python笔记:决策树(ID3)

本文深入解析ID3算法,一种基于信息熵的决策树构建方法。文章详细介绍了算法的评价函数、熵的计算、最佳特征选择及决策树的生成过程。通过实例演示了如何利用ID3算法进行数据分类。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

from math import log
import operator
import numpy as np
########################################
#ID3算法:基于最大熵的分裂法(贪心法)
#评价函数
#定义样本的类别为K个
#关注一棵树的叶子节点:
#如果该叶子节点的n个样本全部属于一个类别,则该叶子节点的熵为0
#如果该叶子节点的n个样本刚好属于K个类别,则取得第k个类别的概率为 (n/K)/n 为1/K
#则该叶子节点的信息量为-ln(1/K),熵为lnK
#定义评价函数为 叶子节点熵的加权和 即累加 叶子样本数目个数n * 叶子节点的熵
# 该评价函数值越小 说明 该数对于样本的分类结果越好
# 但往往会造成过拟合 导致该颗树的泛化能力变弱
########################################
#In:输入数据集
#Out:熵值计算结果
def calShannonEnt(dataSet):
    numEntries = len(dataSet)
    lblCnts = {}
    for featVec in dataSet:
        curLbl = featVec[-1]
        if curLbl not in lblCnts:
            lblCnts[curLbl] = 0
        lblCnts[curLbl] += 1
    shannonEnt = 0.0
    for key in lblCnts:
        prob = float(lblCnts[key])/numEntries
        #以2为底的对数值,不传默认为以e为底的自然对数
        shannonEnt = -prob * log(prob, 2) + shannonEnt
    return shannonEnt

#创建数据集
#Out:返回生成的数据集 和 标签列表
def createDataSet():
    dataSet = [[1, 1, 0, 1, 0, 1, 0, 0, 'yes'],
               [1, 1, 0, 1, 0, 1, 1, 1, 'yes'],
               [1, 1, 0, 1, 0, 1, 0, 1, 'no'],
               [0, 1, 1, 1, 0, 1, 1, 1, 'no'],
               [0, 1, 0, 1, 0, 0, 0, 1,'no']]
    labels = ['no surfacing', 'flippers', 'big', 'orientation', 'test', 'block', 'flat', 'active']
    #labels = ['no surfacing', 'flippers', 'big']
    return dataSet, labels
#dataSet:数据集输入 axis:第几个特征 value:该特征的 匹配标签
#Out:返回 去掉axis的特征后的数据集
def splitDataSet(dataSet, axis, value):
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            #左闭右开区间
            reducedFeatVec = featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet
#In:n个样本 M个特征的数据集 构成n行M+1列的矩阵,多的那一列为y值(类别标签)
#Out:返回最适合拿来作分割的那个特征所在的列
def chooseBestFeature(dataSet):
    print("Choose DataSet:",dataSet)
    numFeatures = len(dataSet[0])-1
    baseEnt = calShannonEnt(dataSet)
    print('baseEnt:', baseEnt)
    bestInfoGain = 0.0
    bestFeautre = -1

    for i in range(numFeatures):
        # 第i列的所有数据
        featList = [eg[i] for eg in dataSet]
        #print("featList:", featList)
        #使用set去掉重复的
        uniqueVals = set(featList)
        #print('uniqueVals:', uniqueVals)
        newEnt = 0.0
        # 计算使用第i个特征进行分裂的信息熵
        for value in uniqueVals:
            #先计算每一个子集的信息熵
            subDataSet = splitDataSet(dataSet, i, value)
            #计算概率
            prob = len(subDataSet)/float(len(dataSet))
            newEnt += prob * calShannonEnt(subDataSet)
        #计算信息增益
        infoGain = baseEnt - newEnt
        print("InfoGain:", infoGain)
        #始终保存最大的一个信息增益值和特征
        if(infoGain >= bestInfoGain):
            bestInfoGain = infoGain
            bestFeautre = i
    print('bestFeature:', bestFeautre)
    return bestFeautre
# In:一系列的类别标签名称
# Out:出现次数最多的那个类别标签名称
def majorityCnt(classList):
    #字典  key为标签 value为标签出现的次数
    classCnt = {}
    for vote in classList:
        if vote not in classCnt.keys():
            classCnt[vote] = 0
        classCnt[vote] += 1
    # 排序 从多到少排序(降序,本来默认为升序)  ,以value作为排序比较基准:operator.itemgetter(1)
    sortedClassCnt = sorted(classCnt.items(), key=operator.itemgetter(1), reverse=True)
    #返回排序后的字典里 第一个元素的key 即出现次数最多的那个类别标签
    print("classList:",classList)
    print("classCnt:", classCnt)
    print("sortedClassCnt:", sortedClassCnt)
    print("majorityCnt:", sortedClassCnt[0][0])
    return sortedClassCnt[0][0]

def createTree(dataSet, labels):
    #print("\nDataSet:", dataSet, 'Labels:', labels)
    #获取dataSet中的所有y值 即类别
    classList = [eg[-1] for eg in dataSet]
    #print('classList:', classList, '\n')
    #终止条件:如果数据集中只有一个类别 则停止,返回该类别
    if classList.count(classList[0]) == len(classList):
        return classList[0]
    # 如果数据集已经用完了所有特征,只有y值 ,则停止 (因为每做一次分类 特征数会减掉1,参见函数splitDataSet)返回当前数据
    # 集中占多数的类别标签名称
    if len(dataSet[0]) == 1:
        return majorityCnt(classList)
    # 获取 最适合分类的特征序号(即 第几个特征)
    bestFeat = chooseBestFeature(dataSet)
    # 获取该特征的名称
    bestFeatLabel = labels[bestFeat]
    # 拷贝一份label 不改变输入值
    subLabels = labels[:]
    # 去掉当前拿来分类的这个特征
    del(subLabels[bestFeat])
    # 获取数据集中 所有该样本中该特征的所有值,并去掉重复的元素
    featValues = [eg[bestFeat] for eg in dataSet]
    uniqueValues = set(featValues)
    print('uniqueValue:', uniqueValues)
    print('subLabels:', subLabels)
    # Tree: key:特征名称  value: key:该特征对应的值 value:Tree(到最底层只剩y值,所有特征已全部去除)  ---递归结构
    myTree = {bestFeatLabel: {}}

    for value in uniqueValues:
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
    print("Tree:", myTree)
    return myTree


#In:输入决策树 ,样本的特征情况,样本的特征对应的特征名称  两者一一对应
#Out:输出分类结果
def JudgeResult(myTree, featList, labels):
    if len(featList) != len(labels):
        return 'Unknown'
    # 赋值新特征和特征名称 不改变原有数值
    newFeatList = featList[:]
    newLabels = labels[:]
    # 遍历所有特征
    for i in newLabels:
        # 找到树 节点的 特征名称
        if i in myTree.keys():
            if type(myTree[i]).__name__ == 'dict':
                #如果是字典 说明有子节点 找到对应的 特征
                featValue = newFeatList[newLabels.index(i)]
                if featValue in myTree[i].keys():
                    # 根据该样本特征的值 决定当前的分类 ,如果 仍为字典 则递归 继续 ,否则返回 该类型的标签
                    # if type(myTree[i][featValue]) == type({}):
                    if isinstance(myTree[i][featValue],dict) :
                        #将该特征和该标签移除  用新的剩余树的分支 和剩余的特征列表 和特征标签 继续判定
                        newFeatList.pop(newLabels.index(i))
                        newLabels.remove(i)
                        #print('\nFeatList:', newFeatList, 'Labels:', newLabels, 'Tree:', myTree[i][featValue], '\n')
                        return JudgeResult(myTree[i][featValue], newFeatList, newLabels)
                    else:
                        return myTree[i][featValue]
    return 'Unkown'

#获取数的叶子节点个数
def getAllLeafs(myTree):
    numLeafs = 0
    firstKeyList = [key for key in myTree.keys()]
    #firstKey = myTree.keys()[0]
    firstValue = myTree[firstKeyList[0]]
    for key in firstValue.keys():
        if type(firstValue[key]).__name__ == 'dict':
            numLeafs +=getAllLeafs(firstValue[key])
        else:
            numLeafs += 1
    return numLeafs


def getTreeDepth(myTree):
    maxDepth = 0
    firstKeyList = [key for key in myTree.keys()]
    firstValue = myTree[firstKeyList[0]]
    for key in firstValue.keys():
        if type(firstValue[key]).__name__ == 'dict':
            thisDepth = 1 + getTreeDepth(firstValue[key])
        else:
            thisDepth = 1
        if thisDepth > maxDepth:
            maxDepth = thisDepth
    return maxDepth

# 在本文件使用时返回"__main__",被import到其他文件中时返回该文件名称
if __name__ == "__main__":
    # 函数有两个返回值 则必须声明两个变量来表示  否则如果声明一个 会 当成一个变量
    dataS, labels = createDataSet()
    shannonEnt = calShannonEnt(dataS)
    # print('ShannonEnt:', shannonEnt)
    # reduceDataSet=splitDataSet(dataS, 0, 1)
    bestFeature = chooseBestFeature(dataS)
    myTree = createTree(dataS, labels)
    print(bestFeature)
    print(myTree)
    featTest = [1, 1, 0, 1, 0, 1, 0, 1]

    result = JudgeResult(myTree, featTest, labels)
    print(result)
    Leafs = getAllLeafs(myTree)
    print('Leafs:', Leafs)

    TreeDepth = getTreeDepth(myTree)
    print('TreeDepth:', TreeDepth)

结果:

Choose DataSet: [[1, 1, 0, 1, 0, 1, 0, 0, 'yes'], [1, 1, 0, 1, 0, 1, 1, 1, 'yes'], [1, 1, 0, 1, 0, 1, 0, 1, 'no'], [0, 1, 1, 1, 0, 1, 1, 1, 'no'], [0, 1, 0, 1, 0, 0, 0, 1, 'no']]
baseEnt: 0.9709505944546686
InfoGain: 0.4199730940219749
InfoGain: 0.0
InfoGain: 0.17095059445466854
InfoGain: 0.0
InfoGain: 0.0
InfoGain: 0.17095059445466854
InfoGain: 0.01997309402197489
InfoGain: 0.3219280948873623
bestFeature: 0
Choose DataSet: [[1, 1, 0, 1, 0, 1, 0, 0, 'yes'], [1, 1, 0, 1, 0, 1, 1, 1, 'yes'], [1, 1, 0, 1, 0, 1, 0, 1, 'no'], [0, 1, 1, 1, 0, 1, 1, 1, 'no'], [0, 1, 0, 1, 0, 0, 0, 1, 'no']]
baseEnt: 0.9709505944546686
InfoGain: 0.4199730940219749
InfoGain: 0.0
InfoGain: 0.17095059445466854
InfoGain: 0.0
InfoGain: 0.0
InfoGain: 0.17095059445466854
InfoGain: 0.01997309402197489
InfoGain: 0.3219280948873623
bestFeature: 0
uniqueValue: {0, 1}
subLabels: ['flippers', 'big', 'orientation', 'test', 'block', 'flat', 'active']
Choose DataSet: [[1, 0, 1, 0, 1, 0, 0, 'yes'], [1, 0, 1, 0, 1, 1, 1, 'yes'], [1, 0, 1, 0, 1, 0, 1, 'no']]
baseEnt: 0.9182958340544896
InfoGain: 0.0
InfoGain: 0.0
InfoGain: 0.0
InfoGain: 0.0
InfoGain: 0.0
InfoGain: 0.2516291673878229
InfoGain: 0.2516291673878229
bestFeature: 6
uniqueValue: {0, 1}
subLabels: ['flippers', 'big', 'orientation', 'test', 'block', 'flat']
Choose DataSet: [[1, 0, 1, 0, 1, 1, 'yes'], [1, 0, 1, 0, 1, 0, 'no']]
baseEnt: 1.0
InfoGain: 0.0
InfoGain: 0.0
InfoGain: 0.0
InfoGain: 0.0
InfoGain: 0.0
InfoGain: 1.0
bestFeature: 5
uniqueValue: {0, 1}
subLabels: ['flippers', 'big', 'orientation', 'test', 'block']
Tree: {'flat': {0: 'no', 1: 'yes'}}
Tree: {'active': {0: 'yes', 1: {'flat': {0: 'no', 1: 'yes'}}}}
Tree: {'no surfacing': {0: 'no', 1: {'active': {0: 'yes', 1: {'flat': {0: 'no', 1: 'yes'}}}}}}
0
{'no surfacing': {0: 'no', 1: {'active': {0: 'yes', 1: {'flat': {0: 'no', 1: 'yes'}}}}}}
no
Leafs: 4
TreeDepth: 3
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值