from math import log
import operator
import numpy as np
########################################
#ID3算法:基于最大熵的分裂法(贪心法)
#评价函数
#定义样本的类别为K个
#关注一棵树的叶子节点:
#如果该叶子节点的n个样本全部属于一个类别,则该叶子节点的熵为0
#如果该叶子节点的n个样本刚好属于K个类别,则取得第k个类别的概率为 (n/K)/n 为1/K
#则该叶子节点的信息量为-ln(1/K),熵为lnK
#定义评价函数为 叶子节点熵的加权和 即累加 叶子样本数目个数n * 叶子节点的熵
# 该评价函数值越小 说明 该数对于样本的分类结果越好
# 但往往会造成过拟合 导致该颗树的泛化能力变弱
########################################
#In:输入数据集
#Out:熵值计算结果
def calShannonEnt(dataSet):
numEntries = len(dataSet)
lblCnts = {}
for featVec in dataSet:
curLbl = featVec[-1]
if curLbl not in lblCnts:
lblCnts[curLbl] = 0
lblCnts[curLbl] += 1
shannonEnt = 0.0
for key in lblCnts:
prob = float(lblCnts[key])/numEntries
#以2为底的对数值,不传默认为以e为底的自然对数
shannonEnt = -prob * log(prob, 2) + shannonEnt
return shannonEnt
#创建数据集
#Out:返回生成的数据集 和 标签列表
def createDataSet():
dataSet = [[1, 1, 0, 1, 0, 1, 0, 0, 'yes'],
[1, 1, 0, 1, 0, 1, 1, 1, 'yes'],
[1, 1, 0, 1, 0, 1, 0, 1, 'no'],
[0, 1, 1, 1, 0, 1, 1, 1, 'no'],
[0, 1, 0, 1, 0, 0, 0, 1,'no']]
labels = ['no surfacing', 'flippers', 'big', 'orientation', 'test', 'block', 'flat', 'active']
#labels = ['no surfacing', 'flippers', 'big']
return dataSet, labels
#dataSet:数据集输入 axis:第几个特征 value:该特征的 匹配标签
#Out:返回 去掉axis的特征后的数据集
def splitDataSet(dataSet, axis, value):
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
#左闭右开区间
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet
#In:n个样本 M个特征的数据集 构成n行M+1列的矩阵,多的那一列为y值(类别标签)
#Out:返回最适合拿来作分割的那个特征所在的列
def chooseBestFeature(dataSet):
print("Choose DataSet:",dataSet)
numFeatures = len(dataSet[0])-1
baseEnt = calShannonEnt(dataSet)
print('baseEnt:', baseEnt)
bestInfoGain = 0.0
bestFeautre = -1
for i in range(numFeatures):
# 第i列的所有数据
featList = [eg[i] for eg in dataSet]
#print("featList:", featList)
#使用set去掉重复的
uniqueVals = set(featList)
#print('uniqueVals:', uniqueVals)
newEnt = 0.0
# 计算使用第i个特征进行分裂的信息熵
for value in uniqueVals:
#先计算每一个子集的信息熵
subDataSet = splitDataSet(dataSet, i, value)
#计算概率
prob = len(subDataSet)/float(len(dataSet))
newEnt += prob * calShannonEnt(subDataSet)
#计算信息增益
infoGain = baseEnt - newEnt
print("InfoGain:", infoGain)
#始终保存最大的一个信息增益值和特征
if(infoGain >= bestInfoGain):
bestInfoGain = infoGain
bestFeautre = i
print('bestFeature:', bestFeautre)
return bestFeautre
# In:一系列的类别标签名称
# Out:出现次数最多的那个类别标签名称
def majorityCnt(classList):
#字典 key为标签 value为标签出现的次数
classCnt = {}
for vote in classList:
if vote not in classCnt.keys():
classCnt[vote] = 0
classCnt[vote] += 1
# 排序 从多到少排序(降序,本来默认为升序) ,以value作为排序比较基准:operator.itemgetter(1)
sortedClassCnt = sorted(classCnt.items(), key=operator.itemgetter(1), reverse=True)
#返回排序后的字典里 第一个元素的key 即出现次数最多的那个类别标签
print("classList:",classList)
print("classCnt:", classCnt)
print("sortedClassCnt:", sortedClassCnt)
print("majorityCnt:", sortedClassCnt[0][0])
return sortedClassCnt[0][0]
def createTree(dataSet, labels):
#print("\nDataSet:", dataSet, 'Labels:', labels)
#获取dataSet中的所有y值 即类别
classList = [eg[-1] for eg in dataSet]
#print('classList:', classList, '\n')
#终止条件:如果数据集中只有一个类别 则停止,返回该类别
if classList.count(classList[0]) == len(classList):
return classList[0]
# 如果数据集已经用完了所有特征,只有y值 ,则停止 (因为每做一次分类 特征数会减掉1,参见函数splitDataSet)返回当前数据
# 集中占多数的类别标签名称
if len(dataSet[0]) == 1:
return majorityCnt(classList)
# 获取 最适合分类的特征序号(即 第几个特征)
bestFeat = chooseBestFeature(dataSet)
# 获取该特征的名称
bestFeatLabel = labels[bestFeat]
# 拷贝一份label 不改变输入值
subLabels = labels[:]
# 去掉当前拿来分类的这个特征
del(subLabels[bestFeat])
# 获取数据集中 所有该样本中该特征的所有值,并去掉重复的元素
featValues = [eg[bestFeat] for eg in dataSet]
uniqueValues = set(featValues)
print('uniqueValue:', uniqueValues)
print('subLabels:', subLabels)
# Tree: key:特征名称 value: key:该特征对应的值 value:Tree(到最底层只剩y值,所有特征已全部去除) ---递归结构
myTree = {bestFeatLabel: {}}
for value in uniqueValues:
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
print("Tree:", myTree)
return myTree
#In:输入决策树 ,样本的特征情况,样本的特征对应的特征名称 两者一一对应
#Out:输出分类结果
def JudgeResult(myTree, featList, labels):
if len(featList) != len(labels):
return 'Unknown'
# 赋值新特征和特征名称 不改变原有数值
newFeatList = featList[:]
newLabels = labels[:]
# 遍历所有特征
for i in newLabels:
# 找到树 节点的 特征名称
if i in myTree.keys():
if type(myTree[i]).__name__ == 'dict':
#如果是字典 说明有子节点 找到对应的 特征
featValue = newFeatList[newLabels.index(i)]
if featValue in myTree[i].keys():
# 根据该样本特征的值 决定当前的分类 ,如果 仍为字典 则递归 继续 ,否则返回 该类型的标签
# if type(myTree[i][featValue]) == type({}):
if isinstance(myTree[i][featValue],dict) :
#将该特征和该标签移除 用新的剩余树的分支 和剩余的特征列表 和特征标签 继续判定
newFeatList.pop(newLabels.index(i))
newLabels.remove(i)
#print('\nFeatList:', newFeatList, 'Labels:', newLabels, 'Tree:', myTree[i][featValue], '\n')
return JudgeResult(myTree[i][featValue], newFeatList, newLabels)
else:
return myTree[i][featValue]
return 'Unkown'
#获取数的叶子节点个数
def getAllLeafs(myTree):
numLeafs = 0
firstKeyList = [key for key in myTree.keys()]
#firstKey = myTree.keys()[0]
firstValue = myTree[firstKeyList[0]]
for key in firstValue.keys():
if type(firstValue[key]).__name__ == 'dict':
numLeafs +=getAllLeafs(firstValue[key])
else:
numLeafs += 1
return numLeafs
def getTreeDepth(myTree):
maxDepth = 0
firstKeyList = [key for key in myTree.keys()]
firstValue = myTree[firstKeyList[0]]
for key in firstValue.keys():
if type(firstValue[key]).__name__ == 'dict':
thisDepth = 1 + getTreeDepth(firstValue[key])
else:
thisDepth = 1
if thisDepth > maxDepth:
maxDepth = thisDepth
return maxDepth
# 在本文件使用时返回"__main__",被import到其他文件中时返回该文件名称
if __name__ == "__main__":
# 函数有两个返回值 则必须声明两个变量来表示 否则如果声明一个 会 当成一个变量
dataS, labels = createDataSet()
shannonEnt = calShannonEnt(dataS)
# print('ShannonEnt:', shannonEnt)
# reduceDataSet=splitDataSet(dataS, 0, 1)
bestFeature = chooseBestFeature(dataS)
myTree = createTree(dataS, labels)
print(bestFeature)
print(myTree)
featTest = [1, 1, 0, 1, 0, 1, 0, 1]
result = JudgeResult(myTree, featTest, labels)
print(result)
Leafs = getAllLeafs(myTree)
print('Leafs:', Leafs)
TreeDepth = getTreeDepth(myTree)
print('TreeDepth:', TreeDepth)
结果:
Choose DataSet: [[1, 1, 0, 1, 0, 1, 0, 0, 'yes'], [1, 1, 0, 1, 0, 1, 1, 1, 'yes'], [1, 1, 0, 1, 0, 1, 0, 1, 'no'], [0, 1, 1, 1, 0, 1, 1, 1, 'no'], [0, 1, 0, 1, 0, 0, 0, 1, 'no']]
baseEnt: 0.9709505944546686
InfoGain: 0.4199730940219749
InfoGain: 0.0
InfoGain: 0.17095059445466854
InfoGain: 0.0
InfoGain: 0.0
InfoGain: 0.17095059445466854
InfoGain: 0.01997309402197489
InfoGain: 0.3219280948873623
bestFeature: 0
Choose DataSet: [[1, 1, 0, 1, 0, 1, 0, 0, 'yes'], [1, 1, 0, 1, 0, 1, 1, 1, 'yes'], [1, 1, 0, 1, 0, 1, 0, 1, 'no'], [0, 1, 1, 1, 0, 1, 1, 1, 'no'], [0, 1, 0, 1, 0, 0, 0, 1, 'no']]
baseEnt: 0.9709505944546686
InfoGain: 0.4199730940219749
InfoGain: 0.0
InfoGain: 0.17095059445466854
InfoGain: 0.0
InfoGain: 0.0
InfoGain: 0.17095059445466854
InfoGain: 0.01997309402197489
InfoGain: 0.3219280948873623
bestFeature: 0
uniqueValue: {0, 1}
subLabels: ['flippers', 'big', 'orientation', 'test', 'block', 'flat', 'active']
Choose DataSet: [[1, 0, 1, 0, 1, 0, 0, 'yes'], [1, 0, 1, 0, 1, 1, 1, 'yes'], [1, 0, 1, 0, 1, 0, 1, 'no']]
baseEnt: 0.9182958340544896
InfoGain: 0.0
InfoGain: 0.0
InfoGain: 0.0
InfoGain: 0.0
InfoGain: 0.0
InfoGain: 0.2516291673878229
InfoGain: 0.2516291673878229
bestFeature: 6
uniqueValue: {0, 1}
subLabels: ['flippers', 'big', 'orientation', 'test', 'block', 'flat']
Choose DataSet: [[1, 0, 1, 0, 1, 1, 'yes'], [1, 0, 1, 0, 1, 0, 'no']]
baseEnt: 1.0
InfoGain: 0.0
InfoGain: 0.0
InfoGain: 0.0
InfoGain: 0.0
InfoGain: 0.0
InfoGain: 1.0
bestFeature: 5
uniqueValue: {0, 1}
subLabels: ['flippers', 'big', 'orientation', 'test', 'block']
Tree: {'flat': {0: 'no', 1: 'yes'}}
Tree: {'active': {0: 'yes', 1: {'flat': {0: 'no', 1: 'yes'}}}}
Tree: {'no surfacing': {0: 'no', 1: {'active': {0: 'yes', 1: {'flat': {0: 'no', 1: 'yes'}}}}}}
0
{'no surfacing': {0: 'no', 1: {'active': {0: 'yes', 1: {'flat': {0: 'no', 1: 'yes'}}}}}}
no
Leafs: 4
TreeDepth: 3