在《机器学习实战》决策树这一章的时候,有些地方的代码有点看不太懂,看了几篇博客,还是未解。最后仔细看书,发现自己不懂数据集的组织方式。希望大家看的时候也注意一下。在决策树函数调用的数据要满足如下要求:
1、数据必须是由列表元素组成的列表,所有的列表元素都要具有相同的数据长度;
2、数据(也就是数据集中的实例)的最后一列或者每个实例的最后一个元素是当前实例的类别标签。
满足如上要求,就可以在数据集的第一行判定当前数据包包含多少特征属性。
from math import log
def calShannonEnt(dataSet):
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
else:
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
shannonEnt -= prob*log(prob,2)
return shannonEnt
def createDataSet():
dataSet = [[1,1,'yes'],
[1,1,'yes'],
[1,0,'no'],
[0,1,'no'],
[0,1,'no']]
labels = ['no surfacing','flippers']
return dataSet,labels
def splitDataSet(dataSet,axis,value):
retDataSet = []
for featVec in dataSet:
if featVec[axis]==value:
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])#等价于删除featVec[axis],提取剩下的部分
retDataSet.append(reducedFeatVec)
return retDataSet
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0])-1 #特征值数量,因为数据实例的最后一个元素是当前实例的类别标签
baseEntropy = calShannonEnt(dataSet) # 计算整个数据集的熵
bestInfoGain = 0.0;bestFeature = -1 #初始化最好的信息增益为0.0,最好的划分数据集的特征为-1
for i in range(numFeatures):
featList = [example[i] for example in dataSet]
uniqueVals = set(featList)
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet,i,value)
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob*calShannonEnt(subDataSet)
infoGain = baseEntropy-newEntropy
if(infoGain>bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
return bestFeature