import operator
from math import log
def createDataSet():
dataSet = [[0, 0, 0, 0, 'no'],
[0, 0, 0, 1, 'no'],
[0, 1, 0, 1, 'yes'],
[0, 1, 1, 0, 'yes'],
[0, 0, 0, 0, 'no'],
[1, 0, 0, 0, 'no'],
[1, 0, 0, 1, 'no'],
[1, 1, 1, 1, 'yes'],
[1, 0, 1, 2, 'yes'],
[1, 0, 1, 2, 'yes'],
[2, 0, 1, 2, 'yes'],
[2, 0, 1, 1, 'yes'],
[2, 1, 0, 1, 'yes'],
[2, 1, 0, 2, 'yes'],
[2, 0, 0, 0, 'no']]
labels = ['年龄', '有工作', '有自己的房子', '信贷情况']
return dataSet, labels
def calcEnt(dataSet):
numEntires = len(dataSet)
labelCounts = {}
for eigenVec in dataSet:
currentlabel = eigenVec[-1]
if (currentlabel not in labelCounts.keys()):
labelCounts[currentlabel] = 0
labelCounts[currentlabel] += 1
Ent = 0.0
for key in labelCounts:
prob = float(labelCounts[key]) / numEntires
Ent -= prob * log(prob, 2)
return Ent
def splitDataSet(dataSet, axis, value):
"""
:param dataSet: 待划分数据集
:param axis: 划分数据集的特征值
:param value:需要返回特征的值
:return: 无
"""
retDataSet = []
for eigenVec in dataSet:
if (eigenVec[axis] == value):
reducedFeatVec = eigenVec[:axis]
reducedFeatVec.extend(eigenVec[axis + 1:])
retDataSet.append(reducedFeatVec)
return retDataSet
def chooseBeatFeatureToSplit(dataSet):
"""
:param dataSet:数据集
:return: 返回最大特征的索引值
"""
numFeatures = len(dataSet[0]) - 1
baseEntropy = calcEnt(dataSet)
beatGain = 0.0
beatFeatureIndex = -1
for i in range(numFeatures):
featList = [example[i] for example in dataSet]
uniqueVals = set(featList)
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
print(subDataSet)
prob = len(subDataSet) / float(len(dataSet))
newEntropy += prob * calcEnt(subDataSet)
infoGain = baseEntropy - newEntropy
print("第%d个特征的增益为%.3f" % (i, infoGain))
if (infoGain > beatGain):
beatGain = infoGain
beatFeatureIndex = i
return beatFeatureIndex
def majorityCnt(classList):
classCount = {}
for vote in classList:
if vote not in classCount.keys(): classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def createTree(dataSet, labels, featLabels):
"""
:param dataSet:训练数据集
:param labels:分类属性标签
:param featLabels:存储选择的最优特征标签
:return:决策树,一个嵌套字典,形如{'有自己的房子': {0: {'有工作': {0: 'no', 1: 'yes'}}, 1: 'yes'}}
"""
classList = [example[-1] for example in dataSet]
if classList.count(classList[0]) == len(classList):
return classList[0]
if len(dataSet[0]) == 1:
return majorityCnt(classList)
bestFeat = chooseBeatFeatureToSplit(dataSet)
bestFeatLabel = labels[bestFeat]
featLabels.append(bestFeatLabel)
myTree = {bestFeatLabel: {}}
del (labels[bestFeat])
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)
for value in uniqueVals:
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), labels, featLabels)
return myTree
if __name__ == '__main__':
dataSet, labels = createDataSet()
print("数据集:", dataSet)
print("信息熵:", calcEnt(dataSet))
print("最优特征索引值:" + str(chooseBeatFeatureToSplit(dataSet)))
featLabels = []
myTree = createTree(dataSet, labels, featLabels)
print(myTree)