统计学习方法第五章CART算法代码实践例题5.4
from numpy import *
def loadDataSet(): # 本书例题的数据集
dataset = [['青年', '否', '否', '一般', '否'],
['青年', '否', '否', '好', '否'],
['青年', '是', '否', '好', '是'],
['青年', '是', '是', '一般', '是'],
['青年', '否', '否', '一般', '否'],
['中年', '否', '否', '一般', '否'],
['中年', '否', '否', '好', '否'],
['中年', '是', '是', '好', '是'],
['中年', '否', '是', '非常好', '是'],
['中年', '否', '是', '非常好', '是'],
['老年', '否', '是', '非常好', '是'],
['老年', '否', '是', '好', '是'],
['老年', '是', '否', '好', '是'],
['老年', '是', '否', '非常好', '是'],
['老年', '否', '否', '一般', '否']]
label = ['年龄', '有工作', '有自己的房子', '信贷情况']
return dataset, label
def calculateGini(dataSet): # 计算训练集的基尼指数
resultList = [data[-1] for data in dataSet]
lenOfResult = float(len(resultList))
uniqueLabels = set(resultList)
curGini = 0.0
for label in uniqueLabels:
prob = resultList.count(label) / lenOfResult
curGini -= prob**2
return curGini+1 #这里对应书中计算基尼指数的公式
def splitDataSet(dataSet,i,value): #这里重新写一下splitDataSet函数将原本的数据集根据某个特征的某个属性值划分为两个数据集
returnSameDataSet=[];returnDiffDataSet=[]
for data in dataSet:
if data[i] == value:
returnList=data[:i]
returnList.extend(data[i+1:])
returnSameDataSet.append(returnList)
else:
returnList = data[:i]
returnList.extend(data[i+1:])
returnDiffDataSet.append(returnList)
return returnSameDataSet,returnDiffDataSet
def calculateGiniWithValue(dataSet, i): # 计算根据某个特征进行划分后的基尼指数
m = float(len(dataSet))
listOfValue = [data[i] for data in dataSet]
uniqueValueList = set(listOfValue)
bestGiniCoefficient=inf;feat=0;curGiniCoefficient=0.0
for value in uniqueValueList: #寻找以最小基尼指数的值作为划分
returnSameDataSet,returnDiffDataSet=splitDataSet(dataSet,i,value)
curGiniCoefficient += ((len(returnSameDataSet)/m) * calculateGini(returnSameDataSet)+len(returnDiffDataSet)*calculateGini(returnDiffDataSet))
if curGiniCoefficient<bestGiniCoefficient:
bestGiniCoefficient=curGiniCoefficient;feat=value
return bestGiniCoefficient,feat
def chooseBestValueToSplit(dataSet): # 通过本函数寻找能获得最小基尼指数的列和其中的值
m, n = shape(dataSet)
bestGini = inf;bestValue = -1;bestFeat=0 #bestValue是列,bestFeat是该列中的值
for i in range(n - 1): #计算每一列的最佳基尼指数来寻求最小基尼指数的那一列
curGini,feat= calculateGiniWithValue(dataSet, i)
if curGini < bestGini:
bestGini = curGini
bestValue = i
bestFeat=feat
return bestValue,bestFeat
def maxResult(resultList):
calcNumDict=dict([(resultList.count(result),result) for result in resultList])
return calcNumDict[max(calcNumDict.keys())]
def createTree(dataSet,labels,a): #首先判别是否需要继续划分,a为样本个数的阈值
resultList=[data[-1] for data in dataSet]
if len(dataSet)<a:
return maxResult(resultList)
if resultList.count(resultList[0]) == len(resultList):
return resultList[0]
if len(dataSet[0])==1:
return maxResult(resultList)
bestValue,bestFeat=chooseBestValueToSplit(dataSet)
bestLabel=labels[bestValue]
del(labels[bestValue])
tree={bestLabel+'为'+str(bestFeat):{}}
#uniqueValue=set([data[bestValue] for data in dataSet])
returnSameDataSet,returnDiffDataSet=splitDataSet(dataSet, bestValue, bestFeat)
#subLabels1=labels[:]
#这里曾经用了同一个label去划分树代码不断出错调试了半天真扎心了
subLabels1 =labels[:]
subLabels2=labels[:]
subTree1=createTree(returnSameDataSet,subLabels1,a)
subTree2=createTree(returnDiffDataSet,subLabels2,a)
tree[bestLabel+'为'+str(bestFeat)]['是']=subTree1
tree[bestLabel+'为'+str(bestFeat)]['否']=subTree2
return tree
dataSet,label=loadDataSet()
print(createTree(dataSet,label,4))
结果为(因为数据集较小设置了阈值所以树的层数比较少):
{'有自己的房子为否': {'是': {'有工作为否': {'是': '否', '否': '是'}}, '否': '是'}}
made by zcl at CUMT
I know I can because I have a heart that beats