决策树算法:
决策树常用于处理分类问题,也常用于数据挖掘,其主要优势是概念简单清晰,数据形式非常容易理解。
优点:计算复杂度不高,输出结果易于理解,对中间值的缺失不敏感,可以处理不相关的特征数据。
缺点:可能会产生过度匹配的问题。
适用数据类型:数值型和标称型。
划分数据:此处采用ID3算法,划分数据的原则是将无序的数据变得更加有序。
信息增益:划分数据之前和之后信息发生的变化称为信息增益。
基尼不纯度:度量集合无序程度的方法,与熵类似。其概念是:从一个数据集中随机选取子项,度量其被错误的划分到其他组里的概率。 一个随机事件变成它的对立事件的概率。(以下关于基尼不纯度案例参考https://blog.youkuaiyun.com/JJBOOM425/article/details/79997440青山孤客)
案例:
一个随机事件X ,P(X=0)= 0.5 ,P(X=1)=0.5
那么基尼不纯度就为 P(X=0)(1 - P(X=0)) + P(X=1)(1 - P(X=1)) = 0.5
一个随机事件Y ,P(Y=0)= 0.1 ,P(Y=1)=0.9
那么基尼不纯度就为P(Y=0)(1 - P(Y=0)) + P(Y=1)(1 -P(Y=1)) = 0.18
很明显 X比Y更混乱,因为两个都为0.5 很难判断哪个发生。而Y就确定得多,Y=1发生的概率很大。而基尼不纯度也就越小。
结论:
(1)基尼不纯度可以作为 衡量系统混乱程度的 标准;
(2)基尼不纯度越小,纯度越高,集合的有序程度越高,分类的效果越好;
(3)基尼不纯度为 0 时,表示集合类别一致;
(4)在决策树中,比较基尼不纯度的大小可以选择更好的决策条件(子节点)。
#示例代码:
my_data = [['fan', 'C', 'yes', 32, 'None'],
['fang', 'U', 'yes', 23, 'Premium'],
['ming', 'F', 'no', 28, 'Basic']]
# 计算每一行数据的可能数量
def uniqueCounts(rows):
results = {}
for row in rows:
# 对最后一列的值计算
#r = row[len(row) - 1]
# 对倒数第三的值计算,也就是yes 和no 的一列
r = row[len(row) - 3]
if r not in results: results[r] = 0
results[r] += 1
return results
# 基尼不纯度样例
def giniImpurityExample(rows):
total = len(rows)
print(total)
counts = uniqueCounts(rows)
print(counts)
imp = 0
for k1 in counts:#两个for循环计算,其实有更方便高效的计算方式
p1 = float(counts[k1]) / total
print(counts[k1])
for k2 in counts:
if k1 == k2: continue
p2 = float(counts[k2]) / total
imp += p1 * p2
'''
for k in counts:
imp += float(counts[k]) / total * (1 - float(counts[k]) / total)
'''
return imp
gini = giniImpurityExample(my_data)
print('gini Impurity is %s' % gini)
采用信息增益的方式
划分数据集:
def splitDataSet(dataSet, axis, value):
retDataSet = []
for featVec in dataSet:
if (featVec[axis] == value):
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis + 1:])
retDataSet.append(reducedFeatVec)
return retDataSet
选择最好的数据集划分方式:
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0]) - 1
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0
bestFeature = -1
for i in range(numFeatures):
feature = [example[i] for example in dataSet]
feature_unique = set(feature)
newEntropy = 0.0
for value in feature_unique:
splitData = splitDataSet(dataSet, i, value)
prob = len(splitData) / float(len(dataSet))
newEntropy += prob * calcShannonEnt(splitData)
newInfoGain = baseEntropy - newEntropy
if (newInfoGain > bestInfoGain):
bestInfoGain = newInfoGain
bestFeature = i
return bestFeature
递归构建决策树:
**def majorityCnt(classList):
classCount = {}
for vote in classList:
classCount[vote] = classCount.get(vote, 0) + 1
classCountsorted = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
return classCountsorted[0][0]
def createTree(dataSet, labels):
classList = [example[-1] for example in dataSet]
if (classList.count(classList[0]) == len(classList)):
return classList[0]
if (len(dataSet[0]) == 1):
return majorityCnt(classList)
bestFeature = chooseBestFeatureToSplit(dataSet)
bestLabel = labels[bestFeature]
del (labels[bestFeature])
new_tree = {bestLabel: {}}
featValues = [example[bestFeature] for example in dataSet]
uniquevalue = set(featValues)
for value in uniquevalue:
label_tree = labels[:]
new_tree[bestLabel][value] = createTree(splitDataSet(dataSet, bestFeature, value), label_tree)
return new_tree**
辅助函数:
def calcShannonEnt(dataSet):
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1]
if currentLabel not in labelCounts:
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key]) / numEntries
shannonEnt -= prob * log(prob, 2)
return shannonEnt
def splitDataSet(dataSet, axis, value):
retDataSet = []
for featVec in dataSet:
if (featVec[axis] == value):
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis + 1:])
retDataSet.append(reducedFeatVec)
return retDataSet
使用决策树执行分类:
def classify(inputTree, featLabels, testVec):
firstStr = list(inputTree.keys())[0]
secondStr = inputTree[firstStr]
featureIndex = featLabels.index(firstStr)
for key in secondStr.keys():
if (testVec[featureIndex] == key):
if (type(secondStr[key]).__name__ == 'dict'):
labels = classify(secondStr[key], featLabels, testVec)
else:
labels = secondStr[key]
return labels
存储和提取决策树:
def storeTree(inputTree, filename):
import pickle
fw = open(filename, "wb")
pickle.dump(inputTree, fw)
fw.close()
def grabTree(filename):
import pickle
fr = open(filename, "rb")
return pickle.load(fr)
用matplotlib绘制树形图:
# 绘制图形测试---样例
decisionNode = dict(boxstyle="sawtooth", fc="0.8")
leafNode = dict(boxstyle="round4", fc="0.8")
arrow_args = dict(arrowstyle="<-")
def plotNode(nodeTxt, centerPt, parentPt, nodeType):
createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction', xytext=centerPt, textcoords='axes fraction',
va='center', ha='center', bbox=nodeType, arrowprops=arrow_args)
def plotMidText(cntrPt, parentPt, txtString):
xMid = (parentPt[0] - cntrPt[0]) / 2.0 + cntrPt[0]
yMid = (parentPt[1] - cntrPt[1]) / 2.0 + cntrPt[1]
createPlot.ax1.text(xMid, yMid, txtString)
def plotTree(myTree, parentPt, nodeTxt):
numLeafs = getNumLeafs(myTree)
depth = getTreeDepth(myTree)
firstStr = list(myTree.keys())[0]
cntrPt = (plotTree.xOff + (1.0 + float(numLeafs)) / 2.0 / plotTree.totalW, plotTree.yOff)
plotMidText(cntrPt, parentPt, nodeTxt)
plotNode(firstStr, cntrPt, parentPt, decisionNode)
secondDict = myTree[firstStr]
plotTree.yOff = plotTree.yOff - 1.0 / plotTree.totalD
for key in secondDict.keys():
if (type(secondDict[key]).__name__ == 'dict'):
plotTree(secondDict[key], cntrPt, str(key))
else:
plotTree.xOff = plotTree.xOff + 1.0 / plotTree.totalW
plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
plotTree.yOff = plotTree.yOff + 1.0 / plotTree.totalD
def createPlot(inTree):
fig = plt.figure(1, facecolor="white")
fig.clf()
axprops = dict(xticks=[], yticks=[])
createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
plotTree.totalW = float(getNumLeafs(inTree))
plotTree.totalD = float(getTreeDepth(inTree))
plotTree.xOff = -0.5 / plotTree.totalW
plotTree.yOff = 1.0
plotTree(inTree, (0.5, 1.0), "")
# plotNode("a decision node", (0.5, 0.1), (0.1, 0.5), decisionNode)
# plotNode("a leaf node", (0.8, 0.1), (0.3, 0.8), leafNode)
plt.show()
# 绘制图形测试---样例
def getNumLeafs(myTree):
numLeafs = 0
firstStrs = list(myTree.keys())
firstStr = firstStrs[0]
secondStr = myTree[firstStr]
for key in secondStr.keys():
if (type(secondStr[key]).__name__ == 'dict'):
numLeafs += getNumLeafs(secondStr[key])
else:
numLeafs += 1
return numLeafs
def getTreeDepth(myTree):
maxDepth = 0
firstStr = list(myTree.keys())[0]
secondStr = myTree[firstStr]
for key in secondStr.keys():
thisDepth = 0
if (type(secondStr[key]).__name__ == 'dict'):
thisDepth = 1 + getTreeDepth(secondStr[key])
else:
thisDepth = 1
if maxDepth < thisDepth: maxDepth = thisDepth
return maxDepth
def retrieveTree(i):
listOfTrees = [{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}},
{'no surfacing': {0: 'no', 1: {'flippers': {0: {'head': {0: 'no', 1: 'yes'}}, 1: 'no'}}}}
]
return listOfTrees[i]
本案例完整代码:
from math import log
import operator
import matplotlib.pyplot as plt
def calcShannonEnt(dataSet):
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1]
if currentLabel not in labelCounts:
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key]) / numEntries
shannonEnt -= prob * log(prob, 2)
return shannonEnt
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0]) - 1
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0
bestFeature = -1
for i in range(numFeatures):
feature = [example[i] for example in dataSet]
feature_unique = set(feature)
newEntropy = 0.0
for value in feature_unique:
splitData = splitDataSet(dataSet, i, value)
prob = len(splitData) / float(len(dataSet))
newEntropy += prob * calcShannonEnt(splitData)
newInfoGain = baseEntropy - newEntropy
if (newInfoGain > bestInfoGain):
bestInfoGain = newInfoGain
bestFeature = i
return bestFeature
def majorityCnt(classList):
classCount = {}
for vote in classList:
classCount[vote] = classCount.get(vote, 0) + 1
classCountsorted = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
return classCountsorted[0][0]
def createTree(dataSet, labels):
classList = [example[-1] for example in dataSet]
if (classList.count(classList[0]) == len(classList)):
return classList[0]
if (len(dataSet[0]) == 1):
return majorityCnt(classList)
bestFeature = chooseBestFeatureToSplit(dataSet)
bestLabel = labels[bestFeature]
del (labels[bestFeature])
new_tree = {bestLabel: {}}
featValues = [example[bestFeature] for example in dataSet]
uniquevalue = set(featValues)
for value in uniquevalue:
label_tree = labels[:]
new_tree[bestLabel][value] = createTree(splitDataSet(dataSet, bestFeature, value), label_tree)
return new_tree
def createDataSet():
dataSet = [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]
labels = ['no surfacing', 'flippers']
return dataSet, labels
def splitDataSet(dataSet, axis, value):
retDataSet = []
for featVec in dataSet:
if (featVec[axis] == value):
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis + 1:])
retDataSet.append(reducedFeatVec)
return retDataSet
def classify(inputTree, featLabels, testVec):
firstStr = list(inputTree.keys())[0]
secondStr = inputTree[firstStr]
featureIndex = featLabels.index(firstStr)
for key in secondStr.keys():
if (testVec[featureIndex] == key):
if (type(secondStr[key]).__name__ == 'dict'):
labels = classify(secondStr[key], featLabels, testVec)
else:
labels = secondStr[key]
return labels
def storeTree(inputTree, filename):
import pickle
fw = open(filename, "wb")
pickle.dump(inputTree, fw)
fw.close()
def grabTree(filename):
import pickle
fr = open(filename, "rb")
return pickle.load(fr)
myDat, labels = createDataSet()
print(myDat)
print(labels)
# print(splitDataSet(myDat, 0, 1))
# print(calcShannonEnt(myDat))
# print(chooseBestFeatureToSplit(myDat))
# print(createTree(myDat, labels))
feat_labels = labels.copy()
feat_labels2 = labels.copy()
classTree = createTree(myDat, feat_labels)
storeTree(classTree, "classifierfileStore.txt")
print("grabTree: ", grabTree("classifierfileStore.txt"))
print(classTree)
print("classify test result:", classify(classTree, feat_labels2, [1, 1]))
# 绘制图形测试---样例
decisionNode = dict(boxstyle="sawtooth", fc="0.8")
leafNode = dict(boxstyle="round4", fc="0.8")
arrow_args = dict(arrowstyle="<-")
def plotNode(nodeTxt, centerPt, parentPt, nodeType):
createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction', xytext=centerPt, textcoords='axes fraction',
va='center', ha='center', bbox=nodeType, arrowprops=arrow_args)
def plotMidText(cntrPt, parentPt, txtString):
xMid = (parentPt[0] - cntrPt[0]) / 2.0 + cntrPt[0]
yMid = (parentPt[1] - cntrPt[1]) / 2.0 + cntrPt[1]
createPlot.ax1.text(xMid, yMid, txtString)
def plotTree(myTree, parentPt, nodeTxt):
numLeafs = getNumLeafs(myTree)
depth = getTreeDepth(myTree)
firstStr = list(myTree.keys())[0]
cntrPt = (plotTree.xOff + (1.0 + float(numLeafs)) / 2.0 / plotTree.totalW, plotTree.yOff)
plotMidText(cntrPt, parentPt, nodeTxt)
plotNode(firstStr, cntrPt, parentPt, decisionNode)
secondDict = myTree[firstStr]
plotTree.yOff = plotTree.yOff - 1.0 / plotTree.totalD
for key in secondDict.keys():
if (type(secondDict[key]).__name__ == 'dict'):
plotTree(secondDict[key], cntrPt, str(key))
else:
plotTree.xOff = plotTree.xOff + 1.0 / plotTree.totalW
plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
plotTree.yOff = plotTree.yOff + 1.0 / plotTree.totalD
def createPlot(inTree):
fig = plt.figure(1, facecolor="white")
fig.clf()
axprops = dict(xticks=[], yticks=[])
createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
plotTree.totalW = float(getNumLeafs(inTree))
plotTree.totalD = float(getTreeDepth(inTree))
plotTree.xOff = -0.5 / plotTree.totalW
plotTree.yOff = 1.0
plotTree(inTree, (0.5, 1.0), "")
# plotNode("a decision node", (0.5, 0.1), (0.1, 0.5), decisionNode)
# plotNode("a leaf node", (0.8, 0.1), (0.3, 0.8), leafNode)
plt.show()
# 绘制图形测试---样例
def getNumLeafs(myTree):
numLeafs = 0
firstStrs = list(myTree.keys())
firstStr = firstStrs[0]
secondStr = myTree[firstStr]
for key in secondStr.keys():
if (type(secondStr[key]).__name__ == 'dict'):
numLeafs += getNumLeafs(secondStr[key])
else:
numLeafs += 1
return numLeafs
def getTreeDepth(myTree):
maxDepth = 0
firstStr = list(myTree.keys())[0]
secondStr = myTree[firstStr]
for key in secondStr.keys():
thisDepth = 0
if (type(secondStr[key]).__name__ == 'dict'):
thisDepth = 1 + getTreeDepth(secondStr[key])
else:
thisDepth = 1
if maxDepth < thisDepth: maxDepth = thisDepth
return maxDepth
def retrieveTree(i):
listOfTrees = [{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}},
{'no surfacing': {0: 'no', 1: {'flippers': {0: {'head': {0: 'no', 1: 'yes'}}, 1: 'no'}}}}
]
return listOfTrees[i]
myTree = retrieveTree(0)
print(getNumLeafs(myTree))
print(getTreeDepth(myTree))
createPlot(myTree)
fr = open(r"C:\Users\tycho-wjr\Desktop\数据结构和算法相关\机器学习\机器学习实战中文\MLiA_SourceCode\machinelearninginaction\Ch03\lenses.txt")
lenses = [inst.strip().split("\t") for inst in fr.readlines()]
lensesLables = ['age', 'prescript', 'astigmatic', 'tearRate']
print(lenses)
lensesTree = createTree(lenses, lensesLables.copy())
print(lensesTree)
createPlot(lensesTree)