文章目录
1. 原理推导
2. ID3算法实现
1. 计算信息熵
def calaShannonEnt(dataSet , index = -1):
#m , n = dataSet.shape
m = len(dataSet)
labelCounts = {}
# 提取每个分类数量
for line in dataSet:
fea = line[index]
if fea not in labelCounts.keys():
labelCounts[fea] = 0
labelCounts[fea] += 1
shannonEnt = 0
for key in labelCounts:
prob = float(labelCounts[key]) / m
shannonEnt -= prob * np.log(prob)
return shannonEnt
2. 分割数据
def spiltDataSet(dataSet , axis , value ):
'''
para:
dataSet: 特征 + 标签数据
axis: 根据那个特征进行分割数据
value: 根据那个值进行分割数据
return:
指定特征指定值的数据集
'''
retData = []
for fea in dataSet:
if fea[axis] == value:
feaVec = fea[:axis]
feaVec.extend(fea[axis+1:])
retData.append(feaVec)
return retData
3. 选择最佳特征以及最佳特征值
def chooseBestFeatureToSplit(dataSet , imp = 0 ):
'''
原理:
外循环每一个特征,内循环每一个特征属性
para:
数据集 np.array
return:
特征索引
'''
m = len(dataSet)
n = len(dataSet[0]) -1
if n == 1:
return 0
baseEnt = calaShannonEnt(dataSet)
bestFea = -1
bestGain = 0
bestGainRate = 0
# 循环所有的特征
for i in range(n):
# 所有行,第i列数据
FeaList = [ fea[i] for fea in dataSet ]
uniqueVars = set(FeaList)
newEnt = 0
for value in uniqueVars:
subData = spiltDataSet(dataSet , i , value)
prob = len(subData) / float(m)
newEnt += prob * calaShannonEnt(subData)
infoGain = baseEnt - newEnt
if imp == 0: # 计算信息增益 ID3算法
if infoGain > bestGain:
bestFea = i
bestGain = infoGain
else: # 这里是计算的信息增益比 c4.5算法
#这里计算的是某个特征条件下的熵 , 而不是集合类别的熵
iv = calaShannonEnt(dataSet , i)
if iv == 0:
continue
infoGainRate = infoGain / iv
if bestGainRate < infoGainRate:
bestFea = i
bestGainRate = infoGainRate
return bestFea
4. 投票的方式确定叶子节点的类别
def majorityCnt(classList):
'''
叶子节点判断属于什么类别 投票的方式
'''
classCount = {}
for vote in classList:
if vote not in classCount.keys():
classCount[vote] = 0
classCount[vote] += 1
sortedClass = sorted(classCount.items() ,reverse = True )
return sortedClass[0][0]
5. 创建分类树
def createTree(dataSet , labels , imp = 0):
'''
思路:
1. 选择最佳特征创建根节点
2. 遍历该特征所有属性,迭代创建决策树
'''
# 获取所有的数据类别列表
classList = [lab[-1] for lab in dataSet]
# 递归出口
# 如果第一个类别就是全部
if (classList.count(classList[0]) == len(classList)):
return classList[0]
# 所有特征全部使用完了,但是有可能还是没有完全分开,那么就投票决定了
if len(dataSet[0]) == 1 :
return majorityCnt(classList)
bestFea = chooseBestFeatureToSplit(dataSet , imp)
print '------' , bestFea
bestFeaLabel = labels[bestFea]
myTree = {bestFeaLabel :{}}
print 'diyici' , myTree
del(labels[bestFea])
#
featValues = [fea[bestFea] for fea in dataSet]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]
myTree[bestFeaLabel][value] = createTree(spiltDataSet(
dataSet , bestFea , value) , subLabels)
print 'dierci :----' , myTree
return myTree
6. 预测函数
def predict(myTree , labels, preLabel):
# 获取第一个根节点
firstFea = myTree.keys()[0]
# 获取孩子节点
secondDict = myTree[firstFea]
# 获取根节点在label中的索引
featIndex = labels.index(firstFea)
# 遍历根节点所有的边
for key in secondDict.keys():
# 判断边是与给定的相等,走向对应的分支
if preLabel[featIndex] == key:
if isinstance(secondDict[key] , dict):
classLabel = predict(secondDict[key] , labels , preLabel)
else:
classLabel = secondDict[key]
return classLabel
3. C4.5算法实现
note: 信息增益比 , 分母是该特征的混乱程度,而不是该特征下类别的混乱程度,很多博客上面是有问题的
if imp == 0: # 计算信息增益 ID3算法
if infoGain > bestGain:
bestFea = i
bestGain = infoGain
else: # 这里是计算的信息增益比 c4.5算法
#这里计算的是某个特征条件下的熵 , 而不是集合类别的熵
iv = calaShannonEnt(dataSet , i)
if iv == 0:
continue
infoGainRate = infoGain / iv
if bestGainRate < infoGainRate:
bestFea = i
bestGainRate = infoGainRate
4. CART 分类树实现
1. 基尼指数计算
def calaGini(dataSet):
# 计算每个类别出现的次数
label = [ fea[-1] for fea in dataSet ]
feaDict = {}
for fea in label:
if fea not in feaDict:
feaDict[fea] = 0
feaDict[fea] += 1
gini0 = 0
# 累加基尼指数
for key in feaDict.keys():
prob = feaDict[key] / float(len(label))
gini0 += prob * prob
return 1 - gini0
2. 分割数据
CART是二叉树,每次都是对数据进行二类分割
def spiltDataSet(dataSet , col ,value):
retSetLeft = []
retSetRight= []
for fea in dataSet:
if fea[col] == value:
feaVec = fea[:col]
feaVec.extend(fea[col+1 : ])
retSetLeft.append(feaVec)
else:
feaVec = fea[:col]
feaVec.extend(fea[col+1:])
retSetRight.append(feaVec)
return retSetLeft , retSetRight
3. 选择最佳特征以及特征值
def chooseBestFeatureToSpilt(dataSet):
'''
'''
numFeat = len(dataSet[0]) -1
bestFeat = -1
minGini = 100000
bestValue = 9999
# 遍历所有的特征
for col in range(numFeat):
colList = dataSet[col]
feat = set(colList)
gini = 0
# 遍历所有的特征值
for value in feat:
subData1 , subData2 = spiltDataSet(dataSet , col , value)
prob1 = len(subData1) / float(len(dataSet))
prob2 = len(subData2) / float(len(dataSet))
gini = prob1 * calaGini(subData1) + prob2*calaGini(subData2)
if gini < minGini:
minGini = gini
bestFeat = col
bestValue = value
return bestFeat , bestValue , gini
3. 创建分类树
class Tree(object):
def __init__(self , fea , value , isLeaf = False):
self.left = None
self.right = None
self.isLeaf = isLeaf
self.fea = fea
self.value = value
def createTree(dataSet , labels):
'''
'''
classList = [fea[-1] for fea in dataSet]
# 递归出口
if classList.count(classList[0]) == len(classList):
return Tree(None , classList[0] , True)
if len(dataSet[0]) == 1:
return Tree(None ,dt.majorityCnt(classList) , True)
# 找到最佳特征以及特征值
fea , value , gini = chooseBestFeatureToSpilt(dataSet)
# 创建根节点
feaLabel = labels[fea]
myTree = Tree(feaLabel , value)
# 切分数据,递归创建树
dataLeft , dataRight = spiltDataSet(dataSet , fea , value)
newLabels = labels[:fea]
newLabels.extend(labels[fea+1:])
myTree.left = createTree(dataLeft , newLabels)
myTree.right = createTree(dataRight , newLabels)
return myTree
4. 预测
def predict(myTree , labels ,preVec):
# 递归出口
if myTree.isLeaf :
return myTree.value
fea = myTree.fea
value = myTree.value
index = labels.index(fea)
preValue = preVec[index]
if value == preValue:
return predict(myTree.left , labels , preVec)
else:
return predict(myTree.right , labels , preVec)
5. CART 回归树实现
1. 计算误差
def calaError(dataSet , mean0 = -1):
numCount = len(dataSet)
target = [t[-1] for t in dataSet]
if mean0 == -1:
mean = sum(target) / numCount
else:
# 这里主要是在后剪枝的时候,需要使用
mean = mean0
err = sum([ (t-mean)**2 for t in target ])
return err
2. 选择最佳特征以及最佳特征值
def chooseBestFeatureToSpilt(dataSet ,ops):
numFeat = len(dataSet[0]) - 1
bestErr = 999999
bestIndex = -1
bestValue = 0
minLeftCount = ops[1]
minErr = ops[0]
baseErr = calaError(dataSet)
# 数据不可分,限制叶子节点中样本数量
if len(dataSet) <= 2 * minLeftCount:
return None,meanValue(dataSet),baseErr
for i in range(numFeat):
colList = [fea[i] for fea in dataSet]
feaList = set(colList)
for feaValue in feaList:
subLeft ,subRight = spiltDataSet(dataSet , i , feaValue)
if len(subLeft)<minLeftCount or len(subRight)<minLeftCount:
continue
newErr = calaError(subLeft) + calaError(subRight)
if newErr < bestErr:
bestErr = newErr
bestIndex = i
bestValue = feaValue
# 当数据集不可分,或者误差小于指定的阈值的情况下
if abs(baseErr - bestErr) < minErr :
return None,meanValue(dataSet),baseErr
return bestIndex , bestValue , bestErr
3. 创建回归树
def createTree(dataSet, ops=(1, 4)):
index , value ,err= chooseBestFeatureToSpilt(dataSet , ops)
if index is None:
return value
myTree = {}
myTree['spIndex'] = index
myTree['spValue'] = value
subLeft , subRight = spiltDataSet(dataSet , index , value)
myTree['left'] = createTree(subLeft , ops)
myTree['right'] = createTree(subRight , ops)
return myTree
4. 预测
def predict(myTree , preVec):
# 出口
if isLeaf(myTree):
return myTree
index = myTree['spIndex']
value = myTree['spValue']
preValue = preVec[index]
if preValue <= value:
return predict(myTree['left'] , preVec)
else:
return predict(myTree['right'] , preVec)
5. 后剪枝实现
def prune(tree , testData):
'''
1. 把测试数据分配到回归树的叶子节点
2. 计算合并叶子节点前后 误差的变化
3. 如果没有分配到数据,那么就直接把子树合并起来
'''
if len(testData) == 0:
return getMean(tree)
# 根据回归树来划分我们的测试数据
if isTree(tree['left']) or isTree(tree['right']):
dataLeft , dataRight = spiltDataSet(testData , tree['spIndex'] , tree['spValue'])
if isTree(tree['left']):
tree['left'] = prune(tree['left'] , dataLeft)
if isTree(tree['right']):
tree['right'] = prune(tree['right'] , dataRight)
if isLeaf(tree['left']) and isLeaf(tree['right']):
dataLeft , dataRight = spiltDataSet(testData , tree['spIndex'] , tree['spValue'])
#sumErr = calaError(dataLeft) + calaError(dataRight)
#sumErrMer = calaError(testData)
#value = meanValue(testData)
## 这里特别注意: 这里的均值就是叶子节点的值
sumErr = calaError(dataLeft , tree['left']) + calaError(dataRight , tree['right'])
value = tree['left'] + (tree['right'])/float(2)
sumErrMer = calaError(testData , value)
if sumErr > sumErrMer:
return value
else:
return tree
else:
return tree

本文深入解析决策树算法,包括ID3、C4.5、CART分类树及回归树的原理与实现。从信息熵、基尼指数到后剪枝,全面覆盖决策树构建与预测过程。



被折叠的 条评论
为什么被折叠?



