【python】CART

最新推荐文章于 2023-07-23 15:13:51 发布

原创最新推荐文章于 2023-07-23 15:13:51 发布 · 1k 阅读

2 ·

CC 4.0 BY-SA版权

文章标签：

#决策树 #CART #Gini

ML / Math / Matlab 专栏收录该内容

26 篇文章

订阅专栏

本文深入探讨了CART算法在分类与回归任务中的应用，包括基于熵和基尼系数的划分准则、超参数调整技巧以及剪枝策略等内容。

文章目录

0 CART for Classification
1 hyper-parameters
2 复杂数据的局部性建模
3 连续和离散型特征的树的构建
4 将CART算法用于回归
- 4.1 构建树
- 4.2 运行代码
5 剪枝
- 5.1 预剪枝
- 5.2 后剪枝

节选自《Machine Learning in Action》——Peter Harrington，中文版是《机器学习实战》，感谢深度之眼（学习笔记）

0 CART for Classification

参考【python】ID3 0 和 1 小节，对比能很直观的发现基于熵的和基于Gini 系数的联系和区别

label 有 $C$ 类，第 $i$ 类的概率为 $p_i$
$-p_i\cdot \sum_{i=1}^{C}log(p_{i})$

# 计算信息熵
def entropy(y):
    counter = Counter(y)
    res = 0.0
    for num in counter.values():
        p = num / len(y)
        res += -p * log(p)
    return res

label 有 $C$ 类，第 $i$ 类的概率为 $p_i$
$1-\sum_{i=1}^{C}p_{i}^{2}$

# 计算 GINI 系数
def gini(y):
    counter = Counter(y)
    res = 1.0
    for num in counter.values():
        p = num / len(y)
        res -= p**2
    return res

信息熵计算较慢（涉及大量对数运算）

let’s get it!

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data[:,2:] # 150,2
y = iris.target # 150,(0,1,2) 3 class

plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.scatter(X[y==2,0], X[y==2,1])
plt.show()

from sklearn.tree import DecisionTreeClassifier
# random_state 固定随机种子
dt_clf = DecisionTreeClassifier(max_depth=2, criterion="entropy", random_state=42)
dt_clf.fit(X, y)

在这里插入图片描述
可视化

#ravel()：如果没有必要，不会产生源数据的副本
#flatten()：返回源数据的副本

# np.r_是按列连接两个矩阵，就是把两矩阵上下相加，要求列数相等。
# np.c_是按行连接两个矩阵，就是把两矩阵左右相加，要求行数相等。

def plot_decision_boundary(model, axis):
    
    x0, x1 = np.meshgrid( #生成网格点坐标矩阵。
        np.linspace(axis[0], axis[1], int((axis[1]-axis[0])*100)).reshape(-1, 1), # 0.5-7.5 之间产生 700个点
        np.linspace(axis[2], axis[3], int((axis[3]-axis[2])*100)).reshape(-1, 1), # 0-3 之间产生 300个点
    )
    # x0 300,700
    # x1 300,700
    X_new = np.c_[x0.ravel(), x1.ravel()] # (210000, 2)

    y_predict = model.predict(X_new) # (210000,)
    zz = y_predict.reshape(x0.shape) # 预测结果变成 300，700

    from matplotlib.colors import ListedColormap
    custom_cmap = ListedColormap(['#EF9A9A','#FFF59D','#90CAF9'])
    
    plt.contourf(x0, x1, zz, cmap=custom_cmap)

调用查看结果

plot_decision_boundary(dt_clf, axis=[0.5, 7.5, 0, 3])
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.scatter(X[y==2,0], X[y==2,1])
plt.show()

在这里插入图片描述
参考：

用 Gini 系数划分

from collections import Counter
from math import log

# 根据特征值v 把 X 和 y 的属性 d 划分开
def split(X, y, d, value):
    index_a = (X[:,d] <= value) # d 列特征值小于等于某个 value
    index_b = (X[:,d] > value)  # d 列特征值大于某个 value
    return X[index_a], X[index_b], y[index_a], y[index_b]

# 计算 GINI 系数
def gini(y):
    counter = Counter(y)
    res = 1.0
    for num in counter.values():
        p = num / len(y)
        res -= p**2
    return res

# 根据 GINI 系数，找出最优划分特征和最优划分特征值
def try_split(X, y):
    best_g = float('inf')
    best_d, best_v = -1, -1
    for d in range(X.shape[1]): # 遍历每个特征， 0 and 1
        sorted_index = np.argsort(X[:,d]) # 按照特征值大小排序，返回索引
        for i in range(1, len(X)): # 1,150 遍历每个样本
            if X[sorted_index[i], d] != X[sorted_index[i-1], d]: # 当前和前面一个的特征值不同时
                v = (X[sorted_index[i], d] + X[sorted_index[i-1], d])/2 # current and previous ave
                X_l, X_r, y_l, y_r = split(X, y, d, v) # 根据特征值v 把 X 和 y 的属性 d 划分开
                g = gini(y_l) + gini(y_r)
                if g < best_g: # 熵值是越小越好
                    best_g, best_d, best_v = g, d, v # 返回最低的熵，划分后最好的特征，最好的特征划分值              
    return best_g, best_d, best_v

first

# X（150，2）
# y（150，）
# 计算第一次划分的最优特征和最优特征值
best_g, best_d, best_v = try_split(X, y)
print("best_g =", best_g)
print("best_d =", best_d)
print("best_v =", best_v)

# 将数据分开
X1_l, X1_r, y1_l, y1_r = split(X, y, best_d, best_v)
print("y1_l:",gini(y1_l))
print("y1_r:",gini(y1_r))

output

best_g = 0.5
best_d = 0
best_v = 2.45
y1_l: 0.0
y1_r: 0.5

once more

# 在第一次的基础上计算第二次划分的最优特征和最优特征值
best_g2, best_d2, best_v2 = try_split(X1_r, y1_r)
print("best_g =", best_g2)
print("best_d =", best_d2)
print("best_v =", best_v2)

# 将数据分开
X2_l, X2_r, y2_l, y2_r = split(X1_r, y1_r, best_d2, best_v2)
print("y2_l:",gini(y2_l))
print("y2_r:",gini(y2_r))

output

best_g = 0.2105714900645938
best_d = 1
best_v = 1.75
y2_l: 0.1680384087791495
y2_r: 0.04253308128544431

1 hyper-parameters

以数据集 make_moons 为例，我们探索下超参数，参考 SKlearn中分类决策树的重要参数详解

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

# X(100,2) y(100,)
X, y = datasets.make_moons(noise=0.25, random_state=666)
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()

在这里插入图片描述

from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier()
dt_clf.fit(X, y)

output

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

可视化分类结果

def plot_decision_boundary(model, axis):
    
    # x0,x1 (250,400)
    x0, x1 = np.meshgrid(
        np.linspace(axis[0], axis[1], int((axis[1]-axis[0])*100)).reshape(-1, 1), # 400,1
        np.linspace(axis[2], axis[3], int((axis[3]-axis[2])*100)).reshape(-1, 1), # 250,1
    )
    X_new = np.c_[x0.ravel(), x1.ravel()] # (250*400,2)

    y_predict = model.predict(X_new)
    zz = y_predict.reshape(x0.shape)

    from matplotlib.colors import ListedColormap
    custom_cmap = ListedColormap(['#EF9A9A','#FFF59D','#90CAF9'])
    
    plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)
    
# plot decision result
plot_decision_boundary(dt_clf, axis=[-1.5, 2.5, -1.0, 1.5])
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()

在这里插入图片描述

1.1 max_depth

限制树的最大深度，超过设定深度的树枝全部剪掉

这是用得最广泛的剪枝参数，在高维度低样本量时非常有效。决策树多生长一层，对样本量的需求会增加一倍，所以限制树深度能够有效地限制过拟合。在集成算法中也非常实用。实际使用时，建议从=3开始尝试，看看拟合的效果再决定是否增加设定深度。

作者：CDA数据分析师培训
链接：https://www.jianshu.com/p/d1d17499365c
来源：简书

dt_clf2 = DecisionTreeClassifier(max_depth=1)
dt_clf2.fit(X, y)

plot_decision_boundary(dt_clf2, axis=[-1.5, 2.5, -1.0, 1.5])
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()

max_depth = 1,2
在这里插入图片描述
max_depth = 3,5

max_depth = 10,50

可以看出设置太小欠拟合，太大就过拟合了

1.2 min_samples_split

min_samples_split 限定，一个节点必须要包含至少 min_samples_split 个训练样本，这个节点才允许被分枝，否则分枝就不会发生。

dt_clf3 = DecisionTreeClassifier(min_samples_split=2)
dt_clf3.fit(X, y)

plot_decision_boundary(dt_clf3, axis=[-1.5, 2.5, -1.0, 1.5])
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()

min_samples_split=2,3
在这里插入图片描述
min_samples_split=4,5

min_samples_split=10,20

小了欠拟合，大了过拟合

1.3 min_samples_leaf

min_samples_leaf 限定，一个节点在分枝后的每个子节点都必须包含至少min_samples_leaf个训练样本，否则分枝就不会发生，或者，分枝会朝着满足每个子节点都包含min_samples_leaf个样本的方向去发生

一般搭配max_depth使用，在回归树中有神奇的效果，可以让模型变得更加平滑。这个参数的数量设置得太小会引起过拟合，设置得太大就会阻止模型学习数据。一般来说，建议从=5开始使用。如果叶节点中含有的样本量变化很大，建议输入浮点数作为样本量的百分比来使用。同时，这个参数可以保证每个叶子的最小尺寸，可以在回归问题中避免低方差，过拟合的叶子节点出现。对于类别不多的分类问题，=1通常就是最佳选择。

作者：CDA数据分析师培训
链接：https://www.jianshu.com/p/d1d17499365c
来源：简书

较小的叶子使模型更容易捕捉训练数据中的噪声（偏向过拟合）。

dt_clf4 = DecisionTreeClassifier(min_samples_leaf=1)
dt_clf4.fit(X, y)

plot_decision_boundary(dt_clf4, axis=[-1.5, 2.5, -1.0, 1.5])
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()

min_samples_leaf=1,2
在这里插入图片描述
min_samples_leaf=3,4

min_samples_leaf=5,6

min_samples_leaf=7,10

偏小过拟合，偏大欠拟合

1.4 max_leaf_nodes

限制最大叶子节点数

dt_clf5 = DecisionTreeClassifier(max_leaf_nodes=2)
dt_clf5.fit(X, y)

plot_decision_boundary(dt_clf5, axis=[-1.5, 2.5, -1.0, 1.5])
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()

max_leaf_nodes=2，10
在这里插入图片描述
太小欠拟合，太大过拟合！

2 复杂数据的局部性建模

决策树是一种贪心算法，它要在给定时间内作出最佳选择，但是并不关心能否达到全局最优，

树回归

优点：可以对复杂和非线性的数据建模
缺点：结果不易理解

上篇 ID3 算法的缺点

切分过于迅速：每次选取当前最佳的特征来分割数据，并按照该特征所有可能取值来切分。也就是说，如果一个特征有4个取值，那么数据将被切成4份，该特征在之后的算法执行过程中将不会再起作用。
不能处理连续型特征

树回归的一般方法

收集数据：anyway
准备数据：标称型数据应该映射成二值型数据
分析数据：绘出数据的委会可视化显示结果，以字典的方式生成树
训练算法：大部分时间都花费在叶节点树模型的构建上
测试算法：使用测试数据上的R平方值来分析模型的效果
使用算法：使用训练出的树做预测模型，预测结果还可以用来做很多事情

3 连续和离散型特征的树的构建

用字典的形式存储树的数据结构，该字典包含以下4个元素

待切分的特征
待切分的特征值
右子树。当不再需要切分的时候，也可以是单个值
左子树。与右子树类似

from numpy import *
import regTrees
#四阶，单位矩阵
testMat = mat(eye(4))
#把第一列特征（0开始编号），按照大于0.5或者小于等于0.5分类
mat0,mat1 = regTrees.binSplitDataSet(testMat,1,0.5)
print (mat0,'\n')
print (mat1)

结果为

[[ 0.  1.  0.  0.]] 

[[ 1.  0.  0.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  0.  1.]]

4 将CART算法用于回归

4.1 构建树

用 createTree() 来构建
伪代码大致如下：

找到最佳的待切分特征：
如果该节点不能再分，将改节点存为叶节点
执行二元切分
在右子树调用createTree（）方法
在左子树调用createTree（）方法

核心代码是chooseBestSplit（）函数。给定某误差计算方法，该函数会找到数据集上最佳的二元切分方式。另外该函数还要确定什么时候停止切分，一旦停止切分会生成一个叶节点。
因此，chooseBestSplit() 只需要完成两件事

1. 用最佳方式切分数据集
2. 生成相应的叶节点

伪代码如下
对每个特征：

对每个特征值：
将数据集切分成两份
计算切分的误差
如果当前误差小于最小误差，那么将当前切分设定为最佳切分并更新最小误差
返回最佳切分的特征和阈值

如果找不到一个好的二元切分，该函数返回None并同时调用createTree（）方法来产生叶子节点

Note:此函数有三种情况不可切分

4.2 运行代码

from numpy import *
import regTrees
myDat = regTrees.loadDataSet('ex00.txt')  
myMat = mat(myDat)  
regTrees.createTree(myMat)

结果为

{'left': 1.0180967672413792,
 'right': -0.044650285714285719,
 'spInd': 0,
 'spVal': 0.48813}

数据集为

0.203693	-0.064036
0.355688	-0.119399
0.988852	1.069062
0.518735	1.037179
0.514563	1.156648
0.976414	0.862911
0.919074	1.123413
0.697777	0.827805
0.928097	0.883225
0.900272	0.996871
0.344102	-0.061539
0.148049	0.204298
0.130052	-0.026167
0.302001	0.317135
0.337100	0.026332
0.314924	-0.001952
0.269681	-0.165971
0.196005	-0.048847
0.129061	0.305107
0.936783	1.026258
0.305540	-0.115991
0.683921	1.414382
0.622398	0.766330
0.902532	0.861601
0.712503	0.933490
0.590062	0.705531
0.723120	1.307248
0.188218	0.113685
0.643601	0.782552
0.520207	1.209557
0.233115	-0.348147
0.465625	-0.152940
0.884512	1.117833
0.663200	0.701634
0.268857	0.073447
0.729234	0.931956
0.429664	-0.188659
0.737189	1.200781
0.378595	-0.296094
0.930173	1.035645
0.774301	0.836763
0.273940	-0.085713
0.824442	1.082153
0.626011	0.840544
0.679390	1.307217
0.578252	0.921885
0.785541	1.165296
0.597409	0.974770
0.014083	-0.132525
0.663870	1.187129
0.552381	1.369630
0.683886	0.999985
0.210334	-0.006899
0.604529	1.212685
0.250744	0.046297

5 剪枝

5.1 预剪枝

第3节简单的实验结果还是挺满意的，但树的构建对输入的参数tolS和tolN非常敏感，如果使用其他值将不太容易达到这么好的效果。
chooseBestSplit（）终止条件，实际上是在进行一种所谓的预剪枝（prepruning操作）。也即不断修改ops的参数，但这并不是一个好办法，因为我们有时候不知道到底需要寻找什么样的结果。

另一种形式的剪枝需要使用测试集和训练集，称为后剪枝（postpruning），这是一种更理想化的剪枝方法。

5.2 后剪枝

函数 prune() 伪代码如下
基于已有的树切分测试数据：
如果存在任一子集是一棵树，则在该子集递归剪枝过程
计算将当前两个叶节点合并后的误差
计算不合并的误差
如果合并会降低误差的话，就将叶节点合并

from numpy import *
import regTrees
myDat2 = regTrees.loadDataSet('ex2.txt')  
myMat2 = mat(myDat2)  
regTrees.createTree(myMat2)

结果有很多节点

实验代码如下

myTree = regTrees.createTree(myMat2,ops=(0,1))
myDatTest = regTrees.loadDataSet('ex2test.txt')
myMat2Test = mat(myDatTest)
regTrees.prune(myTree,myMat2Test)

结果发现，合并了许多节点

from numpy import *

def loadDataSet(fileName):      #general function to parse tab -delimited floats
    dataMat = []                #assume last column is target value
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        #fltLine = map(float,curLine) #map all elements to float()
        fltLine = [float(item) for item in curLine]
        dataMat.append(fltLine)
    return dataMat

#形参：数据集集合、待切分的特征和该特征下的某个值
def binSplitDataSet(dataSet, feature, value):
    mat0 = dataSet[nonzero(dataSet[:,feature] > value)[0],:]
    mat1 = dataSet[nonzero(dataSet[:,feature] <= value)[0],:]
    return mat0,mat1

def regLeaf(dataSet):#returns the value used for each leaf
    return mean(dataSet[:,-1])#计算最后一列

def regErr(dataSet):
    return var(dataSet[:,-1]) * shape(dataSet)[0]

def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
    tolS = ops[0]; tolN = ops[1]# 1 和 4
    #tolS是容许的误差下降值，tolN是切分的最少样本数
    #if all the target variables are the same value: quit and return value
    
    #如果该数目是1，那么就不需要再切分而直接返回
    if len(set(dataSet[:,-1].T.tolist()[0])) == 1: #exit condition 1
        return None, leafType(dataSet)
    
    m,n = shape(dataSet)
    #the choice of the best feature is driven by Reduction in RSS error from mean
    S = errType(dataSet)
    bestS = inf; bestIndex = 0; bestValue = 0
    for featIndex in range(n-1):
        #for splitVal in set(dataSet[:,featIndex]):#集合的形式
        for splitVal in set(dataSet[:,featIndex].T.A.tolist()[0]):  
            mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
            if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue
            newS = errType(mat0) + errType(mat1)
            if newS < bestS: 
                bestIndex = featIndex
                bestValue = splitVal
                bestS = newS
    #if the decrease (S-bestS) is less than a threshold don't do the split
    
    #如果切分数据集后效果提升不够大，那么就不应进行切分操作而直接创建叶节点   
    if (S - bestS) < tolS: 
        return None, leafType(dataSet) #exit conditon 2

    mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
    if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):  #exit condition 3
    #检查切分后子集的大小，如果子集大小小于tolN，那么也不应切分
        return None, leafType(dataSet)
    
    #返回切分特征和特征值
    return bestIndex,bestValue#returns the best feature to split on
                              #and the value used for that split
    
#形参：数据集、建立叶节点的函数、误差计算函数、树构建所需要包含其他参数的元组
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):#assume dataSet is NumPy Mat so we can array filtering
    feat, val = chooseBestSplit(dataSet, leafType, errType, ops)#choose the best split
    if feat == None: return val #if the splitting hit a stop condition return val
    retTree = {}
    retTree['spInd'] = feat
    retTree['spVal'] = val
    lSet, rSet = binSplitDataSet(dataSet, feat, val)
    retTree['left'] = createTree(lSet, leafType, errType, ops)
    retTree['right'] = createTree(rSet, leafType, errType, ops)
    return retTree  

#回归剪枝函数
#测试输入变量是否是一棵树，返回布尔结果，换句话说，该函数用于判断当前处理的节点是否是叶节点
def isTree(obj):
    return (type(obj).__name__=='dict')

#递归调用，从上往下遍历，直到叶子节点为止，如果找到两个叶子节点，则计算他们的平均值。该函数对树进行塌陷处理
def getMean(tree):
    if isTree(tree['right']): tree['right'] = getMean(tree['right'])
    if isTree(tree['left']): tree['left'] = getMean(tree['left'])
    return (tree['left']+tree['right'])/2.0
    
def prune(tree, testData):
    if shape(testData)[0] == 0: return getMean(tree) #if we have no test data collapse the tree
    if (isTree(tree['right']) or isTree(tree['left'])):#if the branches are not trees try to prune them
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
    if isTree(tree['left']): tree['left'] = prune(tree['left'], lSet)
    if isTree(tree['right']): tree['right'] =  prune(tree['right'], rSet)
    #if they are now both leafs, see if we can merge them
    if not isTree(tree['left']) and not isTree(tree['right']):
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
        errorNoMerge = sum(power(lSet[:,-1] - tree['left'],2)) +\
            sum(power(rSet[:,-1] - tree['right'],2))
        treeMean = (tree['left']+tree['right'])/2.0
        errorMerge = sum(power(testData[:,-1] - treeMean,2))
        if errorMerge < errorNoMerge: 
            print ("merging")
            return treeMean
        else: return tree
    else: return tree