9.3将CART算法用于回归
在python中可以直接以字典这个数据结构来实现树的生成
而且CART是给定切分特征值,用来划分特征集是归于左边还是右边
def binSplitDataSet(dataSet, feature, value):#数据集合,待切分特征和特征的值
mat0 = dataSet[nonzero(dataSet[:,feature] > value)[0],:][0]
mat1 = dataSet[nonzero(dataSet[:,feature] <= value)[0],:][0]
return mat0,mat1#返回两个子集
def regLeaf(dataSet):#生成叶子节点
return mean(dataSet[:,-1])
def regErr(dataSet):#计算目标变量的平方误差
return var(dataSet[:,-1]) * shape(dataSet)[0] #均方差*样本个数
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
tolS = ops[0]; tolN = ops[1]
#tolS是容许的误差下降至,tolN是切分的最少样本数
if len(set(dataSet[:,-1].T.tolist()[0])) == 1: #只有一个样本停止切分,加入叶子节点
return None, leafType(dataSet)
m,n = shape(dataSet)
#选择最好的切分方式
S = errType(dataSet)
bestS = inf; bestIndex = 0; bestValue = 0
for featIndex in range(n-1):
for splitVal in set(dataSet[:,featIndex]):
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue
newS = errType(mat0) + errType(mat1)
if newS < bestS:
bestIndex = featIndex
bestValue = splitVal
bestS = newS
#如果没有找到最好的切分方式,则停止切分
if (S - bestS) < tolS:
return None, leafType(dataSet) #exit cond 2
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): #exit cond 3
return None, leafType(dataSet)
return bestIndex,bestValue