defchooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
tolS = ops[0]; tolN = ops[1]#if all the target variables are the same value: quit and return value# print '***',set(dataSet[:,-1].T.A.tolist()[0])# print '***',len(dataSet[:,-1].T.A.tolist()[0])iflen(set(dataSet[:,-1].T.tolist()[0]))==1:returnNone, leafType(dataSet)
m,n = shape(dataSet)# print m,n#the choice of the best feature is driven by Reduction in RSS error from mean
S = errType(dataSet)# print 'S,',S
bestS = inf; bestIndex =0; bestValue =0for featIndex inrange(n-1):# for splitVal in set(dataSet[:,featIndex]):for splitVal inset((dataSet[:,featIndex].T.A.tolist())[0]):
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)if(shape(mat0)[0]< tolN)or(shape(mat1)[0]< tolN):continue
newS = errType(mat0)+ errType(mat1)if newS < bestS:
bestIndex = featIndex
bestValue = splitVal
bestS = newS
#if the decrease (S-bestS) is less than a threshold don't do the splitif(S - bestS)< tolS:returnNone, leafType(dataSet)#exit cond 2
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)if(shape(mat0)[0]< tolN)or(shape(mat1)[0]< tolN):#exit cond 3returnNone, leafType(dataSet)return bestIndex,bestValue#returns the best feature to split on#and the value used for that split
defregLeaf(dataSet):#returns the value used for each leafreturn mean(dataSet[:,-1])defregErr(dataSet):return var(dataSet[:,-1])* shape(dataSet)[0]
三、递归构造树
defcreateTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):#assume dataSet is NumPy Mat so we can array filtering
feat, val = chooseBestSplit(dataSet, leafType, errType, ops)#choose the best splitif feat ==None:return val #if the splitting hit a stop condition return val
retTree ={}
retTree['spInd']= feat
retTree['spVal']= val
lSet, rSet = binSplitDataSet(dataSet, feat, val)
retTree['left']= createTree(lSet, leafType, errType, ops)
retTree['right']= createTree(rSet, leafType, errType, ops)return retTree