Task2CART树算法
CART中用于选择变量的不纯性度量是Gini指数,最好的划分就是使得GINI_Gain最小的划分。
参考:https://blog.youkuaiyun.com/u011067360/article/details/24871801
实现CART的分类树算法代码
class Node:
j=None
theta=None
p=None
left=None
right=None
class DecistonTreeBasel:
def__init__(self,max_depth,get_score, feature_sample_rate=1.0):
self.max_depth=max depth
self.get_score=get_score
self.feature_sample_raterfeature_sample_rate
def split_data(self,j,theta,X,idx):
idx1,idx2=list(),list()
for i in idx:
if x[i][j]<=theta:
idx1.append(i)
else:
idx2.append(i)
return idx1,idx2
def get_random_features(self,n):
shuffled=np.random.permutation(n)
size=int(self.feature sample rate*n)
return shuffledL:size]
def find_best_split(self,x,y,idx):
m,n=X.shape
best_score,best_j,best theta=float("inf"),-1,float("inf")
best idx1,best_idx2=list(),list()
selected_j=self.get_random_features (n)
for j in selected_j:
thetas=set([x[j] for x in x])
for theta in thetas:
idx1,idx2=self.split_data(j,theta,X,idx)
if min(len(idx1),len(idx2))==0:
continue
scorel,score2=self.get_score(y,idx1),self.get_score(y,idx2)
w=1.0*len(idx1)/len(idx)
score=w*score1+(1-w)* score2
if score<best_score:
best score,best_j,best_theta=score,j,theta
best idx1,best_idx2=idx1,idx2
return best_j,best_theta,best_idx1,best_idx2,best_score
def generate_tree(self,X,Y,idx,d):
r=Node ()
if d==0 or len(idx)==1:
r.p=np.average(y[idx],axis=0)
return r
j,theta,idx1,idx2,score=self.find_best_split(x,y,idx)
current_score=self.get score(y,idx)
if score>=current score:
return r
r.j,r.theta=j,theta
r.left,r.right=self.generate_tree(x,y,idx1,d-1),self.generate_t(X,Y,idx2,d-1)
return r
def fit(self,X,y):
self.root=self.generate_tree(X,y,range(len(X)),self.max_depth)
def get_prediction(self,r,x):
if r.left==None and r.right ==None:
return r.p
if x[r.j]<=r.theta:
return self.get_prediction(r.left,x)
else:
return.self.get_prediction(r.right,x)
def predict(self,X):
y=list()
for i in range(len(X)):
y.append(self.get_prediction(self.root,x[i]))
return np.array(y)
# 基于CART算法基类的决策树分类算法
def get_entropy(y,idx):
_,k=y.shape
p=np.average(y[idx],axis=0)
return -np.1og(p+0.001*np.random.rand(k)).dot(p.T)
class DecisionTreeClassifier(DecisionTreeBase):
def__init__(self,max depth=0,feature_sample_rate=1.0):
super()._init__(max_depth=max_depth,
feature_sample_rate=feature_sample_rate,get_score=get_entropy)
def predict_proba(self,X):
return super().predict(X)
def predict(self,X):
proba=self.predict_proba(X)
return np.argmax(proba,axis=1)
# 基于CART算法基类的决策树回归算法
def get_var(y,idx):
y_avg=np.average(y[idx])*np.ones(len(idx))
return np.linalg.norm(y_avg -y[idx],2)** 2/len(idx)
class DecisionTreeRegressor(DecisionTreeBase):
def_init__(self,max_depth=0,feature_sample_rate=1.0):
super().__init__(
max_depth = max_depth
feature_sample_rate = feature_sample_rate
get_score = get_var)