import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets,cross_validation,ensemble
加载数据集
#糖尿病病人
def load_data_regression():
diabetes=datasets.load_diabetes()
return cross_validation.train_test_split(diabetes.data,diabetes.target,test_size=0.25,random_state=0)
#手写识别数据集Digit Dataset
def load_data_classification():
digits=datasets.load_digits()
return cross_validation.train_test_split(digits.data,digits.target,test_size=0.25,random_state=0)
GradientBoostClassifier梯度提升决策树
模型原型
class sklearn.ensemble.GradientBoostClassifier(loss=’deviance’,learning_rate=0.1,n_estimators=100,subsample=1.0, min_samples_split=2,min_samples_leaf=1,min_weight_fraction_leaf=0.0,max_depth=3,init=None,random_state=None, max_feature=None,verbose=0,max_leaf_nodes=None,warm_start=False,presort=’auto’)
参数
- loss:指定损失函数
- ’deviance’:对数损失函数
- ‘exponential’:指数损失函数
- learning_rate
- n_estimators:指定基础决策树的数量(默认为100).GBDT对过拟合有很好的鲁棒性,因此该值越大越好
- subsample:指定了提取原始训练集中的一个子集用于训练基础决策树
- min_samples_split
- min_samples_leaf
- min_weight_fraction_leaf
- max_depth
- init:用于执行初始的预测(如果为None,则使用loss.init_estimator)
- random_state
- max_feature
- verbose
- max_leaf_nodes
- warm_start
- presort
属性
- featureimportances
- oobimprovement
- trainscore
- init
- estimators_
方法
- fit(X,y[,samples_weight,monitor])
- predict(X)
- predict_log_proba(X)
- predict_proba(X)
- score(X,y[,samples_weight])
- staged_predict(X)
- staged_predict_proba(X)
使用GradientBoostingClassifier类
def test_GradientBoostingClassifier(*data):
X_train,X_test,y_train,y_test=data
clf=ensemble.GradientBoostingClassifier()
clf.fit(X_train,y_train)
print('Training Score:%f'%clf.score(X_train,y_train))
print('Testing Score:%f'%clf.score(X_test,y_test))
X_train,X_test,y_train,y_test=load_data_classification()
test_GradientBoostingClassifier(X_train,X_test,y_train,y_test)
个体决策树的数量的影响
def test_GradientBoostingClassifier_num(*data):
X_train,X_test,y_train,y_test=data
nums=np.arange(1,100,step=2)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
training_scores=[]
testing_scores=[]
for num in nums:
clf=ensemble.GradientBoostingClassifier(n_estimators=num)
clf.fit(X_train,y_train)
training_scores.append(clf.score(X_train,y_train))
testing_scores.append(clf.score(X_test,y_test))
ax.plot(nums,training_scores,label='Training Score')
ax.plot(nums,testing_scores,label='Testing Score')
ax.set_xlabel('estimator num')
ax.set_ylabel('score')
ax.legend(loc='lower right')
ax.set_ylim(0,1.05)
plt.suptitle('GradientBoostingClassifier')
plt.show()
test_GradientBoostingClassifier_num(X_train,X_test,y_train,y_test)
个体决策树的最大树深度的影响
def test_GradientBoostingClassifier_maxdepth(*data):
X_train,X_test,y_train,y_test=data
maxdepths=np.arange(1,20)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
training_scores=[]
testing_scores=[]
for maxdepth in maxdepths:
clf=ensemble.GradientBoostingClassifier(max_depth=maxdepth,
max_leaf_nodes=None)
clf.fit(X_train,y_train)
training_scores.append(clf.score(X_train,y_train))
testing_scores.append(clf.score(X_test,y_test))
ax.plot(maxdepths,training_scores,label='Training Score')
ax.plot(maxdepths,testing_scores,label='Testing Score')
ax.set_xlabel('max_depth')
ax.set_ylabel('score')
ax.legend(loc='lower right')
ax.set_ylim(0,1.05)
plt.suptitle('GradientBoostingClassifier')
plt.show()
test_GradientBoostingClassifier_maxdepth(X_train,X_test,y_train,y_test)
学习率的影响
def test_GradientBoostingClassifier_learning(*data):
X_train,X_test,y_train,y_test=data
learnings=np.linspace(0.01,1.0)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
training_scores=[]
testing_scores=[]
for learning in learnings:
clf=ensemble.GradientBoostingClassifier(learning_rate=learning)
clf.fit(X_train,y_train)
training_scores.append(clf.score(X_train,y_train))
testing_scores.append(clf.score(X_test,y_test))
ax.plot(learnings,training_scores,label='Training Score')
ax.plot(learnings,testing_scores,label='Testing Score')
ax.set_xlabel('learning_rate')
ax.set_ylabel('score')
ax.legend(loc='lower right')
ax.set_ylim(0,1.05)
plt.suptitle('GradientBoostingClassifier')
plt.show()
test_GradientBoostingClassifier_learning(X_train,X_test,y_train,y_test)
subsample的影响强调内容
def test_GradientBoostingClassifier_subsample(*data):
X_train,X_test,y_train,y_test=data
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
subsamples=np.linspace(0.01,1.0)
training_scores=[]
testing_scores=[]
for subsample in subsamples:
clf=ensemble.GradientBoostingClassifier(subsample=subsample)
clf.fit(X_train,y_train)
training_scores.append(clf.score(X_train,y_train))
testing_scores.append(clf.score(X_test,y_test))
ax.plot(subsamples,training_scores,label='Training Score')
ax.plot(subsamples,testing_scores,label='Testing Score')
ax.set_xlabel('subsample')
ax.set_ylabel('score')
ax.legend(loc='lower right')
ax.set_ylim(0,1.05)
plt.suptitle('GradientBoostingClassifier')
plt.show()
test_GradientBoostingClassifier_subsample(X_train,X_test,y_train,y_test)
max_features参数的影响
def test_GradientBoostingClassifier_max_features(*data):
X_train,X_test,y_train,y_test=data
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
max_features=np.linspace(0.01,1.0)
training_scores=[]
testing_scores=[]
for features in max_features:
clf=ensemble.GradientBoostingClassifier(max_features=features)
clf.fit(X_train,y_train)
training_scores.append(clf.score(X_train,y_train))
testing_scores.append(clf.score(X_test,y_test))
ax.plot(max_features,training_scores,label='Training Score')
ax.plot(max_features,testing_scores,label='Testing Score')
ax.set_xlabel('max_features')
ax.set_ylabel('score')
ax.legend(loc='lower right')
ax.set_ylim(0,1.05)
plt.suptitle('GradientBoostingClassifier')
plt.show()
test_GradientBoostingClassifier_max_features(X_train,X_test,y_train,y_test)
GradientBoostRegressor
模型原型
class sklearn.ensemble.GradientBoostRegressor(loss=’ls’,learning_rate=0.1,n_estimators=100,subsample=1.0, min_samples_split=2,min_samples_leaf=1,min_weight_fraction_leaf=0.0,max_depth=3,init=None,random_state=None, max_feature=None,alpha=0.9,verbose=0,max_leaf_nodes=None,warm_start=False,presort=’auto’)
参数
- loss:指定损失函数
- ’ls’:平方损失函数
- ‘lad’:绝对值损失函数
- ‘huber’:平方损失函数和绝对值损失函数结合,通过alpha参数指定比例
- learning_rate
- n_estimators:指定基础决策树的数量(默认为100).GBDT对过拟合有很好的鲁棒性,因此该值越大越好
- subsample:指定了提取原始训练集中的一个子集用于训练基础决策树
- min_samples_split
- min_samples_leaf
- min_weight_fraction_leaf
- max_depth
- init:用于执行初始的预测(如果为None,则使用loss.init_estimator)
- random_state
- max_feature
- alpha
- verbose
- max_leaf_nodes
- warm_start
- presort
属性
- featureimportances
- oobimprovement
- trainscore
- init
- estimators_
方法
- fit(X,y[,samples_weight,monitor])
- predict(X)
- predict_log_proba(X)
- predict_proba(X)
- score(X,y[,samples_weight])
- staged_predict(X)
使用GradientBoostingRegressor类
def test_GradientBoostingRegressor(*data):
X_train,X_test,y_train,y_test=data
regr=ensemble.GradientBoostingRegressor()
regr.fit(X_train,y_train)
print('Training Score:%f'%regr.score(X_train,y_train))
print('Testing Score:%f'%regr.score(X_test,y_test))
X_train,X_test,y_train,y_test=load_data_regression()
test_GradientBoostingRegressor(X_train,X_test,y_train,y_test)
个体回归树的数量的影响
def test_GradientBoostingRegressor_num(*data):
X_train,X_test,y_train,y_test=data
nums=np.arange(1,200,step=2)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
training_scores=[]
testing_scores=[]
for num in nums:
clf=ensemble.GradientBoostingRegressor(n_estimators=num)
clf.fit(X_train,y_train)
training_scores.append(clf.score(X_train,y_train))
testing_scores.append(clf.score(X_test,y_test))
ax.plot(nums,training_scores,label='Training Score')
ax.plot(nums,testing_scores,label='Testing Score')
ax.set_xlabel('estimator num')
ax.set_ylabel('score')
ax.legend(loc='lower right')
ax.set_ylim(0,1.05)
plt.suptitle('GradientBoostingRegressor')
plt.show()
test_GradientBoostingRegressor_num(X_train,X_test,y_train,y_test)
个体决策树的最大树深度的影响
def test_GradientBoostingRegressor_maxdepth(*data):
X_train,X_test,y_train,y_test=data
maxdepths=np.arange(1,20)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
training_scores=[]
testing_scores=[]
for maxdepth in maxdepths:
clf=ensemble.GradientBoostingRegressor(max_depth=maxdepth,max_leaf_nodes=None)
clf.fit(X_train,y_train)
training_scores.append(clf.score(X_train,y_train))
testing_scores.append(clf.score(X_test,y_test))
ax.plot(maxdepths,training_scores,label='Training Score')
ax.plot(maxdepths,testing_scores,label='Testing Score')
ax.set_xlabel('max_depth')
ax.set_ylabel('score')
ax.legend(loc='lower right')
ax.set_ylim(-1,1.05)
plt.suptitle('GradientBoostingRegressor')
plt.show()
test_GradientBoostingRegressor_maxdepth(X_train,X_test,y_train,y_test)
学习率的影响
def test_GradientBoostingRegressor_learning(*data):
X_train,X_test,y_train,y_test=data
learnings=np.linspace(0.01,1.0)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
training_scores=[]
testing_scores=[]
for learning in learnings:
clf=ensemble.GradientBoostingRegressor(learning_rate=learning)
clf.fit(X_train,y_train)
training_scores.append(clf.score(X_train,y_train))
testing_scores.append(clf.score(X_test,y_test))
ax.plot(learnings,training_scores,label='Training Score')
ax.plot(learnings,testing_scores,label='Testing Score')
ax.set_xlabel('learning_rate')
ax.set_ylabel('score')
ax.legend(loc='lower right')
ax.set_ylim(-1,1.05)
plt.suptitle('GradientBoostingRegressor')
plt.show()
test_GradientBoostingRegressor_learning(X_train,X_test,y_train,y_test)
subsample的影响
def test_GradientBoostingRegressor_subsample(*data):
X_train,X_test,y_train,y_test=data
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
subsamples=np.linspace(0.01,1.0,num=20)
training_scores=[]
testing_scores=[]
for subsample in subsamples:
clf=ensemble.GradientBoostingRegressor(subsample=subsample)
clf.fit(X_train,y_train)
training_scores.append(clf.score(X_train,y_train))
testing_scores.append(clf.score(X_test,y_test))
ax.plot(subsamples,training_scores,label='Training Score')
ax.plot(subsamples,testing_scores,label='Testing Score')
ax.set_xlabel('subsample')
ax.set_ylabel('score')
ax.legend(loc='lower right')
ax.set_ylim(-1,1.05)
plt.suptitle('GradientBoostingRegressor')
plt.show()
test_GradientBoostingRegressor_subsample(X_train,X_test,y_train,y_test)
损失函数的影响
def test_GradientBoostingRegressor_loss(*data):
X_train,X_test,y_train,y_test=data
fig=plt.figure()
nums=np.arange(1,200,step=2)
losses=['ls','lad','huber']
#绘制huber
ax=fig.add_subplot(2,1,1)
alphas=np.linspace(0.01,1.0,endpoint=False,num=5)
for alpha in alphas:
training_scores=[]
testing_scores=[]
for num in nums:
regr=ensemble.GradientBoostingRegressor(n_estimators=num,
loss='huber',alpha=alpha)
regr.fit(X_train,y_train)
training_scores.append(regr.score(X_train,y_train))
testing_scores.append(regr.score(X_test,y_test))
ax.plot(nums,training_scores,
label='Training Score:alpha=%f'%alpha)
ax.plot(nums,testing_scores,
label='Testing Score:alpha=%f'%alpha)
ax.set_xlabel('estimator num')
ax.set_ylabel('score')
ax.legend(loc='lower right',framealpha=0.4)
ax.set_ylim(0,1.05)
ax.set_title('loss=%huber')
plt.suptitle('GradientBoostingRegressor')
#绘制ls和lad
ax=fig.add_subplot(2,1,2)
for loss in ['ls','lad']:
training_scores=[]
testing_scores=[]
for num in nums:
regr=ensemble.GradientBoostingRegressor(n_estimators=num,
loss=loss)
regr.fit(X_train,y_train)
training_scores.append(regr.score(X_train,y_train))
testing_scores.append(regr.score(X_test,y_test))
ax.plot(nums,training_scores,label='Training Score:loss=%s'%loss)
ax.plot(nums,testing_scores,label='Testing Score:loss=%s'%loss)
ax.set_xlabel('estimator num')
ax.set_ylabel('score')
ax.legend(loc='lower right',framealpha=0.4)
ax.set_ylim(0,1.05)
ax.set_title('loss=ls,lad')
plt.suptitle('GradientBoostingRegressor')
plt.show()
test_GradientBoostingRegressor_loss(X_train,X_test,y_train,y_test)
max_features参数的影响
def test_GradientBoostingRegressor_max_features(*data):
X_train,X_test,y_train,y_test=data
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
max_features=np.linspace(0.01,1.0)
training_scores=[]
testing_scores=[]
for features in max_features:
clf=ensemble.GradientBoostingRegressor(max_features=features)
clf.fit(X_train,y_train)
training_scores.append(clf.score(X_train,y_train))
testing_scores.append(clf.score(X_test,y_test))
ax.plot(max_features,training_scores,label='Training Score')
ax.plot(max_features,testing_scores,label='Testing Score')
ax.set_xlabel('max_features')
ax.set_ylabel('score')
ax.legend(loc='lower right')
ax.set_ylim(0,1.05)
plt.suptitle('GradientBoostingRegressor')
plt.show()
test_GradientBoostingRegressor_max_features(X_train,X_test,
y_train,y_test)