import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets,cross_validation,ensemble
加载数据集
#糖尿病病人
def load_data_regression():
diabetes=datasets.load_diabetes()
return cross_validation.train_test_split(diabetes.data,diabetes.target,test_size=0.25,random_state=0)
#手写识别数据集Digit Dataset
def load_data_classification():
digits=datasets.load_digits()
return cross_validation.train_test_split(digits.data,digits.target,test_size=0.25,random_state=0)
AdaBoostClassifier分类器
模型原型
class sklearn.ensemble.AdaBoostClassifier(base_estomator=None,n_estimators=50,learning_rate=1.0,algorithm=’SAMME.R’,random_state=None)
参数
- base_estomator:基础分类器对象(默认为DecisionTreeClassifier)
- n_estimators:指定基础分类器的数量(默认为50)
- learning_rate:用于减少每一步的步长,防止步长太大而跨过了极值点
- algorithm:指定算法,用于多类分类问题
- ’SAMME.R’:使用SAMME.R算法,基础分类器对象必须支持计算类别的概率
- ‘SAMME’:使用SAMME算法
- random_state
属性
- estimators_:所有训练过的基础分类器
- classes_
- n_classes
- estimatorweights
- estimatorerrors
- featureimportances
方法
- fit(X,y[,samples_weight])
- predict(X)
- predict_log_proba(X)
- predict_proba(X)
- score(X,y[,samples_weight])
- staged_predict(X)
- staged_predict_proba(X)
- staged_score(X,y[,samples_weight])
使用AdaBoostClassifier类
def test_AdaBoostClassifier(*data):
X_train,X_test,y_train,y_test=data
clf=ensemble.AdaBoostClassifier(learning_rate=0.1)
clf.fit(X_train,y_train)
#绘图
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
estimators_num=len(clf.estimators_)
X=range(1,estimators_num+1)
ax.plot(list(X),list(clf.staged_score(X_train,y_train)),label='Training score')
ax.plot(list(X),list(clf.staged_score(X_test,y_test)),label='Testing score')
ax.set_xlabel('estimator num')
ax.set_ylabel('score')
ax.legend(loc='best')
ax.set_title('AdaBoostClassifier')
plt.show()
X_train,X_test,y_train,y_test=load_data_classification()
test_AdaBoostClassifier(X_train,X_test,y_train,y_test)
不同类型的个体分类器的影响
def test_AdaBoostClassifier_base_classifier(*data):
from sklearn.naive_bayes import GaussianNB
X_train,X_test,y_train,y_test=data
fig=plt.figure()
ax=fig.add_subplot(2,1,1)
#默认的个体分类器
clf=ensemble.AdaBoostClassifier(learning_rate=0.1)
clf.fit(X_train,y_train)
#绘图
estimators_num=len(clf.estimators_)
X=range(1,estimators_num+1)
ax.plot(list(X),list(clf.staged_score(X_train,y_train)),
label='Training score')
ax.plot(list(X),list(clf.staged_score(X_test,y_test)),
label='Testing score')
ax.set_xlabel('estimator num')
ax.set_ylabel('score')
ax.legend(loc='lower right')
ax.set_ylim(0,1)
ax.set_title('AdaBoostClassifier with Decision Tree')
#Gaussian Naive Bayes个体分类器
ax=fig.add_subplot(2,1,2)
clf=ensemble.AdaBoostClassifier(learning_rate=0.1,
base_estimator=GaussianNB())
clf.fit(X_train,y_train)
#绘图
estimators_num=len(clf.estimators_)
X=range(1,estimators_num+1)
ax.plot(list(X),list(clf.staged_score(X_train,y_train)),
label='Training score')
ax.plot(list(X),list(clf.staged_score(X_test,y_test)),
label='Testing score')
ax.set_xlabel('estimator num')
ax.set_ylabel('score')
ax.legend(loc='lower right')
ax.set_ylim(0,1)
ax.set_title('AdaBoostClassifier with Gaussian Naive Bayes')
plt.show()
test_AdaBoostClassifier_base_classifier(X_train,X_test,y_train,
y_test)
学习率的影响
def test_AdaBoostClassifier_learning_rate(*data):
X_train,X_test,y_train,y_test=data
learning_rates=np.linspace(0.01,1)
fig=plt.figure()
#SAMME.R算法
ax=fig.add_subplot(2,1,1)
training_scores=[]
testing_scores=[]
for learning_rate in learning_rates:
clf=ensemble.AdaBoostClassifier(learning_rate=learning_rate,
n_estimators=500)
clf.fit(X_train,y_train)
training_scores.append(clf.score(X_train,y_train))
testing_scores.append(clf.score(X_test,y_test))
ax.plot(learning_rates,training_scores,label='Training score')
ax.plot(learning_rates,testing_scores,label='Testing score')
ax.set_xlabel('learning rate')
ax.set_ylabel('score')
ax.legend(loc='best')
ax.set_title('AdaBoostClassifier(SAMME.R)')
#SAMME算法
ax=fig.add_subplot(2,1,2)
training_scores=[]
testing_scores=[]
for learning_rate in learning_rates:
clf=ensemble.AdaBoostClassifier(learning_rate=learning_rate,
n_estimators=500,algorithm='SAMME')
clf.fit(X_train,y_train)
training_scores.append(clf.score(X_train,y_train))
testing_scores.append(clf.score(X_test,y_test))
ax.plot(learning_rates,training_scores,label='Training score')
ax.plot(learning_rates,testing_scores,label='Testing score')
ax.set_xlabel('learning rate')
ax.set_ylabel('score')
ax.legend(loc='best')
ax.set_title('AdaBoostClassifier(SAMME)')
plt.show()
test_AdaBoostClassifier_learning_rate(X_train,X_test,y_train,y_test)
algorithm的影响
def test_AdaBoostClassifier_algorithm(*data):
X_train,X_test,y_train,y_test=data
algorithms=['SAMME.R','SAMME']
fig=plt.figure()
learning_rates=[0.05,0.1,0.5,0.9]
for i,learning_rate in enumerate(learning_rates):
ax=fig.add_subplot(2,2,i+1)
for i,algorithm in enumerate(algorithms):
clf=ensemble.AdaBoostClassifier(learning_rate=learning_rate,
algorithm=algorithm)
clf.fit(X_train,y_train)
#绘图
estimators_num=len(clf.estimators_)
X=range(1,estimators_num+1)
ax.plot(list(X),list(clf.staged_score(X_train,y_train)),
label='%s:Training score'%algorithms[i])
ax.plot(list(X),list(clf.staged_score(X_test,y_test)),
label='%s:Testing score'%algorithm[i])
ax.set_xlabel('estimator num')
ax.set_ylabel('score')
ax.legend(loc='lower right')
ax.set_title('learning rate:%f'%learning_rate)
fig.suptitle('AdaBoostClassifier')
plt.show()
test_AdaBoostClassifier_algorithm(X_train,X_test,y_train,y_test)
AdaBoostRegreddor回归器
模型原型
class sklearn.ensemble.AdaBoostRegressor(base_estimator=None,n_estimators=50,learning_rate=1.0,loss=’linear’, random_state=None)
参数
- base_estimator
- n_estimators
- learning_rate
- loss:指定损失函数
- ’linear’:线性损失函数(默认)
- ‘square’:平方损失函数
- ‘exponential’:指数损失函数
- random_state
属性
- estimators_
- estimatorweights
- estimatorerrors
- featureimportances
方法
- fit(X,y[,sample_weight])
- predict(X)
- score(X,y[,sample_weight])
- staged_predict(X)
- staged_score(X,y[,sample_weight])
使用AdaBoostRegressor类
def test_AdaBoostRegressor(*data):
X_train,X_test,y_train,y_test=data
regr=ensemble.AdaBoostRegressor()
regr.fit(X_train,y_train)
#绘图
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
estimators_num=len(regr.estimators_)
X=range(1,estimators_num+1)
ax.plot(list(X),list(regr.staged_score(X_train,y_train)),
label='Training score')
ax.plot(list(X),list(regr.staged_score(X_test,y_test)),
label='Testing score')
ax.set_xlabel('estimator num')
ax.set_ylabel('score')
ax.legend(loc='best')
ax.set_title('AdaBoostRegressor')
plt.show()
X_train,X_test,y_train,y_test=load_data_regression()
test_AdaBoostRegressor(X_train,X_test,y_train,y_test)
不同类型的个体分类器的影响
def test_AdaBoostRegressor_base_regr(*data):
from sklearn.svm import LinearSVR
X_train,X_test,y_train,y_test=data
fig=plt.figure()
regrs=[ensemble.AdaBoostRegressor(),ensemble.AdaBoostRegressor
(base_estimator=LinearSVR(epsilon=0.01,C=100))]
labels=['Decision Tree Regressor','Linear SVM Regressor']
for i,regr in enumerate(regrs):
ax=fig.add_subplot(2,1,i+1)
regr.fit(X_train,y_train)
#绘图
estimators_num=len(regr.estimators_)
X=range(1,estimators_num+1)
ax.plot(list(X),list(regr.staged_score(X_train,y_train)),
label='Training score')
ax.plot(list(X),list(regr.staged_score(X_test,y_test)),
label='Testing score')
ax.set_xlabel('estimator num')
ax.set_ylabel('score')
ax.legend(loc='lower right')
ax.set_ylim(-1,1)
ax.set_title('Base_Estimator:%s'%labels[i])
plt.suptitle('AdaBoostRegressor')
plt.show()
test_AdaBoostRegressor_base_regr(X_train,X_test,y_train,y_test)
学习率的影响
def test_AdaBoostRegressor_learning_rate(*data):
X_train,X_test,y_train,y_test=data
learning_rates=np.linspace(0.01,1)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
training_scores=[]
testing_scores=[]
for learning_rate in learning_rates:
regr=ensemble.AdaBoostRegressor(learning_rate=learning_rate,
n_estimators=500)
regr.fit(X_train,y_train)
training_scores.append(regr.score(X_train,y_train))
testing_scores.append(regr.score(X_test,y_test))
ax.plot(learning_rates,training_scores,label='Training score')
ax.plot(learning_rates,testing_scores,label='Testing score')
ax.set_xlabel('learning rate')
ax.set_ylabel('score')
ax.legend(loc='best')
ax.set_title('AdaBoostRegressor')
plt.show()
test_AdaBoostRegressor_learning_rate(X_train,X_test,y_train,y_test)
损失函数的影响
def test_AdaBoostRegressor_loss(*data):
X_train,X_test,y_train,y_test=data
losses=['linear','square','exponential']
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
for i,loss in enumerate(losses):
regr=ensemble.AdaBoostRegressor(loss=loss,n_estimators=30)
regr.fit(X_train,y_train)
#绘图
estimators_num=len(regr.estimators_)
X=range(1,estimators_num+1)
ax.plot(list(X),list(regr.staged_score(X_train,y_train)),label='Training score:loss=%s'%loss)
ax.plot(list(X),list(regr.staged_score(X_test,y_test)),label='Testing score:loss=%s'%loss)
ax.set_xlabel('estimator num')
ax.set_ylabel('score')
ax.legend(loc='lower right')
ax.set_ylim(-1,1)
plt.suptitle('AdaBoostRegressor')
plt.show()
test_AdaBoostRegressor_loss(X_train,X_test,y_train,y_test)