这是在乳腺癌数据集的的stacking集成学习案例
1. 读入样本数据
import warnings
warnings.filterwarnings(“ignore”)
from sklearn import datasets
file=datasets.load_breast_cancer()
X=file[“data”]
y=file[“target”]
#拿出20%,用作测试数据
2. 分拆为训练集和测试集
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
3. 初级学习——通过交叉验证产生次级学习数据集
import numpy as np
from sklearn.model_selection import KFold
def stacking(model, x_train, y_train, x_test, k=5):
#这个函数是stacking的核心,使用交叉验证的方法得到第二阶段训练集和测试集
#取训练集和测试集的记录数
train_num, test_num = x_train.shape[0], x_test.shape[0]
#初始化训练集和测试集
second_level_train_set = np.zeros(train_num)
second_level_test_set = np.zeros(test_num)
#初始化每轮在测试集上预测的结果
test_nfolds_sets = np.zeros((test_num, k))
kf = KFold(n_splits=k)
#在训练集上进行k折交叉验证
for i,(train_index, test_index) in enumerate(kf.split(x_train)):
x_tr, y_tr = x_train[train_index], y_train[train_index]
x_ts= x_train[test_index]
#第i轮训练
model.fit(x_tr, y_tr)
#第i轮在训练集上产生的测试结果
second_level_train_set[test_index] = clf.predict(x_ts)
#第i轮模型在测试集上产生的测试结果
test_nfolds_sets[:,i] = model.predict(x_test)
#对k轮产生的测试集上的预测结果进行算术平均,拟合成模型clf在测试集的预测结果y_test_i
second_level_test_set[:] = np.round(test_nfolds_sets.mean(axis=1))
#输出clf模型在训练集和测试集上的预测结果
return second_level_train_set, second_level_test_set
#初始化融合后的训练集和测试集
train_sets = []
test_sets = []
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
rf= RandomForestClassifier()
ada = AdaBoostClassifier()
gbdt = GradientBoostingClassifier()
svc= SVC()
lgbm=LGBMClassifier()
xgb=XGBClassifier()
for clf in [rf,ada,gbdt,svc,lgbm,xgb]:
train_set, test_set = stacking(clf, X_train,y_train,X_test)
train_sets.append(train_set)
test_sets.append(test_set)
#将每个学习器的预测结果列化,并联,形成n维向量数据集,构造新的训练集和测试集
X_train_new = np.concatenate([result_set.reshape(-1,1) for result_set in train_sets], axis=1)
X_test_new = np.concatenate([y_test_set.reshape(-1,1) for y_test_set in test_sets], axis=1)
4. 次级学习及预测
#我们这里使用6个分类算法。正常来说,这六个分类器是通过参数调优的,在此略过
#使用lightGBM作为最终的学习器
lgbm.fit(X_train_new, y_train)
y_pred= lgbm.predict(X_test_new)
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error,confusion_matrix
accuracy_score(y_test,y_pred)