零基础入门数据挖掘-Task5 模型融合-优快云博客

import numpy as np
import pandas as pd
from sklearn import metrics
import warnings

warnings.filterwarnings('ignore')

# 生成一些简单的样本数据，test_prei 代表第i个模型的预测值
test_pre1 = [1.2, 3.2, 2.1, 6.2]
test_pre2 = [0.9, 3.1, 2.0, 5.9]
test_pre3 = [1.1, 2.9, 2.2, 6.0]

# y_test_true 代表第模型的真实值
y_test_true = [1, 3, 2, 6]

# 定义结果的加权平均函数
def Weighted_method(test_pre1, test_pre2, test_pre3, w = [1/3, 1/3, 1/3]):
    Weighted_result = w[0] * pd.Series(test_pre1) + \
                    w[1] * pd.Series(test_pre2) + w[2] * pd.Series(test_pre3)
    return Weighted_result

# 各模型的预测结果计算MAE
print('Pred1 MAE:',metrics.mean_absolute_error(y_test_true, test_pre1))
print('Pred2 MAE:',metrics.mean_absolute_error(y_test_true, test_pre2))
print('Pred3 MAE:',metrics.mean_absolute_error(y_test_true, test_pre3))
## 根据加权计算MAE
w = [0.3,0.4,0.3] # 定义比重权值
Weighted_pre = Weighted_method(test_pre1, test_pre2, test_pre3, w)
print('Weighted_pre MAE:', metrics.mean_absolute_error(y_test_true, Weighted_pre))

Pred1 MAE: 0.1750000000000001
Pred2 MAE: 0.07499999999999993
Pred3 MAE: 0.10000000000000009
Weighted_pre MAE: 0.05750000000000027

# 定义基于结果平均值的加权平均函数
def Mean_method(test_pre1, test_pre2, test_pre3):
    Mean_result = pd.concat([pd.Series(test_pre1), pd.Series(test_pre2), \
                             pd.Series(test_pre3)], axis = 1).mean(axis = 1)
    return Mean_result

Mean_pre = Mean_method(test_pre1, test_pre2, test_pre3)
print('Mean_pre MAE:', metrics.mean_absolute_error(y_test_true, Mean_pre))

Mean_pre MAE: 0.06666666666666693

# 定义基于结果中位数的加权平均函数
def Median_method(test_pre1, test_pre2, test_pre3):
    Median_result = pd.concat([pd.Series(test_pre1), pd.Series(test_pre2), \
                               pd.Series(test_pre3)], axis = 1).median(axis = 1)
    return Median_result

Median_pre = Median_method(test_pre1, test_pre2, test_pre3)
print('Median_pre MAE:', metrics.mean_absolute_error(y_test_true, Median_pre))

Median_pre MAE: 0.07500000000000007

3.2 Stacking

Stacking简单来说就是将训练分为两个阶段，第一个阶段使用不同的模型对训练集进行训练并对测试集进行预测。将训练集的预测结果和测试集的预测结果分别作为新特征加入到训练集和测试集中。第二阶段用新得到的训练集和测试集来进行训练和预测。

与Stacking十分类似的融合方式叫做Blending，二者的核心思路是一致的，就是实现细节不同。二者的具体区别可以参考[1]。

这里我们的第二阶段选择的是线性模型。

from sklearn import linear_model

def Stacking_method(train_reg1, train_reg2, train_reg3, y_train_true, \
                    test_pre1, test_pre2, test_pre3, \
                    model_L2 = linear_model.LinearRegression()):
    model_L2.fit(pd.concat([pd.Series(train_reg1), pd.Series(train_reg2), \
                            pd.Series(train_reg3)], axis = 1 ).values, y_train_true)
    Stacking_result = model_L2.predict(pd.concat([pd.Series(test_pre1), \
                                                  pd.Series(test_pre2), \
                                                  pd.Series(test_pre3)], \
                                                 axis = 1).values)
    return Stacking_result

# 生成一些简单的样本数据，test_prei 代表第i个模型的预测值
train_reg1 = [3.2, 8.2, 9.1, 5.2]
train_reg2 = [2.9, 8.1, 9.0, 4.9]
train_reg3 = [3.1, 7.9, 9.2, 5.0]
# y_test_true 代表第模型的真实值
y_train_true = [3, 8, 9, 5] 

test_pre1 = [1.2, 3.2, 2.1, 6.2]
test_pre2 = [0.9, 3.1, 2.0, 5.9]
test_pre3 = [1.1, 2.9, 2.2, 6.0]

# y_test_true 代表第模型的真实值
y_test_true = [1, 3, 2, 6]

model_L2 = linear_model.LinearRegression()
Stacking_pre = Stacking_method(train_reg1, train_reg2, train_reg3, y_train_true,
                               test_pre1, test_pre2, test_pre3, model_L2)
print('Stacking_pre MAE:', metrics.mean_absolute_error(y_test_true, Stacking_pre))

Stacking_pre MAE: 0.042134831460675204

4. 分类模型的融合

4.1 导入相关模块

from sklearn.datasets import make_blobs
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

4.2 投票法

对于一个分类问题，当多个模型有各自不同的预测结果时，可以采取少数服从多数的方式来决定最后的预测结果。这就是投票法。投票法也分为硬投票和软投票，其实就是一人一票和一人多票的区别（相当于加权）。

# 硬投票：对多个模型直接进行投票，不区分模型结果的相对重要度，最终投票数最多的类为最终被预测的类。
iris = datasets.load_iris()

x = iris.data
y = iris.target
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.3)

clf1 = XGBClassifier(learning_rate = 0.1, n_estimators = 150, max_depth = 3, \
                     min_child_weight = 2, subsample = 0.7, \
                     colsample_bytree = 0.6, objective = 'binary:logistic')
clf2 = RandomForestClassifier(n_estimators = 50, max_depth = 1, \
                              min_samples_split = 4,
                              min_samples_leaf = 63,oob_score = True)
clf3 = SVC(C = 0.1)

# 硬投票
eclf = VotingClassifier(estimators = [('xgb', clf1), ('rf', clf2), \
                                      ('svc', clf3)], voting = 'hard')
for clf, label in zip([clf1, clf2, clf3, eclf], \
                      ['XGBBoosting', 'Random Forest', 'SVM', 'Ensemble']):
    scores = cross_val_score(clf, x, y, cv = 5, scoring = 'accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.96 (+/- 0.02) [XGBBoosting]
Accuracy: 0.33 (+/- 0.00) [Random Forest]
Accuracy: 0.95 (+/- 0.03) [SVM]
Accuracy: 0.95 (+/- 0.03) [Ensemble]

# 软投票：和硬投票原理相同，增加了设置权重的功能，可以为不同模型设置不同权重，进而区别模型不同的重要度。
x = iris.data
y = iris.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

clf1 = XGBClassifier(learning_rate = 0.1, n_estimators = 150, max_depth = 3, \
                     min_child_weight = 2, subsample = 0.8, \
                     colsample_bytree = 0.8, objective = 'binary:logistic')
clf2 = RandomForestClassifier(n_estimators = 50, max_depth = 1, \
                              min_samples_split = 4,
                              min_samples_leaf = 63, oob_score = True)
clf3 = SVC(C = 0.1, probability = True)

# 软投票
eclf = VotingClassifier(estimators = [('xgb', clf1), ('rf', clf2), \
                                      ('svc', clf3)], voting = 'soft', \
                        weights = [2, 1, 1])
clf1.fit(x_train, y_train)

for clf, label in zip([clf1, clf2, clf3, eclf], \
                      ['XGBBoosting', 'Random Forest', 'SVM', 'Ensemble']):
    scores = cross_val_score(clf, x, y, cv = 5, scoring = 'accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.96 (+/- 0.02) [XGBBoosting]
Accuracy: 0.33 (+/- 0.00) [Random Forest]
Accuracy: 0.95 (+/- 0.03) [SVM]
Accuracy: 0.96 (+/- 0.02) [Ensemble]

4.3 Stacking

# 5-Fold Stacking
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier,GradientBoostingClassifier
import pandas as pd

#创建训练的数据集
data_0 = iris.data
data = data_0[:100,:]

target_0 = iris.target
target = target_0[:100]

# 模型融合中使用到的各个单模型
clfs = [LogisticRegression(solver = 'lbfgs'),
        RandomForestClassifier(n_estimators = 5, n_jobs = -1, criterion = 'gini'),
        ExtraTreesClassifier(n_estimators = 5, n_jobs = -1, criterion = 'gini'),
        ExtraTreesClassifier(n_estimators = 5, n_jobs = -1, criterion = 'entropy'),
        GradientBoostingClassifier(learning_rate = 0.05, subsample = 0.5, \
                                   max_depth = 6, n_estimators = 5)]
 
# 切分一部分数据作为测试集
X, X_predict, y, y_predict = train_test_split(data, target, test_size = 0.3, \
                                              random_state = 2020)

dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_predict.shape[0], len(clfs)))

# 5折stacking
n_splits = 5
skf = StratifiedKFold(n_splits)
skf = skf.split(X, y)

for j, clf in enumerate(clfs):
    # 依次训练各个单模型
    dataset_blend_test_j = np.zeros((X_predict.shape[0], 5))
    for i, (train, test) in enumerate(skf):
        # 5-Fold交叉训练，使用第i个部分作为预测，
        # 剩余的部分来训练模型，获得其预测的输出作为第i部分的新特征。
        X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test]
        clf.fit(X_train, y_train)
        y_submission = clf.predict_proba(X_test)[:, 1]
        dataset_blend_train[test, j] = y_submission
        dataset_blend_test_j[:, i] = clf.predict_proba(X_predict)[:, 1]
    # 对于测试集，直接用这k个模型的预测值均值作为新的特征。
    dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)
    print("val auc Score: %f" % roc_auc_score(y_predict, dataset_blend_test[:, j]))

clf = LogisticRegression(solver = 'lbfgs')
clf.fit(dataset_blend_train, y)
y_submission = clf.predict_proba(dataset_blend_test)[:, 1]

print("Val auc Score of Stacking: %f" % (roc_auc_score(y_predict, y_submission)))

val auc Score: 1.000000
val auc Score: 0.500000
val auc Score: 0.500000
val auc Score: 0.500000
val auc Score: 0.500000
Val auc Score of Stacking: 1.000000

这次之所以写得如此简略是因为源代码有很多东西交代不清，导致很多地方我根本没有理解。