导入相关包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
创建二分类数据集
from sklearn import datasets
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
data, target = make_blobs(n_samples=10000, centers=2, random_state=1, cluster_std=1.0)
X_train1, X_test, y_train1, y_test = train_test_split(data, target, test_size=0.2, random_state=1)
## 创建训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X_train1, y_train1, test_size=0.3, random_state=1)
print("The shape of training X:", X_train.shape)
print("The shape of training y:", y_train.shape)
print("The shape of test X:", X_test.shape)
print("The shape of test y:", y_test.shape)
print("The shape of validation X:", X_val.shape)
print("The shape of validation y:", y_val.shape)
设置分类器
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
clfs = [SVC(probability=True), RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion="gini"),
KNeighborsClassifier()]
# 设置第二层分类器
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
# 输出第一层的验证集结果与测试集结果
这里初始化一个X_val.shape[0], len(clfs) 的数组,下面会填上每个模型预测的值
val_features = np.zeros((X_val.shape[0], len(clfs))) # 初始化验证集结果
test_features = np.zeros((X_test.shape[0], len(clfs))) # 初始化测试集结果
# 返回出的val_features是三个模型下的预测结果分别是类别0/1
for i, clf in enumerate(clfs):
clf.fit(X_train, y_train)
val_feature = clf.predict_proba(X_val)[:, 1] #这里是输出的属于1类的 概率
print(clf.predict_proba(X_val)[:, 1])
test_feature = clf.predict_proba(X_test)[:, 1]
val_features[:, i] = val_feature
test_features[:, i] = test_feature
试了一下也可以直接用类别,不用概率
# for i, clf in enumerate(clfs):
# clf.fit(X_train, y_train)
# val_feature = clf.predict(X_val)
# print(clf.predict_proba(X_val))
# test_feature = clf.predict(X_test)
# val_features[:, i] = val_feature
# test_features[:, i] = test_feature
把上面得出的预测结果当做自二层的输入,在此训练得出预测,用5折交叉验证查看模型结果
# 将第一层的验证集的结果输入第二层训练第二层分类器
lr.fit(val_features, y_val) # 验证集去训练模型
p = lr.predict(test_features) # 测试集去验证模型
# lr.coef_ # 线性回归的系数
# p[p != 1] = 0
from sklearn.metrics import accuracy_score
accuracy_score(y_test, p)
# 输出预测的结果
from sklearn.model_selection import cross_val_score
cross_val_score(lr, test_features, y_test, cv=5) # 评价指标是是accuracy