在 Python 中,可以使用scikit-learn
库来实现多种类型的交叉验证。
1. K 折交叉验证(K-Fold Cross Validation)
K 折交叉验证是将数据集分成 K 个互不重叠的子集,每次将其中一个子集作为测试集,其余 K - 1 个子集作为训练集,重复 K 次,最后将 K 次的评估结果进行平均。
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
import numpy as np
# 生成示例数据集
X, y = make_classification(n_samples=100, n_features=10, n_informative=5, n_redundant=0, random_state=42)
# 创建K折交叉验证对象,这里K设为5
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# 创建逻辑回归模型
model = LogisticRegression()
# 存储每次交叉验证的准确率
accuracies = []
# 进行K折交叉验证
for train_index, test_index in kf.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
# 训练模型
model.fit(X_train, y_train)
# 预测
y_pred = model.predict(X_test)
# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
accuracies.append(accuracy)
# 输出每次交叉验证的准确率和平均准确率
print("每次交叉验证的准确率:", accuracies)
print("平均准确率:", np.mean(accuracies))
2. 分层 K 折交叉验证(Stratified K-Fold Cross Validation)
当数据集中各类别的样本数量不均衡时,使用分层 K 折交叉验证可以保证每个子集内各类别的比例与原始数据集相同。
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
import numpy as np
# 生成示例数据集
X, y = make_classification(n_samples=100, n_features=10, n_informative=5, n_redundant=0, weights=[0.2, 0.8], random_state=42)
# 创建分层K折交叉验证对象,这里K设为5
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# 创建逻辑回归模型
model = LogisticRegression()
# 存储每次交叉验证的准确率
accuracies = []
# 进行分层K折交叉验证
for train_index, test_index in skf.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
# 训练模型
model.fit(X_train, y_train)
# 预测
y_pred = model.predict(X_test)
# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
accuracies.append(accuracy)
# 输出每次交叉验证的准确率和平均准确率
print("每次交叉验证的准确率:", accuracies)
print("平均准确率:", np.mean(accuracies))
3. 留一交叉验证(Leave-One-Out Cross Validation, LOOCV)
留一交叉验证是 K 折交叉验证的一种特殊情况,其中 K 等于样本数量,即每次只留一个样本作为测试集,其余样本作为训练集。
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
import numpy as np
# 生成示例数据集
X, y = make_classification(n_samples=10, n_features=10, n_informative=5, n_redundant=0, random_state=42)
# 创建留一交叉验证对象
loo = LeaveOneOut()
# 创建逻辑回归模型
model = LogisticRegression()
# 存储每次交叉验证的准确率
accuracies = []
# 进行留一交叉验证
for train_index, test_index in loo.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
# 训练模型
model.fit(X_train, y_train)
# 预测
y_pred = model.predict(X_test)
# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
accuracies.append(accuracy)
# 输出每次交叉验证的准确率和平均准确率
print("每次交叉验证的准确率:", accuracies)
print("平均准确率:", np.mean(accuracies))