Python机器学习之交叉验证

本文通过使用Python的机器学习库sklearn,对一个水果数据集进行预处理、特征工程及模型训练。主要内容包括数据加载、特征归一化、交叉验证、模型选择与评估等步骤,并展示了如何使用K近邻、支持向量机和决策树等算法。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

数据链接:https://pan.baidu.com/s/1yW6gye5rJQ-Rn_iKlKUm1g 密码:ejki

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

1、 数据加载

# 加载数据集
fruits_df = pd.read_table('fruit_data_with_colors.txt')
print(fruits_df.head())
print('样本个数:', len(fruits_df))
# 创建目标标签和名称的字典
fruit_name_dict = dict(zip(fruits_df['fruit_label'], fruits_df['fruit_name']))
print(fruit_name_dict)
# 划分数据集
X = fruits_df[['mass', 'width', 'height', 'color_score']]
y = fruits_df['fruit_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/4, random_state=0)
print('数据集样本数:{},训练集样本数:{},测试集样本数:{}'.format(len(X), len(X_train), len(X_test)))

2、特征归一化

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
for i in range(4):
    print('归一化前,训练数据第{}维特征最大值:{:.3f},最小值:{:.3f}'.format(i + 1, X_train.iloc[:, i].max(), X_train.iloc[:, i].min()))
    print('归一化后,训练数据第{}维特征最大值:{:.3f},最小值:{:.3f}'.format(i + 1,X_train_scaled[:, i].max(), X_train_scaled[:, i].min()))
print()

3、交叉验证

 # 单一超参数
 from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

k_range = [2, 4, 5, 10]
cv_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train_scaled, y_train, cv=3)
    cv_score = np.mean(scores)
    print('k={},验证集上的准确率={:.3f}'.format(k, cv_score))
    cv_scores.append(cv_score)
best_k = k_range[np.argmax(cv_scores)]
best_knn = KNeighborsClassifier(n_neighbors=best_k)
best_knn.fit(X_train_scaled, y_train)
print('测试集准确率:', best_knn.score(X_test_scaled, y_test))
# 调用  validation_curve 绘制超参数对训练集和验证集的影响
from sklearn.model_selection import validation_curve
from sklearn.svm import SVC
c_range = [1e-3, 1e-2, 0.1, 1, 10, 100, 1000, 10000]
train_scores, test_scores = validation_curve(SVC(kernel='linear'), X_train_scaled, y_train,param_name='C', param_range=c_range,cv=5, scoring='accuracy')
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.figure(figsize=(10, 8))
plt.title('Validation Curve with SVM')
plt.xlabel('C')
plt.ylabel('Score')
plt.ylim(0.0, 1.1)
lw = 2
plt.semilogx(c_range, train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
plt.fill_between(c_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2,
                 color="darkorange", lw=lw)
plt.semilogx(c_range, test_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
plt.fill_between(c_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2,
                 color="navy", lw=lw)
plt.legend(loc="best")
plt.show()
# 从上图可知对SVM,C=100为最优参数
svm_model = SVC(kernel='linear', C=1000)
svm_model.fit(X_train_scaled, y_train)
print(svm_model.score(X_test_scaled, y_test))

# 多个超参数
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

parameters = {'max_depth':[3, 5, 7, 9], 'min_samples_leaf': [1, 2, 3, 4]}
clf = GridSearchCV(DecisionTreeClassifier(), parameters, cv=3, scoring='accuracy')
clf.fit(X_train, y_train)
print('最优参数:', clf.best_params_)
print('验证集最高得分:', clf.best_score_)
# 获取最优模型
best_model = clf.best_estimator_
print('测试集上准确率:', best_model.score(X_test, y_test))

4、模型评价指标

k = 1
# 转换为二分类问题
y_train_binary = y_train.copy()
y_test_binary = y_test.copy()
y_train_binary[y_train_binary != 1] = 0
y_test_binary[y_test_binary != 1] = 0
knn = KNeighborsClassifier(k)
knn.fit(X_train_scaled, y_train_binary)
y_pred = knn.predict(X_test_scaled)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# 准确率
print('准确率:{:.3f}'.format(accuracy_score(y_test_binary, y_pred)))
# 精确率
print('精确率:{:.3f}'.format(precision_score(y_test_binary, y_pred)))
# 召回率
print('召回率:{:.3f}'.format(recall_score(y_test_binary, y_pred)))
# F1值
print('F1值:{:.3f}'.format(f1_score(y_test_binary, y_pred)))

# PR 曲线
from sklearn.metrics import precision_recall_curve, average_precision_score

# precision, recall, _ = precision_recall_curve(y_test, y_pred)
print('AP值:{:.3f}'.format(average_precision_score(y_test_binary, y_pred)))

# ROC曲线
from sklearn.metrics import roc_auc_score, roc_curve
# fpr, tpr, _ = roc_curve(y_test, y_pred)
print('AUC值:{:.3f}'.format(roc_auc_score(y_test_binary, y_pred)))

# 混淆矩阵
from sklearn.metrics import confusion_matrix

y_pred = best_model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)
print(cm)
plt.figure()
plt.grid(False)
plt.imshow(cm, cmap='jet')
plt.colorbar()
### Python 中实现机器学习模型的交叉验证 #### 使用 K 折交叉验证评估模型性能 为了更好地理解如何在 Python 中实现机器学习模型的交叉验证,下面展示了一个具体的例子。此示例展示了如何利用 `sklearn` 库中的 `KFold` 和 `cross_val_score` 函数来执行 k 折交叉验证。 ```python from sklearn.model_selection import KFold, cross_val_score from sklearn.datasets import load_iris from sklearn.tree import DecisionTreeClassifier # 加载数据集 data = load_iris() X, y = data.data, data.target # 创建决策树分类器实例 model = DecisionTreeClassifier() # 定义折叠数 k_folds = KFold(n_splits=5) # 执行交叉验证并计算平均得分 scores = cross_val_score(model, X, y, cv=k_folds) average_accuracy = scores.mean() print(f'Cross-validation scores: {scores}') print(f'Average accuracy: {average_accuracy:.2f}')[^1] ``` 这段代码首先加载了鸢尾花数据集作为样本输入,并创建了一个简单的决策树分类器。接着定义了五折交叉验证策略 (`n_splits=5`) 并应用到该数据集中。最后输出每次迭代后的准确性分数以及它们的均值。 #### 利用 ShuffleSplit 进行随机划分 除了标准的 k 折分割外,还可以采用其他类型的拆分方式比如 `ShuffleSplit` 来增加灵活性: ```python from sklearn.model_selection import ShuffleSplit shuffle_split = ShuffleSplit(test_size=.2, n_splits=5, random_state=0) scores_shuffle = cross_val_score(model, X, y, cv=shuffle_split) avg_acc_shuffle = scores_shuffle.mean() print(f'Shuffled split validation scores: {scores_shuffle}') print(f'Average shuffled accuracy: {avg_acc_shuffle:.2f}')[^2] ``` 在这个版本里,选择了不同的测试比例 (test_size),并且指定了重复次数 (n_splits) 以便于比较不同配置下的效果差异。 通过上述两种方法之一,能够有效地评估所选算法对于给定问题的有效性和稳定性,从而帮助挑选最优解方案[^3]。
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值