How to represent ROC curve when using Cross-Validation

本文探讨了在使用逻辑回归分类器进行K折交叉验证时,如何正确地绘制ROC曲线并计算AUC值。作者对比了两种方法:一种是直接平均每个折叠的FPR和TPR值;另一种则是收集所有折叠的预测概率后统一绘制ROC曲线。文章讨论了这两种方法的适用场景及可能存在的问题。


I am performing k-Fold Cross Validation using a Logistic Regression classifier on a dataset and computing the ROC curve and the AUC for each fold. My desired output is one ROC curve with a corresponding AUC value.

One method (taken from here) is to take the mean false positivity rates (fpr) and true positivity rates (tpr) over all folds and plot the overall ROC curve using the mean tpr and fpr values. Then compute the AUC using the mean-ROC curve. However, this method does not work well when the dataset is small. Without a long explanation, my classification is a diagnosis that uses many samples for one diagnosis and thus reduces the predictions per fold to around 3-5.

The alternative method is to save the probabilities of each prediction in every fold and then construct a ROC curve after k-Fold CV and compute the AUC using this ROC curve. However, this would mean that various models, trained on different datasets are combined into one ROC curve. I don't know if this is an issue?

What is the industry standard for model evaluation reporting when using ROC and AUC combined with k-Fold Cross validation?

-feel free to edit my question.

from sklearn.metrics import confusion_matrix import seaborn as sns import os import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array from tensorflow.keras.callbacks import EarlyStopping import tensorflow as tf from glob import glob # 设置数据集路径 base_dir = "D:/dataset" # 创建数据集DataFrame - 从文件名自动推断标签 def create_dataframe(dataset_path): data = [] for img_file in glob(dataset_path + r'/*/0/*.png'): if img_file.endswith('.png'): # img_path = os.path.join(dataset_path, img_file) img_path = img_file # 从文件名推断标签: 文件名中包含"class1"为阳性(1), 其他为阴性(0) # label = 1 if "class1" in img_file.lower() else 0 data.append([img_path, 0]) for img_file in glob(dataset_path + r'/*/1/*.png'): if img_file.endswith('.png'): # img_path = os.path.join(dataset_path, img_file) img_path = img_file # 从文件名推断标签: 文件名中包含"class1"为阳性(1), 其他为阴性(0) # label = 1 if "class1" in img_file.lower() else 0 data.append([img_path, 1]) # for img_file in glob(dataset_path): # if img_file.endswith('.png'): # # img_path = os.path.join(dataset_path, img_file) # img_path = img_file # # 从文件名推断标签: 文件名中包含"class1"为阳性(1), 其他为阴性(0) # label = 1 if "class1" in img_file.lower() else 0 # data.append([img_path, label]) return pd.DataFrame(data, columns=['path', 'label']) # 创建数据集DataFrame df = create_dataframe(base_dir) df = create_dataframe("D:/dataset") print("总样本数:", len(df)) print(df['label'].value_counts()) print(df.head()) # 检查数据集分布 print(f"阴性样本数(0): {len(df[df['label'] == 0])}") print(f"阳性样本数(1): {len(df[df['label'] == 1])}") # 划分训练集和测试集 (80%训练, 20%测试) train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label']) # 自定义数据生成器 - 直接从文件路径加载图像 class CustomDataGenerator(tf.keras.utils.Sequence): def __init__(self, df, batch_size=32, img_size=(50, 50), shuffle=True, augment=False): self.df = df self.batch_size = batch_size self.img_size = img_size self.shuffle = shuffle self.augment = augment self.on_epoch_end() # 创建数据增强生成器 self.augmenter = ImageDataGenerator( rotation_range=20, width_shift_range=0.2, height_shift_range=0.2, shear_range=0.2, zoom_range=0.2, horizontal_flip=True, fill_mode='nearest' ) if augment else None def __len__(self): return int(np.ceil(len(self.df) / self.batch_size)) def __getitem__(self, index): batch_paths = self.paths[index * self.batch_size:(index + 1) * self.batch_size] batch_labels = self.labels[index * self.batch_size:(index + 1) * self.batch_size] batch_images = [] for path in batch_paths: img = load_img(path, target_size=self.img_size) img_array = img_to_array(img) / 255.0 # 归一化 if self.augment and self.augmenter: # 应用数据增强 img_array = self.augmenter.random_transform(img_array) batch_images.append(img_array) return np.array(batch_images), np.array(batch_labels) def on_epoch_end(self): self.paths = self.df['path'].values self.labels = self.df['label'].values if self.shuffle: indices = np.arange(len(self.paths)) np.random.shuffle(indices) self.paths = self.paths[indices] self.labels = self.labels[indices] # 图像尺寸 (参考Kaggle数据集) img_width, img_height = 50, 50 batch_size = 32 # 创建数据生成器 train_generator = CustomDataGenerator( train_df, batch_size=batch_size, img_size=(img_width, img_height), augment=True # 训练集使用数据增强 ) test_generator = CustomDataGenerator( test_df, batch_size=batch_size, img_size=(img_width, img_height), shuffle=False # 测试集不需要打乱 ) # 构建CNN模型 model = Sequential([ Conv2D(32, (3, 3), activation='relu', input_shape=(img_width, img_height, 3)), MaxPooling2D((2, 2)), Conv2D(64, (3, 3), activation='relu'), MaxPooling2D((2, 2)), Conv2D(128, (3, 3), activation='relu'), MaxPooling2D((2, 2)), Flatten(), Dense(256, activation='relu'), Dropout(0.5), Dense(1, activation='sigmoid') ]) # 编译模型 model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.Precision(name='precision'), tf.keras.metrics.Recall(name='recall'), tf.keras.metrics.AUC(name='auc')]) # 提前停止回调 early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True) # 训练模型 history = model.fit( train_generator, epochs=30, validation_data=test_generator, callbacks=[early_stop] ) # 评估测试集 test_results = model.evaluate(test_generator) print( f"测试集准确率: {test_results[1]:.4f}, 精确率: {test_results[2]:.4f}, 召回率: {test_results[3]:.4f}, AUC: {test_results[4]:.4f}") # 保存模型 model.save('breast_cancer_cnn.h5') # 绘制训练历史 plt.figure(figsize=(12, 4)) plt.subplot(1, 2, 1) plt.plot(history.history['accuracy'], label='训练准确率') plt.plot(history.history['val_accuracy'], label='验证准确率') plt.title('模型准确率') plt.ylabel('准确率') plt.xlabel('轮次') plt.legend() plt.subplot(1, 2, 2) plt.plot(history.history['loss'], label='训练损失') plt.plot(history.history['val_loss'], label='验证损失') plt.title('模型损失') plt.ylabel('损失') plt.xlabel('轮次') plt.legend() plt.savefig('training_history.png') plt.show() # 获取测试集真实标签和预测标签 test_labels = [] for i in range(len(test_generator)): _, labels = test_generator[i] test_labels.extend(labels) test_labels = np.array(test_labels) # 模型预测(输出概率) pred_probs = model.predict(test_generator) # 转换为二分类标签(阈值0.5) pred_labels = (pred_probs > 0.5).astype(int).flatten() # 计算混淆矩阵 cm = confusion_matrix(test_labels, pred_labels) # 绘制并保存混淆矩阵图 plt.figure(figsize=(8, 6)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['IDC(-)', 'IDC(+)'], yticklabels=['IDC(-)', 'IDC(+)']) plt.title('Confusion Matrix') plt.xlabel('Predicted Label') plt.ylabel('True Label') plt.savefig('confusion_matrix.png') plt.show() # === 新增的ROC曲线绘制代码 === from sklearn.metrics import roc_curve, auc # 计算ROC曲线参数 fpr, tpr, thresholds = roc_curve(test_labels, pred_probs) roc_auc = auc(fpr, tpr) # 绘制ROC曲线 plt.figure(figsize=(8, 6)) plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})') plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') # 随机猜测线 plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver Operating Characteristic (ROC) Curve') plt.legend(loc="lower right") plt.grid(True, alpha=0.3) # 保存ROC曲线图 plt.savefig('roc_curve.png') plt.show() # === 新增模型评估指标代码 === from sklearn.metrics import classification_report # 生成分类报告 report = classification_report(test_labels, pred_labels, target_names=['IDC(-)', 'IDC(+)']) print("分类报告:\n", report) 说明ROC曲线的用途英文介绍
09-12
【无人机】基于改进粒子群算法的无人机路径规划研究[和遗传算法、粒子群算法进行比较](Matlab代码实现)内容概要:本文围绕基于改进粒子群算法的无人机路径规划展开研究,重点探讨了在复杂环境中利用改进粒子群算法(PSO)实现无人机三维路径规划的方法,并将其与遗传算法(GA)、标准粒子群算法等传统优化算法进行对比分析。研究内容涵盖路径规划的多目标优化、避障策略、航路点约束以及算法收敛性和寻优能力的评估,所有实验均通过Matlab代码实现,提供了完整的仿真验证流程。文章还提到了多种智能优化算法在无人机路径规划中的应用比较,突出了改进PSO在收敛速度和全局寻优方面的优势。; 适合人群:具备一定Matlab编程基础和优化算法知识的研究生、科研人员及从事无人机路径规划、智能优化算法研究的相关技术人员。; 使用场景及目标:①用于无人机在复杂地形或动态环境下的三维路径规划仿真研究;②比较不同智能优化算法(如PSO、GA、蚁群算法、RRT等)在路径规划中的性能差异;③为多目标优化问题提供算法选型和改进思路。; 阅读建议:建议读者结合文中提供的Matlab代码进行实践操作,重点关注算法的参数设置、适应度函数设计及路径约束处理方式,同时可参考文中提到的多种算法对比思路,拓展到其他智能优化算法的研究与改进中。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值