作ROC曲线时遇上的问题

使用ROC曲线评估分类模型是非常通用的手段,但是,使用它的时候要注意两点:
1、分类的类型。
必须为数值型。
2、只针对二分类问题。
ROC曲线是根据一系列不同的二分类方式(分界值或决定阈),以真阳性率(灵敏度)为纵坐标,假阳性率(1-特异度)为横坐标绘制的曲线。传统的诊断试验评价方法有一个共同的特点,必须将试验结果分为两类,再进行统计分析。
下面有个例子:

import numpy as np
from sklearn.metrics import roc_auc_score
y_scores=np.array([ 0.63, 0.53, 0.36, 0.02, 0.70 ,1 , 0.48, 0.46, 0.57])
y_true=np.array(['0', '1', '0', '0', '1', '1', '1', '1', '1'])
roc_auc_score(y_true, y_scores)

当运行此代码时,会出现ValueError: Data is not binary and pos_label is not specified for roc_curve的错误。y_true有问题。我们再看源码定义的y_true:

classes = np.unique(y_true)
if (pos_label is None and not (np.all(classes == [0, 1]) or
 np.all(classes == [-1, 1]) or
 np.all(classes == [0]) or
 np.all(classes == [-1]) or
 np.all(classes == [1]))):
    raise ValueError("Data is not binary and pos_label is not specified")

可以看出他是数值型的变量,而不是字符串的变量,所以,将y_true改为:

y_true=np.array([0, 1, 0, 0, 1, 1, 1, 1, 1])

就行了!
同时,也许你还会遇上这样的错误:
ValueError: multiclass format is not supported
这是因为,你的类别数多于了两个,前面我们也说过ROC只能针对二分类问题。所以,对于多分类问题,我们不用ROC曲线去评估。如果,不信,可以将y_true变为:

y_true=np.array([0, 1, 0, 0, 1, 1, 1, 1, 2])

看看是不是会出现:ValueError: multiclass format is not supported

请你帮我检查以下R studio脚本,并做修改,确保可以运行,我的数据文件在桌面,名称为“all.xlsx”,我数据文件中的列名分别为paraffin pathology,Radiologist and AI,Radiologist,AI ,并且是竖列。# ====================== # ROC分析脚本 - 修复版 # 数据名称:all (小写,已导入RStudio环境) # ====================== # 0. 初始化环境 cat("步骤0: 初始化环境...\n") options(warn = 1) # 显示警告但不转为错误 # 1. 安装和加载必要的包 cat("步骤1: 检查并安装必要的包...\n") required_packages <- c("pROC", "ggplot2", "caret", "dplyr") new_packages <- required_packages[!(required_packages %in% installed.packages()[,"Package"])] if (length(new_packages) > 0) { cat("正在安装缺失的包:", paste(new_packages, collapse = ", "), "\n") install.packages(new_packages, dependencies = TRUE) } # 加载包 suppressPackageStartupMessages({ library(pROC) library(ggplot2) library(caret) library(dplyr) }) cat("包加载成功!\n\n") # 2. 数据准备和验证 - 修复大小写问题 cat("步骤2: 数据准备和验证...\n") # 检查数据是否存在(小写) if (!exists("all")) { # 尝试从桌面读取数据 cat("环境变量中未找到'all',尝试从桌面读取...\n") desktop_path <- file.path(Sys.getenv("USERPROFILE"), "Desktop") file_path <- file.path(desktop_path, "All.xlsx") if (file.exists(file_path)) { cat("找到文件:", file_path, "\n") all <- readxl::read_excel(file_path) cat("数据成功从Excel导入!\n") } else { stop("错误: 未找到'all'数据框,也未在桌面找到All.xlsx文件") } } else { cat("找到'all'数据框\n") } # 创建数据副本 data <- as.data.frame(all) cat("数据维度:", dim(data), "\n") # 显示数据列名(重要诊断步骤) cat("数据列名:", paste(names(data), collapse = ", "), "\n") # 检查必要列是否存在 - 使用小写列名 required_columns <- c("paraffin_pathology", "rad_ai", "radiologist", "ai") cat("需要列名:", paste(required_columns, collapse = ", "), "\n") # 检查缺失列 missing_columns <- setdiff(required_columns, names(data)) if (length(missing_columns) > 0) { cat("警告: 缺少必要列 -", paste(missing_columns, collapse = ", "), "\n") # 尝试忽略大小写匹配 cat("尝试忽略大小写匹配列名...\n") actual_columns <- names(data) matched_columns <- sapply(required_columns, function(col) { match <- grep(paste0("^", col, "$"), actual_columns, ignore.case = TRUE, value = TRUE) if (length(match) > 0) match[1] else col }) # 更新数据框列名 names(data)[match(matched_columns, names(data))] <- names(matched_columns) cat("更新后的列名:", paste(names(data), collapse = ", "), "\n") # 再次检查缺失列 missing_columns <- setdiff(required_columns, names(data)) if (length(missing_columns) > 0) { stop("错误: 仍然缺少必要列 - ", paste(missing_columns, collapse = ", ")) } else { cat("列名匹配成功!\n") } } else { cat("所有必要列都存在!\n") } # 3. 数据预处理 cat("\n步骤3: 数据预处理...\n") # 显示金标准分布 cat("金标准原始值:\n") print(table(data$paraffin_pathology)) # 转换为因子(1=阴性, 2=阳性) data$paraffin_pathology <- factor( data$paraffin_pathology, levels = c(1, 2), labels = c("Negative", "Positive") ) cat("\n金标准分布(转换后):\n") print(table(data$paraffin_pathology)) # 检查预测值 check_range <- function(col_name) { vals <- data[[col_name]] if (!is.numeric(vals)) { cat(sprintf("%-20s: 非数值型 - 尝试转换\n", col_name)) vals <- as.numeric(vals) } cat(sprintf("%-20s: 最小值=%.4f, 最大值=%.4f, 缺失值=%d\n", col_name, min(vals, na.rm=TRUE), max(vals, na.rm=TRUE), sum(is.na(vals)))) return(vals) } cat("\n预测值范围检查:\n") data$rad_ai <- check_range("rad_ai") data$radiologist <- check_range("radiologist") data$ai <- check_range("ai") # 4. 定义评估函数 cat("\n步骤4: 定义评估函数...\n") evaluate_model <- function(model_name, true_labels, predictions) { # 移除缺失值 complete_idx <- complete.cases(true_labels, predictions) true_labels_complete <- true_labels[complete_idx] predictions_complete <- predictions[complete_idx] if (length(unique(true_labels_complete)) < 2) { stop("错误: 金标准只有单一类别") } cat("有效样本量:", sum(complete_idx), "\n") # 计算ROC曲线 roc_obj <- roc( response = true_labels_complete, predictor = predictions_complete, levels = c("Negative", "Positive"), direction = "<", ci = TRUE, auc = TRUE ) # 计算最佳阈值 best_threshold <- coords( roc_obj, x = "best", best.method = "youden", transpose = TRUE ) # 二分类预测 pred_class <- ifelse(predictions_complete >= best_threshold["threshold"], "Positive", "Negative") pred_class <- factor(pred_class, levels = c("Negative", "Positive")) # 计算混淆矩阵 cm <- confusionMatrix( data = pred_class, reference = true_labels_complete, positive = "Positive" ) # 计算95%置信区间 ci_auc <- ci.auc(roc_obj) # 提取指标 metrics <- data.frame( Model = model_name, Sensitivity = round(cm$byClass["Sensitivity"], 4), Specificity = round(cm$byClass["Specificity"], 4), PPV = round(cm$byClass["Pos Pred Value"], 4), NPV = round(cm$byClass["Neg Pred Value"], 4), F1 = round(cm$byClass["F1"], 4), AUC = round(auc(roc_obj), 4), AUC_CI_Lower = round(ci_auc[1], 4), AUC_CI_Upper = round(ci_auc[3], 4), Threshold = round(best_threshold["threshold"], 4), N = sum(complete_idx), stringsAsFactors = FALSE ) return(list(roc = roc_obj, metrics = metrics)) } # 5. 模型评估 cat("\n步骤5: 评估模型...\n") models <- list( "Radiologist and AI" = data$rad_ai, "Radiologist" = data$radiologist, "AI" = data$ai ) results <- list() for (model_name in names(models)) { cat("\n评估模型:", model_name, "\n") result <- tryCatch({ evaluate_model( model_name = model_name, true_labels = data$paraffin_pathology, predictions = models[[model_name]] ) }, error = function(e) { cat("!! 评估出错:", e$message, "\n") return(NULL) }) if (!is.null(result)) { results[[model_name]] <- result cat("评估成功!\n") } else { cat("评估失败\n") } } # 6. 结果整理 cat("\n步骤6: 整理结果...\n") if (length(results) == 0) { stop("所有模型评估均失败,无法继续") } # 合并指标结果 metrics_df <- do.call(rbind, lapply(results, function(x) x$metrics)) # 准备ROC曲线数据 roc_data <- data.frame() for (model_name in names(results)) { roc_obj <- results[[model_name]]$roc temp <- data.frame( Model = model_name, FPR = 1 - roc_obj$specificities, TPR = roc_obj$sensitivities, Thresholds = roc_obj$thresholds ) roc_data <- rbind(roc_data, temp) } # 7. 绘制ROC曲线 cat("\n步骤7: 绘制ROC曲线...\n") roc_plot <- ggplot(roc_data, aes(x = FPR, y = TPR, color = Model)) + geom_line(size = 1.2) + geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray") + labs( title = "ROC Curve Comparison", x = "False Positive Rate (1 - Specificity)", y = "True Positive Rate (Sensitivity)", color = "Models" ) + theme_minimal() + theme( legend.position = "bottom", plot.title = element_text(hjust = 0.5, size = 16, face = "bold") ) + scale_color_manual(values = c("#E41A1C", "#377EB8", "#4DAF4A")) # 8. 输出结果 cat("\n步骤8: 输出结果...\n") cat("\n===== 模型性能指标 =====\n") print(metrics_df) # 9. 保存结果 cat("\n步骤9: 保存结果...\n") # 保存指标结果 write.csv(metrics_df, "Model_Performance_Metrics.csv", row.names = FALSE) cat("指标结果已保存到: Model_Performance_Metrics.csv\n") # 保存ROC曲线 ggsave("ROC_Curve.png", plot = roc_plot, width = 8, height = 6, dpi = 300) cat("ROC曲线已保存到: ROC_Curve.png\n") # 10. 显示结果 cat("\n步骤10: 显示结果...\n") # 显示ROC曲线 print(roc_plot) # 完成 cat("\n===== 分析成功完成! =====\n")
最新发布
07-08
评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值