请你帮我检查以下R studio脚本,并做修改,确保可以运行,我的数据文件在桌面,名称为“all.xlsx”,我数据文件中的列名分别为paraffin pathology,Radiologist and AI,Radiologist,AI ,并且是竖列。# ======================
# ROC分析脚本 - 修复版
# 数据名称:all (小写,已导入RStudio环境)
# ======================
# 0. 初始化环境
cat("步骤0: 初始化环境...\n")
options(warn = 1) # 显示警告但不转为错误
# 1. 安装和加载必要的包
cat("步骤1: 检查并安装必要的包...\n")
required_packages <- c("pROC", "ggplot2", "caret", "dplyr")
new_packages <- required_packages[!(required_packages %in% installed.packages()[,"Package"])]
if (length(new_packages) > 0) {
cat("正在安装缺失的包:", paste(new_packages, collapse = ", "), "\n")
install.packages(new_packages, dependencies = TRUE)
}
# 加载包
suppressPackageStartupMessages({
library(pROC)
library(ggplot2)
library(caret)
library(dplyr)
})
cat("包加载成功!\n\n")
# 2. 数据准备和验证 - 修复大小写问题
cat("步骤2: 数据准备和验证...\n")
# 检查数据是否存在(小写)
if (!exists("all")) {
# 尝试从桌面读取数据
cat("环境变量中未找到'all',尝试从桌面读取...\n")
desktop_path <- file.path(Sys.getenv("USERPROFILE"), "Desktop")
file_path <- file.path(desktop_path, "All.xlsx")
if (file.exists(file_path)) {
cat("找到文件:", file_path, "\n")
all <- readxl::read_excel(file_path)
cat("数据成功从Excel导入!\n")
} else {
stop("错误: 未找到'all'数据框,也未在桌面找到All.xlsx文件")
}
} else {
cat("找到'all'数据框\n")
}
# 创建数据副本
data <- as.data.frame(all)
cat("数据维度:", dim(data), "\n")
# 显示数据列名(重要诊断步骤)
cat("数据列名:", paste(names(data), collapse = ", "), "\n")
# 检查必要列是否存在 - 使用小写列名
required_columns <- c("paraffin_pathology", "rad_ai", "radiologist", "ai")
cat("需要列名:", paste(required_columns, collapse = ", "), "\n")
# 检查缺失列
missing_columns <- setdiff(required_columns, names(data))
if (length(missing_columns) > 0) {
cat("警告: 缺少必要列 -", paste(missing_columns, collapse = ", "), "\n")
# 尝试忽略大小写匹配
cat("尝试忽略大小写匹配列名...\n")
actual_columns <- names(data)
matched_columns <- sapply(required_columns, function(col) {
match <- grep(paste0("^", col, "$"), actual_columns, ignore.case = TRUE, value = TRUE)
if (length(match) > 0) match[1] else col
})
# 更新数据框列名
names(data)[match(matched_columns, names(data))] <- names(matched_columns)
cat("更新后的列名:", paste(names(data), collapse = ", "), "\n")
# 再次检查缺失列
missing_columns <- setdiff(required_columns, names(data))
if (length(missing_columns) > 0) {
stop("错误: 仍然缺少必要列 - ", paste(missing_columns, collapse = ", "))
} else {
cat("列名匹配成功!\n")
}
} else {
cat("所有必要列都存在!\n")
}
# 3. 数据预处理
cat("\n步骤3: 数据预处理...\n")
# 显示金标准分布
cat("金标准原始值:\n")
print(table(data$paraffin_pathology))
# 转换为因子(1=阴性, 2=阳性)
data$paraffin_pathology <- factor(
data$paraffin_pathology,
levels = c(1, 2),
labels = c("Negative", "Positive")
)
cat("\n金标准分布(转换后):\n")
print(table(data$paraffin_pathology))
# 检查预测值
check_range <- function(col_name) {
vals <- data[[col_name]]
if (!is.numeric(vals)) {
cat(sprintf("%-20s: 非数值型 - 尝试转换\n", col_name))
vals <- as.numeric(vals)
}
cat(sprintf("%-20s: 最小值=%.4f, 最大值=%.4f, 缺失值=%d\n",
col_name, min(vals, na.rm=TRUE), max(vals, na.rm=TRUE), sum(is.na(vals))))
return(vals)
}
cat("\n预测值范围检查:\n")
data$rad_ai <- check_range("rad_ai")
data$radiologist <- check_range("radiologist")
data$ai <- check_range("ai")
# 4. 定义评估函数
cat("\n步骤4: 定义评估函数...\n")
evaluate_model <- function(model_name, true_labels, predictions) {
# 移除缺失值
complete_idx <- complete.cases(true_labels, predictions)
true_labels_complete <- true_labels[complete_idx]
predictions_complete <- predictions[complete_idx]
if (length(unique(true_labels_complete)) < 2) {
stop("错误: 金标准只有单一类别")
}
cat("有效样本量:", sum(complete_idx), "\n")
# 计算ROC曲线
roc_obj <- roc(
response = true_labels_complete,
predictor = predictions_complete,
levels = c("Negative", "Positive"),
direction = "<",
ci = TRUE,
auc = TRUE
)
# 计算最佳阈值
best_threshold <- coords(
roc_obj,
x = "best",
best.method = "youden",
transpose = TRUE
)
# 二分类预测
pred_class <- ifelse(predictions_complete >= best_threshold["threshold"],
"Positive", "Negative")
pred_class <- factor(pred_class, levels = c("Negative", "Positive"))
# 计算混淆矩阵
cm <- confusionMatrix(
data = pred_class,
reference = true_labels_complete,
positive = "Positive"
)
# 计算95%置信区间
ci_auc <- ci.auc(roc_obj)
# 提取指标
metrics <- data.frame(
Model = model_name,
Sensitivity = round(cm$byClass["Sensitivity"], 4),
Specificity = round(cm$byClass["Specificity"], 4),
PPV = round(cm$byClass["Pos Pred Value"], 4),
NPV = round(cm$byClass["Neg Pred Value"], 4),
F1 = round(cm$byClass["F1"], 4),
AUC = round(auc(roc_obj), 4),
AUC_CI_Lower = round(ci_auc[1], 4),
AUC_CI_Upper = round(ci_auc[3], 4),
Threshold = round(best_threshold["threshold"], 4),
N = sum(complete_idx),
stringsAsFactors = FALSE
)
return(list(roc = roc_obj, metrics = metrics))
}
# 5. 模型评估
cat("\n步骤5: 评估模型...\n")
models <- list(
"Radiologist and AI" = data$rad_ai,
"Radiologist" = data$radiologist,
"AI" = data$ai
)
results <- list()
for (model_name in names(models)) {
cat("\n评估模型:", model_name, "\n")
result <- tryCatch({
evaluate_model(
model_name = model_name,
true_labels = data$paraffin_pathology,
predictions = models[[model_name]]
)
}, error = function(e) {
cat("!! 评估出错:", e$message, "\n")
return(NULL)
})
if (!is.null(result)) {
results[[model_name]] <- result
cat("评估成功!\n")
} else {
cat("评估失败\n")
}
}
# 6. 结果整理
cat("\n步骤6: 整理结果...\n")
if (length(results) == 0) {
stop("所有模型评估均失败,无法继续")
}
# 合并指标结果
metrics_df <- do.call(rbind, lapply(results, function(x) x$metrics))
# 准备ROC曲线数据
roc_data <- data.frame()
for (model_name in names(results)) {
roc_obj <- results[[model_name]]$roc
temp <- data.frame(
Model = model_name,
FPR = 1 - roc_obj$specificities,
TPR = roc_obj$sensitivities,
Thresholds = roc_obj$thresholds
)
roc_data <- rbind(roc_data, temp)
}
# 7. 绘制ROC曲线
cat("\n步骤7: 绘制ROC曲线...\n")
roc_plot <- ggplot(roc_data, aes(x = FPR, y = TPR, color = Model)) +
geom_line(size = 1.2) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray") +
labs(
title = "ROC Curve Comparison",
x = "False Positive Rate (1 - Specificity)",
y = "True Positive Rate (Sensitivity)",
color = "Models"
) +
theme_minimal() +
theme(
legend.position = "bottom",
plot.title = element_text(hjust = 0.5, size = 16, face = "bold")
) +
scale_color_manual(values = c("#E41A1C", "#377EB8", "#4DAF4A"))
# 8. 输出结果
cat("\n步骤8: 输出结果...\n")
cat("\n===== 模型性能指标 =====\n")
print(metrics_df)
# 9. 保存结果
cat("\n步骤9: 保存结果...\n")
# 保存指标结果
write.csv(metrics_df, "Model_Performance_Metrics.csv", row.names = FALSE)
cat("指标结果已保存到: Model_Performance_Metrics.csv\n")
# 保存ROC曲线
ggsave("ROC_Curve.png", plot = roc_plot, width = 8, height = 6, dpi = 300)
cat("ROC曲线已保存到: ROC_Curve.png\n")
# 10. 显示结果
cat("\n步骤10: 显示结果...\n")
# 显示ROC曲线
print(roc_plot)
# 完成
cat("\n===== 分析成功完成! =====\n")
最新发布