# 加载所需库
library(dplyr)
library(glmnet)
library(pROC)
library(caret)
library(ggplot2)
library(DMwR) # 用于SMOTE算法
library(caTools) # 用于sample.split
# 设置随机种子确保结果可重现
set.seed(145)
# 1. 数据准备与分组(与原代码相同)
data <- GCM_base_information_plot_S108
sample <- sample.split(data$PRE_response, SplitRatio = 0.8)
train <- subset(data, sample == TRUE)
test <- subset(data, sample == FALSE)
# 显示分组基本信息
cat("数据分组信息:\n")
cat("训练集样本量:", nrow(train), "\n")
cat("测试集样本量:", nrow(test), "\n")
cat("训练集响应变量分布:\n")
print(table(train$PRE_response))
cat("测试集响应变量分布:\n")
print(table(test$PRE_response))
# 2. 代谢物数据处理(与原代码相同)
train_M <- semi_join(GCM_base_M_sig, train, by = "ID")
train_M$PRE_response <- as.factor(train_M$PRE_response)
row.names(train_M) <- train_M$ID
train_M <- train_M[-1] # 移除ID列
test_M <- semi_join(GCM_base_M_sig, test, by = "ID")
test_M$PRE_response <- as.factor(test_M$PRE_response)
row.names(test_M) <- test_M$ID
test_M <- test_M[-1] # 移除ID列
# 提取特征矩阵
xtr <- as.matrix(train_M[, -ncol(train_M)])
xte <- as.matrix(test_M[, -ncol(test_M)])
cat("\n原始特征数量:", ncol(xtr), "\n")
# 3. 高相关特征处理(与原代码相同)
cor_matrix <- cor(xtr)
remove_highly_correlated <- function(cor_matrix, threshold = 0.8) {
features_to_remove <- character(0)
features_to_keep <- colnames(cor_matrix)
high_corr <- which(abs(cor_matrix) > threshold & upper.tri(cor_matrix), arr.ind = TRUE)
if (nrow(high_corr) > 0) {
feature_importance <- apply(xtr, 2, var)
for (i in 1:nrow(high_corr)) {
feat1 <- colnames(cor_matrix)[high_corr[i, 1]]
feat2 <- colnames(cor_matrix)[high_corr[i, 2]]
if (feature_importance[feat1] > feature_importance[feat2]) {
features_to_remove <- c(features_to_remove, feat2)
} else {
features_to_remove <- c(features_to_remove, feat1)
}
}
features_to_remove <- unique(features_to_remove)
features_to_keep <- setdiff(colnames(cor_matrix), features_to_remove)
}
return(list(remove = features_to_remove, keep = features_to_keep))
}
result <- remove_highly_correlated(cor_matrix, threshold = 0.8)
cat("特征筛选结果:\n")
cat("移除的高相关特征数量:", length(result$remove), "\n")
cat("保留的特征数量:", length(result$keep), "\n")
if (length(result$remove) > 0) {
cat("移除的特征:", paste(result$remove, collapse = ", "), "\n")
}
# 处理训练集和测试集
if (length(result$remove) > 0) {
xtr_processed <- xtr[, result$keep, drop = FALSE]
xte_processed <- xte[, result$keep, drop = FALSE]
} else {
xtr_processed <- xtr
xte_processed <- xte
}
# 4. 数据标准化(与原代码相同)
metabolite_cols <- colnames(xtr_processed)
train_processed <- data.frame(
xtr_processed,
PRE_response = train_M$PRE_response
)
test_processed <- data.frame(
xte_processed,
PRE_response = test_M$PRE_response
)
train_scaled <- scale(train_processed[, metabolite_cols])
train_processed[, metabolite_cols] <- train_scaled
test_scaled <- scale(test_processed[, metabolite_cols],
center = attr(train_scaled, "scaled:center"),
scale = attr(train_scaled, "scaled:scale"))
test_processed[, metabolite_cols] <- test_scaled
# 准备特征矩阵和响应变量
xtr_final <- as.matrix(train_processed[, metabolite_cols])
xte_final <- as.matrix(test_processed[, metabolite_cols])
y_train <- train_processed$PRE_response
y_test <- test_processed$PRE_response
# 显示最终数据信息
cat("\n最终数据维度:\n")
cat("训练集特征:", ncol(xtr_final), "训练集样本:", nrow(xtr_final), "\n")
cat("测试集特征:", ncol(xte_final), "测试集样本:", nrow(xte_final), "\n")
cat("训练集响应分布:\n")
print(table(y_train))
cat("测试集响应分布:\n")
print(table(y_test))
# 定义评估函数
evaluate_model <- function(predictions, actual, method_name, dataset_name) {
auc_val <- roc(actual, as.numeric(predictions))$auc
ci_val <- ci.auc(auc_val)
# 计算其他指标
pred_class <- ifelse(predictions > 0.5, "R", "NR")
pred_class <- factor(pred_class, levels = levels(actual))
cm <- confusionMatrix(pred_class, actual)
cat(paste0("\n", method_name, " - ", dataset_name, ":\n"))
cat("AUC:", round(auc_val, 3), "\n")
cat("AUC 95% CI:", round(ci_val[1], 3), "-", round(ci_val[3], 3), "\n")
cat("准确率:", round(cm$overall["Accuracy"], 3), "\n")
cat("灵敏度:", round(cm$byClass["Sensitivity"], 3), "\n")
cat("特异度:", round(cm$byClass["Specificity"], 3), "\n")
return(list(auc = auc_val, ci = ci_val, cm = cm))
}
# 修复代码中的错误
# 主要问题是字符串连接操作符错误
# 重新定义方法函数,修复字符串连接问题
method1_lasso <- function(xtr, y_train, xte, y_test) {
cat("\n", paste(rep("=", 50), collapse = ""), "\n")
cat("方法1: 直接LASSO回归\n")
cat(paste(rep("=", 50), collapse = ""), "\n")
set.seed(6)
fitCV <- cv.glmnet(xtr, y_train, family = "binomial",
type.measure = "class", nfolds = 10)
lambda.1se <- fitCV$lambda.1se
lambda.min <- fitCV$lambda.min
# lambda.1se评估
pred_train_1se <- predict(fitCV, newx = xtr, s = "lambda.1se", type = "response")
pred_test_1se <- predict(fitCV, newx = xte, s = "lambda.1se", type = "response")
train_results_1se <- evaluate_model(pred_train_1se, y_train, "LASSO (lambda.1se)", "训练集")
test_results_1se <- evaluate_model(pred_test_1se, y_test, "LASSO (lambda.1se)", "测试集")
# lambda.min评估
pred_train_min <- predict(fitCV, newx = xtr, s = "lambda.min", type = "response")
pred_test_min <- predict(fitCV, newx = xte, s = "lambda.min", type = "response")
train_results_min <- evaluate_model(pred_train_min, y_train, "LASSO (lambda.min)", "训练集")
test_results_min <- evaluate_model(pred_test_min, y_test, "LASSO (lambda.min)", "测试集")
# 获取系数
coef_min <- coef(fitCV, s = "lambda.min")
coef_1se <- coef(fitCV, s = "lambda.1se")
return(list(
fitCV = fitCV,
coef_min = coef_min,
coef_1se = coef_1se,
results = list(
lambda.1se = list(train = train_results_1se, test = test_results_1se),
lambda.min = list(train = train_results_min, test = test_results_min)
)
))
}
method2_top3_lr <- function(xtr, y_train, xte, y_test, lasso_coef) {
cat("\n", paste(rep("=", 50), collapse = ""), "\n")
cat("方法2: 前3特征逻辑回归\n")
cat(paste(rep("=", 50), collapse = ""), "\n")
# 从LASSO系数中提取前4个特征(按绝对值)
coef_df <- data.frame(
feature = lasso_coef@Dimnames[[1]][lasso_coef@i + 1],
coefficient = lasso_coef@x
)
coef_df <- coef_df[coef_df$feature != "(Intercept)", ]
if(nrow(coef_df) == 0) {
cat("没有选择到任何特征!使用所有特征。\n")
top3_features <- colnames(xtr)
if(length(top3_features) > 3) {
top3_features <- top3_features[1:3]
}
} else {
coef_df$abs_coef <- abs(coef_df$coefficient)
coef_df <- coef_df[order(-coef_df$abs_coef), ]
top3_features <- head(coef_df$feature, 3)
}
cat("选择的Top3特征:", paste(top3_features, collapse = ", "), "\n")
# 构建逻辑回归模型
train_df <- data.frame(xtr[, top3_features, drop = FALSE], PRE_response = y_train)
test_df <- data.frame(xte[, top3_features, drop = FALSE], PRE_response = y_test)
lr_model <- glm(PRE_response ~ ., data = train_df, family = binomial)
pred_train <- predict(lr_model, newdata = train_df, type = "response")
pred_test <- predict(lr_model, newdata = test_df, type = "response")
train_results <- evaluate_model(pred_train, y_train, "Top3逻辑回归", "训练集")
test_results <- evaluate_model(pred_test, y_test, "Top3逻辑回归", "测试集")
return(list(
features = top3_features,
model = lr_model,
results = list(train = train_results, test = test_results)
))
}
method3_smote_lasso <- function(xtr, y_train, xte, y_test) {
cat("\n", paste(rep("=", 50), collapse = ""), "\n")
cat("方法3: SMOTE平衡后LASSO\n")
cat(paste(rep("=", 50), collapse = ""), "\n")
# 创建SMOTE数据
smote_data <- data.frame(xtr, PRE_response = y_train)
# 计算SMOTE参数(少数类达到多数类数量)
table_response <- table(y_train)
majority_class <- names(which.max(table_response))
minority_class <- names(which.min(table_response))
perc_over <- (table_response[majority_class] / table_response[minority_class] - 1) * 100
set.seed(42)
smote_data_balanced <- SMOTE(PRE_response ~ ., data = smote_data,
perc.over = perc_over, perc.under = 100)
cat("SMOTE后数据分布:\n")
print(table(smote_data_balanced$PRE_response))
# SMOTE后LASSO
xtr_smote <- as.matrix(smote_data_balanced[, -ncol(smote_data_balanced)])
y_smote <- smote_data_balanced$PRE_response
set.seed(6)
fitCV_smote <- cv.glmnet(xtr_smote, y_smote, family = "binomial",
type.measure = "class", nfolds = 10)
# 使用lambda.min(SMOTE后通常更稳定)
pred_train_smote <- predict(fitCV_smote, newx = xtr_smote, s = "lambda.min", type = "response")
pred_test_smote <- predict(fitCV_smote, newx = xte, s = "lambda.min", type = "response")
train_results <- evaluate_model(pred_train_smote, y_smote, "SMOTE-LASSO", "训练集")
test_results <- evaluate_model(pred_test_smote, y_test, "SMOTE-LASSO", "测试集")
return(list(
fitCV = fitCV_smote,
smote_data = smote_data_balanced,
results = list(train = train_results, test = test_results)
))
}
method5_weighted_lasso <- function(xtr, y_train, xte, y_test) {
cat("\n", paste(rep("=", 50), collapse = ""), "\n")
cat("方法5: 加权LASSO\n")
cat(paste(rep("=", 50), collapse = ""), "\n")
# 计算类别权重
table_response <- table(y_train)
class_weights <- 1 / table_response
# 归一化权重
obs_weights <- ifelse(y_train == "NR", class_weights["NR"], class_weights["R"])
obs_weights <- obs_weights / mean(obs_weights) # 标准化权重
cat("类别权重:\n")
print(class_weights)
set.seed(6)
fitCV_weighted <- cv.glmnet(xtr, y_train, family = "binomial",
type.measure = "class", nfolds = 10,
weights = obs_weights)
# 使用lambda.min
pred_train_weighted <- predict(fitCV_weighted, newx = xtr, s = "lambda.min", type = "response")
pred_test_weighted <- predict(fitCV_weighted, newx = xte, s = "lambda.min", type = "response")
train_results <- evaluate_model(pred_train_weighted, y_train, "加权LASSO", "训练集")
test_results <- evaluate_model(pred_test_weighted, y_test, "加权LASSO", "测试集")
return(list(
fitCV = fitCV_weighted,
weights = obs_weights,
results = list(train = train_results, test = test_results)
))
}
# 重新执行所有方法
results <- list()
# 方法1: 直接LASSO回归
results$method1 <- method1_lasso(xtr_final, y_train, xte_final, y_test)
# 方法2: 基于lambda.min的前3特征逻辑回归
results$method2_min <- method2_top3_lr(xtr_final, y_train, xte_final, y_test, results$method1$coef_min)
# 方法2: 基于lambda.1se的前3特征逻辑回归
results$method2_1se <- method2_top3_lr(xtr_final, y_train, xte_final, y_test, results$method1$coef_1se)
# 方法3: SMOTE平衡后LASSO
results$method3 <- method3_smote_lasso(xtr_final, y_train, xte_final, y_test)
# 方法4: SMOTE后前3特征逻辑回归(基于SMOTE LASSO系数)
results$method4 <- method2_top3_lr(xtr_final, y_train, xte_final, y_test,
coef(results$method3$fitCV, s = "lambda.min"))
# 方法5: 加权LASSO
results$method5 <- method5_weighted_lasso(xtr_final, y_train, xte_final, y_test)
# 方法6: 加权后前3特征逻辑回归
results$method6 <- method2_top3_lr(xtr_final, y_train, xte_final, y_test,
coef(results$method5$fitCV, s = "lambda.min"))
# 汇总所有结果
cat("\n", paste(rep("=", 60), collapse = ""), "\n")
cat("所有方法测试集AUC汇总\n")
cat(paste(rep("=", 60), collapse = ""), "\n")
method_names <- c(
"方法1: LASSO (lambda.min)",
"方法1: LASSO (lambda.1se)",
"方法2: 前3特征LR (基于lambda.min)",
"方法2: 前3特征LR (基于lambda.1se)",
"方法3: SMOTE-LASSO",
"方法4: SMOTE-前3特征LR",
"方法5: 加权LASSO",
"方法6: 加权-前3特征LR"
)
test_aucs <- c(
results$method1$results$lambda.min$test$auc,
results$method1$results$lambda.1se$test$auc,
results$method2_min$results$test$auc,
results$method2_1se$results$test$auc,
results$method3$results$test$auc,
results$method4$results$test$auc,
results$method5$results$test$auc,
results$method6$results$test$auc
)
summary_df <- data.frame(
方法 = method_names,
测试集AUC = round(test_aucs, 3)
)
print(summary_df)
# 绘制ROC曲线比较
par(mfrow = c(2, 2))
# 方法1比较
roc_min <- roc(y_test, as.numeric(predict(results$method1$fitCV, newx = xte_final,
s = "lambda.min", type = "response")))
roc_1se <- roc(y_test, as.numeric(predict(results$method1$fitCV, newx = xte_final,
s = "lambda.1se", type = "response")))
plot(roc_min, main = "1: LASSO", col = "blue")
lines(roc_1se, col = "red")
legend("bottomright", legend = c("lambda.min", "lambda.1se"),
col = c("blue", "red"), lwd = 2)
# 平衡方法比较
roc_smote <- roc(y_test, as.numeric(predict(results$method3$fitCV, newx = xte_final,
s = "lambda.min", type = "response")))
roc_weighted <- roc(y_test, as.numeric(predict(results$method5$fitCV, newx = xte_final,
s = "lambda.min", type = "response")))
plot(roc_smote, main = "banlanced ways", col = "green")
lines(roc_weighted, col = "orange")
legend("bottomright", legend = c("SMOTE-LASSO", "weight-LASSO"),
col = c("green", "orange"), lwd = 2)
# 前4特征方法比较
pred_method2 <- predict(results$method2_min$model, newdata = data.frame(xte_final), type = "response")
pred_method4 <- predict(results$method4$model, newdata = data.frame(xte_final), type = "response")
pred_method6 <- predict(results$method6$model, newdata = data.frame(xte_final), type = "response")
roc_method2 <- roc(y_test, as.numeric(pred_method2))
roc_method4 <- roc(y_test, as.numeric(pred_method4))
roc_method6 <- roc(y_test, as.numeric(pred_method6))
plot(roc_method2, main = "top 4 features", col = "purple")
lines(roc_method4, col = "brown")
lines(roc_method6, col = "pink")
legend("bottomright", legend = c("original", "SMOTE", "weight"),
col = c("purple", "brown", "pink"), lwd = 2)
# 最佳方法比较
roc_best1 <- roc_min
roc_best2 <- roc_smote
roc_best3 <- roc_weighted
plot(roc_best1, main = "comparasion", col = "blue")
lines(roc_best2, col = "green")
lines(roc_best3, col = "orange")
legend("bottomright", legend = c("LASSO-min", "SMOTE-LASSO", "weight-LASSO"),
col = c("blue", "green", "orange"), lwd = 2)
# 保存重要结果
final_results <- list(
数据信息 = list(
训练集样本量 = nrow(xtr_final),
测试集样本量 = nrow(xte_final),
特征数量 = ncol(xtr_final),
训练集分布 = table(y_train),
测试集分布 = table(y_test)
),
方法比较 = summary_df,
详细结果 = results
)
cat("\n分析完成!所有方法的结果已保存。\n")
# 显示每个方法选择的特征
cat("\n各方法选择的特征:\n")
cat("方法2 (基于lambda.min):", paste(results$method2_min$features, collapse = ", "), "\n")
cat("方法2 (基于lambda.1se):", paste(results$method2_1se$features, collapse = ", "), "\n")
cat("方法4 (SMOTE后):", paste(results$method4$features, collapse = ", "), "\n")
cat("方法6 (加权后):", paste(results$method6$features, collapse = ", "), "\n")
计算各个方法的power以及模型中各个代谢物的power