> # 设置工作目录,即代码和数据的存储位置
> setwd("/Users/jiaqingjun/Desktop/31机器学习算法构建诊断模型")
> min.selected.var = 6 # 设置变量选择的最小数目
>
> # 加载所需的库
> library(openxlsx) # 用于Excel文件操作
> library(seqinr) # 用于序列分析
> library(plyr) # 数据处理工具
Attaching package: ‘plyr’
The following object is masked from ‘package:seqinr’:
count
> library(randomForestSRC) # 随机森林
randomForestSRC 3.4.0
Type rfsrc.news() to see new features, changes, and bug fixes.
> library(glmnet) # 广义线性模型
Loading required package: Matrix
Loaded glmnet 4.1-8
> library(plsRglm) # 偏最小二乘回归
> library(gbm) # 梯度提升机
Loaded gbm 2.2.2
This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
> library(caret) # 机器学习算法训练和测试
Loading required package: ggplot2
Loading required package: lattice
Attaching package: ‘caret’
The following object is masked from ‘package:seqinr’:
dotPlot
> library(mboost) # 增强模型
Loading required package: parallel
Loading required package: stabs
Attaching package: ‘stabs’
The following object is masked from ‘package:randomForestSRC’:
subsample
Attaching package: ‘mboost’
The following object is masked from ‘package:ggplot2’:
%+%
The following object is masked from ‘package:glmnet’:
Cindex
> library(e1071) # SVM等算法
Attaching package: ‘e1071’
The following objects are masked from ‘package:randomForestSRC’:
impute, tune
> library(BART) # 贝叶斯加法回归树
Loading required package: nlme
Attaching package: ‘nlme’
The following object is masked from ‘package:seqinr’:
gls
Loading required package: survival
Attaching package: ‘survival’
The following object is masked from ‘package:caret’:
cluster
> library(MASS) # 包含广泛应用的统计方法
> library(snowfall) # 并行计算支持
Loading required package: snow
Attaching package: ‘snow’
The following objects are masked from ‘package:parallel’:
closeNode, clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, clusterExport, clusterMap, clusterSplit, makeCluster,
parApply, parCapply, parLapply, parRapply, parSapply, recvData, recvOneData, sendData, splitIndices, stopCluster
> library(xgboost) # 极端梯度提升模型
> library(ComplexHeatmap) # 复杂热图绘制
Loading required package: grid
========================================
ComplexHeatmap version 2.20.0
Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
Github page: https://github.com/jokergoo/ComplexHeatmap
Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
If you use it in published research, please cite either one:
- Gu, Z. Complex Heatmap Visualization. iMeta 2022.
- Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
genomic data. Bioinformatics 2016.
The new InteractiveComplexHeatmap package can directly export static
complex heatmaps into an interactive Shiny app with zero effort. Have a try!
This message can be suppressed by:
suppressPackageStartupMessages(library(ComplexHeatmap))
========================================
> library(RColorBrewer) # 颜色选择
> library(pROC) # ROC曲线工具
Type 'citation("pROC")' for a citation.
Attaching package: ‘pROC’
The following objects are masked from ‘package:stats’:
cov, smooth, var
>
>
> # 定义一个函数RunML,用于运行机器学习算法
> RunML <- function(method, Train_set, Train_label, mode = "Model", classVar){
+ # 清理和准备算法名称和参数
+ method = gsub(" ", "", method) # 去除方法名称中的空格,例如 'Enet [alpha=0.4]' 变为 'Enet[alpha=0.4]'
+ method_name = gsub("(\\w+)\\[(.+)\\]", "\\1", method) # 从方法名称中提取算法名称,如从 'Enet[alpha=0.4]' 得到 'Enet'
+ method_param = gsub("(\\w+)\\[(.+)\\]", "\\2", method) # 从方法名称中提取参数,如从 'Enet[alpha=0.4]' 得到 'alpha=0.4'
+
+ # 根据提取的算法名称,准备相应的参数
+ method_param = switch(
+ EXPR = method_name,
+ "Enet" = list("alpha" = as.numeric(gsub("alpha=", "", method_param))),
+ "Stepglm" = list("direction" = method_param),
+ NULL # 如果没有匹配到任何名称,返回NULL
+ )
+
+ # 输出正在运行的算法和使用的变量数
+ message("Run ", method_name, " algorithm for ", mode, "; ",
+ method_param, ";",
+ " using ", ncol(Train_set), " Variables")
+
+ # 将传入的参数和提取的参数组合成一个新的参数列表
+ args = list("Train_set" = Train_set,
+ "Train_label" = Train_label,
+ "mode" = mode,
+ "classVar" = classVar)
+ args = c(args, method_param)
+
+ # 使用do.call动态调用相应的算法实现函数
+ obj <- do.call(what = paste0("Run", method_name),
+ args = args)
+
+ # 根据模式,输出不同的信息
+ if(mode == "Variable"){
+ message(length(obj), " Variables retained;\n")
+ }else{message("\n")}
+ return(obj)
+ }
>
> # 定义用于运行Elastic Net正则化线性模型的函数
> RunEnet <- function(Train_set, Train_label, mode, classVar, alpha){
+ # 使用交叉验证找到最优的正则化参数
+ cv.fit = cv.glmnet(x = Train_set,
+ y = Train_label[[classVar]],
+ family = "binomial", alpha = alpha, nfolds = 10)
+ # 建立最终模型
+ fit = glmnet(x = Train_set,
+ y = Train_label[[classVar]],
+ family = "binomial", alpha = alpha, lambda = cv.fit$lambda.min)
+ fit$subFeature = colnames(Train_set)
+ if (mode == "Model") return(fit)
+ if (mode == "Variable") return(ExtractVar(fit))
+ }
>
> # 定义用于运行Lasso正则化线性模型的函数
> RunLasso <- function(Train_set, Train_label, mode, classVar){
+ RunEnet(Train_set, Train_label, mode, classVar, alpha = 1)
+ }
>
> # 定义用于运行Ridge正则化线性模型的函数
> RunRidge <- function(Train_set, Train_label, mode, classVar){
+ RunEnet(Train_set, Train_label, mode, classVar, alpha = 0)
+ }
>
> # 定义用于运行逐步广义线性模型的函数
> RunStepglm <- function(Train_set, Train_label, mode, classVar, direction){
+ # 使用glm函数和step函数逐步选择模型
+ fit <- step(glm(formula = Train_label[[classVar]] ~ .,
+ family = "binomial",
+ data = as.data.frame(Train_set)),
+ direction = direction, trace = 0)
+ fit$subFeature = colnames(Train_set)
+ if (mode == "Model") return(fit)
+ if (mode == "Variable") return(ExtractVar(fit))
+ }
>
> # 定义用于运行支持向量机的函数
> RunSVM <- function(Train_set, Train_label, mode, classVar){
+ # 将数据框转换为因子类型,适合SVM模型输入
+ data <- as.data.frame(Train_set)
+ data[[classVar]] <- as.factor(Train_label[[classVar]])
+ # 建立SVM模型
+ fit = svm(formula = eval(parse(text = paste(classVar, "~."))),
+ data= data, probability = T)
+ fit$subFeature = colnames(Train_set)
+ if (mode == "Model") return(fit)
+ if (mode == "Variable") return(ExtractVar(fit))
+ }
>
> # 定义用于运行线性判别分析的函数
> RunLDA <- function(Train_set, Train_label, mode, classVar){
+ # 准备数据,将类变量转换为因子类型
+ data <- as.data.frame(Train_set)
+ data[[classVar]] <- as.factor(Train_label[[classVar]])
+ # 使用train函数建立LDA模型
+ fit = train(eval(parse(text = paste(classVar, "~."))),
+ data = data,
+ method="lda",
+ trControl = trainControl(method = "cv"))
+ fit$subFeature = colnames(Train_set)
+ if (mode == "Model") return(fit)
+ if (mode == "Variable") return(ExtractVar(fit))
+ }
>
> # 定义用于运行梯度提升机的函数
> RunglmBoost <- function(Train_set, Train_label, mode, classVar){
+ # 准备数据,将类变量和训练集绑定
+ data <- cbind(Train_set, Train_label[classVar])
+ data[[classVar]] <- as.factor(data[[classVar]])
+ # 建立初步的GLMBoost模型
+ fit <- glmboost(eval(parse(text = paste(classVar, "~."))),
+ data = data,
+ family = Binomial())
+ # 使用交叉验证调整模型停止迭代的次数
+ cvm <- cvrisk(fit, papply = lapply,
+ folds = cv(model.weights(fit), type = "kfold"))
+ fit <- glmboost(eval(parse(text = paste(classVar, "~."))),
+ data = data,
+ family = Binomial(),
+ control = boost_control(mstop = max(mstop(cvm), 40)))
+ fit$subFeature = colnames(Train_set)
+ if (mode == "Model") return(fit)
+ if (mode == "Variable") return(ExtractVar(fit))
+ }
>
> # 定义用于运行偏最小二乘回归和广义线性模型的函数
> RunplsRglm <- function(Train_set, Train_label, mode, classVar){
+ # 使用交叉验证评估模型参数
+ cv.plsRglm.res = cv.plsRglm(formula = Train_label[[classVar]] ~ .,
+ data = as.data.frame(Train_set),
+ nt=10, verbose = FALSE)
+ # 建立PLSRGLM模型
+ fit <- plsRglm(Train_label[[classVar]],
+ as.data.frame(Train_set),
+ modele = "pls-glm-logistic",
+ verbose = F, sparse = T)
+ fit$subFeature = colnames(Train_set)
+ if (mode == "Model") return(fit)
+ if (mode == "Variable") return(ExtractVar(fit))
+ }
>
> # 定义用于运行随机森林的函数
> RunRF <- function(Train_set, Train_label, mode, classVar){
+ # 设置随机森林参数,如树的最小节点大小
+ rf_nodesize = 5 # 可根据需要调整
+ # 准备数据,将类变量转换为因子
+ Train_label[[classVar]] <- as.factor(Train_label[[classVar]])
+ # 建立随机森林模型
+ fit <- rfsrc(formula = formula(paste0(classVar, "~.")),
+ data = cbind(Train_set, Train_label[classVar]),
+ ntree = 1000, nodesize = rf_nodesize,
+ importance = T,
+ proximity = T,
+ forest = T)
+ fit$subFeature = colnames(Train_set)
+ if (mode == "Model") return(fit)
+ if (mode == "Variable") return(ExtractVar(fit))
+ }
>
> # 定义用于运行梯度提升机的函数
> RunGBM <- function(Train_set, Train_label, mode, classVar){
+ # 建立初步的GBM模型
+ fit <- gbm(formula = Train_label[[classVar]] ~ .,
+ data = as.data.frame(Train_set),
+ distribution = 'bernoulli',
+ n.trees = 10000,
+ interaction.depth = 3,
+ n.minobsinnode = 10,
+ shrinkage = 0.001,
+ cv.folds = 10,n.cores = 6)
+ # 选择最优的迭代次数
+ best <- which.min(fit$cv.error)
+ fit <- gbm(formula = Train_label[[classVar]] ~ .,
+ data = as.data.frame(Train_set),
+ distribution = 'bernoulli',
+ n.trees = best,
+ interaction.depth = 3,
+ n.minobsinnode = 10,
+ shrinkage = 0.001, n.cores = 8)
+ fit$subFeature = colnames(Train_set)
+ if (mode == "Model") return(fit)
+ if (mode == "Variable") return(ExtractVar(fit))
+ }
>
> # 定义用于运行XGBoost的函数
> RunXGBoost <- function(Train_set, Train_label, mode, classVar){
+ # 创建交叉验证折叠
+ indexes = createFolds(Train_label[[classVar]], k = 5, list=T)
+ # 计算每折的最优模型参数
+ CV <- unlist(lapply(indexes, function(pt){
+ dtrain = xgb.DMatrix(data = Train_set[-pt, ],
+ label = Train_label[-pt, ])
+ dtest = xgb.DMatrix(data = Train_set[pt, ],
+ label = Train_label[pt, ])
+ watchlist <- list(train=dtrain, test=dtest)
+
+ bst <- xgb.train(data=dtrain,
+ max.depth=2, eta=1, nthread = 2, nrounds=10,
+ watchlist=watchlist,
+ objective = "binary:logistic", verbose = F)
+ which.min(bst$evaluation_log$test_logloss)
+ }))
+
+ # 使用最常用的轮数建立最终模型
+ nround <- as.numeric(names(which.max(table(CV))))
+ fit <- xgboost(data = Train_set,
+ label = Train_label[[classVar]],
+ max.depth = 2, eta = 1, nthread = 2, nrounds = nround,
+ objective = "binary:logistic", verbose = F)
+ fit$subFeature = colnames(Train_set)
+
+ if (mode == "Model") return(fit)
+ if (mode == "Variable") return(ExtractVar(fit))
+ }
>
> # 定义用于运行朴素贝叶斯分类器的函数
> RunNaiveBayes <- function(Train_set, Train_label, mode, classVar){
+ # 准备数据
+ data <- cbind(Train_set, Train_label[classVar])
+ data[[classVar]] <- as.factor(data[[classVar]])
+ # 建立朴素贝叶斯模型
+ fit <- naiveBayes(eval(parse(text = paste(classVar, "~."))),
+ data = data)
+ fit$subFeature = colnames(Train_set)
+ if (mode == "Model") return(fit)
+ if (mode == "Variable") return(ExtractVar(fit))
+ }
>
> # DRF模型的注释已被删除,因为此部分代码被注释掉了
> # RunDRF <- function(Train_set, Train_label, mode, classVar){
> # Train_label <- data.frame(
> # "0" = as.numeric(Train_label == 0),
> # "1" = as.numeric(Train_label == 1)
> # )
> # fit <- drf(X = Train_set,
> # Y = Train_label,
> # compute.variable.importance = F)
> # fit$subFeature = colnames(Train_set)
> #
> # summary(predict(fit, functional = "mean", as.matrix(Train_set))$mean)
> #
> # if (mode == "Model") return(fit)
> # if (mode == "Variable") return(ExtractVar(fit))
> # }
>
> # 定义一个函数用于在执行过程中抑制输出
> quiet <- function(..., messages=FALSE, cat=FALSE){
+ if(!cat){
+ sink(tempfile()) # 将输出重定向到临时文件
+ on.exit(sink()) # 确保在函数退出时恢复正常输出
+ }
+ # 根据参数决定是否抑制消息
+ out <- if(messages) eval(...) else suppressMessages(eval(...))
+ out
+ }
>
> # 定义一个函数用于标准化数据
> standarize.fun <- function(indata, centerFlag, scaleFlag) {
+ scale(indata, center=centerFlag, scale=scaleFlag)
+ }
>
> # 定义一个函数用于批量处理数据的标准化
> scaleData <- function(data, cohort = NULL, centerFlags = NULL, scaleFlags = NULL){
+ samplename = rownames(data) # 保存原始样本名称
+ # 如果没有指定队列,将所有数据视为一个队列
+ if (is.null(cohort)){
+ data <- list(data); names(data) = "training"
+ }else{
+ data <- split(as.data.frame(data), cohort) # 根据队列分割数据
+ }
+
+ # 如果没有提供中心化标志,默认不进行中心化
+ if (is.null(centerFlags)){
+ centerFlags = F; message("No centerFlags found, set as FALSE")
+ }
+ # 如果中心化标志是单一值,应用于所有数据
+ if (length(centerFlags)==1){
+ centerFlags = rep(centerFlags, length(data)); message("set centerFlags for all cohort as ", unique(centerFlags))
+ }
+ # 如果中心化标志没有命名,按顺序匹配
+ if (is.null(names(centerFlags))){
+ names(centerFlags) <- names(data); message("match centerFlags with cohort by order\n")
+ }
+
+ # 如果没有提供缩放标志,默认不进行缩放
+ if (is.null(scaleFlags)){
+ scaleFlags = F; message("No scaleFlags found, set as FALSE")
+ }
+ # 如果缩放标志是单一值,应用于所有数据
+ if (length(scaleFlags)==1){
+ scaleFlags = rep(scaleFlags, length(data)); message("set scaleFlags for all cohort as ", unique(scaleFlags))
+ }
+ # 如果缩放标志没有命名,按顺序匹配
+ if (is.null(names(scaleFlags))){
+ names(scaleFlags) <- names(data); message("match scaleFlags with cohort by order\n")
+ }
+
+ centerFlags <- centerFlags[names(data)]; scaleFlags <- scaleFlags[names(data)]
+ # 使用mapply函数对每个数据队列应用标准化函数
+ outdata <- mapply(standarize.fun, indata = data, centerFlag = centerFlags, scaleFlag = scaleFlags, SIMPLIFY = F)
+ # lapply(out.data, function(x) summary(apply(x, 2, var)))
+ # 将处理后的数据按原始顺序重新组合
+ outdata <- do.call(rbind, outdata)
+ outdata <- outdata[samplename, ]
+ return(outdata)
+ }
>
> # 定义一个函数用于从模型中提取重要的变量
> ExtractVar <- function(fit){
+ Feature <- quiet(switch(
+ EXPR = class(fit)[1],
+ "lognet" = rownames(coef(fit))[which(coef(fit)[, 1]!=0)], # 从Elastic Net模型中提取非零系数的变量
+ "glm" = names(coef(fit)), # 从广义线性模型中提取变量
+ "svm.formula" = fit$subFeature, # SVM模型中未进行变量选择,使用所有变量
+ "train" = fit$coefnames, # 训练集中使用的变量
+ "glmboost" = names(coef(fit)[abs(coef(fit))>0]), # 从GLMBoost模型中提取系数非零的变量
+ "plsRglmmodel" = rownames(fit$Coeffs)[fit$Coeffs!=0], # 从PLSRGLM模型中提取系数非零的变量
+ "rfsrc" = var.select(fit, verbose = F)$topvars, # 从随机森林中选择重要的变量
+ "gbm" = rownames(summary.gbm(fit, plotit = F))[summary.gbm(fit, plotit = F)$rel.inf>0], # 从GBM模型中提取重要的变量
+ "xgb.Booster" = fit$subFeature, # XGBoost模型中使用的所有变量
+ "naiveBayes" = fit$subFeature # 朴素贝叶斯模型中使用的所有变量
+ # "drf" = fit$subFeature # DRF模型中使用的所有变量,当前版本已注释
+ ))
+
+ # 从提取的变量中移除截距项
+ Feature <- setdiff(Feature, c("(Intercept)", "Intercept"))
+ return(Feature)
+ }
>
> # 定义一个函数用于计算预测得分
> CalPredictScore <- function(fit, new_data, type = "lp"){
+ # 仅使用模型中涉及的变量
+ new_data <- new_data[, fit$subFeature]
+ # 根据模型类型使用不同的预测函数
+ RS <- quiet(switch(
+ EXPR = class(fit)[1],
+ "lognet" = predict(fit, type = 'response', as.matrix(new_data)), # 使用Elastic Net模型进行响应预测
+ "glm" = predict(fit, type = 'response', as.data.frame(new_data)), # 使用广义线性模型进行响应预测
+ "svm.formula" = predict(fit, as.data.frame(new_data), probability = T), # 使用SVM模型进行概率预测
+ "train" = predict(fit, new_data, type = "prob")[[2]], # 使用train函数进行概率预测
+ "glmboost" = predict(fit, type = "response", as.data.frame(new_data)), # 使用GLMBoost进行响应预测
+ "plsRglmmodel" = predict(fit, type = "response", as.data.frame(new_data)), # 使用PLSRGLM进行响应预测
+ "rfsrc" = predict(fit, as.data.frame(new_data))$predicted[, "1"], # 使用随机森林进行预测
+ "gbm" = predict(fit, type = 'response', as.data.frame(new_data)), # 使用GBM进行响应预测
+ "xgb.Booster" = predict(fit, as.matrix(new_data)), # 使用XGBoost进行预测
+ "naiveBayes" = predict(object = fit, type = "raw", newdata = new_data)[, "1"] # 使用朴素贝叶斯进行原始概率预测
+ # "drf" = predict(fit, functional = "mean", as.matrix(new_data))$mean # 使用DRF进行预测,当前版本已注释
+ ))
+ # 将预测结果转换为数值类型,并赋予原始样本名称
+ RS = as.numeric(as.vector(RS))
+ names(RS) = rownames(new_data)
+ return(RS)
+ }
>
> # 定义一个函数用于预测类别
> PredictClass <- function(fit, new_data){
+ # 仅使用模型中涉及的变量
+ new_data <- new_data[, fit$subFeature]
+ # 根据模型类型使用不同的分类预测函数
+ label <- quiet(switch(
+ EXPR = class(fit)[1],
+ "lognet" = predict(fit, type = 'class', as.matrix(new_data)), # 使用Elastic Net进行类别预测
+ "glm" = ifelse(test = predict(fit, type = 'response', as.data.frame(new_data))>0.5,
+ yes = "1", no = "0"), # 使用广义线性模型进行响应预测,并根据阈值分类
+ "svm.formula" = predict(fit, as.data.frame(new_data), decision.values = T), # 使用SVM进行决策值预测
+ "train" = predict(fit, new_data, type = "raw"), # 使用train函数进行原始预测
+ "glmboost" = predict(fit, type = "class", as.data.frame(new_data)), # 使用GLMBoost进行类别预测
+ "plsRglmmodel" = ifelse(test = predict(fit, type = 'response', as.data.frame(new_data))>0.5,
+ yes = "1", no = "0"), # 使用PLSRGLM进行响应预测,并根据阈值分类
+ "rfsrc" = predict(fit, as.data.frame(new_data))$class, # 使用随机森林进行类别预测
+ "gbm" = ifelse(test = predict(fit, type = 'response', as.data.frame(new_data))>0.5,
+ yes = "1", no = "0"), # 使用GBM进行响应预测,并根据阈值分类
+ "xgb.Booster" = ifelse(test = predict(fit, as.matrix(new_data))>0.5,
+ yes = "1", no = "0"), # 使用XGBoost进行预测,并根据阈值分类
+ "naiveBayes" = predict(object = fit, type = "class", newdata = new_data) # 使用朴素贝叶斯进行类别预测
+ # "drf" = predict(fit, functional = "mean", as.matrix(new_data))$mean # 使用DRF进行预测,当前版本已注释
+ ))
+ # 将预测结果转换为字符类型,并赋予原始样本名称
+ label = as.character(as.vector(label))
+ names(label) = rownames(new_data)
+ return(label)
+ }
>
> # 定义一个函数用于评估模型性能
> RunEval <- function(fit,
+ Test_set = NULL,
+ Test_label = NULL,
+ Train_set = NULL,
+ Train_label = NULL,
+ Train_name = NULL,
+ cohortVar = "Cohort",
+ classVar){
+
+ # 检查测试标签中是否存在队列指标
+ if(!is.element(cohortVar, colnames(Test_label))) {
+ stop(paste0("There is no [", cohortVar, "] indicator, please fill in one more column!"))
+ }
+
+ # 如果提供了训练集和训练标签,将它们与测试集合并
+ if((!is.null(Train_set)) & (!is.null(Train_label))) {
+ new_data <- rbind.data.frame(Train_set[, fit$subFeature],
+ Test_set[, fit$subFeature])
+
+ # 如果提供了训练名称,将其作为队列名称
+ if(!is.null(Train_name)) {
+ Train_label$Cohort <- Train_name
+ } else {
+ Train_label$Cohort <- "Training"
+ }
+ # 更新训练标签的列名,包括队列变量和类变量
+ colnames(Train_label)[ncol(Train_label)] <- cohortVar
+ Test_label <- rbind.data.frame(Train_label[,c(cohortVar, classVar)],
+ Test_label[,c(cohortVar, classVar)])
+ Test_label[,1] <- factor(Test_label[,1],
+ levels = c(unique(Train_label[,cohortVar]), setdiff(unique(Test_label[,cohortVar]),unique(Train_label[,cohortVar]))))
+ } else {
+ new_data <- Test_set[, fit$subFeature]
+ }
+
+ # 计算预测得分
+ RS <- suppressWarnings(CalPredictScore(fit = fit, new_data = new_data))
+
+ # 准备输出数据,包括预测得分
+ Predict.out <- Test_label
+ Predict.out$RS <- as.vector(RS)
+ # 按队列分组
+ Predict.out <- split(x = Predict.out, f = Predict.out[,cohortVar])
+ # 计算每个队列的AUC值
+ unlist(lapply(Predict.out, function(data){
+ as.numeric(auc(suppressMessages(roc(data[[classVar]], data$RS))))
+ }))
+ }
>
> # 定义一个简单的热图绘制函数
> SimpleHeatmap <- function(Cindex_mat, avg_Cindex,
+ CohortCol, barCol,
+ cellwidth = 1, cellheight = 0.5,
+ cluster_columns, cluster_rows){
+ # 定义列注释
+ col_ha = columnAnnotation("Cohort" = colnames(Cindex_mat),
+ col = list("Cohort" = CohortCol),
+ show_annotation_name = F)
+
+ # 定义行注释,包括平均C指数的条形图
+ row_ha = rowAnnotation(bar = anno_barplot(avg_Cindex, bar_width = 0.8, border = FALSE,
+ gp = gpar(fill = barCol, col = NA),
+ add_numbers = T, numbers_offset = unit(-10, "mm"),
+ axis_param = list("labels_rot" = 0),
+ numbers_gp = gpar(fontsize = 9, col = "white"),
+ width = unit(3, "cm")),
+ show_annotation_name = F)
+
+ # 创建热图
+ Heatmap(as.matrix(Cindex_mat), name = "AUC",
+ right_annotation = row_ha,
+ top_annotation = col_ha,
+ col = c("#4195C1", "#FFFFFF", "#FFBC90"), # 定义颜色,从蓝到红通过白色渐变
+ rect_gp = gpar(col = "black", lwd = 1), # 设置边框颜色为黑色
+ cluster_columns = cluster_columns, cluster_rows = cluster_rows, # 是否对行和列进行聚类
+ show_column_names = FALSE,
+ show_row_names = TRUE,
+ row_names_side = "left",
+ width = unit(cellwidth * ncol(Cindex_mat) + 2, "cm"),
+ height = unit(cellheight * nrow(Cindex_mat), "cm"),
+ column_split = factor(colnames(Cindex_mat), levels = colnames(Cindex_mat)),
+ column_title = NULL,
+ cell_fun = function(j, i, x, y, w, h, col) { # 在每个格子中添加文本
+ grid.text(label = format(Cindex_mat[i, j], digits = 3, nsmall = 3),
+ x, y, gp = gpar(fontsize = 10))
+ }
+ )
+ }
>
> # 读取训练数据文件
> Train_data <- read.table("data.train.txt", header = T, sep = "\t", check.names=F, row.names=1, stringsAsFactors=F)
> # 提取表达量数据和类别标签
> Train_expr=Train_data[,1:(ncol(Train_data)-1),drop=F]
> Train_class=Train_data[,ncol(Train_data),drop=F]
>
> # 读取测试数据文件
> Test_data <- read.table("data.test.txt", header=T, sep="\t", check.names=F, row.names=1, stringsAsFactors = F)
> # 提取表达量数据和类别标签
> Test_expr=Test_data[,1:(ncol(Test_data)-1),drop=F]
> Test_class=Test_data[,ncol(Test_data),drop=F]
> # 从行名中提取队列信息
> Test_class$Cohort=gsub("(.*)\\_(.*)\\_(.*)", "\\1", row.names(Test_class))
> Test_class=Test_class[,c("Cohort", "Type")]
>
> # 交叉验证训练集和测试集的共同基因
> comgene <- intersect(colnames(Train_expr), colnames(Test_expr))
> # 根据共同基因过滤数据
> Train_expr <- as.matrix(Train_expr[,comgene])
> Test_expr <- as.matrix(Test_expr[,comgene])
> # 对数据进行标准化处理
> Train_set = scaleData(data=Train_expr, centerFlags=T, scaleFlags=T)
set centerFlags for all cohort as TRUE
match centerFlags with cohort by order
set scaleFlags for all cohort as TRUE
match scaleFlags with cohort by order
>
>
> # 在标准化后的训练集上添加噪声
> set.seed(123) # 为了可重复性,设置随机种子
> noise <- matrix(rnorm(n = nrow(Train_set) * ncol(Train_set), mean = 1.5, sd = 4),
+ nrow = nrow(Train_set), ncol = ncol(Train_set))
> Train_set <- Train_set + noise
> # 测试集也可以添加噪声,如果希望影响模型在测试集上的表现
> Test_set = scaleData(data = Test_expr, cohort = Test_class$Cohort, centerFlags = T, scaleFlags = T)
set centerFlags for all cohort as TRUE
match centerFlags with cohort by order
set scaleFlags for all cohort as TRUE
match scaleFlags with cohort by order
> noise_test <- matrix(rnorm(n = nrow(Test_set) * ncol(Test_set), mean = 1.5, sd = 4),
+ nrow = nrow(Test_set), ncol = ncol(Test_set))
> Test_set <- Test_set + noise_test
> names(x = split(as.data.frame(Test_expr), f = Test_class$Cohort))
[1] "GSE28623" "GSE83456"
> Test_set = scaleData(data = Test_expr, cohort = Test_class$Cohort, centerFlags = T, scaleFlags = T)
set centerFlags for all cohort as TRUE
match centerFlags with cohort by order
set scaleFlags for all cohort as TRUE
match scaleFlags with cohort by order
>
> # 读取机器学习方法列表
> methodRT <- read.table("refer.methodLists.txt", header=T, sep="\t", check.names=F)
> methods=methodRT$Model
> methods <- gsub("-| ", "", methods) # 清理方法名称中的连字符和空格
>
> # 准备运行机器学习模型的参数
> classVar = "Type" # 设置类变量的名称
> Variable = colnames(Train_set)
> preTrain.method = strsplit(methods, "\\+") # 分解方法名称中的组合
> preTrain.method = lapply(preTrain.method, function(x) rev(x)[-1]) # 反转并移除第一个元素
> preTrain.method = unique(unlist(preTrain.method)) # 去除重复的方法名称
>
> ###################### 根据训练数据运行机器学习模型 ######################
> # 第一阶段:使用机器学习方法选择变量
> preTrain.var <- list() # 初始化保存变量选择结果的列表
> set.seed(seed = 123) # 设置随机种子以保证结果的可重复性
> for (method in preTrain.method){
+ preTrain.var[[method]] = RunML(method = method, # 指定机器学习方法
+ Train_set = Train_set, # 提供训练数据
+ Train_label = Train_class, # 提供类别标签
+ mode = "Variable", # 设置模式为变量选择
+ classVar = classVar) # 指定类变量
+ }
Run Lasso algorithm for Variable; ; using 19 Variables
15 Variables retained;
Run glmBoost algorithm for Variable; ; using 19 Variables
15 Variables retained;
Run RF algorithm for Variable; ; using 19 Variables
Error in var.select(fit, verbose = F) :
could not find function "var.select"请问最后这个报错怎么解决
最新发布