# 加载必要的包
library(readxl)
library(GSVA)
library(limma)
library(tidyverse)
library(edgeR)
library(BiocParallel)
# 步骤1:读取病人信息并预处理
patient_info <- read_excel("patient_info.xlsx") %>%
select(RNAID, Recurrence) %>%
filter(!is.na(RNAID))
# 步骤2:读取基因表达矩阵并匹配样本
gene_exp <- read.delim("Gene_exp.txt", row.names = 1, check.names = FALSE)
# 匹配表达矩阵列名与RNAID
matched_samples <- intersect(colnames(gene_exp), patient_info$RNAID)
gene_exp_matched <- gene_exp[, matched_samples, drop = FALSE]
# 获取对应的分组信息
group_info <- patient_info %>%
filter(RNAID %in% matched_samples) %>%
arrange(match(RNAID, colnames(gene_exp_matched))) %>%
mutate(Group = factor(Recurrence, levels = c(0, 1), labels = c("NonRecur", "Recur")))
# 步骤3:数据转换
data_type <- "counts" # 请根据实际数据类型修改:"counts" 或 "fpkm"
if(data_type == "counts") {
dge <- DGEList(counts = gene_exp_matched)
dge <- calcNormFactors(dge, method = "TMM")
expr_matrix <- cpm(dge, log = TRUE, prior.count = 1)
cat("数据转换完成:原始counts -> log2(CPM+1)\n")
} else if(data_type == "fpkm") {
expr_matrix <- as.matrix(log2(gene_exp_matched + 1))
cat("数据转换完成:FPKM/TPM -> log2(FPKM+1)\n")
} else {
stop("请指定正确的数据类型:'counts' 或 'fpkm'")
}
# 步骤4:读取SASP通路基因集
sasp_gmt <- readLines("SASP.gmt")
sasp_gene_sets <- list()
for (line in sasp_gmt) {
elements <- unlist(strsplit(line, "\t"))
pathway_name <- elements[1]
genes <- elements[3:length(elements)]
genes <- genes[genes != ""] # 移除空基因名
sasp_gene_sets[[pathway_name]] <- genes
}
# 步骤5:创建GSVA参数对象
param <- gsvaParam(
exprData = expr_matrix,
geneSets = sasp_gene_sets,
minSize = 5, # 最小基因集大小
maxSize = 500, # 最大基因集大小
kcdf = "Gaussian", # 适用于log转换的数据
absRanking = FALSE
)
# 步骤6:稳健执行GSVA分析(带错误处理和回退机制)
run_gsva <- function() {
# 尝试并行计算(如果可用)
if (requireNamespace("BiocParallel", quietly = TRUE)) {
tryCatch({
# 尝试自动检测最佳并行方法
bpparam <- bpparam()
# 设置进度条
if (interactive()) {
bpparam$progressbar <- TRUE
}
cat("尝试并行计算...\n")
return(gsva(param, verbose = TRUE, BPPARAM = bpparam))
}, error = function(e) {
cat("并行计算失败,错误信息:", e$message, "\n")
cat("尝试单线程计算...\n")
return(gsva(param, verbose = TRUE))
})
} else {
cat("BiocParallel不可用,使用单线程计算...\n")
return(gsva(param, verbose = TRUE))
}
}
# 执行GSVA分析
gsva_results <- run_gsva()
# 步骤7:差异分析
design <- model.matrix(~ 0 + group_info$Group)
colnames(design) <- levels(group_info$Group)
fit <- lmFit(gsva_results, design)
cont_matrix <- makeContrasts(
Recur_vs_Non = Recur - NonRecur,
levels = design
)
fit2 <- contrasts.fit(fit, cont_matrix)
fit2 <- eBayes(fit2)
# 获取差异结果
diff_results <- topTable(fit2, number = Inf, adjust.method = "BH", coef = "Recur_vs_Non")
# 步骤8:结果可视化
# 火山图
volcano_data <- as.data.frame(diff_results) %>%
rownames_to_column("Pathway") %>%
mutate(Significant = adj.P.Val < 0.05)
p_volcano <- ggplot(volcano_data, aes(x = logFC, y = -log10(adj.P.Val), color = Significant)) +
geom_point(alpha = 0.6) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
geom_vline(xintercept = 0, linetype = "dashed") +
scale_color_manual(values = c("grey", "red")) +
labs(title = "SASP Pathways Differential Enrichment",
x = "Log2 Fold Change (Recur vs NonRecur)",
y = "-Log10(Adjusted P-value)") +
theme_minimal()
# 添加标签(仅当有显著结果时)
if (sum(volcano_data$Significant) > 0) {
p_volcano <- p_volcano +
ggrepel::geom_text_repel(
data = subset(volcano_data, Significant),
aes(label = Pathway), size = 3, max.overlaps = 20
)
}
ggsave("GSVA_volcano_plot.pdf", plot = p_volcano, width = 10, height = 8)
# 热图(仅显示显著通路)
significant_pathways <- rownames(diff_results)[diff_results$adj.P.Val < 0.05]
if (length(significant_pathways) > 0) {
heatmap_data <- gsva_results[significant_pathways, ]
# 创建注释数据框
annotation_col <- data.frame(
Group = group_info$Group,
row.names = colnames(heatmap_data)
)
# 绘制热图
pheatmap::pheatmap(
heatmap_data,
annotation_col = annotation_col,
show_colnames = FALSE,
scale = "row",
clustering_method = "complete",
color = colorRampPalette(c("blue", "white", "red"))(100),
main = "Differential SASP Pathways (adj.P.Val < 0.05)",
filename = "GSVA_heatmap.pdf",
width = 10,
height = max(6, length(significant_pathways) * 0.5)
)
} else {
cat("未发现显著差异的通路(adj.P.Val < 0.05)\n")
}
# 步骤9:保存结果
write.csv(diff_results, "GSVA_Differential_Results.csv")
write.csv(as.data.frame(gsva_results), "GSVA_Enrichment_Scores.csv")
saveRDS(gsva_results, "GSVA_Scores.rds")
# 生成报告摘要
report <- list(
analysis_date = Sys.Date(),
input_data = c(
expression_matrix = paste(nrow(expr_matrix), "genes ×", ncol(expr_matrix), "samples"),
gene_sets = paste(length(sasp_gene_sets), "pathways"),
samples = paste(nrow(group_info), "samples (",
sum(group_info$Group == "NonRecur"), "NonRecur, ",
sum(group_info$Group == "Recur"), "Recur)")
),
gsva_results = paste(nrow(gsva_results), "pathways ×", ncol(gsva_results), "samples"),
significant_pathways = if(length(significant_pathways) > 0)
significant_pathways else "None"
)
cat("\n===== 分析摘要 =====\n")
cat("分析日期:", report$analysis_date, "\n")
cat("输入数据:\n -", report$input_data["expression_matrix"], "\n")
cat(" -", report$input_data["gene_sets"], "\n")
cat(" -", report$input_data["samples"], "\n")
cat("GSVA结果维度:", report$gsva_results, "\n")
cat("显著差异通路数:",
if(is.character(report$significant_pathways)) report$significant_pathways
else length(report$significant_pathways), "\n")
cat("结果已保存到工作目录\n") 可视化部分我只需要热图,复发一列、非复发一列,只看SASP通路,需要标记哪一列是复发还是非复发,样本名也需要写出
最新发布