Enrichment plot的另一种展示

在这里插入图片描述
上图大家都不陌生,GSEA分析出来的一个结果图。这里想像大家介绍的不是这个图是什么意思,而是这个图换一种形式展示!

#安装R包

library(plyr)
library(ggplot2)
library(grid)
library(gridExtra)
#设置工作目录
setwd("C:\\Users\\lexb4\\Desktop\\singleGene\\21.multipleGSEA") 
#获取目录下的所有xls文件
files=grep(".xls",dir(),value=T)
 #读取每个文件                                        
data = lapply(files,read.delim)                                        
names(data) = files
                               该目录下的所有文件

该目录下的所有文件
以其中一个文件为例
在这里插入图片描述

dataSet = ldply(data, data.frame)
#将文件后缀删掉
dataSet$pathway = gsub(".xls","",dataSet$.id)                            
gseaCol=c("#58CDD9","#7A142C","
#makeOrgPackage # 查看当前工作目录 getwd() setwd("D:/A File/Data/ORG") # 确保路径存在且有权限访问 library(dplyr) library(stringr) library(jsonlite) library(purrr) library(RCurl) library(clusterProfiler) library(AnnotationForge) library(tidyr) #创建Orgb-------------------------------------------------------------- # 1. 读取注释文件 - 使用更可靠的方式 # 建议明确指定分隔符和列名 egg <- read.csv("Annotation.csv", header = TRUE, stringsAsFactors = FALSE) #egg <- read.csv("Annotation.csv", header = TRUE, sep = "\t", stringsAsFactors = FALSE, na.strings = c("", "NA")) # 处理缺失值 egg[egg == ""] <- NA # 2. 基因信息提取 - 添加唯一性检查 gene_info <- egg %>% dplyr::select(GID = GeneID, GENENAME = Name) %>% na.omit() %>% distinct(GID, .keep_all = TRUE) # 确保GID唯一 # 3. GO注释处理 - 优化分割逻辑 goterms <- egg %>% dplyr::select(GeneID, GO) %>% filter(!is.na(GO) & str_detect(GO, "GO:")) # 使用更高效的分割方式 gene2go <- goterms %>% mutate(GO = str_split(GO, ";\\s*")) %>% unnest(GO) %>% filter(str_detect(GO, "^GO:\\d+")) %>% # 更严格的GO格式验证 transmute(GID = GeneID, GO = GO, EVIDENCE = "IEA") # 4. KEGG注释处理 - 添加格式验证 gene2ko <- egg %>% dplyr::select(GID = GeneID, KO = KEGG) %>% filter(!is.na(KO)) %>% mutate(KO = str_split(KO, ",\\s*")) %>% unnest(KO) %>% filter(str_detect(KO, "^K\\d{5}$")) # 严格匹配KO编号格式 # 5. Pathway处理 - 优化正则表达式 gene2pathway <- egg %>% dplyr::select(GID = GeneID, Pathway = KEGGPathway) %>% filter(!is.na(Pathway)) %>% mutate(Pathway = str_extract_all(Pathway, "ko\\d{5}")) %>% unnest(Pathway) %>% filter(nchar(Pathway) == 7) # 确保ko后接5位数字 # 6. KEGG信息下载 - 添加错误处理和文件检查 if (!file.exists('ko00001.json')) { download.file("https://rest.kegg.jp/get/ko00001/json", 'ko00001.json') } update_kegg <- function(json = "ko00001.json") { if (!file.exists(json)) stop("KEGG JSON file not found") kegg <- fromJSON(json) pathway2name <- tibble() ko2pathway <- tibble() # 使用更安全的遍历方式 for (a in seq_along(kegg$children$children)) { level1 <- kegg$children$children[[a]] for (b in seq_along(level1$children)) { level2 <- level1$children[[b]] for (c in seq_along(level2$children)) { pathway_info <- level2$name[[c]] # 提取通路ID和名称 pathway_id <- str_extract(pathway_info, "ko\\d{5}") pathway_name <- str_remove(pathway_info, "\\s*\\[PATH:ko\\d{5}\\]$") if (!is.na(pathway_id)) { pathway2name <- bind_rows(pathway2name, tibble(Pathway = pathway_id, Name = pathway_name)) } # 处理KO条目 kos <- level2$children[[c]]$name if (length(kos) > 0) { ko_ids <- str_extract(kos, "K\\d{5}") valid_ko <- !is.na(ko_ids) ko2pathway <- bind_rows(ko2pathway, tibble(Ko = ko_ids[valid_ko], Pathway = rep(pathway_id, sum(valid_ko)))) } } } } save(pathway2name, ko2pathway, file = "kegg_info.RData") } # 仅当文件不存在时更新 if (!file.exists('kegg_info.RData')) { update_kegg() } # 7. 创建OrgDb - 添加必要参数检查 makeOrgPackage( gene_info = gene_info, go = gene2go, ko = gene2ko, pathway = gene2pathway, version = "0.1.0", # 建议使用语义化版本 maintainer = 'Sungy <577548001@qq.com>', author = 'Sungy <577548001@qq.com>', outputDir = ".", tax_id = "4113", # 确认甘薯的NCBI分类ID (Ipomoea batatas) genus = "Ipomoea", species = "batatas", goTable = "go", verbose = TRUE # 显示详细过程 ) #差异基因 ---------------------------------------------------------------------------------------------------------------- # 加载所需库 library(dplyr) library(ggplot2) library(DESeq2) library(clusterProfiler) library(org.Ibatatas.eg.db) # 假设已创建的OrgDb包 library(DOSE) library(pathview) library(gridExtra) library(tibble) # 设置工作目录 setwd("D:/A File/Data/ORG") getwd() # ---------------------------- # 1. 读取并准备表达数据 # ---------------------------- # 1. 读取数据 count_data <- read.csv("Gene_countb.csv", row.names = 1, header = TRUE, check.names = FALSE) sample_info <- read.csv("Sample_info.csv", row.names = 1, header = TRUE, colClasses = "factor") # 检查样本匹配 message("表达矩阵样本名: ", paste(colnames(count_data), collapse = ", ")) message("分组文件样本名: ", paste(rownames(sample_info), collapse = ", ")) # 确保样本顺序一致 if (!all(colnames(count_data) %in% rownames(sample_info))) { stop("样本名称不匹配! 请检查表达矩阵和分组文件") } sample_info <- sample_info[colnames(count_data), , drop = FALSE] # ---------------------------- # 2. 差异基因分析 (使用DESeq2) # ---------------------------- # 创建DESeq2对象 dds <- DESeqDataSetFromMatrix( countData = count_data, colData = sample_info, design = ~ group # 假设分组列名为Group ) # 过滤低表达基因 dds <- dds[rowSums(counts(dds)) > 10, ] # 执行差异表达分析 dds <- DESeq(dds) # 获取差异分析结果 (假设比较组为"Treatment" vs "Control") # 请根据实际分组名称修改 res <- results(dds, contrast = c("group", "HIOF", "HICF")) # 整理结果,使用tidyr的rownames_to_column函数 diff_results <- as.data.frame(res) %>% tibble::rownames_to_column("GeneID") %>% # 明确指定包名,避免函数冲突 filter(!is.na(pvalue)) %>% # 去除NA值 arrange(pvalue) # 定义显著差异基因的阈值 pvalue_threshold <- 0.05 log2fc_threshold <- 1 # 筛选显著差异基因 sig_genes <- diff_results %>% filter(pvalue < pvalue_threshold & abs(log2FoldChange) > log2fc_threshold) %>% pull(GeneID) # 保存差异基因结果 write.csv(diff_results, "differential_expression_results.csv", row.names = FALSE) write.csv(data.frame(GeneID = sig_genes), "significant_genes.csv", row.names = FALSE) cat("符合条件的显著差异基因数量: ", length(sig_genes), "\n")# KEGG2---------------------------------------------------------------------------------------------------------------- # 3. 基因ID映射 # ---------------------------- if (length(sig_genes) > 0) { # 尝试多种ID转换方式 possible_from_types <- c("GID", "GENENAME", "SYMBOL") gene_list <- data.frame() for (from_type in possible_from_types) { tryCatch({ temp_mapping <- bitr( sig_genes, fromType = from_type, toType = "KO", OrgDb = org.Ibatatas.eg.db ) if (nrow(temp_mapping) > 0) { gene_list <- bind_rows(gene_list, temp_mapping) cat("使用", from_type, "成功映射", nrow(temp_mapping), "个基因\n") } }, error = function(e) { cat(from_type, "转换失败: ", e$message, "\n") }) } gene_list <- distinct(gene_list) if (nrow(gene_list) > 0) { cat("总共成功映射的基因数量: ", nrow(gene_list), "\n") # ---------------------------- # 4. 完全离线的KEGG富集分析 # ---------------------------- # 检查是否有本地KEGG数据文件 if (!file.exists("kegg_info.RData")) { stop("未找到本地KEGG数据文件!请先在有网络的环境下运行以下代码获取数据: source('https://bioconductor.org/biocLite.R') biocLite('org.Hs.eg.db') library(clusterProfiler) download.file('https://rest.kegg.jp/get/ko00001/json', 'ko00001.json') # 然后运行之前提供的update_kegg()函数生成kegg_info.RData ") } # 加载本地KEGG数据 load("kegg_info.RData") # 创建自定义的富集分析函数(完全离线) custom_enrich_kegg <- function(gene, ko2pathway, pathway2name, pvalue_cutoff = 0.05) { # 计算每个通路的基因数 pathway_counts <- ko2pathway %>% filter(Ko %in% gene) %>% group_by(Pathway) %>% summarise(Count = n()) %>% arrange(desc(Count)) # 合并通路名称 pathway_counts <- pathway_counts %>% left_join(pathway2name, by = "Pathway") %>% filter(!is.na(Name)) # 计算背景基因数(所有通路的总基因数) background_counts <- ko2pathway %>% group_by(Pathway) %>% summarise(BgCount = n_distinct(Ko)) # 合并数据 enrichment_results <- pathway_counts %>% left_join(background_counts, by = "Pathway") %>% mutate( BgRatio = paste0(Count, "/", BgCount), RichFactor = Count / BgCount, # 使用超几何检验计算p值 pvalue = phyper(Count - 1, BgCount, nrow(ko2pathway) - BgCount, length(gene), lower.tail = FALSE) ) %>% arrange(pvalue) %>% filter(pvalue < pvalue_cutoff) %>% rename(Description = Name, ID = Pathway) return(enrichment_results) } # 执行离线富集分析 kegg_results <- custom_enrich_kegg( gene = gene_list$KO, ko2pathway = ko2pathway, pathway2name = pathway2name, pvalue_cutoff = 0.05 ) write.csv(kegg_results, "kegg_enrichment_results.csv", row.names = FALSE) # ---------------------------- # 5. 绘制KEGG富集图表 # ---------------------------- if (nrow(kegg_results) > 0) { top20_kegg <- kegg_results %>% arrange(pvalue) %>% head(20) %>% mutate(Description = factor(Description, levels = rev(Description))) kegg_bar <- ggplot(top20_kegg, aes(x = Description, y = Count, fill = pvalue)) + geom_bar(stat = "identity", width = 0.8) + scale_fill_gradientn(colors = c("red", "orange", "yellow", "blue")) + labs( title = "Top 20 Significant KEGG Pathways", subtitle = "筛选标准: |log2FoldChange| > 1 且 P-value < 0.05", x = "Pathway", y = "Number of Genes", fill = "P-value" ) + theme_minimal() + theme( plot.title = element_text(hjust = 0.5, size = 14, face = "bold"), plot.subtitle = element_text(hjust = 0.5, size = 12), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 10), axis.text.y = element_text(size = 10) ) + coord_flip() print(kegg_bar) ggsave("kegg_enrichment_barplot.png", plot = kegg_bar, width = 12, height = 8, dpi = 300) top20_kegg_bubble <- kegg_results %>% arrange(pvalue) %>% head(20) %>% mutate( RichFactor = Count / as.numeric(sub("/\\d+", "", BgRatio)), Description = factor(Description, levels = rev(Description)) ) kegg_bubble <- ggplot(top20_kegg_bubble, aes(x = RichFactor, y = Description, size = Count, color = pvalue)) + geom_point(alpha = 0.7) + scale_size(range = c(3, 10), name = "Gene Count") + scale_color_gradientn(colors = c("red", "orange", "yellow", "blue"), name = "P-value") + labs( title = "Top 20 Significant KEGG Pathways", subtitle = "筛选标准: |log2FoldChange| > 1 且 P-value < 0.05", x = "Rich Factor", y = "Pathway" ) + theme_minimal() + theme( plot.title = element_text(hjust = 0.5, size = 14, face = "bold"), plot.subtitle = element_text(hjust = 0.5, size = 12), axis.text.x = element_text(size = 10), axis.text.y = element_text(size = 10) ) print(kegg_bubble) ggsave("kegg_enrichment_bubbleplot.png", plot = kegg_bubble, width = 12, height = 8, dpi = 300) } else { cat("没有显著富集的KEGG通路\n") } } else { cat("所有基因ID转换失败,请检查ID类型是否正确\n") write.csv(data.frame(Unmapped_GeneID = sig_genes), "unmapped_genes.csv", row.names = FALSE) } } else { cat("没有符合条件的显著差异基因,无法进行富集分析\n") }。检查以上代码,为何绘制的气泡图richfactor均为1,修改要求:根据 KEGG 富集结果,通过 Rich factor、FDR 值和富集到此 pathway上的基因个数来衡量富集的程度,挑选 FDR 值最小的即富集最显著的前20条 KEGG pathway 进行展示,圆圈颜色表示为FDR,圆圈大小反应gene count,横坐标为richfactor
最新发布
07-25
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值