#makeOrgPackage
# 查看当前工作目录
getwd()
setwd("D:/A File/Data/ORG") # 确保路径存在且有权限访问
library(dplyr)
library(stringr)
library(jsonlite)
library(purrr)
library(RCurl)
library(clusterProfiler)
library(AnnotationForge)
library(tidyr)
#创建Orgb--------------------------------------------------------------
# 1. 读取注释文件 - 使用更可靠的方式
# 建议明确指定分隔符和列名
egg <- read.csv("Annotation.csv", header = TRUE, stringsAsFactors = FALSE)
#egg <- read.csv("Annotation.csv", header = TRUE, sep = "\t",
stringsAsFactors = FALSE, na.strings = c("", "NA"))
# 处理缺失值
egg[egg == ""] <- NA
# 2. 基因信息提取 - 添加唯一性检查
gene_info <- egg %>%
dplyr::select(GID = GeneID, GENENAME = Name) %>%
na.omit() %>%
distinct(GID, .keep_all = TRUE) # 确保GID唯一
# 3. GO注释处理 - 优化分割逻辑
goterms <- egg %>%
dplyr::select(GeneID, GO) %>%
filter(!is.na(GO) & str_detect(GO, "GO:"))
# 使用更高效的分割方式
gene2go <- goterms %>%
mutate(GO = str_split(GO, ";\\s*")) %>%
unnest(GO) %>%
filter(str_detect(GO, "^GO:\\d+")) %>% # 更严格的GO格式验证
transmute(GID = GeneID,
GO = GO,
EVIDENCE = "IEA")
# 4. KEGG注释处理 - 添加格式验证
gene2ko <- egg %>%
dplyr::select(GID = GeneID, KO = KEGG) %>%
filter(!is.na(KO)) %>%
mutate(KO = str_split(KO, ",\\s*")) %>%
unnest(KO) %>%
filter(str_detect(KO, "^K\\d{5}$")) # 严格匹配KO编号格式
# 5. Pathway处理 - 优化正则表达式
gene2pathway <- egg %>%
dplyr::select(GID = GeneID, Pathway = KEGGPathway) %>%
filter(!is.na(Pathway)) %>%
mutate(Pathway = str_extract_all(Pathway, "ko\\d{5}")) %>%
unnest(Pathway) %>%
filter(nchar(Pathway) == 7) # 确保ko后接5位数字
# 6. KEGG信息下载 - 添加错误处理和文件检查
if (!file.exists('ko00001.json')) {
download.file("https://rest.kegg.jp/get/ko00001/json", 'ko00001.json')
}
update_kegg <- function(json = "ko00001.json") {
if (!file.exists(json)) stop("KEGG JSON file not found")
kegg <- fromJSON(json)
pathway2name <- tibble()
ko2pathway <- tibble()
# 使用更安全的遍历方式
for (a in seq_along(kegg$children$children)) {
level1 <- kegg$children$children[[a]]
for (b in seq_along(level1$children)) {
level2 <- level1$children[[b]]
for (c in seq_along(level2$children)) {
pathway_info <- level2$name[[c]]
# 提取通路ID和名称
pathway_id <- str_extract(pathway_info, "ko\\d{5}")
pathway_name <- str_remove(pathway_info, "\\s*\\[PATH:ko\\d{5}\\]$")
if (!is.na(pathway_id)) {
pathway2name <- bind_rows(pathway2name,
tibble(Pathway = pathway_id, Name = pathway_name))
}
# 处理KO条目
kos <- level2$children[[c]]$name
if (length(kos) > 0) {
ko_ids <- str_extract(kos, "K\\d{5}")
valid_ko <- !is.na(ko_ids)
ko2pathway <- bind_rows(ko2pathway,
tibble(Ko = ko_ids[valid_ko],
Pathway = rep(pathway_id, sum(valid_ko))))
}
}
}
}
save(pathway2name, ko2pathway, file = "kegg_info.RData")
}
# 仅当文件不存在时更新
if (!file.exists('kegg_info.RData')) {
update_kegg()
}
# 7. 创建OrgDb - 添加必要参数检查
makeOrgPackage(
gene_info = gene_info,
go = gene2go,
ko = gene2ko,
pathway = gene2pathway,
version = "0.1.0", # 建议使用语义化版本
maintainer = 'Sungy <577548001@qq.com>',
author = 'Sungy <577548001@qq.com>',
outputDir = ".",
tax_id = "4113", # 确认甘薯的NCBI分类ID (Ipomoea batatas)
genus = "Ipomoea",
species = "batatas",
goTable = "go",
verbose = TRUE # 显示详细过程
)
#差异基因 ----------------------------------------------------------------------------------------------------------------
# 加载所需库
library(dplyr)
library(ggplot2)
library(DESeq2)
library(clusterProfiler)
library(org.Ibatatas.eg.db) # 假设已创建的OrgDb包
library(DOSE)
library(pathview)
library(gridExtra)
library(tibble)
# 设置工作目录
setwd("D:/A File/Data/ORG")
getwd()
# ----------------------------
# 1. 读取并准备表达数据
# ----------------------------
# 1. 读取数据
count_data <- read.csv("Gene_countb.csv", row.names = 1, header = TRUE, check.names = FALSE)
sample_info <- read.csv("Sample_info.csv", row.names = 1, header = TRUE, colClasses = "factor")
# 检查样本匹配
message("表达矩阵样本名: ", paste(colnames(count_data), collapse = ", "))
message("分组文件样本名: ", paste(rownames(sample_info), collapse = ", "))
# 确保样本顺序一致
if (!all(colnames(count_data) %in% rownames(sample_info))) {
stop("样本名称不匹配! 请检查表达矩阵和分组文件")
}
sample_info <- sample_info[colnames(count_data), , drop = FALSE]
# ----------------------------
# 2. 差异基因分析 (使用DESeq2)
# ----------------------------
# 创建DESeq2对象
dds <- DESeqDataSetFromMatrix(
countData = count_data,
colData = sample_info,
design = ~ group # 假设分组列名为Group
)
# 过滤低表达基因
dds <- dds[rowSums(counts(dds)) > 10, ]
# 执行差异表达分析
dds <- DESeq(dds)
# 获取差异分析结果 (假设比较组为"Treatment" vs "Control")
# 请根据实际分组名称修改
res <- results(dds, contrast = c("group", "HIOF", "HICF"))
# 整理结果,使用tidyr的rownames_to_column函数
diff_results <- as.data.frame(res) %>%
tibble::rownames_to_column("GeneID") %>% # 明确指定包名,避免函数冲突
filter(!is.na(pvalue)) %>% # 去除NA值
arrange(pvalue)
# 定义显著差异基因的阈值
pvalue_threshold <- 0.05
log2fc_threshold <- 1
# 筛选显著差异基因
sig_genes <- diff_results %>%
filter(pvalue < pvalue_threshold & abs(log2FoldChange) > log2fc_threshold) %>%
pull(GeneID)
# 保存差异基因结果
write.csv(diff_results, "differential_expression_results.csv", row.names = FALSE)
write.csv(data.frame(GeneID = sig_genes), "significant_genes.csv", row.names = FALSE)
cat("符合条件的显著差异基因数量: ", length(sig_genes), "\n")# KEGG2----------------------------------------------------------------------------------------------------------------
# 3. 基因ID映射
# ----------------------------
if (length(sig_genes) > 0) {
# 尝试多种ID转换方式
possible_from_types <- c("GID", "GENENAME", "SYMBOL")
gene_list <- data.frame()
for (from_type in possible_from_types) {
tryCatch({
temp_mapping <- bitr(
sig_genes,
fromType = from_type,
toType = "KO",
OrgDb = org.Ibatatas.eg.db
)
if (nrow(temp_mapping) > 0) {
gene_list <- bind_rows(gene_list, temp_mapping)
cat("使用", from_type, "成功映射", nrow(temp_mapping), "个基因\n")
}
}, error = function(e) {
cat(from_type, "转换失败: ", e$message, "\n")
})
}
gene_list <- distinct(gene_list)
if (nrow(gene_list) > 0) {
cat("总共成功映射的基因数量: ", nrow(gene_list), "\n")
# ----------------------------
# 4. 完全离线的KEGG富集分析
# ----------------------------
# 检查是否有本地KEGG数据文件
if (!file.exists("kegg_info.RData")) {
stop("未找到本地KEGG数据文件!请先在有网络的环境下运行以下代码获取数据:
source('https://bioconductor.org/biocLite.R')
biocLite('org.Hs.eg.db')
library(clusterProfiler)
download.file('https://rest.kegg.jp/get/ko00001/json', 'ko00001.json')
# 然后运行之前提供的update_kegg()函数生成kegg_info.RData
")
}
# 加载本地KEGG数据
load("kegg_info.RData")
# 创建自定义的富集分析函数(完全离线)
custom_enrich_kegg <- function(gene, ko2pathway, pathway2name, pvalue_cutoff = 0.05) {
# 计算每个通路的基因数
pathway_counts <- ko2pathway %>%
filter(Ko %in% gene) %>%
group_by(Pathway) %>%
summarise(Count = n()) %>%
arrange(desc(Count))
# 合并通路名称
pathway_counts <- pathway_counts %>%
left_join(pathway2name, by = "Pathway") %>%
filter(!is.na(Name))
# 计算背景基因数(所有通路的总基因数)
background_counts <- ko2pathway %>%
group_by(Pathway) %>%
summarise(BgCount = n_distinct(Ko))
# 合并数据
enrichment_results <- pathway_counts %>%
left_join(background_counts, by = "Pathway") %>%
mutate(
BgRatio = paste0(Count, "/", BgCount),
RichFactor = Count / BgCount,
# 使用超几何检验计算p值
pvalue = phyper(Count - 1, BgCount, nrow(ko2pathway) - BgCount, length(gene), lower.tail = FALSE)
) %>%
arrange(pvalue) %>%
filter(pvalue < pvalue_cutoff) %>%
rename(Description = Name, ID = Pathway)
return(enrichment_results)
}
# 执行离线富集分析
kegg_results <- custom_enrich_kegg(
gene = gene_list$KO,
ko2pathway = ko2pathway,
pathway2name = pathway2name,
pvalue_cutoff = 0.05
)
write.csv(kegg_results, "kegg_enrichment_results.csv", row.names = FALSE)
# ----------------------------
# 5. 绘制KEGG富集图表
# ----------------------------
if (nrow(kegg_results) > 0) {
top20_kegg <- kegg_results %>%
arrange(pvalue) %>%
head(20) %>%
mutate(Description = factor(Description, levels = rev(Description)))
kegg_bar <- ggplot(top20_kegg, aes(x = Description, y = Count, fill = pvalue)) +
geom_bar(stat = "identity", width = 0.8) +
scale_fill_gradientn(colors = c("red", "orange", "yellow", "blue")) +
labs(
title = "Top 20 Significant KEGG Pathways",
subtitle = "筛选标准: |log2FoldChange| > 1 且 P-value < 0.05",
x = "Pathway",
y = "Number of Genes",
fill = "P-value"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, size = 12),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 10),
axis.text.y = element_text(size = 10)
) +
coord_flip()
print(kegg_bar)
ggsave("kegg_enrichment_barplot.png", plot = kegg_bar, width = 12, height = 8, dpi = 300)
top20_kegg_bubble <- kegg_results %>%
arrange(pvalue) %>%
head(20) %>%
mutate(
RichFactor = Count / as.numeric(sub("/\\d+", "", BgRatio)),
Description = factor(Description, levels = rev(Description))
)
kegg_bubble <- ggplot(top20_kegg_bubble,
aes(x = RichFactor, y = Description, size = Count, color = pvalue)) +
geom_point(alpha = 0.7) +
scale_size(range = c(3, 10), name = "Gene Count") +
scale_color_gradientn(colors = c("red", "orange", "yellow", "blue"), name = "P-value") +
labs(
title = "Top 20 Significant KEGG Pathways",
subtitle = "筛选标准: |log2FoldChange| > 1 且 P-value < 0.05",
x = "Rich Factor",
y = "Pathway"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, size = 12),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10)
)
print(kegg_bubble)
ggsave("kegg_enrichment_bubbleplot.png", plot = kegg_bubble, width = 12, height = 8, dpi = 300)
} else {
cat("没有显著富集的KEGG通路\n")
}
} else {
cat("所有基因ID转换失败,请检查ID类型是否正确\n")
write.csv(data.frame(Unmapped_GeneID = sig_genes), "unmapped_genes.csv", row.names = FALSE)
}
} else {
cat("没有符合条件的显著差异基因,无法进行富集分析\n")
}。检查以上代码,为何绘制的气泡图richfactor均为1,修改要求:根据 KEGG 富集结果,通过 Rich factor、FDR 值和富集到此 pathway上的基因个数来衡量富集的程度,挑选 FDR 值最小的即富集最显著的前20条 KEGG pathway 进行展示,圆圈颜色表示为FDR,圆圈大小反应gene count,横坐标为richfactor
最新发布