rm(list = ls())
options(stringsAsFactors = F)
Sys.setenv(LANGUAGE = "en")
# 如果尚未安装 BiocManager,先安装它
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
# 使用 BiocManager 来安装 impute 包
if (!requireNamespace("impute", quietly = TRUE))
BiocManager::install("impute")
library(WGCNA)
library(FactoMineR)
library(factoextra)
library(tidyverse)
library(data.table)
library(openxlsx)
### 启用WGCNA多核计算
enableWGCNAThreads(nThreads = 0.75*parallel::detectCores())
################################# 0.输入数据准备 ################################
### 读取表达矩阵
cat("正在读取表达矩阵...\n")
if (T) {
fpkm00 <- fread("FPKM.txt", data.table = F)
# 检查数据结构和内容
cat("表达矩阵维度:", dim(fpkm00), "\n")
cat("前几行基因名:", head(fpkm00$gene), "\n")
### 合并具有相同基因名的行
table(duplicated(fpkm00$gene))
gene <- fpkm00$gene
fpkm00 <- fpkm00[,-1]
# 确保数据是矩阵而不是列表
fpkm00 <- as.matrix(fpkm00)
# 检查并处理非数值数据
if(!is.numeric(fpkm00)) {
cat("检测到非数值数据,正在转换...\n")
fpkm00 <- matrix(as.numeric(fpkm00), nrow = nrow(fpkm00), ncol = ncol(fpkm00))
}
# 处理NA和无穷大
fpkm00[is.na(fpkm00)] <- 0
fpkm00[is.infinite(fpkm00)] <- 0
# 使用矩阵进行聚合
fpkm0 <- aggregate(fpkm00, by=list(gene), FUN=sum)
fpkm <- as.matrix(fpkm0[,-1])
rownames(fpkm) <- fpkm0[,1]
}
# 确保fpkm是数值矩阵
if(!is.matrix(fpkm) || !is.numeric(fpkm)) {
fpkm <- as.matrix(fpkm)
fpkm <- matrix(as.numeric(fpkm), nrow = nrow(fpkm), ncol = ncol(fpkm))
}
# 处理NA和无穷大 - 确保是对矩阵操作
fpkm[is.na(fpkm)] <- 0
fpkm[is.infinite(fpkm)] <- 0
data <- log2(fpkm+1)
# 再次检查数据质量 - 确保data是矩阵
if(any(is.na(data)) | any(is.infinite(data))) {
data[is.na(data)] <- 0
data[is.infinite(data)] <- 0
}
### 筛选MAD前5000的基因
cat("筛选MAD前8000的基因...\n")
# 确保data是矩阵
if(!is.matrix(data)) {
data <- as.matrix(data)
}
keep_data <- data[order(apply(data,1,mad, na.rm = TRUE), decreasing = T)[1:min(8000, nrow(data))],]
### 创建datTraits,包含分组、表型等信息
cat("读取分组和表型数据...\n")
# 读取分组信息
samplegroup <- read.table("samplegroup.txt", header = TRUE, sep = "\t")
# 读取表型数据
trait_data <- read.table("Trait.txt", header = TRUE, sep = "\t")
# 检查表型数据结构
cat("表型数据维度:", dim(trait_data), "\n")
cat("表型数据列名:", colnames(trait_data), "\n")
# 仔细处理表型数据中的非数值
trait_data_clean <- trait_data
for(col in colnames(trait_data)[-1]) { # 跳过Sample列
if(!is.numeric(trait_data[[col]])) {
cat("列", col, "不是数值型,正在转换...\n")
# 转换为字符再转换为数值,处理可能的因子或字符类型
trait_data_clean[[col]] <- as.numeric(as.character(trait_data[[col]]))
# 检查转换后的NA值
na_count <- sum(is.na(trait_data_clean[[col]]))
if(na_count > 0) {
cat("警告: 列", col, "有", na_count, "个NA值,用均值替换\n")
trait_data_clean[[col]][is.na(trait_data_clean[[col]])] <- mean(trait_data_clean[[col]], na.rm = TRUE)
}
}
}
# 合并分组和表型数据
datTraits <- merge(samplegroup, trait_data_clean, by.x = "sample", by.y = "Sample")
rownames(datTraits) <- datTraits$sample
datTraits <- datTraits[,-1] # 移除sample列
### 给分组加上编号(用于后续分析)
grouptype <- data.frame(group=sort(unique(datTraits$group)),
groupNo=1:length(unique(datTraits$group)))
datTraits$groupNo <- NA
for(i in 1:nrow(grouptype)){
datTraits[which(datTraits$group == grouptype$group[i]),'groupNo'] <- grouptype$groupNo[i]
}
### 确保表达矩阵和表型数据的样本顺序一致
common_samples <- intersect(colnames(keep_data), rownames(datTraits))
cat("共同样本数量:", length(common_samples), "\n")
keep_data <- keep_data[, common_samples]
datTraits <- datTraits[common_samples, ]
### 转置并确保数据类型正确
cat("转置表达矩阵...\n")
# 确保keep_data是矩阵
if(!is.matrix(keep_data)) {
keep_data <- as.matrix(keep_data)
}
datExpr0 <- as.data.frame(t(keep_data))
# 确保datExpr0的所有列都是数值型
for(col in names(datExpr0)) {
if(!is.numeric(datExpr0[[col]])) {
datExpr0[[col]] <- as.numeric(as.character(datExpr0[[col]]))
}
# 处理任何NA值
if(any(is.na(datExpr0[[col]]))) {
datExpr0[[col]][is.na(datExpr0[[col]])] <- 0
}
}
############################## 1.判断数据质量 ################################
### 判断数据质量--缺失值
cat("检查数据质量...\n")
gsg <- goodSamplesGenes(datExpr0,verbose = 3)
gsg$allOK
if (!gsg$allOK){
cat("数据质量问题:\n")
if (sum(!gsg$goodGenes)>0)
printFlush(paste("移除基因:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", ")));
if (sum(!gsg$goodSamples)>0)
printFlush(paste("移除样本:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")));
datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]
datTraits = datTraits[gsg$goodSamples, ]
}
gsg <- goodSamplesGenes(datExpr0,verbose = 3)
cat("数据质量检查通过:", gsg$allOK, "\n")
### 绘制样品的系统聚类树
if(T){
sampleTree <- hclust(dist(datExpr0), method = "average")
par(mar = c(0,5,2,0))
plot(sampleTree, main = "Sample clustering", sub="", xlab="", cex.lab = 2,
cex.axis = 1, cex.main = 1,cex.lab=1)
sample_colors <- numbers2colors(as.numeric(factor(datTraits$group)),
colors = rainbow(length(table(datTraits$group))),
signed = FALSE)
par(mar = c(1,4,3,1),cex=0.8)
pdf("step1_Sample dendrogram and trait.pdf",width = 8,height = 6)
plotDendroAndColors(sampleTree, sample_colors,
groupLabels = "trait",
cex.dendroLabels = 0.8,
marAll = c(1, 4, 3, 1),
cex.rowText = 0.01,
main = "Sample dendrogram and trait" )
dev.off()
}
### 判断数据质量 : PCA进行分组查看
group_list <- datTraits$group
# 确保datExpr0是数值矩阵
datExpr0_numeric <- datExpr0
for(i in 1:ncol(datExpr0)) {
if(!is.numeric(datExpr0[[i]])) {
datExpr0_numeric[[i]] <- as.numeric(as.character(datExpr0[[i]]))
}
}
dat.pca <- PCA(datExpr0_numeric, graph = F)
pca <- fviz_pca_ind(dat.pca,
title = "Principal Component Analysis",
legend.title = "Groups",
geom.ind = c("point","text"),
pointsize = 2,
labelsize = 4,
repel = TRUE,
col.ind = group_list,
axes.linetype=NA,
mean.point=F
) +
theme(legend.position = "none") +
coord_fixed(ratio = 1)
pca
ggsave(pca,filename= "step1_Sample PCA analysis.pdf", width = 8, height = 8)
##保存数据
datExpr <- datExpr0
nGenes <- ncol(datExpr)
nSamples <- nrow(datExpr)
save(nGenes,nSamples,datExpr,datTraits,file="step1_input.Rdata")
############################### 2.挑选最佳阈值power ###################################
rm(list = ls())
load("step1_input.Rdata")
R.sq_cutoff = 0.8
# 检查数据是否全是数值
cat("检查datExpr数据类型:\n")
print(summary(sapply(datExpr, class)))
if(T){
powers <- c(seq(1,20,by = 1), seq(22,30,by = 2))
# 确保数据是数值型
datExpr_numeric <- datExpr
for(i in 1:ncol(datExpr)) {
if(!is.numeric(datExpr[[i]])) {
datExpr_numeric[[i]] <- as.numeric(as.character(datExpr[[i]]))
datExpr_numeric[[i]][is.na(datExpr_numeric[[i]])] <- 0
}
}
sft <- pickSoftThreshold(datExpr_numeric,
networkType = "unsigned",
powerVector = powers,
RsquaredCut = R.sq_cutoff,
verbose = 5)
pdf("step2_power-value.pdf",width = 16,height = 12)
par(mfrow = c(1,2));
cex1 = 0.9;
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n")
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red")
abline(h=R.sq_cutoff ,col="red")
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n")
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
abline(h=100,col="red")
dev.off()
}
sft$powerEstimate
power = sft$powerEstimate
if(is.na(power)){
type = "unsigned"
nSamples=nrow(datExpr)
power = ifelse(nSamples<20, ifelse(type == "unsigned", 9, 18),
ifelse(nSamples<30, ifelse(type == "unsigned", 8, 16),
ifelse(nSamples<40, ifelse(type == "unsigned", 7, 14),
ifelse(type == "unsigned", 6, 12))
)
)
cat("使用经验power值:", power, "\n")
}
save(sft, power, file = "step2_power_value.Rdata")
##################### 3.一步法构建加权共表达网络,识别基因模块 ####################
load(file = "step1_input.Rdata")
load(file = "step2_power_value.Rdata")
# 确保数据是数值型
datExpr_numeric <- datExpr
for(i in 1:ncol(datExpr)) {
if(!is.numeric(datExpr[[i]])) {
datExpr_numeric[[i]] <- as.numeric(as.character(datExpr[[i]]))
datExpr_numeric[[i]][is.na(datExpr_numeric[[i]])] <- 0
}
}
if(T){
cat("开始构建加权共表达网络...\n")
net <- blockwiseModules(
datExpr_numeric,
power = power,
maxBlockSize = ncol(datExpr_numeric),
corType = "pearson",
networkType = "unsigned",
TOMType = "unsigned",
minModuleSize = 50,
mergeCutHeight = 0.25,
pamRespectsDendro = FALSE, # 允许更灵活的模块检测
numericLabels = TRUE,
saveTOMs = F,
verbose = 3
)
table(net$colors)
}
## 模块可视化,层级聚类树展示各个模块
if(T){
moduleColors <- labels2colors(net$colors)
table(moduleColors)
pdf("step3_genes-modules_ClusterDendrogram.pdf",width = 16,height = 12)
plotDendroAndColors(net$dendrograms[[1]], moduleColors[net$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
dev.off()
}
##################### 新增:导出每个模块的基因数量和基因列表 #####################
cat("导出模块基因信息...\n")
# 获取基因名
gene_names <- colnames(datExpr_numeric)
# 创建模块-基因对应表
module_genes <- data.frame(
Gene = gene_names,
Module = moduleColors,
ModuleNumber = net$colors,
stringsAsFactors = FALSE
)
# 统计每个模块的基因数量
module_summary <- module_genes %>%
group_by(Module, ModuleNumber) %>%
summarise(GeneCount = n(), .groups = 'drop') %>%
arrange(desc(GeneCount))
# 保存模块统计信息
write.csv(module_summary, "step3_module_gene_count.csv", row.names = FALSE)
cat("模块基因数量统计已保存到: step3_module_gene_count.csv\n")
# 保存完整的模块-基因对应表
write.csv(module_genes, "step3_all_genes_modules.csv", row.names = FALSE)
cat("完整模块-基因对应表已保存到: step3_all_genes_modules.csv\n")
# 为每个模块创建单独的基因列表文件
unique_modules <- unique(moduleColors)
module_gene_lists <- list()
for(module in unique_modules) {
module_genes_subset <- module_genes[module_genes$Module == module, "Gene"]
filename <- paste0("step3_module_", module, "_genes.txt")
write.table(module_genes_subset, filename, row.names = FALSE, col.names = FALSE, quote = FALSE)
module_gene_lists[[module]] <- module_genes_subset
cat("模块", module, "的基因列表已保存到:", filename, "\n")
}
# 创建Excel工作簿,每个模块一个工作表
if(require(openxlsx)) {
wb <- createWorkbook()
# 添加模块统计工作表
addWorksheet(wb, "Module Summary")
writeData(wb, "Module Summary", module_summary)
# 添加完整基因列表工作表
addWorksheet(wb, "All Genes")
writeData(wb, "All Genes", module_genes)
# 为每个模块添加单独的工作表
for(module in unique_modules) {
module_data <- module_genes[module_genes$Module == module, c("Gene", "ModuleNumber")]
addWorksheet(wb, paste0("Module_", module))
writeData(wb, paste0("Module_", module), module_data)
}
saveWorkbook(wb, "step3_module_genes_detailed.xlsx", overwrite = TRUE)
cat("详细模块基因信息已保存到: step3_module_genes_detailed.xlsx\n")
}
# 创建模块基因数量条形图
module_plot_data <- module_summary[module_summary$Module != "grey", ] # 排除未分类基因
if(nrow(module_plot_data) > 0) {
p <- ggplot(module_plot_data, aes(x = reorder(Module, -GeneCount), y = GeneCount, fill = Module)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = module_plot_data$Module) +
labs(title = "各模块基因数量分布",
x = "模块",
y = "基因数量") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none")
ggsave("step3_module_gene_count_plot.pdf", p, width = 10, height = 6)
cat("模块基因数量分布图已保存到: step3_module_gene_count_plot.pdf\n")
}
# 显示模块统计信息
cat("\n=== 模块统计信息 ===\n")
print(module_summary)
cat("总基因数量:", nrow(module_genes), "\n")
cat("模块数量:", length(unique_modules), "\n")
cat("灰色模块(未分类)基因数量:", sum(moduleColors == "grey"), "\n")
save(net, moduleColors, file = "step3_genes_modules.Rdata")
####################### 4.关联基因模块与表型 #####################################
rm(list = ls())
load(file = "step1_input.Rdata")
load(file = "step2_power_value.Rdata")
load(file = "step3_genes_modules.Rdata")
# 确保数据是数值型
datExpr_numeric <- datExpr
for(i in 1:ncol(datExpr)) {
if(!is.numeric(datExpr[[i]])) {
datExpr_numeric[[i]] <- as.numeric(as.character(datExpr[[i]]))
datExpr_numeric[[i]][is.na(datExpr_numeric[[i]])] <- 0
}
}
## 模块与表型的相关性热图
if(T){
datTraits$group <- as.factor(datTraits$group)
design <- model.matrix(~0+datTraits$group)
colnames(design) <- levels(datTraits$group)
MES0 <- moduleEigengenes(datExpr_numeric, moduleColors)$eigengenes
MEs <- orderMEs(MES0)
traits_for_cor <- datTraits[, !colnames(datTraits) %in% "groupNo"]
# 确保表型数据是数值型
traits_for_cor_numeric <- traits_for_cor
for(col in colnames(traits_for_cor)) {
if(!is.numeric(traits_for_cor[[col]])) {
traits_for_cor_numeric[[col]] <- as.numeric(as.character(traits_for_cor[[col]]))
traits_for_cor_numeric[[col]][is.na(traits_for_cor_numeric[[col]])] <- 0
}
}
moduleTraitCor <- cor(MEs, traits_for_cor_numeric, use = "p")
moduleTraitPvalue <- corPvalueStudent(moduleTraitCor, nSamples)
textMatrix <- paste0(signif(moduleTraitCor,2),"\n(",
signif(moduleTraitPvalue,1),")")
dim(textMatrix) <- dim(moduleTraitCor)
pdf("step4_Module-trait-relationship_heatmap.pdf",
width = 0.6*ncol(traits_for_cor_numeric) + 2,
height = 0.6*nrow(moduleTraitCor) + 2)
par(mar=c(8, 10, 3, 3))
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = colnames(traits_for_cor_numeric),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = F,
colors = blueWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = F,
cex.text = 0.5,
zlim = c(-1,1),
main = "Module-trait relationships")
dev.off()
save(design, file = "step4_design.Rdata")
}
### 模块与表型的相关性boxplot图(针对分组)
if(T){
# 将MEs转换为数据框并添加行名作为列
MEs_with_rownames <- MEs
MEs_with_rownames$Sample <- rownames(MEs)
# 将datTraits转换为数据框并添加行名作为列
datTraits_with_rownames <- as.data.frame(datTraits)
datTraits_with_rownames$Sample <- rownames(datTraits)
# 按Sample列合并
mes_group <- merge(MEs_with_rownames, datTraits_with_rownames, by = "Sample")
library(ggpubr)
library(gridExtra)
draw_ggboxplot <- function(data, Module="Module", group="group"){
ggboxplot(data, x=group, y=Module,
ylab = paste0(Module),
xlab = group,
fill = group,
palette = "jco",
legend = "") +
stat_compare_means()
}
colorNames <- names(MEs)[!names(MEs) %in% "Sample"]
pdf("step4_Module-trait-relationship_boxplot.pdf", width = 7.5, height = 1.6*length(colorNames))
p <- lapply(colorNames, function(x) {
draw_ggboxplot(mes_group, Module = x, group = "group")
})
do.call(grid.arrange, c(p, ncol=2))
dev.off()
}
### 基因与模块、表型的相关性散点图
if(T){
modNames <- substring(names(MEs)[!names(MEs) %in% "Sample"], 3)
geneModuleMembership <- as.data.frame(cor(datExpr_numeric, MEs[,!names(MEs) %in% "Sample"], use = "p"))
MMPvalue <- as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples))
names(geneModuleMembership) <- paste0("MM", modNames)
names(MMPvalue) <- paste0("p.MM", modNames)
trait_names <- colnames(datTraits)[!colnames(datTraits) %in% c("group", "groupNo")]
for(trait_name in trait_names){
trait <- as.data.frame(datTraits[,trait_name])
names(trait) <- trait_name
geneTraitSignificance <- as.data.frame(cor(datExpr_numeric, trait, use = "p"))
GSPvalue <- as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples))
names(geneTraitSignificance) <- paste0("GS.", trait_name)
names(GSPvalue) <- paste0("p.GS.", trait_name)
selectModule <- modNames
pdf(paste0("step4_gene-Module-trait-significance_", trait_name, ".pdf"),
width=7, height=1.5*length(selectModule))
par(mfrow=c(ceiling(length(selectModule)/2),2))
for(module in selectModule){
column <- match(module, selectModule)
moduleGenes <- moduleColors == module
if(sum(moduleGenes) > 1){
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
abs(geneTraitSignificance[moduleGenes, 1]),
xlab = paste("Module Membership in", module, "module"),
ylab = paste("Gene significance for", trait_name),
main = paste("Module membership vs. gene significance\n"),
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2,
col = labels2colors(as.numeric(module)))
}
}
dev.off()
}
}
######################### 5. WGCNA可视化 ##################################
rm(list = ls())
load(file = 'step1_input.Rdata')
load(file = "step2_power_value.Rdata")
load(file = "step3_genes_modules.Rdata")
load(file = "step4_design.Rdata")
# 确保数据是数值型
datExpr_numeric <- datExpr
for(i in 1:ncol(datExpr)) {
if(!is.numeric(datExpr[[i]])) {
datExpr_numeric[[i]] <- as.numeric(as.character(datExpr[[i]]))
datExpr_numeric[[i]][is.na(datExpr_numeric[[i]])] <- 0
}
}
if(T){
TOM <- TOMsimilarityFromExpr(datExpr_numeric, power=power)
dissTOM <- 1-TOM
if(T){
geneTree = net$dendrograms[[1]]
plotTOM = dissTOM^7
diag(plotTOM)=NA
png("step5_TOMplot_Network-heatmap.png",width = 800, height=600)
TOMplot(plotTOM,geneTree,moduleColors,
col=gplots::colorpanel(250,'red',"orange",'lemonchiffon'),
main="Network heapmap plot")
dev.off()
}
}
### 模块相关性展示
if(T){
MEs = moduleEigengenes(datExpr_numeric, moduleColors)$eigengenes
MET = orderMEs(MEs)
if(T){
TF = as.data.frame(datTraits$TF)
names(TF) = "TF"
MET = orderMEs(cbind(MEs, TF))
}
pdf("step5_module_cor_Eigengene-dendrogram.pdf",width = 8,height = 10)
plotEigengeneNetworks(MET, setLabels="",
marDendro = c(0,4,1,4),
marHeatmap = c(5,5,1,2),
cex.lab = 0.8,
xLabelsAngle = 90)
dev.off()
}
cat("WGCNA分析完成!\n")。该代码结果模块与表型TF、TP只有强负相关的,没有强正相关,如何解决