基因集变异分析(Gene Set Variation Analysis,GSVA)是一种用于揭示基因集(通路)在不同组中的差异性的计算方法。GSVA的作用是将单个基因表达水平转化为整个基因集的活跃度得分,并比较不同样本以及组间基因集的变异程度。相较于传统的基因水平的差异检验,GSVA能够捕捉到整个基因集在样本组之间可能存在的差异,上调还是抑制。这有助于更好地理解基因集在生物学过程中的功能,有助于发现与特定生理或病理状态相关的重要基因集。GSVA可以应用于多种研究领域,例如癌症研究、药物发现和生物标记物鉴定等。
GSVA教程如下:
library(GSVA)
#1
#computeGeneSetsOverlap:计算输入的每对基因集的重叠
geneSets <- list(set1=as.character(1:4), set2=as.character(4:10))
computeGeneSetsOverlap(geneSets, unique(unlist(geneSets)))#得出25%的基因是两组共有的
#2
#filterGeneSets:在最大和最小范围内挑选基因
geneSets <- list(set1=as.character(1:4), set2=as.character(4:10))
filterGeneSets(geneSets, min.sz=6,max.sz=7)#挑选最小值小于6,最大值大于7的基因集
#3
#gsva:GSVA富集分析
library(limma)
p <- 10 ## 基因数量
n <- 30 ## 样本数量
nGrp1 <- 15 ## 组1中样本数量
nGrp2 <- n - nGrp1 ## 组2中样本数量
## 3个分开的基因集
geneSets <- list(set1=paste("g", 1:3, sep=""),
set2=paste("g", 4:6, sep=""),
set3=paste("g", 7:10, sep=""))
##从一个满足均数为0,标准差为1的正态分布数据中抽样
y <- matrix(rnorm(n*p), nrow=p, ncol=n,
dimnames=list(paste("g", 1:p, sep="") , paste("s", 1:n, sep="")))#行名和列名
## 在组2中基因集1的基因表达水平较高
y[geneSets$set1, (nGrp1+1):n] <- y[geneSets$set1, (nGrp1+1):n] + 2
##构件设计矩阵
design <- cbind(sampleGroup1=1, sampleGroup2vs1=c(rep(0, nGrp1), rep(1, nGrp2)))
## 拟合线性模型
fit <- lmFit(y, design)
## 估计调节t-统计量
fit <- eBayes(fit)
##基因集1中基因的差异表达
topTable(fit, coef="sampleGroup2vs1")
##估计3个基因集的GSVA富集评分
gsva_es <- gsva(y, geneSets, mx.diff=1)
## 对GSVA富集分数进行线性拟合
fit <- lmFit(gsva_es, design)
## 估计调节t-统计量
fit <- eBayes(fit)
## 基因集1中基因的差异表达
topTable(fit, coef="sampleGroup2vs1")
#4
#igsva():启动交互式GSVA的web应用程序
res <- igsva()
实例操作:对TCGA-COAD数据进行GSVA分析
#绘制热图
#读取基因表达数据
load("排列有序的临床信息和基因表达矩阵.RData")
#导入特定基因集
#免疫相关基因集
immo <- msigdbr(species = "Homo sapiens",
category = "C7",
subcategory = "IMMUNESIGDB")
immo_df<-dplyr::select(immo,gs_name,gs_exact_source,gene_symbol)
immo_list<-split(immo_df$gene_symbol,immo_df$gs_name) ##按照gs_name给gene_symbol分组
#gsva分析
class(matrix_2)
data1<-as.matrix(matrix_2)[,1:50]
str(data1)
gsva_mat <- gsva(data1,
immo_list,
kcdf="Gaussian" ,#"Gaussian" for logCPM,logRPKM,logTPM, "Poisson" for counts
verbose=T,
parallel.sz = parallel::detectCores())#调用所有核
#limma差异分析
#读取分组信息
load("分组信息.Rdata")
identical(rownames(moxing),colnames(data1))
group_list<-moxing$group
design <- model.matrix(~0+factor(group_list))#制作混淆矩阵
colnames(design) <- levels(factor(group_list))
rownames(design) <- colnames(gsva_mat)
contrast.matrix <- makeContrasts(contrasts=paste0("high",'-',"low"),
levels = design)#制作比较矩阵
fit1 <- lmFit(gsva_mat,design) #拟合模型
fit2 <- contrasts.fit(fit1, contrast.matrix) #统计检验
efit <- eBayes(fit2) #修正
summary(decideTests(efit, p.value=0.05)) #统计查看差异结果
tempOutput<- topTable(efit, coef=paste0("high",'-',"low"), n=Inf)#提取数据
degs <- na.omit(tempOutput)
#绘制热图
##设置筛选阈值
padj_cutoff=0.05
keep <- rownames(degs[degs$adj.P.Val<padj_cutoff, ])#提选出p值小于0.05的通路
length(keep)
dat <- gsva_mat[keep[1:20],] #选取前20进行展示
gl<-data.frame(group=factor(group_list))
rownames(gl)<-colnames(dat)
#对dat进行重排列
identical(rownames(gl),colnames(dat))
sort_index <- order(gl$group)
group_sorted <- gl[sort_index,,drop = FALSE]#对行排列
dat_ordered <- dat[,rownames(group_sorted), drop = FALSE]#对列重排
identical(rownames(group_sorted),colnames(dat_ordered))
pheatmap::pheatmap(dat_ordered,
fontsize_row = 8,
height = 10,
width=18,
annotation_col =group_sorted,
cluster_col = FALSE,
show_colnames = F,
show_rownames = T,filename = 'GSVA_heatmap.pdf')
#绘制火山图
degs$significance<-as.factor(ifelse(degs$adj.P.Val < padj_cutoff & abs(degs$logFC) > 0,
ifelse(degs$logFC > 0 ,'UP','DOWN'),'NOT'))
table(degs$significance)
title <- paste0(' Up : ',nrow(degs[degs$significance =='UP',]) ,
'\n Down : ',nrow(degs[degs$significance =='DOWN',]),
'\n adj.P.Val < ',padj_cutoff,
'\n logFC >0 ')
ggplot(data=degs,
aes(x=logFC, y=-log10(adj.P.Val),
color=significance)) +
geom_point(alpha=0.4, size=1) +#绘制点
theme_classic()+ #无网格线
xlab("logFC") + #x轴
ylab("-log10(adj.P.Val)") +
#标题
ggtitle( title ) +
#分区颜色
scale_colour_manual(values = c('green','black','red'))+
#辅助线
geom_vline(xintercept = 0,lty=4,col="black",lwd=0.8) +
geom_hline(yintercept = -log10(padj_cutoff),lty=4,col="black",lwd=0.8) +
#图例标题间距等设置
theme(plot.title = element_text(hjust = 0.5),
plot.margin=unit(c(2,2,2,2),'lines'), #上右下左
legend.title = element_blank(),
legend.position="right")
#绘制条形图
library(magrittr)
library(ggthemes)
library(ggprism)
cutoff <- 0.0000000001
#挑选出前30个通路
path <- rbind(
subset(degs, logFC > 0) %>% arrange(adj.P.Val) %$% .[1:30,],
subset(degs, logFC < 0) %>% arrange(adj.P.Val) %$% .[1:30,]
)
#生成绘图所需的数据
plot_data <- data.frame(
id = row.names(path),
p = path$adj.P.Val,
logFC = path$logFC
)
plot_data$group <- ifelse(plot_data$logFC > 0, 1, -1)
plot_data$lg_p <- -log10(plot_data$p) * plot_data$group
plot_data$threshold <- factor(
ifelse(abs(plot_data$p) < cutoff, ifelse(plot_data$logFC > 0, 'Up', 'Down'), 'Not'),
levels = c('Up', 'Down', 'Not')
)
plot_data <- plot_data %>% arrange(lg_p)#按照lg_p排列
plot_data$id <- factor(plot_data$id, levels = plot_data$id)#x坐标转化为因子
down2 <- plot_data %>% filter(lg_p < log10(cutoff)) %>% nrow()
down1 <- plot_data %>% filter(lg_p < 0) %>% nrow()
up1 <- plot_data %>% filter(lg_p < -log10(cutoff)) %>% nrow()
up2 <- nrow(plot_data)
#绘图
ggplot(data = plot_data, aes(x = id, y = lg_p, fill = threshold)) +
geom_col() +
coord_flip() +
scale_fill_manual(values = c('Up' = 'red', 'Not' = 'grey', 'Down' = 'green')) +#填充颜色
geom_hline(yintercept = c(-log10(cutoff), log10(cutoff)), color = 'white', size = 0.5, lty = 'dashed') +#辅助线
xlab('Pathway') +
ylab('-log10(adj.P.Val) of GSVA') +
guides(fill = "none") +
theme_prism(border = T) +
theme(
axis.text.y = element_blank(),
axis.ticks.y = element_blank()
) +
geom_text(data = plot_data[1:down1, ], aes(x = id, y = 0.1, label = id), hjust = 0, color = 'black') +
geom_text(data = plot_data[(down1 + 1):down2, ], aes(x = id, y = 0.1, label = id), hjust = 0, color = 'grey') +
geom_text(data = plot_data[(down2 + 1):up1, ], aes(x = id, y = -0.1, label = id), hjust = 1, color = 'grey') +
geom_text(data = plot_data[(up1 + 1):up2, ], aes(x = id, y = -0.1, label = id), hjust = 1, color = 'black')
#绘制弦图
#引入包
library(circlize)
#导入数据:列名为通路,行名为分组,填充值为平均GSVA结果
#分组信息
xuan1<-t(gsva_mat[rownames(path),]) %>% data.frame
identical(rownames(xuan1),rownames(moxing))
xuan1$group<-moxing$group
str(xuan1)
# 按照分组计算不同通路的平均值
library(data.table)
asd<-data.table::dcast(xuan1,group~,fun=mean)
# 按照分组计算不同通路的平均值
result <- xuan1 %>%
group_by(group) %>%
summarize(across(1:20, mean)) %>% data.frame
result1<-result
rownames(result1)<-result1$group
str(result1)
result1<-result1[,-1] %>% as.matrix()
chordDiagram(result1)
chordDiagram(result1, grid.col = grid.col,annotationTrack = "grid",transparency = 0.8)
参考文献: