本文的重点在于箱线图的调整,对ggplot2函数各个参数进行比较详细的剖析,方便绘图。
1 准备文件
准备文件包括表达矩阵,分组信息,表达矩阵行名为基因名,列名为样本名。
2 运行代码
2.1 加载R包、设定输入输出
# 加载必要的包
library(limma)
library(ggplot2)
library(ggpubr)
library(dplyr)
library(tidyr)
# 指定输出位置
output_folder <- "D:/Bio_Project/DEseq_batch/output" # 这里以我电脑中的工作路径为例
if (!dir.exists(output_folder)) {
dir.create(output_folder)
}
output_file <- file.path(output_folder,"boxplot.tiff")
# 输入
setwd("D:\\Bio_Project\\DEseq_batch\\input_GEO")
genelist <- "genelist.txt"
exp <- "GEO.txt"
group <- "group.txt"
2.2 读取基因
# 读取基因列表
gene_list <- read.table(genelist, header = TRUE, stringsAsFactors = FALSE)[,1]
# 读取表达矩阵
exp_data <- read.table(exp, header = TRUE, row.names = 1, stringsAsFactors = FALSE, check.names = FALSE)
# 读取分组信息
group_info <- read.table(group, header = TRUE, stringsAsFactors = FALSE)
colnames(group_info) <- c("sample", "group")
# 2. 数据预处理
# 过滤表达矩阵,只保留基因列表中的基因
exp_filtered <- exp_data[rownames(exp_data) %in% gene_list, ]
# 转置表达矩阵以便与分组信息合并
exp_t <- as.data.frame(t(exp_filtered))
exp_t$sample <- rownames(exp_t)
# 合并表达数据和分组信息
merged_data <- merge(exp_t, group_info, by = "sample")
2.3 差异分析
# 准备结果数据框
diff_results <- data.frame(gene = character(),
p.value = numeric(),
stringsAsFactors = FALSE)
# 对每个基因进行t检验
for(gene in gene_list) {
if(gene %in% colnames(merged_data)) {
# 提取当前基因的表达数据
normal_data <- merged_data[merged_data$group == "Normal", gene]
tumor_data <- merged_data[merged_data$group == "Tumor", gene]
# 进行t检验
test_result <- t.test(normal_data, tumor_data)
# 存储结果
diff_results <- rbind(diff_results,
data.frame(gene = gene,
p.value = test_result$p.value))
}
}
# 添加显著性标记
diff_results <- diff_results %>%
mutate(significance = case_when(
p.value < 0.001 ~ "***",
p.value < 0.01 ~ "**",
p.value < 0.05 ~ "*",
p.value < 0.1 ~ ".",
TRUE ~ "ns"
))
# 筛选显著差异基因 (p.value < 0.05)
sig_genes <- diff_results[diff_results$p.value < 0.05, "gene"]
2.4 绘图
绘图部分代码我会拆解的的比较细致(帮助大家绘出好看标准的图),如果大家觉得我的绘图风格还可以的话可以直接抄我的代码(图太大上传不了代码,大家运行后就可以得到tiff)。
接下来是代码的拆解:
plot_data <- merged_data %>%
select(sample, group, all_of(sig_genes)) %>%
pivot_longer(cols = -c(sample, group),
names_to = "gene",
values_to = "expression") %>%
left_join(diff_results %>% select(gene, significance), by = "gene")
这段代码运行的是输入数据的准备,%>%
是管道操作符,它可以将前一个函数的输出作为后一个函数的输入,比如在这个例子中如果没有merged_data %>%,就需要一个中间函数,写成
# 1. 选择指定列
selected_data <- select(merged_data, sample, group, all_of(sig_genes))
# 2. 将数据从宽格式转换为长格式
long_data <- pivot_longer(
data = selected_data,
cols = -c(sample, group),
names_to = "gene",
values_to = "expression"
)
# 3. 筛选差异结果中的基因和显著性列
significance_data <- select(diff_results, gene, significance)
# 4. 左连接数据框
plot_data <- left_join(
x = long_data,
y = significance_data,
by = "gene"
)
随后这个函数最后的画图部分,首先是精简部分,不含多余的注释:
# 计算全局最大表达值用于统一标记位置
global_y_max <- max(plot_data$expression)
p <- ggplot(plot_data, aes(x = gene, y = expression, color = group)) +
geom_boxplot(outlier.shape = NA, fill = NA, width = 0.6) +
geom_jitter(position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.6),
alpha = 0.6, size = 0.75) +
# 统一标记位置(所有标记在同一水平)
geom_text(data = diff_results %>% filter(gene %in% sig_genes),
aes(x = gene, y = global_y_max * 1.05, label = significance),
inherit.aes = FALSE, size = 5, vjust = 0.5) +
scale_color_manual(values = c("Normal" = "#2076b2", "Tumor" = "#af280a")) +
labs(x = "Genes", y = "Expression Value", color = "Group") +
# 调整Y轴范围以容纳标记
scale_y_continuous(limits = c(NA, global_y_max * 1.05)) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 20),
legend.text = element_text(size = 20),
legend.title = element_text(size = 20),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_rect(fill = NA, color = "black"),
# 关键修改:将图例放在右侧中间位置
legend.position = "right",
legend.justification = "center",
# 增加右边距给图例留空间
# plot.margin = margin(1, 3, 1, 1, "cm"),
# 调整图例框样式
legend.box.background = element_rect(color = NA, fill = "white", size = 0.5),
legend.box.margin = margin(0, 0, 0, 10)
) +
# 确保图例在绘图区域外也有足够空间
guides(color = guide_legend(
title.position = "top",
title.hjust = 0.5,
label.position = "right",
keywidth = unit(0.5, "cm"),
override.aes = list(size = 2)) # 调图例中图标大小
)
print(p)
随后是注释部分,注释了在ggplot中每个函数及其参数的作用:
# 创建ggplot对象,设置全局映射:x轴为基因,y轴为表达值,颜色映射到组别
p <- ggplot(plot_data, aes(x = gene, y = expression, color = group)) +
# 添加箱线图层:不显示离群点,不填充内部,宽度设为0.6
geom_boxplot(outlier.shape = NA, fill = NA, width = 0.6) +
# 添加抖动散点图层:在水平方向上抖动0.2单位以避免重叠,按组别躲避0.6单位
# 散点设置为半透明(alpha=0.6)和小尺寸(size=0.75)
geom_jitter(position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.6),
alpha = 0.6, size = 0.75) +
# 添加显著性标记文本:在所有基因上方相同高度(global_y_max*1.05)处添加星号标记
# 使用单独的数据源(diff_results),不继承全局映射,字体大小为5,垂直居中对齐
geom_text(data = diff_results %>% filter(gene %in% sig_genes),
aes(x = gene, y = global_y_max * 1.05, label = significance),
inherit.aes = FALSE, size = 5, vjust = 0.5) +
# 手动设置颜色映射:正常组为蓝色(#2076b2),肿瘤组为红色(#af280a)
scale_color_manual(values = c("Normal" = "#2076b2", "Tumor" = "#af280a")) +
# 设置坐标轴和图例标题
labs(x = "Genes", y = "Expression Value", color = "Group") +
# 设置Y轴范围:上限扩展到全局最大值的1.05倍,以容纳显著性标记
scale_y_continuous(limits = c(NA, global_y_max * 1.05)) +
# 使用简约主题
theme_minimal() +
# 自定义主题元素
theme(
# x轴标签旋转45度并右对齐,字体大小12
axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
# y轴标签字体大小12
axis.text.y = element_text(size = 12),
# 坐标轴标题字体大小20
axis.title = element_text(size = 20),
# 图例文本和标题字体大小20
legend.text = element_text(size = 20),
legend.title = element_text(size = 20),
# 隐藏网格线
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
# 添加黑色边框
panel.border = element_rect(fill = NA, color = "black"),
# 将图例放在右侧并垂直居中
legend.position = "right",
legend.justification = "center",
# 设置图例背景和边距
legend.box.background = element_rect(color = NA, fill = "white", size = 0.5),
legend.box.margin = margin(0, 0, 0, 10)
) +
# 自定义图例样式
guides(color = guide_legend(
title.position = "top", # 图例标题在顶部
title.hjust = 0.5, # 图例标题水平居中
label.position = "right", # 图例标签在右侧
keywidth = unit(0.5, "cm"), # 图例符号宽度
# 调整图例中符号的大小为2
override.aes = list(size = 2)
))
ggplot中起主要作用函数: