20250108记录:第一次跟着视频完成复现,期间仍然遇到了各种各样的问题,开始总是很艰难的,这点流程花费了我两个半天才啃下来。“路漫漫其修远兮”,不过没关系,慢慢来,总会有收获~
dataset from GSE224528, the title of article "Single-cell RNA sequencing reveals distinct tumor microenvironmental patterns in lung adenocarcinoma". Steps as follows:
#step1:读入数据,构建单细胞对象(参数counts是前面的表达矩阵,project是自定义项目的名称,min.cells是一个基因至少在3个细胞表达,min.features是一个细胞至少包含200个基因)
sce <- CreateSeuratObject(counts = gse, project = "pro1_run", min.cells = 3, min.features = 200)
sce
#step2: QC (根据文章200<nFeature_RNA<4000, percent.mt<10) 并小提琴图展示
sce[["percent.mt"]] <- PercentageFeatureSet(sce, pattern = "^MT-") #在sce矩阵中增加一列线粒体基因所占百分比,用作筛选条件之一(存在于meta.data)
sce
sce2 <- subset(sce, subset = nFeature_RNA>200 & nFeature_RNA<4000 & percent.mt<10)
sce2
VlnPlot(sce2, features = c("nCount_RNA", "nFeature_RNA", "percent.mt")) #VlnPlot是Seurat函数的画图函数,参数features指定三个变量
#step3: 降维聚类分群(单个样品无需merge,此处利用SCTransform可以替代”筛选高变基因-缩放-标准化三步骤)
sce2 <- SCTransform(sce2, verbose = F) # verbose参数为F是为了只显示最终结果,简化输出
sce2 <- RunPCA(sce2, verbose = F) #线性降维,得到一系列主成分(文章未给定参数,一般就自定义为20个PC)
sce2 <- RunUMAP(sce2, dims = 1:20, verbose = F) #非线性降维,dims参数是指定umap降维时所使用的数据维度
sce2 <- RunTSNE(sce2, dims = 1:20) #非线性降维,dims参数是指定umap降维时所使用的数据维度
sce2 <- FindNeighbors(sce2, dims = 1:20, verbose = F) #通过计算细胞间的距离进行细胞分群—依据细胞间的“共享邻居”。邻居越多,则两个细胞群越近。所以dims值越大,则分出的细胞群越多。
sce2 <- FindClusters(sce2, verbose = F) #识别细胞类群
DimPlot(sce2, reduction = "umap") +DimPlot(sce2, reduction = "tsne") #展示非线性降维的细胞分群结果
DotPlot(sce2, features = c("CD3G", "CD4")) #散点图画出感兴趣的基因主要分布在哪个细胞群 (文章分群是T cell所以根据T细胞的marker就能知道T细胞主要分布在那些群)
table(sce2$seurat_clusters) # 计算每个群的细胞数量如何
#step4: 从seurat_clusters中提取感兴趣的细胞亚群
sce_T <- subset(sce2, subset = (seurat_clusters %in% c(0,8,10,11))) #依据前面的散点图知道T细胞主要是在细胞群0/8/10/11中
sce_T
{
sce_T <- CreateSeuratObject(counts = sce_T@assays$RNA$counts, meta.data = sce_T@meta.data[,1:4])
sce_T <- NormalizeData(sce_T, normalization.method = "LogNormalize", scale.factor = 1e4)
GetAssay(sce_T, assay = "RNA")
sce_T <- FindVariableFeatures(sce_T, selection.method = "vst", nfeatures = 2000) #设定高变基因数量为2000
?FindClusters
sce_T <- ScaleData(sce_T)
sce_T <- RunPCA(object = sce_T, pc.genes = VariableFeatures(sce_T))
sce_T
sce_T <- FindNeighbors(sce_T, dims = 1:15)
for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 0.8, 1)) {
sce_T = FindClusters(sce_T,
resolution = res, algorithm = 1)
}
sce_T <- RunUMAP(sce_T, dims = 1:15)
}
table(sce_T$RNA_snn_res.0.8)
sel.clust <- "RNA_snn_res.0.8" #可以将0.8换为0.5 或1 都试一下43,44这两行代码,看看区别(一般认为resolution取值在0.4~1.2较好)
DefaultAssay(sce_T) <- "RNA"
#step5: 手动命名单细胞亚群并画图,通过将基因名写到genes_to_check 就可以对其进行可视化
check_by_ident <- function(iden){
sce_T <- SetIdent(sce_T,value = iden)
p = DimPlot(sce_T,reduction = "umap",label = T,repel = T)+
VlnPlot(sce_T,
features = c("nCount_RNA","nFeature_RNA","percent.mt"))
ggsave(paste0(iden,"_DimPlot_qc.pdf"),plot = p,
width = 15,height = 6)
markers <- FindAllMarkers(sce_T, only.pos = T, min.pct = 0.25, logfc.threshold = 0.25)
all.markers =markers %>% dplyr::select(gene, everything()) %>% subset(p_val<0.05)
dim(all.markers)
table(all.markers$cluster)
top10 =all.markers %>% group_by(cluster) %>% top_n(n=10, wt = avg_log2FC)
DoHeatmap(sce_T,features = top10$gene)
ggsave(paste0(iden,"_heatmap_to10_genes.pdf"),width = 6, height = 9)
genes_to_check <- c("TCF7", "LEF1", "IL7R", "CD28", "FAS", "CXCR5", "CCR7", "GPR183",
"BCL6", "IL21", "CXCL13", "TNFSF8",
"E2F8", "MKI67", "PCNA", "MCM2",
"EOMES", "NKG7", "PRF1", "SLAMF7", "GZMB", "CCL4", "GZMK")
DotPlot(sce_T,features=genes_to_check)+coord_flip()
ggsave(paste0(iden,"_genes_to_check_DotPlot.pdf"),width = 6,height = 6)
FeaturePlot(sce_T,genes_to_check)
ggsave(paste0(iden,"_genes_to_check_FeaturePlot.pdf"),width = 15,height = 15)
}
check_by_ident('RNA_snn_res.0.8')
#check_by_ident('RNA_snn_res.0.5')
#check_by_ident('RNA_snn_res.1')
##注释,根据dotplot将相似的群进行合并注释(可以自定义,这里根据文章),然后依据markergene对每个细胞群进行命名
{
genes_to_check <- c("TCF7", "LEF1", "IL7R", "CD28", "FAS", "CXCR5", "CCR7", "GPR183",
"BCL6", "IL21", "CXCL13", "TNFSF8",
"E2F8", "MKI67", "PCNA", "MCM2",
"EOMES", "NKG7", "PRF1", "SLAMF7", "GZMB", "CCL4", "GZMK")
pdf("dotplot_DimPlot.pdf", 10,6)
DimPlot(sce_T, reduction = "umap", label = T)+
DotPlot(sce_T, features = genes_to_check)+coord_flip()
dev.off()
celltype=data.frame(ClusterID=0:6, celltype=0:6) #参数是看细胞分群0到6群
celltype[celltype$ClusterID %in% c(0),2]="TCF-hi"
celltype[celltype$ClusterID %in% c(2),2]="TFH-like"
celltype[celltype$ClusterID %in% c(1,5),2]="Tcyto"
celltype[celltype$ClusterID %in% c(4),2]="TCycling1"
celltype[celltype$ClusterID %in% c(3,6),2]="TCycling2"
VlnPlot(sce_T, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"))
for (i in 1:nrow(celltype)) {
sce_T@meta.data[which(sce_T@meta.data$RNA_snn_res.0.8==celltype$ClusterID[i]), 'celltype'] <- celltype$celltype[i]
}
table(sce_T$celltype)
Idents(sce_T) <- sce_T$celltype
check_by_ident('celltype')
100*table(sce_T$celltype)/ncol(sce_T) #计算每个细胞群所占百分比(影响因素甚多,如阈值或者p)
}
#step6:检查通路在这些亚群的打分----
library(msigdbr)
library(RColorBrewer)
gene_sets <- msigdbr(species = "Homo sapiens", category = "C2") %>%
dplyr::filter(gs_name=="REACTOME_TCR_SIGNALING")
#这里选择的是MSigDB数据库中REACTOME_TCR_SIGNALING基因集(版本变更也会变,或GOCC_T_CELL_RECEPTOR_COMPLEX或KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY等基因集不唯一去网站搜索)
#category参数需要去搜。MSigDB中信号通路相关的基因集会被划分到不同的类别(category)中。
unique(gene_sets$gene_symbol)
TCR_l <- list(TCR_signaling=unique(gene_sets$gene_symbol)) #对基因集进行list
TCR_l
sce_T <- AddModuleScore(sce_T,TCR_l) #进行打分,打分方法不唯一
FeaturePlot(sce_T,"Cluster1") 展示sce_T对象中meta.data中的Cluster1(细胞亚群标记)进行可视化
VlnPlot(sce_T,"Cluster1")
ggsave("REACTOME_TCR_SIGNALING signaling.pdf") #将小提琴图保存为pdf
#cell cycling signaling
gene_sets2 <- msigdbr(species = "Homo sapiens", category = "C5") %>%
dplyr::filter(gs_name=="GOBP_CELL_CYCLE") #或BIOCARTA_G1_PATHWAY等
unique(gene_sets2$gene_symbol)
cellcycling_l <- list(cell_cycle=unique(gene_sets2$gene_symbol)) #对基因集进行list
sce_T <- AddModuleScore(sce_T,cellcycling_l) #进行打分
FeaturePlot(sce_T,"Cluster1") 展示sce_T对象中meta.data中的Cluster1(细胞亚群标记)进行可视化
VlnPlot(sce_T,"Cluster1") #去和前面的dotplot结果进行比对(由于文章没有提供基因列表,有出入)
ggsave("cellcycling.pdf")