开始文章复现啦～_sce.markers <- findallmarkers(sce.all, only.pos = -优快云博客

本文链接：https://blog.youkuaiyun.com/Dipsy_Xiaoyu/article/details/144996850
20250108记录：第一次跟着视频完成复现，期间仍然遇到了各种各样的问题，开始总是很艰难的，这点流程花费了我两个半天才啃下来。“路漫漫其修远兮”，不过没关系，慢慢来，总会有收获～
dataset from GSE224528, the title of article "Single-cell RNA sequencing reveals distinct tumor microenvironmental patterns in lung adenocarcinoma". Steps as follows:
#step1:读入数据，构建单细胞对象(参数counts是前面的表达矩阵，project是自定义项目的名称，min.cells是一个基因至少在3个细胞表达，min.features是一个细胞至少包含200个基因)
sce <- CreateSeuratObject(counts = gse, project = "pro1_run", min.cells = 3, min.features = 200)
sce

#step2: QC (根据文章200<nFeature_RNA<4000, percent.mt<10) 并小提琴图展示
sce[["percent.mt"]] <- PercentageFeatureSet(sce, pattern = "^MT-") #在sce矩阵中增加一列线粒体基因所占百分比，用作筛选条件之一(存在于meta.data)
sce
sce2 <- subset(sce, subset = nFeature_RNA>200 & nFeature_RNA<4000 & percent.mt<10)
sce2
VlnPlot(sce2, features = c("nCount_RNA", "nFeature_RNA", "percent.mt")) #VlnPlot是Seurat函数的画图函数，参数features指定三个变量

#step3: 降维聚类分群（单个样品无需merge，此处利用SCTransform可以替代”筛选高变基因-缩放-标准化三步骤）
sce2 <- SCTransform(sce2, verbose = F) # verbose参数为F是为了只显示最终结果，简化输出
sce2 <- RunPCA(sce2, verbose = F)  #线性降维，得到一系列主成分（文章未给定参数，一般就自定义为20个PC）
sce2 <- RunUMAP(sce2, dims = 1:20, verbose = F) #非线性降维，dims参数是指定umap降维时所使用的数据维度
sce2 <- RunTSNE(sce2, dims = 1:20) #非线性降维，dims参数是指定umap降维时所使用的数据维度
sce2 <- FindNeighbors(sce2, dims = 1:20, verbose = F) #通过计算细胞间的距离进行细胞分群—依据细胞间的“共享邻居”。邻居越多，则两个细胞群越近。所以dims值越大，则分出的细胞群越多。
sce2 <- FindClusters(sce2, verbose = F) #识别细胞类群
DimPlot(sce2, reduction = "umap") +DimPlot(sce2, reduction = "tsne") #展示非线性降维的细胞分群结果
DotPlot(sce2, features = c("CD3G", "CD4")) #散点图画出感兴趣的基因主要分布在哪个细胞群 （文章分群是T cell所以根据T细胞的marker就能知道T细胞主要分布在那些群）
table(sce2$seurat_clusters) # 计算每个群的细胞数量如何

#step4: 从seurat_clusters中提取感兴趣的细胞亚群

sce_T <- subset(sce2, subset = (seurat_clusters %in% c(0,8,10,11))) #依据前面的散点图知道T细胞主要是在细胞群0/8/10/11中
sce_T
{
sce_T <- CreateSeuratObject(counts = sce_T@assays$RNA$counts, meta.data = sce_T@meta.data[,1:4])
sce_T <- NormalizeData(sce_T, normalization.method = "LogNormalize", scale.factor = 1e4)
GetAssay(sce_T, assay = "RNA")
sce_T <- FindVariableFeatures(sce_T, selection.method = "vst", nfeatures = 2000) #设定高变基因数量为2000
?FindClusters
sce_T <- ScaleData(sce_T)
sce_T <- RunPCA(object = sce_T, pc.genes = VariableFeatures(sce_T))
sce_T
sce_T <- FindNeighbors(sce_T, dims = 1:15)
for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 0.8, 1)) {
  sce_T = FindClusters(sce_T, 
                       resolution = res, algorithm = 1)
}
 sce_T <- RunUMAP(sce_T, dims = 1:15)
}
table(sce_T$RNA_snn_res.0.8) 
sel.clust <- "RNA_snn_res.0.8" #可以将0.8换为0.5 或1 都试一下43，44这两行代码，看看区别（一般认为resolution取值在0.4～1.2较好）
DefaultAssay(sce_T) <- "RNA"

#step5: 手动命名单细胞亚群并画图，通过将基因名写到genes_to_check 就可以对其进行可视化
check_by_ident <- function(iden){
  sce_T <- SetIdent(sce_T,value = iden)
  p = DimPlot(sce_T,reduction = "umap",label = T,repel = T)+
  VlnPlot(sce_T, 
          features = c("nCount_RNA","nFeature_RNA","percent.mt"))
  ggsave(paste0(iden,"_DimPlot_qc.pdf"),plot = p,
         width = 15,height = 6)
  
  markers <- FindAllMarkers(sce_T, only.pos = T, min.pct = 0.25, logfc.threshold = 0.25)
  all.markers =markers %>% dplyr::select(gene, everything()) %>% subset(p_val<0.05)
  dim(all.markers)
  table(all.markers$cluster)
  top10 =all.markers %>% group_by(cluster) %>% top_n(n=10, wt = avg_log2FC)
  DoHeatmap(sce_T,features = top10$gene)
  ggsave(paste0(iden,"_heatmap_to10_genes.pdf"),width = 6, height = 9)
         
  genes_to_check <- c("TCF7", "LEF1", "IL7R", "CD28", "FAS", "CXCR5", "CCR7", "GPR183", 
                      "BCL6", "IL21", "CXCL13", "TNFSF8",
                      "E2F8", "MKI67", "PCNA", "MCM2",
                      "EOMES", "NKG7", "PRF1", "SLAMF7", "GZMB", "CCL4", "GZMK")
  DotPlot(sce_T,features=genes_to_check)+coord_flip()
  ggsave(paste0(iden,"_genes_to_check_DotPlot.pdf"),width = 6,height = 6)
  
  FeaturePlot(sce_T,genes_to_check)
  ggsave(paste0(iden,"_genes_to_check_FeaturePlot.pdf"),width = 15,height = 15)
  
  }

check_by_ident('RNA_snn_res.0.8')
#check_by_ident('RNA_snn_res.0.5')
#check_by_ident('RNA_snn_res.1')

##注释，根据dotplot将相似的群进行合并注释（可以自定义，这里根据文章），然后依据markergene对每个细胞群进行命名
{
genes_to_check <- c("TCF7", "LEF1", "IL7R", "CD28", "FAS", "CXCR5", "CCR7", "GPR183", 
                    "BCL6", "IL21", "CXCL13", "TNFSF8",
                    "E2F8", "MKI67", "PCNA", "MCM2",
                    "EOMES", "NKG7", "PRF1", "SLAMF7", "GZMB", "CCL4", "GZMK")
pdf("dotplot_DimPlot.pdf", 10,6)
DimPlot(sce_T, reduction = "umap", label = T)+
  DotPlot(sce_T, features = genes_to_check)+coord_flip()
dev.off()
celltype=data.frame(ClusterID=0:6, celltype=0:6)  #参数是看细胞分群0到6群
celltype[celltype$ClusterID %in% c(0),2]="TCF-hi"
celltype[celltype$ClusterID %in% c(2),2]="TFH-like"
celltype[celltype$ClusterID %in% c(1,5),2]="Tcyto"

celltype[celltype$ClusterID %in% c(4),2]="TCycling1"
celltype[celltype$ClusterID %in% c(3,6),2]="TCycling2"

VlnPlot(sce_T, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"))

for (i in 1:nrow(celltype)) {
  sce_T@meta.data[which(sce_T@meta.data$RNA_snn_res.0.8==celltype$ClusterID[i]), 'celltype'] <- celltype$celltype[i]
}
table(sce_T$celltype)
Idents(sce_T) <- sce_T$celltype
check_by_ident('celltype')
100*table(sce_T$celltype)/ncol(sce_T) #计算每个细胞群所占百分比(影响因素甚多，如阈值或者p)
}

#step6:检查通路在这些亚群的打分----
library(msigdbr)
library(RColorBrewer)
gene_sets <- msigdbr(species = "Homo sapiens", category = "C2") %>%
  dplyr::filter(gs_name=="REACTOME_TCR_SIGNALING")   
#这里选择的是MSigDB数据库中REACTOME_TCR_SIGNALING基因集（版本变更也会变，或GOCC_T_CELL_RECEPTOR_COMPLEX或KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY等基因集不唯一去网站搜索）
#category参数需要去搜。MSigDB中信号通路相关的基因集会被划分到不同的类别（category）中。
unique(gene_sets$gene_symbol)
TCR_l <- list(TCR_signaling=unique(gene_sets$gene_symbol)) #对基因集进行list
TCR_l
sce_T <- AddModuleScore(sce_T,TCR_l)  #进行打分，打分方法不唯一
FeaturePlot(sce_T,"Cluster1") 展示sce_T对象中meta.data中的Cluster1（细胞亚群标记）进行可视化
VlnPlot(sce_T,"Cluster1")
ggsave("REACTOME_TCR_SIGNALING signaling.pdf") #将小提琴图保存为pdf

#cell cycling signaling
gene_sets2 <- msigdbr(species = "Homo sapiens", category = "C5") %>%
  dplyr::filter(gs_name=="GOBP_CELL_CYCLE") #或BIOCARTA_G1_PATHWAY等
unique(gene_sets2$gene_symbol)
cellcycling_l <- list(cell_cycle=unique(gene_sets2$gene_symbol)) #对基因集进行list
sce_T <- AddModuleScore(sce_T,cellcycling_l)  #进行打分
FeaturePlot(sce_T,"Cluster1") 展示sce_T对象中meta.data中的Cluster1（细胞亚群标记）进行可视化
VlnPlot(sce_T,"Cluster1") #去和前面的dotplot结果进行比对（由于文章没有提供基因列表，有出入）
ggsave("cellcycling.pdf")