生信技能树 数据挖掘课程结合公众号推文学习笔记
TCGA数据获取方式
1. 如何使用R语言的cgdsr
包来获取任意TCGA数据
查看有多少不同的癌症数据集
cBioPortal
是按照发表文章的方式来组织TCGA数据的,当然,里面也还有很多非TCGA的数据集,所有的数据集如下所示:
library(cgdsr)
library(DT)
# Get list of cancer studies at server ## 获取有哪些数据集
mycgds <- CGDS("http://www.cbioportal.org/public-portal/")
all_TCGA_studies <- getCancerStudies(mycgds)
# all_TCGA_studies[1:3, 1:2]
#write.csv(all_TCGA_studies,paste0(Sys.time(),"all_TCGA_studies.csv"),row.names = F)
DT::datatable(all_TCGA_studies)
也可以去网站上面查看这些数据集的详细信息http://www.cbioportal.org/data_sets.jsp
查看任意数据集的样本列表方式
cancer_study_id
就是数据集的名字,选一个stad_tcga_pub
数据集为例
stad2014 <- "stad_tcga_pub"
# 获取在数据集中有哪些表格(每个表格都是一个样本列表)
all_tables <- getCaseLists(mycgds, stad2014)
dim(all_tables) # 共11种样本列表方式
DT::datatable(all_tables[,1:3])
查看任意数据集的数据形式
all_dataset <- getGeneticProfiles(mycgds, stad2014)
DT::datatable(all_dataset,
extensions = 'FixedColumns',
options = list( # dom = 't',
scrollX = TRUE,
fixedColumns = TRUE
))
选定数据形式及样本列表后获取感兴趣基因的信息
my_dataset <- 'stad_tcga_pub_rna_seq_v2_mrna'
my_table <- "stad_tcga_pub_rna_seq_v2_mrna"
# 获得一个基因的一个profile
BRCA1 <- getProfileData(mycgds, "BRCA1", my_dataset, my_table)
dim(BRCA1)
DT::datatable(BRCA1)
# 获取多个基因的相同profile
data <- getProfileData(mycgds,c("BRCA1","BRCA2"),my_dataset,my_table)
DT::datatable(data)
选定样本列表获取临床信息
如果需要绘制survival curve
,就需要获取临床数据
clinicaldata <- getClinicalData(mycgds, my_table)
DT::datatable(clinicaldata,
extensions = 'FixedColumns',
options = list( # dom = 't',
scrollX = TRUE,
fixedColumns = TRUE
))
综合性获取
只需要根据癌症列表选择自己感兴趣的研究数据集即可,然后选择好感兴趣的数据形式及对应的样本量。就可以获取对应的信息:
library(cgdsr)
library(DT)
mycgds <- CGDS("http://www.cbioportal.org/public-portal/")
## mycancerstudy = getCancerStudies(mycgds)[25,1]
mycancerstudy = 'brca_tcga'
getCaseLists(mycgds,mycancerstudy)[,1]
getGeneticProfiles(mycgds,mycancerstudy)[,1]
mycaselist ='brca_tcga_rna_seq_v2_mrna'
mygeneticprofile = 'brca_tcga_rna_seq_v2_mrna'
# Get data slices for a specified list of genes, genetic profile and case liste
xpr=getProfileData(mycgds,c('BRCA1','BRCA2'),mygeneticprofile,mycaselist)
DT::datatable(expr) # 就获得了指定基因在指定癌症中的表达量
# Get clinical data for the case list
myclinicaldata = getClinicalData(mycgds,mycaselist)
DT::datatable(myclinicaldata,
extensions = 'FixedColumns',
options = list( # dom = 't',
scrollX = TRUE,
fixedColumns = TRUE
))
从cBioPortal下载点突变信息
library(cgdsr)library(DT)
mycgds <- CGDS("http://www.cbioportal.org/public-portal/")
mutGene=c("EGFR", "PTEN", "TP53", "ATRX")
mut_df <- getProfileData(mycgds,
caseList ="gbm_tcga_sequenced",
geneticProfile = "gbm_tcga_mutations",
genes = mutGene
)
mut_df <- apply(mut_df,2,as.factor)
mut_df[mut_df == "NaN"] = ""
mut_df[is.na(mut_df)] = ""
mut_df[mut_df != ''] = "MUT"
DT::datatable(mut_df)
从cBioPortal下载拷贝数变异数据
library(cgdsr)
library(DT)
mycgds <- CGDS("http://www.cbioportal.org/public-portal/")
mutGene <- c("EFGR","PTEN","TP53","ATRX")
cna <- getProfileData(mycgds,
caseList ="gbm_tcga_sequenced",
geneticProfile = "gbm_tcga_gistic",
genes = mutGene)
cna <- apply(cna,2,function(x)as.character(factor(x,levels=c(-2,2),
labels=c("HOMDEL","HETLOSS","DIPLOID","GAIN","AMP"))))
cna[is.na(cna)]=""
cna[cna=="DIPLOID"]=""
DT::datatable(cna)
把拷贝数及点突变信息结合画热图
先把两个信息结合起来
library(ComplexHeatmap)
library(cgdsr)
comb <- data.frame(matrix(paste(as.matrix(cna),as.matrix(mut_df),sep";"),
nrow=nrow(cna),ncol=ncol(cna),
dimnames=list(row,names(mut_df),colnames(cna))))
mat <- as.matrix(t(comb))
DT::datatable(mat)
画热图,主要是配色复杂,原理其实很简单
alt <- apply(mat,1,function(x)strsplit(x,";"))
alt <- unique(unlist(alt))
alt <- alt[which(alt !="")]
alt <- c("background",alt)
alter_fun=list(
background=function(x,y,w,h){
grid.rect(x,y,w-unit(0.5,"mm"),h-unit(0.5,"mm"),gp=gpar(fill="#CCCCCC",col=NA))
},
HOMDEL=function(x,y,w,h){
grid.rect(x,y,w-unit(0.5,"mm"),h-unit(0.5,"mm"),gp=gpar(fill="blue3",col=NA))
},
HETLOSS=function(x,y,w,h){
grid.rect(x,y,w-unit(0.5,"mm"),h-unit(0.5,"mm"),gp=gpar(fill="cadetblue1",col=NA))
},
GAIN=function(x,y,w,h){
grid.rect(x,y,w-unit(0.5,"mm"),h-unit(0.5,"mm"),gp=gpar(fill="pink",col=NA))
},
AMP=function(x,y,w,h){
grid.rect(x,y,w-unit(0.5,"mm"),h-unit(0.5,"mm"),gp=gpar(fill="red",col=NA))
},
MUT=function(x,y,w,h){
grid.rect(x,y,w-unit(0.5,"mm"),h-unit(0.5,"mm"),gp=gpar(fill="#008000",col=NA))
})
col <- c("MUT"="#008000","AMP"="red","GAIN"="pink"," HETLOSS"="cadetblue1","HOMDEL"="blue3")
alt <- intersect(names(alter_fun),alt)
alt_fun_list <- alter_fun[alt]
col <- col[alt]
oncoPrint(mat=mat,alter_fun=alt_fun_list,
get_type=function(x)strsplit(x,";")[[1]],
col=col)
2. 如何使用R语言的RTCGAToolbox
包来获取任意TCGA数据
RTCGAToolbox
是Bioconductor
上的一个软件包,它的作用就是查询、下载和组织TCGA Firehose的数据,还提供一些简单的数据分析和可视化工具。除此之外,下载好的数据也可以很方便的导入到Bioconductor
的其他分析流程中。对于R用户来说,所有的TCGA数据分析工作(从数据下载一直到可视化图表)都可在一个pipeline中完成,能够极大地提高工作效率。
了解并获取FireBrowse
的数据
# 包下载
# source("https://bioconductor.org/biocLite.R")
# biocLite("RTCGAToolbox")
# 加载包
library(RTCGAToolbox)
# 哪些癌症数据可以下载
getFirehoseDatasets()
## [1] "ACC" "BLCA" "BRCA" "CESC" "CHOL" "COADREAD"
## [7] "COAD" "DLBC" "ESCA" "FPPP" "GBMLGG" "GBM"
## [13] "HNSC" "KICH" "KIPAN" "KIRC" "KIRP" "LAML"
## [19] "LGG" "LIHC" "LUAD" "LUSC" "MESO" "OV"
## [25] "PAAD" "PCPG" "PRAD" "READ" "SARC" "SKCM"
## [31] "STAD" "STES" "TGCT" "THCA" "THYM" "UCEC"
## [37] "UCS" "UVM"
# 数据库中更新时间
getFirehoseRunningDates()
getFirehoseAnalyzeDates()
# 下面以乳腺癌为例,下载数据
## 下载数据,需要选择癌症种类,数据分析时间,还有数据的种类
brcaData = getFirehoseData (dataset="BRCA", runDate="20160128",
forceDownload = TRUE,
clinical=TRUE, Mutation=TRUE)
save(brcaData,file='brcaData.RTCGAToolbox.Rdata')
# 临床信息和突变信息的数据下载,因为它们比较小,所以下载速度会很快
trying URL 'http://gdac.broadinstitute.org/runs/stddata__2016_01_28/data/BRCA/20160128/gdac.broadinstitute.org_BRCA.Clinical_Pick_Tier1.Level_4.2016012800.0.0.tar.gz'
Content type 'application/x-gzip' length 164047 bytes (160 KB)
trying URL 'http://gdac.broadinstitute.org/runs/stddata__2016_01_28/data/BRCA/20160128/gdac.broadinstitute.org_BRCA.Mutation_Packager_Calls.Level_3.2016012800.0.0.tar.gz' # 包含somatic mutation信息
Content type 'application/x-gzip' length 10454503 bytes (10.0 MB)
trying URL 'http://gdac.broadinstitute.org/runs/analyses__2016_01_28/data/BRCA/20160128/gdac.broadinstitute.org_BRCA-TP.CopyNumber_Gistic2.Level_4.2016012800.0.0.tar.gz'
Content type 'application/x-gzip' length 53856803 bytes (51.4 MB)
其实就是根据参数拼接了两个URL而已,原理非常简单,但是它有个好处就是,不仅仅是下载了数据,而且返回了包含这些数据的S4对象
还有很多其它参数可以控制下载的数据类型,包括:
- clinical - Get the clinical data slot
- RNASeqGene - RNASeqGene
- RNASeq2GeneNorm - Normalized
- miRNASeqGene - micro RNA SeqGene
- CNASNP - Copy Number Alteration
- CNVSNP - Copy Number Variation
- CNASeq - Copy Number Alteration
- CNACGH - Copy Number Alteration
- Methylation - Methylation
- mRNAArray - Messenger RNA
- miRNAArray - micro RNA
- RPPAArray - Reverse Phase Protein Array
- Mutation - Mutations
- GISTICA - GISTIC v2 (’AllByGene’ only)
- GISTICT - GISTIC v2 (’ThresholdedByGene’ only)
- GISTIC - GISTIC v2 scores and probabilities (both)
了解从FireBrowse
下载到的S4对象
load(file='brcaData.RTCGAToolbox.Rdata')
brcaData
## BRCA FirehoseData objectStandard run date: 20160128
## Analysis running date: 20160128
## Available data types:
## clinical: A data frame of phenotype data, dim: 1097 x 18 # 临床信息
## GISTIC: A FirehoseGISTIC for copy number data # 拷贝数变异信息
## Mutation: A data.frame, dim: 90490 x 67
## To export data, use the 'getData' function.
# 用`biocExtract()`函数来从S4对象里面获取数据
biocExtract(object, type = c("clinical", "RNASeqGene", "miRNASeqGene",
"RNASeq2GeneNorm", "CNASNP", "CNVSNP", "CNASeq", "CNACGH", "Methylation",
"Mutation", "mRNAArray", "miRNAArray", "RPPAArray", "GISTIC", "GISTICA",
"GISTICT"))
# 举例,提取临床信息
clinicData=biocExtract(brcaData,'clinical')
## working on: clinical
colnames(clinicData)
## [1] "Composite Element REF"
## [2] "years_to_birth"
## [3] "vital_status"
## [4] "days_to_death"
## [5] "days_to_last_followup"
## [6] "tumor_tissue_site"
## [7] "pathologic_stage"
## [8] "pathology_T_stage"
## [9] "pathology_N_stage"
## [10] "pathology_M_stage"
## [11] "gender"
## [12] "date_of_initial_pathologic_diagnosis"
## [13] "days_to_last_known_alive"
## [14] "radiation_therapy"
## [15] "histological_type"
## [16] "number_of_lymph_nodes"
## [17] "race"
## [18] "ethnicity"
DT::datatable(clinicData,
extensions = 'FixedColumns',
options = list(
#dom = 't',
scrollX = TRUE,
fixedColumns = TRUE
))
mutationData = biocExtract(brcaData,'Mutation')
## working on: Mutation
length(mutationData@assays)
## [1] 993
class(mutationData@assays[[1]])
## [1] "GRanges"
## attr(,"package")
## [1] "GenomicRanges"
对于 GRanges 对象,就按照 GRanges的操作手册来即可
RTCGAToolbox
提供了5个基本的数据分析工具:
- 差异分析
getDiffExpressedGenes(brcaData)
corRes <- getCNGECorrelation(brcaData)
corRes
showResults(corRes[[1]])
-
拷贝数和基因表达量的相关性分析
-
基因突变率分析
mt=getMutationRate(brcaData)
head(mt)
## Genes MutationRatio
## ACPP ACPP 0.006042296
## ALG13 ALG13 0.007049345
## AMY2A AMY2A 0.006042296
## B4GALT1 B4GALT1 0.004028197
## CARD6 CARD6 0.009063444
## CCDC114 CCDC114 0.005035247
tail(mt[order(mt$MutationRatio),])
## Genes MutationRatio
## GATA3 GATA3 0.09969789
## MUC16 MUC16 0.10070493
## CDH1 CDH1 0.11581067
## TTN TTN 0.19436052
## TP53 TP53 0.31117825
## PIK3CA PIK3CA 0.32628399
- 生存分析
head(clinicData[,1:4])
## Composite Element REF years_to_birth vital_status
## tcga.5l.aat0 value 42 0
## tcga.5l.aat1 value 63 0
## tcga.a1.a0sp value 40 0
## tcga.a2.a04v value 39 1
## tcga.a2.a04y value 53 0
## tcga.a2.a0cq value 62 0
## days_to_death
## tcga.5l.aat0 <NA>
## tcga.5l.aat1 <NA>
## tcga.a1.a0sp <NA>
## tcga.a2.a04v 1920
## tcga.a2.a04y <NA>
## tcga.a2.a0cq <NA>
survData <- data.frame(Samples=rownames(clinicData),
Time=as.numeric(clinicData[,4]),
Censor=as.numeric(clinicData[,3])
)
library(survival)
table(survData$Censor)
##
## 0 1
## 945 152
attach(survData)
my.surv <- Surv(Time,Censor)
kmfit1 <- survfit(my.surv~1)
plot(kmfit1)
- 可视化报告