生信老学长
“师兄师兄,我要研究泛癌怎么批次化下载TCGA的数据啊”。嗨,上次告诉你关注我,这次还是再告诉你一次
download_path='./TCGA'#填写你的下载文件夹
setwd(download_path)
project='TCGA-OV'#填写肿瘤项目名称,不会的见推文
library(TCGAbiolinks)
library(SummarizedExperiment)
##下载方面主要分为三个部分
#1、查询 GDCquery()
#2、下载GDCdownload()
#3、读取GDCprepare()
#1、查询 GDCquery()
GDCquery(
project,#项目名称例如TCGA-LIHC 肝癌
data.category,#数据类型,见推文介绍
data.type,#分类后的再详细分类,见推文介绍
workflow.type,#使用的软件或者方法
access,#默认为open即权限
platform,#测序平台 见https://rdrr.io/bioc/TCGAbiolinks/man/GDCquery.html及TCGA
file.type,#一般没用
barcode,#可以指定下载样本条码如TARGET-20-PADZCG-04A-01R
data.format,#可以指定数据格式例如"VCF", "TXT", "BAM","SVS","BCR XML","BCR SSF XML", "TSV", "BCR Auxiliary XML", "BCR OMF XML", "BCR Biotab", "MAF", "BCR PPS XML", "XLSX")
experimental.strategy,#测序类型
sample.type#样本类型,见推文例如Primary Solid TumorTP
)
#查看各参数可填入列表
#project
TCGAbiolinks:::getGDCprojects()$project_id
#data.category
TCGAbiolinks:::getProjectSummary(project)$data_categories
#查询数据示例 转录组 原发性实体瘤和对照
query <- GDCquery(
project = project,
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",#基因表达量
workflow.type = "STAR - Counts",
sample.type = c('Primary Tumor', 'Control Analyte')
)
#2、下载GDCdownload()
GDCdownload(query, method = "api", files.per.chunk = 100)
#3、读取GDCprepare()
pre.exp = GDCprepare(query = query)
#查看数据类型
names(assays(pre.exp))
#提取fpkm
fpkm_matrix <- assay(pre.exp, "fpkm_uq_unstrand")
fpkm_matrix[1:5,]
#查看临床
colData(pre.exp)
#查看对应的symbol
rowData(pre.exp)
##查询临床
#indexed clinical: 使用 XML 文件创建的精炼临床数据,XML>indexed
#XML: 原始临床数据
#BCR Biotab: 解析 XML 文件之后的 tsv 文件
query_clin <- GDCquery(
project = project,
data.category = "Clinical",
data.type = "Clinical Supplement",
data.format = "BCR Biotab"
)
GDCdownload(query_clin)
clinical <- GDCprepare(query_clin)
names(clinical)
#获取采样信息
query_Biospecimen <- GDCquery(
project = project,
data.category = "Biospecimen",
data.type = "Biospecimen Supplement",
data.format = "BCR Biotab"
)
GDCdownload(query_Biospecimen)
Biospecimen=GDCprepare(query_Biospecimen)
#切片图像
query_slide <- GDCquery(
project = project,
data.category = "Biospecimen",,
data.type = "Slide Image",
data.format = "SVS"
)
# 检索特定肿瘤的分子亚型,只有部分有
subtype=TCGAquery_subtype('HNSC')
colnames(subtype)
参考文献
Antonio Colaprico, Tiago Chedraoui Silva, Catharina Olsen, Luciano Garofano, Claudia Cava, Davide Garolini, Thais Sabedot, Tathiane Malta, Stefano M. Pagnotta, Isabella Castiglioni,Michele Ceccarelli, Gianluca Bontempi Houtan Noushmehr. TCGAbiolinks: An R/Bioconductor package for integrative analysis of TCGA data Nucleic Acids Research (05 May 2016) 44 (8): e71.
生信老学长 专业方案撰写和生信支持
7713

被折叠的 条评论
为什么被折叠?



