作业内容
重复这个散点图
背景知识
TNBC-三阴性乳腺癌
三阴乳腺癌是指乳腺癌免疫组化结果:雌激素受体(ER)、孕激素受体(PR)和人表皮生长因子受体2(Her-2)均为阴性的乳腺癌。
三阴性乳腺癌容易早期发生广泛转移,如脑转移、骨转移等情况。所以,化疗在三阴性乳腺癌当中地位很重要。当然,复发高峰在手术以后头三年,过了复发高峰以后,预后也还是非常好的一种恶性肿瘤,具体要看分期如何、病人身体状况,以及接受什么样治疗等。
下载数据并提取
1.从TCGA.BRCA.sampleMap%2FBRCA_clinicalMatrix中提取TNBC的TCGA ID,发现一共123个
#https://tcga-xena-hub.s3.us-east-1.amazonaws.com/download/TCGA.BRCA.sampleMap%2FBRCA_clinicalMatrix;
#phenotype;存储BRCA的样品临床信息
#TCGA.BRCA.sampleMap%2FBRCA_clinicalMatrix;
cut -f 1,8,10,22 TCGA.BRCA.sampleMap%2FBRCA_clinicalMatrix |awk '$2=="Negative" && $3=="Negative" && $4=="Negative"' |cut -f 1 >TNBC_id
TCGA-A1-A0SK-01,TCGA-A1-A0SO-01,TCGA-A1-A0SP-01,TCGA-A2-A04P-01,TCGA-A2-A04Q-01,TCGA-A2-A04T-01,TCGA-A2-A04U-01,TCGA-A2-A0CM-01,TCGA-A2-A0D0-01,TCGA-A2-A0D2-01,TCGA-A2-A0ST-01,TCGA-A2-A0SX-01,TCGA-A2-A0T0-01,TCGA-A2-A0T2-01,TCGA-A2-A0YE-01,TCGA-A2-A0YM-01,TCGA-A2-A1G6-01,TCGA-A7-A0CE-01,TCGA-A7-A0DA-01,TCGA-A7-A26F-01,TCGA-A7-A26G-01,TCGA-A7-A26I-01,TCGA-A8-A07C-01,TCGA-A8-A07O-01,TCGA-A8-A08R-01,TCGA-A8-A09X-01,TCGA-AN-A04D-01,TCGA-AN-A0AL-01,TCGA-AN-A0AR-01,TCGA-AN-A0AT-01,TCGA-AN-A0FL-01,TCGA-AN-A0FX-01,TCGA-AN-A0G0-01,TCGA-AN-A0XU-01,TCGA-AO-A03U-01,TCGA-AO-A0J2-01,TCGA-AO-A0J4-01,TCGA-AO-A0J6-01,TCGA-AO-A0JL-01,TCGA-AO-A124-01,TCGA-AO-A128-01,TCGA-AO-A129-01,TCGA-AO-A12F-01,TCGA-AQ-A04J-01,TCGA-AR-A0TS-01,TCGA-AR-A0TU-01,TCGA-AR-A0U0-01,TCGA-AR-A0U1-01,TCGA-AR-A0U4-01,TCGA-AR-A1AI-01,TCGA-AR-A1AQ-01,TCGA-AR-A1AR-01,TCGA-AR-A1AY-01,TCGA-AR-A256-01,TCGA-B6-A0IE-01,TCGA-B6-A0IK-01,TCGA-B6-A0IQ-01,TCGA-B6-A0RE-01,TCGA-B6-A0RG-01,TCGA-B6-A0RN-01,TCGA-B6-A0RS-01,TCGA-B6-A0RT-01,TCGA-B6-A0RU-01,TCGA-B6-A0WX-01,TCGA-BH-A0AV-01,TCGA-BH-A0B3-01,TCGA-BH-A0B9-01,TCGA-BH-A0BG-01,TCGA-BH-A0BL-01,TCGA-BH-A0BW-01,TCGA-BH-A0E0-01,TCGA-BH-A0E6-01,TCGA-BH-A0RX-01,TCGA-BH-A0WA-01,TCGA-BH-A18G-01,TCGA-BH-A18Q-01,TCGA-BH-A18T-01,TCGA-BH-A18V-01,TCGA-BH-A1EW-01,TCGA-BH-A1FC-01,TCGA-C8-A12V-01,TCGA-C8-A131-01,TCGA-C8-A134-01,TCGA-C8-A1HJ-01,TCGA-C8-A26X-01,TCGA-C8-A26Y-01,TCGA-C8-A27B-01,TCGA-D8-A13Z-01,TCGA-D8-A142-01,TCGA-D8-A143-01,TCGA-D8-A147-01,TCGA-D8-A1JF-01,TCGA-D8-A1JG-01,TCGA-D8-A1JL-01,TCGA-D8-A1XK-01,TCGA-D8-A1XQ-01,TCGA-D8-A27F-01,TCGA-D8-A27H-01,TCGA-D8-A27M-01,TCGA-E2-A14N-01,TCGA-E2-A14R-01,TCGA-E2-A14X-01,TCGA-E2-A150-01,TCGA-E2-A158-01,TCGA-E2-A159-01,TCGA-E2-A1AZ-01,TCGA-E2-A1B6-01,TCGA-E2-A1L7-01,TCGA-E2-A1LG-01,TCGA-E2-A1LH-01,TCGA-E2-A1LI-01,TCGA-E2-A1LK-01,TCGA-E2-A1LL-01,TCGA-E2-A1LS-01,TCGA-E9-A1ND-01,TCGA-E9-A22G-01,TCGA-EW-A1OV-01,TCGA-EW-A1OW-01,TCGA-EW-A1P1-01,TCGA-EW-A1P4-01,TCGA-EW-A1P7-01,TCGA-EW-A1P8-01,TCGA-EW-A1PB-01
#R语言操作,效果和上面一致
library(data.table)
BRCA_info = fread('TCGA.BRCA.sampleMap%2FBRCA_clinicalMatrix',na.strings="NA")[,c(1,8,10,22)]
BRCA_TNBC_info = dplyr::filter(BRCA_info,ER_Status_nature2012=="Negative",HER2_Final_Status_nature2012=="Negative",PR_Status_nature2012=="Negative")
TNBC_id = unique(BRCA_TNBC_info$sampleID)
> dim(BRCA_TNBC_info)
[1] 123 4
> head(BRCA_TNBC_info)
sampleID ER_Status_nature2012 HER2_Final_Status_nature2012 PR_Status_nature2012
1: TCGA-A1-A0SK-01 Negative Negative Negative
2: TCGA-A1-A0SO-01 Negative Negative Negative
3: TCGA-A1-A0SP-01 Negative Negative Negative
4: TCGA-A2-A04P-01 Negative Negative Negative
5: TCGA-A2-A04Q-01 Negative Negative Negative
6: TCGA-A2-A04T-01 Negative Negative Negative
2.从mc3%2FBRCA_mc3.txt.gz文件中提取 tp53突变 且突变类型为 Missense_Mutation的TCGA ID
#https://tcga-xena-hub.s3.us-east-1.amazonaws.com/download/mc3%2FBRCA_mc3.txt.gz;
#mc3%2FBRCA_mc3.txt.gz;somatic mutation (SNP and INDEL),保存的是BRCA样品的突变信息
zcat mc3%2FBRCA_mc3.txt.gz |awk '$7=="TP53" && $8=="Missense_Mutation"' |cut -f 1 > tp53_Missense_Mutation
#R语言版本
BRCA_mutation_info = fread('mc3%2FBRCA_mc3.txt.gz')
BRCA_TP53_Missense_Mutation = dplyr::filter(BRCA_mutation_info,gene=="TP53",effect=="Missense_Mutation")
TP53_MMT_id = unique(BRCA_TP53_Missense_Mutation$sample)
#结果展示
> dim(BRCA_TP53_Missense_Mutation)
[1] 166 12
> head(BRCA_TP53_Missense_Mutation)
sample chr start end reference alt gene effect Amino_Acid_Change DNA_VAF SIFT PolyPhen
1: TCGA-A1-A0SI-01 17 7578406 7578406 C T TP53 Missense_Mutation p.R175H 0.30 tolerated(0.11) benign(0.308)
2: TCGA-A1-A0SK-01 17 7578532 7578532 A T TP53 Missense_Mutation p.M133K 0.98 deleterious(0) benign(0.122)
3: TCGA-A1-A0SO-01 17 7578190 7578190 T C TP53 Missense_Mutation p.Y220C 0.87 deleterious(0) probably_damaging(1)
4: TCGA-A2-A04W-01 17 7577124 7577124 C T TP53 Missense_Mutation p.V272M 0.34 deleterious(0) probably_damaging(0.997)
5: TCGA-A2-A0CL-01 17 7577120 7577120 C A TP53 Missense_Mutation p.R273L 0.18 deleterious(0) probably_damaging(0.993)
6: TCGA-A2-A0ST-01 17 7578442 7578442 T C TP53 Missense_Mutation p.Y163C 0.09 deleterious(0) probably_damaging(0.999)
> length(TP53_MMT_id)
[1] 165
#TNBC中TP53 missense mutation subgroups样品id个数为38
BRCA_TNBC_mmtp53 = intersect(TNBC_id, TP53_MMT_id)
> length(BRCA_TNBC_mmtp53)
[1] 38
#TNBC中TP53 wild type subgroups样品id个数为
TNBC_TP53_WT = setdiff(TNBC_id, unique(BRCA_mutation_info$sample))
> length(TNBC_TP53_WT)
[1] 34
3.分别提取 TP53 missense mutation subgroups, TP53 missense mutation subgroups 的表达量矩阵
#一共1218个样品的gene expression RNAseq;
#log2(norm_count+1)
#nohup wget -c 'https://tcga-xena-hub.s3.us-east-1.amazonaws.com/download/TCGA.BRCA.sampleMap%2FHiSeqV2.gz' &
TCGA_sample_expression = fread('TCGA.BRCA.sampleMap%2FHiSeqV2.gz')
TCGA_sample_expression = as.data.frame(TCGA_sample_expression)
rownames(TCGA_sample_expression) = TCGA_sample_expression$sample
#TP53 wild type subgroups
TNBC_TP53_WT_expression = TCGA_sample_expression[,TNBC_TP53_WT]
#TP53 missense mutation subgroups
BRCA_TNBC_mmtp53_expression = TCGA_sample_expression[,BRCA_TNBC_mmtp53]
4.对上述得到的两个表达量矩阵计算相关性
计算方法-表达量矩阵中所有基因与TP53基因计算相关性
TNBC_TP53_WT_expression = as.data.frame( t(TNBC_TP53_WT_expression) )
BRCA_TNBC_mmtp53_expression = as.data.frame(t(BRCA_TNBC_mmtp53_expression))
sink('gene_tp53_cor.txt')
for(gene in colnames(TNBC_TP53_WT_expression) ){
cat( paste0(gene,'\t', cor(TNBC_TP53_WT_expression[,gene], TNBC_TP53_WT_expression$TP53)),'\t',cor(BRCA_TNBC_mmtp53_expression[,gene], BRCA_TNBC_mmtp53_expression$TP53) ,"\n")
}
sink()
tp53_correlation = read.table('gene_tp53_cor.txt',header = F)
colnames(tp53_correlation) = c('gene', 'tnbc_wt_tp53', 'tnbc_mt_tp53')
tp53_correlation = na.omit(tp53_correlation)
rownames(tp53_correlation) = tp53_correlation$gene
tp53_correlation = tp53_correlation[, 2:3]
> head(tp53_correlation)
tnbc_wt_tp53 tnbc_mt_tp53
ARHGEF10L 0.28587482 0.169284500
HIF3A 0.16243489 0.115790900
RNF17 -0.19910027 0.039481480
RNF10 0.21917671 -0.310539300
RNF11 0.18367476 -0.486836800
RNF13 0.02319396 0.005537374
library(dplyr)
del_gene_id = rownames(tp53_correlation) %>% grep('^\\?', . ,perl = T)
> del_gene_id %>% rownames(tp53_correlation)[.]
[1] "?|26823" "?|100133144" "?|90288" "?|645851" "?|653553"
[6] "?|10357" "?|388795" "?|100134869" "?|729884" "?|57714"
[11] "?|652919" "?|553137" "?|390284" "?|728603" "?|155060"
[16] "?|340602" "?|10431" "?|317712" "?|8225" "?|391714"
[21] "?|280660" "?|391343" "?|728788"
tp53_correlation = tp53_correlation[-del_gene_id,]
tp53_correlation = tp53_correlation[sort( rownames(tp53_correlation) ),]
> head(tp53_correlation,30)
tnbc_wt_tp53 tnbc_mt_tp53
A1BG 0.013355972 0.065416980
A1CF -0.096826609 0.033157820
A2BP1 -0.163385210 0.010062910
A2LD1 0.246807729 0.212874200
A2M 0.147999058 -0.129446500
A2ML1 0.341158546 -0.005538465
A4GALT 0.004229104 0.127372200
A4GNT 0.105848555 0.115709900
AAAS -0.064430839 0.296455500
AACS -0.137412948 -0.432603200
AACSL 0.049008076 -0.474494000
AADAC 0.076812779 -0.308284300
AADACL2 0.191125198 0.233111900
AADACL3 -0.191370449 0.033848400
AADACL4 0.230454755 0.121381600
AADAT -0.052448739 -0.164513200
AAGAB 0.278612704 -0.045500850
AAK1 0.093400379 -0.362386100
AAMP -0.079047101 0.095738200
AANAT 0.074936621 -0.009138630
AARS -0.013959717 0.032320990
AARS2 0.099614889 -0.015314100
AARSD1 -0.245608837 0.428401600
AASDH -0.091453632 -0.189170200
AASDHPPT -0.572624010 -0.003107184
AASS -0.109802198 -0.115977400
AATF -0.310146852 0.262189300
AATK -0.179822766 0.235807200
ABAT 0.289620484 -0.513028100
ABCA1 0.109771473 -0.249502800
select_gene = c("CCL1","CCL18","CCL28","CCL4","CCL7","CCL8","CX3CL1","CXCL10","CXCL11","IFIT1","IFIT2","IFIT3","IFNB1","IFNG","IL27","IL29")
tp53_correlation[select_gene,]
> tp53_correlation[select_gene,]
tnbc_wt_tp53 tnbc_mt_tp53
CCL1 0.2514942 -0.12348200
CCL18 -0.0649739 -0.09928178
CCL28 0.2138325 -0.13331330
CCL4 0.1815558 -0.08629248
CCL7 0.1008837 -0.14959010
CCL8 0.1171175 -0.15947400
CX3CL1 0.1885921 0.25851010
CXCL10 0.2533631 -0.05307441
CXCL11 0.2534424 -0.15895260
IFIT1 0.1210111 -0.10309130
IFIT2 0.1538091 -0.09959147
IFIT3 0.2242451 -0.17122550
IFNB1 0.1161510 -0.14915090
IFNG 0.1704638 -0.14866620
IL27 0.3561543 0.06605169
IL29 -0.1146054 -0.22295270
散点图
scatter_data = tp53_correlation[select_gene,]
if(T){
plot(scatter_data$tnbc_wt_tp53, scatter_data$tnbc_mt_tp53,pch=19, cex=1.5, cex.lab =1.3,xlab="Correlation coefficient in WTp53", ylab="Correlation coefficient in mtp53",main="A scatterplot")
abline(h=0.1,lty=3)
abline(v=0.1,lty=3)
points(0.116151,-0.1491509,pch=15,col='red',cex=1.6)
text(0.133,-0.145, 'IFNB1' ,col = 'red')
}
ps:欢迎批评指正
参考
https://www.youlai.cn/ask/5C971DMTn00.html
笔记15-TCGA三阴乳腺癌样本的下载