作业链接
1.下载CCLE数据库的RNA-seq的表达矩阵
https://depmap.org/portal/download/?release=CCLE+2019&release=Fusion&release=DNA+Copy+Number
下载CCLE_RNAseq_rsem_genes_tpm_20180929.txt.gz这个数据集
2.提取BREAST的细胞系TPM数据
library(data.table)
library(dplyr)
CCLE_RNAseq_tpm = fread('CCLE_RNAseq_rsem_genes_tpm_20180929.txt.gz',sep = '\t')
CCLE_RNAseq_tpm = as.data.frame(CCLE_RNAseq_tpm)
breast_cols = colnames(CCLE_RNAseq_tpm) %>% grepl('_BREAST', ., perl = T) %>% which(.)
CCLE_RNAseq_breast_tpm = CCLE_RNAseq_tpm[,breast_cols]
rownames(CCLE_RNAseq_breast_tpm)= CCLE_RNAseq_tpm$gene_id
#一共51个BREAST的细胞系
> length(colnames(CCLE_RNAseq_breast_tpm))
[1] 51
> colnames(CCLE_RNAseq_breast_tpm)
[1] "AU565_BREAST" "BT20_BREAST" "BT474_BREAST"
[4] "BT483_BREAST" "BT549_BREAST" "CAL120_BREAST"
[7] "CAL148_BREAST" "CAL51_BREAST" "CAL851_BREAST"
[10] "CAMA1_BREAST" "DU4475_BREAST" "EFM192A_BREAST"
[13] "EFM19_BREAST" "HCC1143_BREAST" "HCC1187_BREAST"
[16] "HCC1395_BREAST" "HCC1419_BREAST" "HCC1428_BREAST"
[19] "HCC1500_BREAST" "HCC1569_BREAST" "HCC1599_BREAST"
[22] "HCC1806_BREAST" "HCC1937_BREAST" "HCC1954_BREAST"
[25] "HCC202_BREAST" "HCC2157_BREAST" "HCC2218_BREAST"
[28] "HCC38_BREAST" "HCC70_BREAST" "HDQP1_BREAST"
[31] "HMC18_BREAST" "HMEL_BREAST" "HS578T_BREAST"
[34] "JIMT1_BREAST" "KPL1_BREAST" "MCF7_BREAST"
[37] "MDAMB134VI_BREAST" "MDAMB157_BREAST" "MDAMB175VII_BREAST"
[40] "MDAMB231_BREAST" "MDAMB361_BREAST" "MDAMB415_BREAST"
[43] "MDAMB436_BREAST" "MDAMB453_BREAST" "MDAMB468_BREAST"
[46] "SKBR3_BREAST" "T47D_BREAST" "UACC812_BREAST"
[49] "UACC893_BREAST" "ZR751_BREAST" "ZR7530_BREAST"
3.随机分组
把这些细胞系按9:1随机分成两组,一组45个细胞系,一组6个细胞系
set.seed(100)
part2 = sample(colnames(CCLE_RNAseq_breast_tpm),size=6,replace=F) #随机抽取6个细胞系为一组
part1 = setdiff(colnames(CCLE_RNAseq_breast_tpm),part2) #剩下的45个细胞系为一组
breast_tpm_part1 = CCLE_RNAseq_breast_tpm[,part1]
breast_tpm_part2 = CCLE_RNAseq_breast_tpm[,part2]
4.从两个分组中提取符合要求的id
在part1中完全不表达的基因个数:14488
> table(apply(breast_tpm_part1, 1, sum)==0)
FALSE TRUE
43332 14488
提取这14488个完全不表达的基因id;
并通过这些基因id去提取part2这部分id的表达量矩阵mid;
最后得到part1完全不表达,而part2有表达的基因id个数为319个
#names(which(apply(breast_tpm_part1, 1, sum)==0)) #提取part1完全不表达的基因id
mid = breast_tpm_part2[which(apply(breast_tpm_part1, 1, sum)==0),]
> table(apply(mid, 1, sum)!=0)
FALSE TRUE
14169 319
提取这319个基因的ensemble id并去掉版本号
> head(names(which(apply(mid, 1, sum)!=0)))
[1] "ENSG00000127515.1" "ENSG00000127529.7" "ENSG00000153498.7"
[4] "ENSG00000166787.3" "ENSG00000167822.1" "ENSG00000169325.9"
part1_no_part2_have_id = names(which(apply(mid, 1, sum)!=0)) %>% sub('\\.\\d+','',.)
> head(part1_no_part2_have_id)
[1] "ENSG00000127515" "ENSG00000127529" "ENSG00000153498" "ENSG00000166787"
[5] "ENSG00000167822" "ENSG00000169325"
5.结果
319个基因的ensemble id
ENSG00000127515,ENSG00000127529,ENSG00000153498,ENSG00000166787,ENSG00000167822,ENSG00000169325,ENSG00000172769,ENSG00000174407,ENSG00000175319,ENSG00000175619,ENSG00000177143,ENSG00000180723,ENSG00000181374,ENSG00000181780,ENSG00000183559,ENSG00000186803,ENSG00000186886,ENSG00000186971,ENSG00000188585,ENSG00000197674,ENSG00000200237,ENSG00000201358,ENSG00000201407,ENSG00000201737,ENSG00000201875,ENSG00000203396,ENSG00000203858,ENSG00000204670,ENSG00000205653,ENSG00000205821,ENSG00000205858,ENSG00000205865,ENSG00000206937,ENSG00000211939,ENSG00000212237,ENSG00000212413,ENSG00000212479,ENSG00000213016,ENSG00000213736,ENSG00000214042,ENSG00000214326,ENSG00000214871,ENSG00000215430,ENSG00000216365,ENSG00000216518,ENSG00000216904,ENSG00000218716,ENSG00000220184,ENSG00000220913,ENSG00000221996,ENSG00000222231,ENSG00000222845,ENSG00000223457,ENSG00000223512,ENSG00000223614,ENSG00000223647,ENSG00000223962,ENSG00000223982,ENSG00000223999,ENSG00000224122,ENSG00000224128,ENSG00000224359,ENSG00000224366,ENSG00000224410,ENSG00000224590,ENSG00000224610,ENSG00000224879,ENSG00000224896,ENSG00000225280,ENSG00000225560,ENSG00000225594,ENSG00000225625,ENSG00000225638,ENSG00000225881,ENSG00000225911,ENSG00000225923,ENSG00000226034,ENSG00000227043,ENSG00000227173,ENSG00000227240,ENSG00000227352,ENSG00000227459,ENSG00000227657,ENSG00000227711,ENSG00000228303,ENSG00000228392,ENSG00000228488,ENSG00000228646,ENSG00000228679,ENSG00000228851,ENSG00000228955,ENSG00000229066,ENSG00000229209,ENSG00000229242,ENSG00000229671,ENSG00000229824,ENSG00000229982,ENSG00000230029,ENSG00000230051,ENSG00000230301,ENSG00000230328,ENSG00000230362,ENSG00000230493,ENSG00000230571,ENSG00000230866,ENSG00000230906,ENSG00000230990,ENSG00000231149,ENSG00000231252,ENSG00000231563,ENSG00000231781,ENSG00000231934,ENSG00000231947,ENSG00000232193,ENSG00000232208,ENSG00000232249,ENSG00000232287,ENSG00000232495,ENSG00000232540,ENSG00000232597,ENSG00000232723,ENSG00000232728,ENSG00000232873,ENSG00000232930,ENSG00000232939,ENSG00000233228,ENSG00000233357,ENSG00000233574,ENSG00000233755,ENSG00000233814,ENSG00000233909,ENSG00000233951,ENSG00000234138,ENSG00000234227,ENSG00000234898,ENSG00000235266,ENSG00000235306,ENSG00000235626,ENSG00000235699,ENSG00000235797,ENSG00000235845,ENSG00000235892,ENSG00000236339,ENSG00000236520,ENSG00000236656,ENSG00000236823,ENSG00000236834,ENSG00000236915,ENSG00000237109,ENSG00000237249,ENSG00000237597,ENSG00000239314,ENSG00000239600,ENSG00000239718,ENSG00000240432,ENSG00000240613,ENSG00000240777,ENSG00000241073,ENSG00000241172,ENSG00000241183,ENSG00000241695,ENSG00000241754,ENSG00000241869,ENSG00000242280,ENSG00000242522,ENSG00000242657,ENSG00000242671,ENSG00000242810,ENSG00000243125,ENSG00000243593,ENSG00000243828,ENSG00000244024,ENSG00000244249,ENSG00000244318,ENSG00000248105,ENSG00000248106,ENSG00000248152,ENSG00000248197,ENSG00000248227,ENSG00000248322,ENSG00000248338,ENSG00000248363,ENSG00000248374,ENSG00000248600,ENSG00000248616,ENSG00000248634,ENSG00000248824,ENSG00000249128,ENSG00000249200,ENSG00000249236,ENSG00000249255,ENSG00000249382,ENSG00000249488,ENSG00000249798,ENSG00000249837,ENSG00000250164,ENSG00000250298,ENSG00000250416,ENSG00000250524,ENSG00000250630,ENSG00000250677,ENSG00000250706,ENSG00000250858,ENSG00000251033,ENSG00000251055,ENSG00000251264,ENSG00000251266,ENSG00000251372,ENSG00000251557,ENSG00000253006,ENSG00000253108,ENSG00000253110,ENSG00000253189,ENSG00000253260,ENSG00000253332,ENSG00000253397,ENSG00000253418,ENSG00000253424,ENSG00000253434,ENSG00000253488,ENSG00000253491,ENSG00000253777,ENSG00000253792,ENSG00000253957,ENSG00000254001,ENSG00000254048,ENSG00000254193,ENSG00000254299,ENSG00000254344,ENSG00000254515,ENSG00000254767,ENSG00000254926,ENSG00000254927,ENSG00000255088,ENSG00000255206,ENSG00000255444,ENSG00000255551,ENSG00000255772,ENSG00000255870,ENSG00000255973,ENSG00000256212,ENSG00000256422,ENSG00000256889,ENSG00000257259,ENSG00000257677,ENSG00000257807,ENSG00000257835,ENSG00000258035,ENSG00000258290,ENSG00000258487,ENSG00000258567,ENSG00000258690,ENSG00000258705,ENSG00000258992,ENSG00000258993,ENSG00000259154,ENSG00000259255,ENSG00000259708,ENSG00000259819,ENSG00000259888,ENSG00000260413,ENSG00000260723,ENSG00000260986,ENSG00000261093,ENSG00000261232,ENSG00000261244,ENSG00000261273,ENSG00000261429,ENSG00000261627,ENSG00000261651,ENSG00000262090,ENSG00000262961,ENSG00000263305,ENSG00000263935,ENSG00000264072,ENSG00000264123,ENSG00000264212,ENSG00000264232,ENSG00000264236,ENSG00000264659,ENSG00000264933,ENSG00000265477,ENSG00000266227,ENSG00000266627,ENSG00000266774,ENSG00000266906,ENSG00000267286,ENSG00000267349,ENSG00000267399,ENSG00000267517,ENSG00000267537,ENSG00000267687,ENSG00000267824,ENSG00000267905,ENSG00000268711,ENSG00000268847,ENSG00000268957,ENSG00000269271,ENSG00000269804,ENSG00000269808,ENSG00000270092,ENSG00000270347,ENSG00000270380,ENSG00000270455,ENSG00000270516,ENSG00000270664,ENSG00000270856,ENSG00000270902,ENSG00000270969,ENSG00000271053,ENSG00000271205,ENSG00000271373,ENSG00000271419,ENSG00000271612,ENSG00000271686,ENSG00000272085,ENSG00000272139,ENSG00000273098,ENSG00000273370
转为gene symbol id
利用https://biit.cs.ut.ee/gprofiler/gost这个网页工具,发现差不多只有一半(157)的ensemble id能转换为symbol id
OR7A10,OR7C2,SPACA7,SAA3P,OR8J3,GUSBP12,OR5B3,MIR1-1HG,NF1P5,OR4B1,CETN1,OR51A9P,CCL13,OR5J1P,C10orf120,IFNA10,OR5D3P,KRTAP13-4,CLEC20A,OR51C1P,SNORA70,RN7SKP193,RNU1-133P,RN7SKP178,HSD3BP2,IGKV1OR2-3,LRRC72,FAM99B,SNORA70B,RNA5SP18,RNU11-3P,U3,BRI3P2,IFNA7,RPL31P1,RPL37P15,EEF1GP6,SOCS5P5,RPL12P23,HMGB3P18,OR52B4,RNU2-54P,RN7SKP42,HTATSF1P1,ZNF735,LINC01946,UBBP3,SNRPD2P2,POU6F2-AS1,LINC01248,MTND3P16,SIGLEC30P,FAM197Y8,PABPC1P12,NANOGNBP1,LINC01682,MYL6P3,ARHGEF7-AS1,SERPINA7P1,RPL31P13,MTCO3P4,LINC01150,RPL12P34,GTF3AP6,CDY11P,OR5H6,ACTG1P23,LINC02558,FAM242A,MTND2P24,SLC6A1-AS1,RPL36P19,RPS17P7,PHB2P1,RPL23AP93,LPCAT2BP,RPL30P15,H3P32,UBE2V1P4,RCC2P3,RPL7L1P1,CHEK2P3,RPL22P17,GAPDHP34,KRT18P66,CXorf51B,PKMP2,POM121L13P,GPC6-AS1,LINC00421,CLCA4-AS1,RPL21P111,MORC1-AS1,HLTF-AS1,KRTAP13-3,RN7SL98P,RN7SL70P,GM2AP1,RN7SL363P,SLC16A1P1,KLHL6-AS1,RN7SL581P,LINC02010,MRPL42P6,RN7SL807P,LINC02049,LARP7P4,RN7SL252P,ANK2-AS1,LINC00290,LINC02513,LINC02472,ANKRD49P3,LINC02215,PGAM1P9,LINC02619,NACAP5,HSD3BP3,GIMD1,SEC63P2,MTCO1P35,IGBP1P5,LINC02429,LINC00499,HNRNPKP3,RN7SKP283,SNX18P27,LINC02237,SINHCAFP3,IGHVII-30-1,IGHV3-22,LINC01944,LINC01288,OR2AL1P,CYCSP27,LINC01479,LINC02552,LINC02388,LINC02464,BRWD1P2,LINC02277,DUX4L16,TSPY1,DUX4L17,DPPA2P4,TOMM20P3,LINC01916,RN7SL190P,NPM1P2,RN7SL202P,LINC01855,VSIG10L-AS1,SIGLEC31P,PABPC1P5,NANOGP3,HSPE1P14,TNPO3P1
剩下的一半没能转化的大部分的描述为:novel transcript
进一步筛选
有小伙伴指出这319基因里绝大部分的表达量很低,聊胜于无呀!
回头一看,还真是,一拍大腿,果断的添加过滤条件,最后只留下8个满足条件的基因,当然这个条件可以放宽点
part1_no_part2_have_id_high_expr = names(which(sort(apply(breast_tpm_part2[part1_no_part2_have_id,],1 ,sum)>=6)))
breast_tpm_part2[part1_no_part2_have_id_high_expr,]
> breast_tpm_part2[part1_no_part2_have_id_high_expr,]
CAMA1_BREAST MDAMB157_BREAST UACC812_BREAST HCC202_BREAST HCC1143_BREAST MDAMB453_BREAST
ENSG00000201407.1 0 7.04 0.00 0 0.00 0
ENSG00000201737.1 0 0.00 0.00 0 6.36 0
ENSG00000212237.1 0 36.63 0.00 0 0.00 0
ENSG00000241073.1 0 0.00 7.48 0 0.00 0
ENSG00000250677.1 0 7.66 0.00 0 0.00 0
ENSG00000270092.2 0 8.15 0.00 0 0.00 0
ENSG00000270856.1 0 0.00 7.58 0 0.00 0
ENSG00000271419.1 0 0.03 14.35 0 0.04 0
从留下的满足条件的表达矩阵,发现一个基因也只有在某个细胞系里有明显的表达,其余几乎都为0
得到初步结论,通过上述过滤得到的基因,都只在某个细胞系里有明显的表达,而在其它的细胞系里几乎都无表达。
放宽过滤条件至sum>1
> part1_no_part2_have_id_high_expr = names(which(sort(apply(breast_tpm_part2[part1_no_part2_have_id,],1 ,sum)>=1)))
> breast_tpm_part2[part1_no_part2_have_id_high_expr,]
CAMA1_BREAST MDAMB157_BREAST UACC812_BREAST HCC202_BREAST HCC1143_BREAST MDAMB453_BREAST
ENSG00000181374.3 0.00 0.00 0.00 0 2.99 0
ENSG00000200237.1 0.00 4.04 0.00 0 0.00 0
ENSG00000201407.1 0.00 7.04 0.00 0 0.00 0
ENSG00000201737.1 0.00 0.00 0.00 0 6.36 0
ENSG00000205821.4 0.00 0.00 1.47 0 0.00 0
ENSG00000206937.1 0.00 5.76 0.00 0 0.00 0
ENSG00000212237.1 0.00 36.63 0.00 0 0.00 0
ENSG00000212413.1 4.34 0.00 0.00 0 0.00 0
ENSG00000212479.1 0.00 0.00 0.00 0 1.63 0
ENSG00000228488.1 0.00 0.00 0.00 0 1.05 0
ENSG00000232208.2 0.00 0.00 0.00 0 4.07 0
ENSG00000241073.1 0.00 0.00 7.48 0 0.00 0
ENSG00000248322.1 0.00 1.29 0.00 0 0.00 0
ENSG00000250677.1 0.00 7.66 0.00 0 0.00 0
ENSG00000254193.1 2.23 0.00 0.00 0 0.00 0
ENSG00000255772.1 0.00 0.00 0.00 0 1.18 0
ENSG00000261093.1 1.16 0.00 0.00 0 0.00 0
ENSG00000262090.1 0.00 4.08 0.00 0 0.00 0
ENSG00000269804.1 0.00 0.00 4.45 0 0.00 0
ENSG00000269808.1 0.00 0.00 2.81 0 0.00 0
ENSG00000270092.2 0.00 8.15 0.00 0 0.00 0
ENSG00000270856.1 0.00 0.00 7.58 0 0.00 0
ENSG00000270902.1 0.00 1.50 0.00 0 0.00 0
ENSG00000271205.1 0.00 2.61 0.00 0 0.00 0
ENSG00000271419.1 0.00 0.03 14.35 0 0.04 0
ENSG00000272085.1 0.00 1.99 0.00 0 0.00 0
发现规律和上述是一致的。这是为啥呢?为啥会是这样子的呢?