在使用下面的部分函数之前,需要加载一下R包。
library(data.table) #loading packages
library(dplyr)
library(tidyr)
1. 按照特定的分割符,拆分某列。
(1)按照单一的分隔符。
ExtensionDat<- data %>% separate(gene.TE, c("transcript","gene","family","class"),sep = ":") #根据“:”,将数据框data中的gene.TE列拆分为4列,列名分别为"transcript","gene","family","class"。
(2)按照多于1个的分隔符。
gtff<-gtf %>% separate(V2, c("chromsome","start","stop","strand"),sep = "[-|:]") #根据“:”,“|”,“-”,将数据框gtf中的V2列拆分为4列,列名分别为"chromsome","start","stop","strand"。在这行代码中,分隔符为一系列,用中括号括起来。
2. 宽数据变成长数据,但在转换的过程中不改变某些列。
LengthDat_long <- gather(LengthDat, key = "sample", value = "expression",
-`transcript`,-`chromsome`, -`start`, -`stop`, -`strand` ,-`length`, -`gene`, -`family`, -`class`)
#将数据框LengthDat的一些行的值,合并后的数据,多出来两列,第一列为sample,即原先的列名,第二列为expression,为原先的每一列所对应的值;在转换的过程中,保证其他列,如“transcript”不改变。
3. 分组计数
CountTable<-FilterDat_long %>% group_by(sample,gene) %>% summarize(notZero = sum(expression!=0),total=sum(!is.na(expression))) #按照sample,gene列进行分组,统计每组中,expression的值不为零的值的加和,以及expression值不为NA的数目(细细品味两者的区别);
mean_length <- aggregate(fa.bed$length, by=list(fa.bed$TE_subfamily, fa.bed$class_family), FUN=mean) #这个也是分组计数,按照列TE_subfamily和class_family进行分组,然后计算组中的均值,设为length的值;
CountTable<-StructureDat%>% count(gene,class.x,name = 'count') #按照gene和class.x组进行简单统计行数,新列命名为count行
4.当数据集过大时,用fread()进行快速读取。
library(data.table) #该函数位于data.table包里。
fa.bed.o<-fread("hg38.fa.out",fill=T,header=T)
5.按条件赋值
(1)大于小于
LengthDat_long$Length_label[LengthDat_long$length>6000] <-">6k" #当length大于6000,给Length_label赋值为">6k"
LengthDat_long$Length_label[LengthDat_long$length<=6000 & LengthDat_long$length>4000 ] <-"4k-6k"
LengthDat_long$Length_label[LengthDat_long$length<=4000 & LengthDat_long$length>2000 ] <-"2k-4k"
LengthDat_long$Length_label[LengthDat_long$length<= 2000] <-"<=2k"
(2)等于
data_ext3[which(data_ext3$strand=="+)"),]$strand<-"+"
data_ext3[which(data_ext3$strand=="-)"),]$strand<-"-"
annotationDat[is.na(annotationDat$class),]$class<-"not-enough-length" #是否为na
(3)包含
data_ext3[grepl("ORF2",data_ext3$subject.acc.ver),"ORF"]<-"ORF2"
data_ext3[grepl("ORF1",data_ext3$subject.acc.ver),"ORF"]<-"ORF1"
FilterDat_noZero_L1$group[FilterDat_noZero_L1$sample%in%c("ESC-1","ESC-2","ESC-3")] <-"ESC"
FilterDat_noZero_L1$group[FilterDat_noZero_L1$sample%in%c("Neuron_pro-1","Neuron_pro-2","Neuron_pro-3")] <-"Neuron_pro"
FilterDat_noZero_L1$group[FilterDat_noZero_L1$sample%in%c("Neuron-1","Neuron-2","Neuron-3")] <-"Neuron"
(4)新列赋值
data_ext4[,"label"]<-"null" #给新列label,全部赋值为“null”
#特定位置赋值
data_ext4[i,]$label<-"1rd"
data_ext4[i+1,]$label<-"1rd"
6. rbind之前,保证列数和列名相同
b=as.data.frame(c(data_ext4[i,1:6],min(a),max(a),rep(NA,8),data_ext4[i,17],"2rd"))
colnames(b)<-colnames(data_ext4)
data_ext4<-rbind(data_ext4,b)
7.判断两行的对应列的内容是否相同
if(sum(data_ext8[m,c(1:6)]==data_ext8[m+1,c(1:6)])==6)
8.paste合并列,按照特定的连接符
data_ext8$label<-paste(paste(paste(paste(data_ext8$chromsome,data_ext8$start,sep=":"),data_ext8$stop,sep="-"),data_ext8$strand,sep="("),"",sep=")")
data_ext8$label2<-paste(paste(paste(data_ext8$chromsome,data_ext8$start,sep=":"),data_ext8$stop,sep="-"),data_ext8$strand,sep=":")
9.筛选特定的数据子集
FilterDat2<-FilterDat[FilterDat$family=="L1",] #筛选family列为L1的数据子集
CountTable2<-CountTable%>% filter(gene%in%c("L1HS","L1PA2","L1PA3","L1PA4","L1PA5","L1PA6","L1PA7","L1PA8")) #筛选gene列为"L1HS","L1PA2","L1PA3","L1PA4","L1PA5","L1PA6","L1PA7","L1PA8"的数据子集;
Savedfiles_repFilterDat_noZero_L1<-repFilterDat_noZero_L1 %>% filter(n==3) #n列为3的数据子集
TE.esc<-cbind(row.names(data[!grepl("ENSG*", row.names(data)),]),data[!grepl("ENSG*", row.names(data)),]) #用grepl函数,筛选出rownames中包含"ENSG*"的行,然后取差集(!)。