1.R代码练习1
rm(list=ls())
library(stringr)
library(plyr)
library(xlsx)
library(Biostrings)
library(splitstackshape)
#质谱检测肽段长度 8到25个氨基酸,比较合适。
MS_seq_length_min = 8
MS_seq_length_max = 25
# input files
in_phospho_file = "Phosphorylation_site_dataset" #调控数据
in_MS_file = "huangyu_proteomicsDB_2.txt" # MS 质谱数据
in_protein_fasta_file = "Phosphosite_PTM_seq.fasta" # protein sequence data
# output files
out_phospho_antibody_human_p_file = "4.phospho_antibody_human_p.txt" # containing Human regulatoryhuman phospho sites
out_fasta_reg_file = "4.df_fasta_phospho_antibody.txt" # human protein sequences including human regulatoryhuman phospho sites
out_MS_data_reg_frequency_file = "4.MS_data_phospho_antibody_frequency.txt" # frequency of MS peptides that contain human regulatoryhuman phospho sites
out_phospho_antibody_MS_comb_file = "4.phospho_antibody_MS_comb.txt" # combination of regulatory phosphosite and MS peptides
out_phospho_antibody_MS_comb_MS_file = "4.phospho_antibody_MS_comb_seq_MS.txt" # # combination of regulatory phosphosite and MS peptides (length: 8-25)
# Read regulatory_site file (读取调控数据)
setwd("D:/LQ/phosphosite_2017_12_20/")
phospho_data = read.table(in_phospho_file, header=T, sep="\t", skip=3, comment.char="", stringsAsFactors=F) #读入磷酸化位点数据
phospho_antibody_human_p = subset(phospho_data, ORGANISM %in% "human" & grepl("p$", MOD_RSD) & nchar(CST_CAT.)>2) # 找出人类的磷酸化修饰的位点
phospho_antibody_human_p$pos = as.numeric(substr(phospho_antibody_human_p$MOD_RSD, 2, nchar(phospho_antibody_human_p$MOD_RSD)-2)) # 修饰位点数字化定位
write.table(phospho_antibody_human_p, out_phospho_antibody_human_p_file, quote=FALSE, sep="\t", row.names=FALSE) #读出phospho_antibody_human_p
#read human protein sequence file (fasta format)
fastaFile <- readAAStringSet(in_protein_fasta_file) #读入蛋白质序列
seq_name = names(fastaFile)
sequence = paste(fastaFile)
df_fasta <- data.frame(seq_name, sequence,stringsAsFactors=F)
df_fasta = cSplit(df_fasta,"seq_name","|",direction="wide")
colnames(df_fasta)[ncol(df_fasta)] = "PROTEIN_ID"
df_fasta[] <- lapply(df_fasta, as.character) # change factor class to character class
df_fasta = unique(df_fasta) # remove replicate rows
# retain protein sequences that contain antibody-targeted phosphosites
df_fasta_antibody = subset(df_fasta, PROTEIN_ID %in% phospho_antibody_human_p$ACC_ID) #根据蛋白质ID,找到磷酸化修饰位点所在蛋白质序列
#add 7x"_" before and after protein sequence for matching
df_fasta_antibody$SEQ = paste("_______", toupper(df_fasta_antibody$sequence), "_______",sep="") #将序列改为大写,并加上7个"-",便于匹配
write.table(df_fasta_antibody, out_fasta_reg_file, quote=FALSE, sep="\t", row.names=FALSE) #读出带有抗体的蛋白质序列df_fasta_antibody
#debug code: check if all antibody-targeted sites are included in the protein sequences
phospho_antibody_human_p_tmp = subset(phospho_antibody_human_p, ACC_ID %in% df_fasta_antibody$PROTEIN_ID )
if(nrow(phospho_antibody_human_p_tmp) != nrow(phospho_antibody_human_p)){stop("Error! the number of rows are different from Phosphosite ! track back!")}
# read peptide sequences detected using MS machine
MS_data = read.table(in_MS_file, header=T, sep="\t",na.strings = "NA", comment.char="", stringsAsFactors=F) #读入质谱数据
# Let both MS_data and phospho_dis_reg have the same swisprot ID
MS_data_anti = subset(MS_data, PROTEIN_ID %in% df_fasta_antibody$PROTEIN_ID) #根据蛋白质ID,筛选带有抗体的肽段
df_fasta_antibody = subset(df_fasta_antibody, PROTEIN_ID %in% MS_data$PROTEIN_ID) #从带有抗体的蛋白质序列中选出能与肽段匹配的蛋白质序列
# count the frequency of each MS peptide (#the more frequency, the easier to be detected)
MS_data_anti_frequency=as.data.frame(table(MS_data_reg$SEQUENCE), stringsAsFactors=F) #根据肽段出现频率,读出肽段丰度
MS_data_anti_frequency = subset(MS_data_reg_frequency, Freq>0)
write.table(MS_data_reg_frequency, out_MS_data_reg_frequency_file, quote=FALSE, sep="\t", row.names=FALSE) #读出肽段丰度
#找出蛋白质长度、带有抗体的磷酸化修饰的肽段序列,序列最大长度,及最大丰度
phospho_antibody_human_p$prot_length = NA # store protein length
phospho_antibody_human_p$MS_seq = NA # store the MS peptides that contain the antibody-targeted phosphosites
phospho_antibody_human_p$MS_seq_max = NA # store the MS peptide detected with the most times
phospho_antibody_human_p$MS_seq_max_length = NA # store the length of MS peptide detected with the most times
phospho_antibody_human_p$MS_seq_max_freq = NA # store the frequency of the MS peptide with the most times
if( nrow(phospho_antibody_human_p) ==0 ){exit("Error! Line 71. There is no content in the df phospho_antibody_human_p!")}
#填入数据
for(i in 1:nrow(phospho_antibody_human_p)){
phospho = phospho_antibody_human_p[i,]
df_fasta = subset(df_fasta_antibody, PROTEIN_ID %in% phospho$ACC_ID)
MS_data_tmp = subset(MS_data_anti, PROTEIN_ID %in% phospho$ACC_ID)
if(length(MS_data_tmp$SEQUENCE)<10){next} # omit the proteins if protein sequence length <10
MS_data_freq=as.data.frame(table(MS_data_tmp$SEQUENCE), stringsAsFactors=F) # summerize the MS peptides and frequency
pos = unlist(gregexpr(pattern =phospho$SITE_...7_AA,df_fasta$SEQ, ignore.case = T)) # position of phosphosite in protein sequence
if( phospho$pos %in% pos){ # if both positions are same (position stored in phosphosite and position by matching with protein sequence)
# Set the locations of MS peptides in protein sequence
seq_locations=str_locate(toupper(df_fasta$sequence), MS_data_freq$Var1) # return start and end position for each MS peptide
df_location_range=data.frame(matrix(seq_locations, ncol=2, byrow=F),stringsAsFactors=FALSE) # change matrix format to df
colnames(df_location_range) = c("pep_start", "pep_end")
MS_data_freq = cbind(MS_data_freq, df_location_range)
# Judge if the phosphosite is located within the MS peptides
MS_data_freq$pos_minus_start = phospho$pos - MS_data_freq$pep_start
MS_data_freq$pos_minus_end = phospho$pos - MS_data_freq$pep_end
MS_data_freq_inc_phospho = subset(MS_data_freq, pos_minus_start>0 & pos_minus_end<0)
# order the df if df has multiple MS peptides containing phosphosite, the one with max freq is ordered at the top
MS_data_freq_inc_phospho = MS_data_freq_inc_phospho[with(MS_data_freq_inc_phospho, order(-Freq)), ]
phospho_antibody_human_p$prot_length[i] = nchar(df_fasta$sequence)
phospho_antibody_human_p$MS_seq[i] = paste(MS_data_freq_inc_phospho$Var1, MS_data_freq_inc_phospho$Freq, collapse=";")
phospho_antibody_human_p$MS_seq_max[i] = MS_data_freq_inc_phospho$Var1[1]
phospho_antibody_human_p$MS_seq_max_length[i] = nchar(MS_data_freq_inc_phospho$Var1[1])
phospho_antibody_human_p$MS_seq_max_freq[i] = MS_data_freq_inc_phospho$Freq[1]
}
}
write.table(phospho_antibody_huma