R语言编程实践-优快云博客

本文链接：https://blog.youkuaiyun.com/Doris_xixi/article/details/80196531

本文详细介绍了如何使用R语言进行数据分析和可视化，包括数据导入、清洗、统计分析以及绘制精美图表的步骤，适合初学者入门。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

1.R代码练习1

rm(list=ls())
library(stringr)
library(plyr)
library(xlsx)
library(Biostrings)
library(splitstackshape)

#质谱检测肽段长度 8到25个氨基酸，比较合适。
MS_seq_length_min = 8
MS_seq_length_max = 25

# input files
in_phospho_file = "Phosphorylation_site_dataset" #调控数据
in_MS_file = "huangyu_proteomicsDB_2.txt" # MS 质谱数据
in_protein_fasta_file = "Phosphosite_PTM_seq.fasta" # protein sequence data

# output files
out_phospho_antibody_human_p_file = "4.phospho_antibody_human_p.txt" # containing Human regulatoryhuman phospho sites 
out_fasta_reg_file = "4.df_fasta_phospho_antibody.txt" # human protein sequences including human regulatoryhuman phospho sites 
out_MS_data_reg_frequency_file = "4.MS_data_phospho_antibody_frequency.txt" # frequency of MS peptides that contain human regulatoryhuman phospho sites 
out_phospho_antibody_MS_comb_file = "4.phospho_antibody_MS_comb.txt" # combination of regulatory phosphosite and MS peptides
out_phospho_antibody_MS_comb_MS_file = "4.phospho_antibody_MS_comb_seq_MS.txt" # # combination of regulatory phosphosite and MS peptides (length: 8-25)

# Read regulatory_site file （读取调控数据）
setwd("D:/LQ/phosphosite_2017_12_20/")
phospho_data = read.table(in_phospho_file, header=T, sep="\t", skip=3, comment.char="", stringsAsFactors=F)    #读入磷酸化位点数据
phospho_antibody_human_p = subset(phospho_data, ORGANISM %in% "human" & grepl("p$", MOD_RSD) & nchar(CST_CAT.)>2) # 找出人类的磷酸化修饰的位点

phospho_antibody_human_p$pos = as.numeric(substr(phospho_antibody_human_p$MOD_RSD, 2, nchar(phospho_antibody_human_p$MOD_RSD)-2)) # 修饰位点数字化定位
write.table(phospho_antibody_human_p, out_phospho_antibody_human_p_file, quote=FALSE, sep="\t", row.names=FALSE)    #读出phospho_antibody_human_p

#read human protein sequence file (fasta format)
fastaFile <- readAAStringSet(in_protein_fasta_file)            #读入蛋白质序列
seq_name = names(fastaFile)
sequence = paste(fastaFile)
df_fasta <- data.frame(seq_name, sequence,stringsAsFactors=F)
df_fasta = cSplit(df_fasta,"seq_name","|",direction="wide")
colnames(df_fasta)[ncol(df_fasta)] = "PROTEIN_ID"
df_fasta[] <- lapply(df_fasta, as.character) # change factor class to character class
df_fasta = unique(df_fasta) # remove replicate rows

# retain protein sequences that contain antibody-targeted phosphosites
df_fasta_antibody = subset(df_fasta, PROTEIN_ID %in% phospho_antibody_human_p$ACC_ID)   #根据蛋白质ID，找到磷酸化修饰位点所在蛋白质序列  
#add 7x"_" before and after protein sequence for matching
df_fasta_antibody$SEQ = paste("_______", toupper(df_fasta_antibody$sequence), "_______",sep="")  #将序列改为大写，并加上7个"-",便于匹配
write.table(df_fasta_antibody, out_fasta_reg_file, quote=FALSE, sep="\t", row.names=FALSE)      #读出带有抗体的蛋白质序列df_fasta_antibody

#debug code: check if all antibody-targeted sites are included in the protein sequences
phospho_antibody_human_p_tmp = subset(phospho_antibody_human_p, ACC_ID %in% df_fasta_antibody$PROTEIN_ID )
if(nrow(phospho_antibody_human_p_tmp) != nrow(phospho_antibody_human_p)){stop("Error! the number of rows are different from Phosphosite ! track back!")}

# read peptide sequences detected using MS machine 
MS_data = read.table(in_MS_file, header=T, sep="\t",na.strings = "NA", comment.char="", stringsAsFactors=F)   #读入质谱数据

# Let both MS_data and phospho_dis_reg have the same swisprot ID
MS_data_anti = subset(MS_data, PROTEIN_ID %in% df_fasta_antibody$PROTEIN_ID)    #根据蛋白质ID，筛选带有抗体的肽段
df_fasta_antibody = subset(df_fasta_antibody, PROTEIN_ID %in% MS_data$PROTEIN_ID)     #从带有抗体的蛋白质序列中选出能与肽段匹配的蛋白质序列

# count the frequency of each MS peptide (#the more frequency, the easier to be detected)
MS_data_anti_frequency=as.data.frame(table(MS_data_reg$SEQUENCE), stringsAsFactors=F)   #根据肽段出现频率，读出肽段丰度
MS_data_anti_frequency = subset(MS_data_reg_frequency, Freq>0)
write.table(MS_data_reg_frequency, out_MS_data_reg_frequency_file, quote=FALSE, sep="\t", row.names=FALSE)   #读出肽段丰度

#找出蛋白质长度、带有抗体的磷酸化修饰的肽段序列，序列最大长度，及最大丰度
phospho_antibody_human_p$prot_length = NA  # store protein length
phospho_antibody_human_p$MS_seq = NA  # store the MS peptides that contain the antibody-targeted phosphosites
phospho_antibody_human_p$MS_seq_max = NA  # store the MS peptide detected with the most times
phospho_antibody_human_p$MS_seq_max_length = NA # store the length of MS peptide detected with the most times
phospho_antibody_human_p$MS_seq_max_freq = NA # store the frequency of the MS peptide with the most times

if( nrow(phospho_antibody_human_p) ==0 ){exit("Error! Line 71. There is no content in the df phospho_antibody_human_p!")}
#填入数据
for(i in 1:nrow(phospho_antibody_human_p)){
  phospho = phospho_antibody_human_p[i,]
  df_fasta = subset(df_fasta_antibody, PROTEIN_ID %in% phospho$ACC_ID)
  MS_data_tmp = subset(MS_data_anti, PROTEIN_ID %in% phospho$ACC_ID)  
  if(length(MS_data_tmp$SEQUENCE)<10){next} # omit the proteins if protein sequence length <10 
  MS_data_freq=as.data.frame(table(MS_data_tmp$SEQUENCE), stringsAsFactors=F) # summerize the MS peptides and frequency
  pos = unlist(gregexpr(pattern =phospho$SITE_...7_AA,df_fasta$SEQ, ignore.case = T)) # position of phosphosite in protein sequence
  if( phospho$pos %in% pos){ # if both positions are same (position stored in phosphosite and position by matching with protein sequence)
    
    # Set the locations of MS peptides in protein sequence
    seq_locations=str_locate(toupper(df_fasta$sequence), MS_data_freq$Var1) # return start and end position for each MS peptide
    df_location_range=data.frame(matrix(seq_locations, ncol=2, byrow=F),stringsAsFactors=FALSE) # change matrix format to df
    colnames(df_location_range) = c("pep_start", "pep_end")
    MS_data_freq = cbind(MS_data_freq, df_location_range)
    
    # Judge if the phosphosite is located within the MS peptides
    MS_data_freq$pos_minus_start = phospho$pos - MS_data_freq$pep_start
    MS_data_freq$pos_minus_end = phospho$pos - MS_data_freq$pep_end
    MS_data_freq_inc_phospho = subset(MS_data_freq, pos_minus_start>0 & pos_minus_end<0)
    # order the df if df has multiple MS peptides containing phosphosite, the one with max freq is ordered at the top
    MS_data_freq_inc_phospho = MS_data_freq_inc_phospho[with(MS_data_freq_inc_phospho, order(-Freq)), ]  

    phospho_antibody_human_p$prot_length[i] = nchar(df_fasta$sequence)
    phospho_antibody_human_p$MS_seq[i] = paste(MS_data_freq_inc_phospho$Var1, MS_data_freq_inc_phospho$Freq, collapse=";")
    phospho_antibody_human_p$MS_seq_max[i] = MS_data_freq_inc_phospho$Var1[1]
    phospho_antibody_human_p$MS_seq_max_length[i] = nchar(MS_data_freq_inc_phospho$Var1[1])
    phospho_antibody_human_p$MS_seq_max_freq[i] = MS_data_freq_inc_phospho$Freq[1]
  }
}

write.table(phospho_antibody_huma