# central dogma FILE = "seq5.fasta" output = open("central_dogma_report.txt", "w") # 报告文档 # 解析提取fasta格式中纯序列:real_seq with open(FILE, 'r') as fp: real_seq = '' for line in fp.readlines(): if line.startswith('>'): seq_name = line output.write(seq_name) # 序列名 else: real_seq += line output.write("DNA:\n%s" % real_seq) # 序列本体 real_seq = real_seq.replace('\n', '') fp.close() # 统计碱基组成 nt_count = [] for nt in "ATCG": nt_count.append(real_seq.count(nt)) A, T, C, G = nt_count[0:4] others = len(real_seq) - A - T - C - G output.write("\nseq_length:%d\n" % (A + C + T + G + others)) output.write("nt_composition:\n" "A:%d(%.2f%%)\n" "T:%d(%.2f%%)\n" "C:%d(%.2f%%)\n" "G:%d(%.2f%%)\n" "others:%d(%.2f%%)" % (A, (A / len(real_seq)) * 100, T, (T / len(real_seq)) * 100, C, (C / len(real_seq)) * 100, G, (G / len(real_seq)) * 100, others, others / len(real_seq) * 100)) # 碱基组成 output.write("\nGC:%.2f%%\n" % (((C + G) / len(real_seq)) * 100)) # 转录获得mRNA transcript_table = {'A': 'U', 'T': 'A', 'C': 'G', 'G': 'C', 'R': '(A/G)', 'Y': '(T/C)', 'W': '(A/T)', 'S': '(C/G)', 'M': '(A/C)', 'K': '(G/T)', 'B': '(C/G/T)', 'D': '(A/G/T)', 'H': '(A/C/T)', 'V': '(A/C/G)', 'N': '(A/T/C/G)'} mRNA = '' for nt in real_seq.upper(): mRNA += transcript_table[nt] output.write("mRNA:\n") for index in range(0, len(mRNA), 70): output.write(mRNA[index:index + 70]) output.write("\n") # 解析密码子和翻译以及计数氨基酸个数 translate_table = { 'GCU': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'CGU': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'AGA': 'R', 'AGG': 'R', 'UCU': 'S', 'UCC': 'S', 'UCA': 'S', 'UCG': 'S', 'AGU': 'S', 'AGC': 'S', 'AUU': 'I', 'AUC': 'I', 'AUA': 'I', 'UUA': 'L', 'UUG': 'L', 'CUU': 'L', 'CUC': 'L', 'CUA': 'L', 'CUG': 'L', 'GGU': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', 'GUU': 'V', 'GUC': 'V', 'GUA': 'V', 'GUG': 'V', 'ACU': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'CCU': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'AAU': 'N', 'AAC': 'N', 'GAU': 'D', 'GAC': 'D', 'UGU': 'C', 'UGC': 'C', 'CAA': 'Q', 'CAG': 'Q', 'GAA': 'E', 'GAG': 'E', 'CAU': 'H', 'CAC': 'H', 'AAA': 'K', 'AAG': 'K', 'UUU': 'F', 'UUC': 'F', 'UAU': 'Y', 'UAC': 'Y', 'AUG': 'M', 'UGG': 'W', 'UAG': 'STOP', 'UGA': 'STOP', 'UAA': 'STOP'} output.write("\naa_seq:\n") for ORF in range(3): orf = '' output.write("\nORF%d:" % (ORF + 1)) for index in range(ORF, len(mRNA), 3): codon = mRNA[index:index + 3] if codon in translate_table.keys(): if translate_table[codon] == "STOP": orf += '*' else: orf += translate_table[codon] else: orf += '-' for index in range(0, len(orf), 60): output.write("\n%s" % (orf[index:index + 60])) output.write("\naa number:\n") for amino_acid in "ACDEFGHIKLMNPQRSTVWY*-": output.write("%s:%d;" % (amino_acid, orf.count(amino_acid))) output.close()
python新手写了个低配版中心法则:)
于 2022-08-22 10:58:44 首次发布