如果第一次阅读,请查看写在前面
#暂时发现读取fasta成为字典会让后续数据处理操作很困难,暂时以后慎用!!!
#用列表进行处理将会很简单,下文copy——作者:未琢 https://www.bilibili.com/read/cv2010611 出处:bilibili
#将fasta存为字典后,可以将字典转化为列表更方便处理!!!!!
import re
from collections import defaultdict
fasta = {} #存入fasta文件
#读取fasta文件内容
with open("../examples/ros_bio10_CONS.txt") as f:
file = f.readlines()
for line in file:
line = re.sub(r'\n', "", line)
m = re.match(r'^>.*', line)
if m:
name = m.group()
fasta[name] = ''
else:
fasta[name] = fasta[name] + line
#将profile中字典空值设为0
profile = {'A': {}, 'C': {}, 'G': {}, 'T': {}}
for key in profile.keys():
profile[key] = defaultdict(int)
#A,C,G,T出现次数写入profile字典
for value in fasta.values():
i = 0
length = len(value)
while i < len(value):
if value[i] == 'A':
profile['A'][i] += 1
elif value[i] == 'C':
profile['C'][i] += 1
elif value[i] == 'G':
profile['G'][i] += 1
else:
profile['T'][i] += 1
i += 1
#打印输出
output = {}
for key in profile.keys():
name = key
dic = profile[key]
i = 0
A = ''
while i < length:
A = A + str(dic[i])
i += 1
output[key] = A
#四个碱基的个数放入列表,并根据最大得出序列
A = list(output['A'])
C = list(output['C'])
G = list(output['G'])
T = list(output['T'])
i = 0
sequence = ''
while i < len(A):
if max(A[i], C[i], G[i], T[i]) == A[i]:
sequence += 'A'
elif max(A[i], C[i], G[i], T[i]) == C[i]:
sequence += 'C'
elif max(A[i], C[i], G[i], T[i]) == G[i]:
sequence += 'G'
else:
sequence += 'T'
i += 1
A = ' '.join(A)
C = ' '.join(C)
G = ' '.join(G)
T = ' '.join(T)
print(sequence)
print("A:", A)
print("C:", C)
print("G:", G)
print("T:", T)
'''
def readfasta(lines): # 用封装好的函数读取FASTA文件
seq = []
index = []
seqplast = ""
numlines = 0
for i in lines:
if '>' in i:
index.append(i.replace("\n", "").replace(">", ""))
seq.append(seqplast.replace("\n", ""))
seqplast = ""
numlines += 1
else:
seqplast = seqplast + i.replace("\n", "")
numlines += 1
if numlines == len(lines):
seq.append(seqplast.replace("\n", ""))
seq = seq[1:]
return index, seq
f = open('rosalind_cons.txt', 'r')
lines = f.readlines()
f.close()
[index, seq] = readfasta(lines)
A = []
G = []
T = []
C = []
i = 0
while i < len(seq[0]): # 第一层循环,扫描位点
j = 0
a = 0
g = 0
c = 0
t = 0
while j < len(seq): # 第二层循环,扫描各序列
if seq[j][i] == 'A': # 判断该序列该位点是否为”A”
a += 1 # 是则相应计数加一
elif seq[j][i] == 'G':
g += 1
elif seq[j][i] == 'C':
c += 1
elif seq[j][i] == 'T':
t += 1
j += 1
i +=1
A.append(a) # 记录该位点出现”A”的次数
G.append(g)
C.append(c)
T.append(t)
# 推出一致序列
common = ''
base = 0
while base < len(A):
if max(A[base],G[base],C[base],T[base]) == A[base]:
common += 'A'
elif max(A[base],G[base],C[base],T[base]) == G[base]:
common += 'G'
elif max(A[base],G[base],C[base],T[base]) == T[base]:
common += 'T'
elif max(A[base],G[base],C[base],T[base]) == C[base]:
common += 'C'
base += 1
# 把计数各项从整数改为字符串型,方便写入输出文件
i = 0
while i < len(A):
A[i] = str(A[i])
G[i] = str(G[i])
C[i] = str(C[i])
T[i] = str(T[i])
i += 1
f = open('output.txt','a')
f.write(common + '\n')
f.write("A: ")
f.write(' '.join(A) + '\n')
f.write("C: ")
f.write(' '.join(C) + '\n')
f.write("G: ")
f.write(' '.join(G) + '\n')
f.write("T: ")
f.write(' '.join(T))
f.close()
'''