import re
#定义一个计算hamming distance的函数
def hamm(seq1, seq2):
dis = 0
for i in range(len(seq1)):
if seq1[i] == seq2[i]:
continue
else:
dis += 1
return dis
#序列的反向补码
def reverse(s):
dic = {'A': 'T', 'G': 'C', 'C': 'G', 'T': 'A'}
rcom = ''.join([dic[x] for x in s[::-1]])
return rcom
#读取fasta中的序列
with open('../examples/ros_bio34_CORR.txt') as f:
file = f.readlines()
table = {}
for line in file:
line = re.sub(r'\n', '', line)
m = re.match(r'^>.*', line)
if m:
name = m.group()
table[name] = ''
else:
table[name] += line
seq = [x for x in table.values()]
#统计所有序列中的错误序列以及正确序列
correct = []
wrong = []
for i in range(len(seq)):
remain = seq[:i] + seq[i+1:]
length = len(correct)
for line in remain:
if hamm(seq[i], line) == 0 or hamm(seq[i], reverse(line)) == 0:
correct.append(seq[i])
break
if len(correct) == length:
wrong.append(seq[i])
#遍历错误序列,计算哈明距离为一的序列即为修正后的序列
result = {}
for i in wrong:
for j in correct:
if hamm(i, j) == 1:
result[i] = j
elif hamm(i, reverse(j)) == 1:
result[i] = reverse(j)
for k, v in result.items():
print(k, '->', v, sep='')