import pandas as pd
df_ref = pd.read_csv('/all_test/nucl.accession2taxid',header=0, sep='\t')
# df_ref.iloc[:,1]=df_ref.iloc[:,1].str.split('.',expand=True)[0]
# df_ref.columns=['taxid','species name','accession','complete','refseq status','length']
accession_to_accessionversion_dict=dict(zip(df_ref['accession'],df_ref['accession.version']))
search_file1='/seqkit/提取的耐药基因对应的nt完整序列id3+accession.txt'
df_search=pd.read_csv(search_file1,sep='\t',header=0)
# df_search.iloc[:,1]=df_search.iloc[:,1].str.split('.',expand=True)[0]
#
# df_search.columns=['Query id','accession','% identity','alignment length','mismatches','gap openings',
# 'q. start','q. end','s. start','s. end','e-value','bit score']
# df_search['accession.version'] = df_search['accession'].apply(lambda x: accession_to_accessionversion_dict[x])
list2=[]
for i in list(df_search.iloc[:,0]):
if i in accession_to_accessionversion_dict:
list2.append(accession_to_accessionversion_dict[i])
# df_search['accession.version'].to_csv('/seqkit/提取的耐药基因对应的nt完整序列id3+accession.version.txt',index=False,sep='\t',header=None)
line=''
with open(r'/seqkit/提取的耐药基因对应的nt完整序列final_accession.version.txt','w') as f:
for i in range(len(list2)):
line+=list2[i]+'\n'
line=line[:-1]
f.write(line)
f.close()
# #老方法:real 24m45.064s
# import pandas as pd
# accession_file1 = 'nucl.accession2taxid'
# df_accession = pd.read_csv(accession_file1, sep='\t')
#
# accession_to_taxid_dict = dict(zip(df_accession['accession'], df_accession['taxid']))