import pandas as pd
from translate import Translator
file1 = r"2021-10\SRR1370913_nr_virus_blastx_match_taxid_lineage_add_kgs2"
df1 = pd.read_csv(file1, sep='\t')
print(df1)
df2 = df1.taxid.value_counts()
print(type(df2))
# id_to_taxnomy_dict=link.set_index('taxid')['taxnomy_k_g_s'].to_dict()
df3 = df1['taxid'].value_counts(ascending=False)
print(df3)
dict_df3 = {'taxid': df3.index, 'count': df3.values}
# print(type(dict_df3))
df4 = pd.DataFrame(dict_df3)
print(df4)
# print(df3.iloc[:,1])
# dict1=df1.to_dict()
# print(dict1)
# dict2=df1.set_index('taxid').to_dict()
# print(dict2)
dict_id_to_taxnomy = dict(zip(df1['taxid'], df1['taxnomy_k_g_s']))
print(dict_id_to_taxnomy)
df4['taxnomy_k_g_s']=df4['taxid'].apply(lambda x : dict_id_to_taxnomy[x])
print(df4)
df5=df4['taxnomy_k_g_s'].str.split(';',expand=True)
df5.columns=['kingdom','genus','species']
df6=pd.concat([df4,df5],axis=1,names=['kingdom','genus','species'])
print(df6)
del df6['taxnomy_k_g_s']
print(df6)
# print(df6.columns)
# print(type(df6['kingdom'][0]))
df6['type']=0
# df6['genus_Cname']='0'
print(df6.loc[1,'kingdom'])
print(df6.loc[:,'type'])
# print("_____________"*3)
# for i in range(len(df6)):
# translator = Translator(to_lang="zh")
# # translation = translator.translate(df6['kingdom'][0])
# # translation1 = translator.translate(df6['kingdom'][i])
# translation1 = translator.translate(df6.loc[i,'kingdom'])
# # df6['type'][i] =translation1
# df6.loc[i,'type']= translation1
#
#
# df6.to_csv(r"2021-10\output",index=None,sep='\t')
# print(df6.loc[:,'type'])
# print("_____________"*3)
#
#
# file2=r'2021-10\output'
# df6=pd.read_csv(file2, sep='\t')
# print(df6)
# print(df6.columns)
# # print(translation)
# for i in range(len(df6)):
# translator = Translator(from_lang='english',to_lang="chinese")
# translation2 = translator.translate(df6.loc[i,'genus'])
# df6.loc[i,'genus_Cname'] = translation2
# print(df6)
# df6.to_csv(r"2021-10\output1",index=None,sep='\t')
for i in range(len(df6)):
if df6.loc[i,'kingdom'] in 'Viruses':
df6.loc[i,'type']='病毒'
elif df6.loc[i,'kingdom']=='Bacteria':
df6.loc[i, 'type'] = '细菌'
elif df6.loc[i,'kingdom'] in 'fungi':
df6.loc[i, 'type'] = '真菌'
print(df6)
df6['Name']=df6['species'] # col_names.insert(0,'Name') #name=species name ,level 1=kingdom
df6['level 1']=df6['kingdom']
col_names=df6.columns.tolist()
print(col_names)
col_names_index=['Name','level 1','taxid', 'count', 'kingdom', 'genus', 'species', 'type' ]
df7=df6.reindex(columns=col_names_index)
print(df7)
# df6.to_csv(r"2021-10\output3.txt",index=None,sep='\t')
df7.to_csv(r"2021-10\output4.txt",index=None,sep='\t')
输出鉴定格式(细菌界门纲目等)_202112
最新推荐文章于 2025-06-10 20:34:35 发布