import pandas as pd
file1 = r"2021-10\K200005137_L01_126_nt_virus_blastn_match_taxid_lineage_add_kgs1.txt"
df1 = pd.read_csv(file1, sep='\t')
print(df1)
print(df1.loc[:,'% identity'])
print(df1['% identity'].value_counts(ascending=False))
for i in range(len(df1['% identity'])):
if df1['% identity'][i] >80:
i+=1
else :
print("unmeet")
break
list1=df1['Query_id'].unique().tolist()
print(list1)
print(len(list1),df1.shape[0])
for i,item in df1.iterrows():
print(i,item.Query_id,item.species)
print(df1['Query_id'].items)
list3=[]
for i in range(len(list1)):
list2=df1[df1['Query_id']==list1[i]]['species'].tolist()
print(list2)
for j in range(1,len(list2)):
if str(list2[j]) .split(' ',2)[0]!= str(list2[0]).split(' ',2)[0]:
print(list1[i])
list3.append(list1[i])
break
else:
j=j+1
print(list3)
for i in range(len(list3)):
df1=df1[~df1['Query_id'].isin([str(list3[i])])]
print(df1)
df2=df1.reset_index(drop=True)
print(df2)
df2.to_csv(r"2021-10\output5.txt",index=None,sep='\t')
file2 = r"2021-10\output5.txt"
df3 = pd.read_csv(file2, sep='\t')
df4=df3.drop_duplicates(subset='Query_id').reset_index()
df5=df4['species'].value_counts(ascending=False)
print(df4)
print(df5)
df4.to_csv(r"2021-10\output6.txt",index=None,sep='\t')