import pandas as pd
import glob
# df1=pd.read_csv(r'C:\Users\Downloads\clincal_com_database_v2.csv',sep=',',encoding='gbk')
# print(df1)
#
# list_have=[]
# list_no_have=[]
# list_have_path=[]
#
# for i in range(len(df1)):
# if str(df1.loc[i,'type'])=='真菌':
# if len(glob.glob('/data_download/Fungi/*%s*' % (str(df1.loc[i,'species_Ename']).replace(' ','_'))))==0:
# # if len(list_Fungi)==0:
# list_no_have.append((str(df1.loc[i,'species_Ename'])))
# else:
# list_have.append((str(df1.loc[i,'species_Ename'])))
# list_have_path.append(glob.glob('/data_download/Fungi/*%s*' % (str(df1.loc[i,'species_Ename']).replace(' ','_'))))
#
# elif str(df1.loc[i,'type'])=='细菌':
# if len(glob.glob('/data_download/Bacteria/*%s*' % (str(df1.loc[i, 'species_Ename']).replace(' ', '_'))))==0:
# list_no_have.append((str(df1.loc[i, 'species_Ename'])))
# else:
# list_have.append((str(df1.loc[i, 'species_Ename'])))
# list_have_path.append(glob.glob('/data_download/Bacteria/*%s*' % (str(df1.loc[i, 'species_Ename']).replace(' ', '_'))))
#
# elif df1.loc[i,'type']=='寄生虫':
# if len(glob.glob('/data_download/parasite/*%s*' % (str(df1.loc[i, 'species_Ename']).replace(' ', '_'))))==0:
# list_no_have.append((str(df1.loc[i, 'species_Ename'])))
# else:
# list_have.append((str(df1.loc[i, 'species_Ename'])))
# list_have_path.append(glob.glob('/data_download/parasite/*%s*' % (str(df1.loc[i, 'species_Ename']).replace(' ', '_'))))
#
# elif df1.loc[i,'type']=='病毒':
# if len(glob.glob('/data_download/virus/*%s*' % (str(df1.loc[i, 'species_Ename']).replace(' ', '_'))))==0:
# list_no_have.append((str(df1.loc[i, 'species_Ename'])))
# else:
# list_have.append((str(df1.loc[i, 'species_Ename'])))
# list_have_path.append(glob.glob('/data_download/virus/*%s*' % (str(df1.loc[i, 'species_Ename']).replace(' ', '_'))))
#
# # list_no_have=sum(list_no_have,[])
# # list_have=sum(list_have,[])
# list_have_path=sum(list_have_path,[])
#
# with open(r'/data_download/no_have_species1.txt','w') as f:
# for i in list_no_have:
# f.write(i+'\n')
# f.close()
#
# with open(r'/data_download/have_species_path.txt','w') as f:
# for i in list_have_path:
# f.write(i+'\n')
# f.close()
import pandas as pd
import gzip
import sys
df2=pd.read_csv(r'C:\Users\Downloads\have_species_path.txt',header=None)
df2.columns=['path']
print(df2.loc[0,'path'])
list2=df2['path'].tolist()
for i in list2:
with gzip.open('%s','r' %(i)) as f1,open('out_file.txt', 'a') as f2:
f2.write('species'+'\t'+'accession'+'\n')
for line in f1:
line=line.decode().strip('\n')
if line.startswith('>'):
accssion=line.split(' ')[0][1:]
f2.write(i.split('/')[-1].split('.')[0].split('_',1)[1].replace('_',' ')+'\t'+accssion+'\n')