import click
import pandas as pd
import os
@click.command()
@click.option('-r', '--reference', help='accession_file(nucl_gb.accession2taxid/prot.accession2taxid /...)', required=True)
@click.option('-op', '--outputpath', help='outputpath_name(/home/lijing/all_test/test_202112/ /...)', required=True)
@click.option('-id', '--id_of_data', help='data_id_name(K200004870_L01_60 /...)', required=True)
@click.option('-i', '--input_hostfilter_file', help='input_hostfilter_file_path_name(/home/lijing/all_test/test_202112/tmp_star_trimmomatic_priceseq_dedup_lzw_bowtie2_gsnap.fasta |...)', required=True)
@click.option('-d', '--docker', help='docker_name(823fa78ed660 |...)', required=True)
@click.option('-ga', '--gsnap_aligndatabase',
help='gsnap_aligndatabase_name(/idseqflow-dockerfile-container-share/20211109/GMAP-GSNAP_data/nt_bacteria_db |...)',
required=True)
@click.option('-gai', '--gsnap_aligndatabase_index',
help='gsnap_aligndatabase_index_name(nt_bacteria_k16 | nt_virus_k16 | nt_standard_db |...)',
required=True)
@click.option('-n', '--input_hostfilter_file_name', help='input_hostfilter_file_name(tmp_star_trimmomatic_priceseq_dedup_lzw_bowtie2_gsnap.fasta |...)', required=True)
def get_blastn_match_taxid_lineage(reference,outputpath,id_of_data,input_hostfilter_file,docker,gsnap_aligndatabase_index,gsnap_aligndatabase,input_hostfilter_file_name):
val0 = os.system(
"echo '复制文件进入docker进行gsnap比对' && docker cp %s %s:/ && docker exec -it %s /bin/bash -c 'gsnapl -A m8 --batch=0 --use-shared-memory=0 --gmap-mode=none --npaths=100 --ordered -t 36 --max-mismatches=40 -D %s -d %s /%s > temp_gsnap.m8' && docker cp %s:/temp_gsnap.m8 %s " % (
input_hostfilter_file, docker, docker, gsnap_aligndatabase, gsnap_aligndatabase_index,
input_hostfilter_file_name, docker, outputpath))
val0 = os.system("echo '比对文件输出' ")
accession_file1 = reference
df_accession = pd.read_csv(accession_file1, sep='\t')
df_accession1=df_accession.iloc[:,1:3]
accession_to_taxid_dict=dict(zip(df_accession1['accession.version'],df_accession1['taxid']))
search_file1='%s/temp_gsnap.m8' %outputpath
df_search=pd.read_csv(search_file1,sep='\t',header=None)
df_search.columns=['Query id','accession.version','% identity','alignment length','mismatches','gap openings',
'q. start','q. end','s. start','s. end','e-value','bit score']
df_search['taxid']=df_search['accession.version'].apply(lambda x : accession_to_taxid_dict[x])
df_search.to_csv('%s/tmp_blastn_match_taxid' % outputpath,index=False,sep='\t',header=None)
df_search['taxid'].to_csv('%s/tmp_blastn_match_taxid_totaxid' % outputpath,index=False,sep='\t',header=None)
val1=os.system("taxonkit lineage %s/tmp_blastn_match_taxid_totaxid | taxonkit reformat -f '{k};{g};{s}' |cut -f 1,3 > %s/tmp_blastn_match_taxid_totaxid_lineage " % (outputpath,outputpath))
val2=os.system('paste %s/tmp_blastn_match_taxid_totaxid_lineage %s/tmp_blastn_match_taxid > %s/tmp_blastn_match_taxid_lineage' % (outputpath,outputpath,outputpath))
if val1==0 and val2==0:
print("taxid匹配上lineage成功")
else:
print("taxid匹配上lineage失败")
file1 = '%s/tmp_blastn_match_taxid_totaxid_lineage' % outputpath
df1 = pd.read_csv(file1, sep='\t', header=None)
file2 = '%s/tmp_blastn_match_taxid_lineage' % outputpath
df2 = pd.read_csv(file2, sep='\t', header=None)
df3 = df2.iloc[:, 0:df2.shape[1] - 1]
df3.columns = ['taxid', 'taxnomy_k_g_s', 'Query id', 'accession.version', '% identity', 'alignment length',
'mismatches', 'gap openings',
'q. start', 'q. end', 's. start', 's. end', 'e-value', 'bit score']
df4 = df3['taxnomy_k_g_s'].str.split(';', expand=True)
df4.columns = ['kingdom', 'genus', 'species']
df5 = pd.concat([df3, df4], axis=1, names=['kingdom', 'genus', 'species'])
df5.to_csv('%s/tmp_blastn_match_taxid_lineage_add_kgs' % outputpath,index=False, sep='\t')
file1 = '%s/tmp_blastn_match_taxid_lineage_add_kgs' % outputpath
df1 = pd.read_csv(file1, sep='\t')
print(df1['% identity'].value_counts(ascending=False))
for i in range(len(df1['% identity'])):
if df1['% identity'][i] > 80:
i += 1
else:
print("unmeet")
break
list1 = df1['Query id'].unique().tolist()
list3 = []
for i in range(len(list1)):
list2 = df1[df1['Query id'] == list1[i]]['species'].tolist()
for j in range(1, len(list2)):
if str(list2[j]).split(' ', 2)[0] != str(list2[0]).split(' ', 2)[0]:
list3.append(list1[i])
break
else:
j = j + 1
for i in range(len(list3)):
df1 = df1[~df1['Query id'].isin([str(list3[i])])]
df2 = df1.reset_index(drop=True)
df2.to_csv('%s/tmp_blastn_match_taxid_lineage_add_kgs_delrepeat' % outputpath,index=None, sep='\t')
file2 ='%s/tmp_blastn_match_taxid_lineage_add_kgs_delrepeat' % outputpath
df3 = pd.read_csv(file2, sep='\t')
df4 = df3.drop_duplicates(subset='Query id').reset_index()
df4.to_csv('%s/tmp_blastn_match_taxid_lineage_add_kgs_delrepeat1' % outputpath,index=None, sep='\t')
file1 = '%s/tmp_blastn_match_taxid_lineage_add_kgs_delrepeat1' % outputpath
df1 = pd.read_csv(file1, sep='\t')
df2 = df1.taxid.value_counts()
df3 = df1['taxid'].value_counts(ascending=False)
dict_df3 = {'taxid': df3.index, 'count': df3.values}
df4 = pd.DataFrame(dict_df3)
dict_id_to_taxnomy = dict(zip(df1['taxid'], df1['taxnomy_k_g_s']))
df4['taxnomy_k_g_s'] = df4['taxid'].apply(lambda x: dict_id_to_taxnomy[x])
df5 = df4['taxnomy_k_g_s'].str.split(';', expand=True)
df5.columns = ['kingdom', 'genus', 'species']
df6 = pd.concat([df4, df5], axis=1, names=['kingdom', 'genus', 'species'])
del df6['taxnomy_k_g_s']
df6['type'] = 0
for i in range(len(df6)):
if df6.loc[i, 'kingdom'] in 'Viruses':
df6.loc[i, 'type'] = '病毒'
elif df6.loc[i, 'kingdom'] == 'Bacteria':
df6.loc[i, 'type'] = '细菌'
elif df6.loc[i, 'kingdom'] in 'fungi':
df6.loc[i, 'type'] = '真菌'
df6['Name'] = df6['species']
df6['level 1'] = df6['kingdom']
col_names = df6.columns.tolist()
col_names_index = ['Name', 'level 1', 'taxid', 'count', 'kingdom', 'genus', 'species', 'type']
df7 = df6.reindex(columns=col_names_index)
df7.to_csv('%s/tmp_blastn_match_taxid_lineage_add_kgs_delrepeat1_out.txt' % outputpath,index=None, sep='\t')
file1 = '%s/tmp_blastn_match_taxid_lineage_add_kgs_delrepeat1_out.txt' % outputpath
df1 = pd.read_csv(file1, sep='\t')
print(df1)
print(df1['Name'])
print(df1['count'])
print(df1.columns)
df2=df1.drop(columns=['taxid','count'],axis=1)
print(df2)
df3=df1.drop(columns=['count'],axis=1)
df4 = df3.drop_duplicates(subset=['Name', 'level 1'],keep='first')
df4.reset_index(drop=True,inplace=True)
print(df4)
list1 =df1['Name'].unique().tolist()
df4["count"]=0
for i in range(len(list1)):
print(list1[i])
list2=df1[df1["Name"]==list1[i]]['count'].tolist()
count_sum = sum(list2)
if df4.loc[i,'Name']==list1[i]:
df4['count'][i]=count_sum
print(df4)
order = ['Name', 'level 1', 'taxid', 'count', 'kingdom', 'genus', 'species','type']
df4 = df4[order]
df4.to_csv('%s/%s_blastn_match_taxid_lineage_add_kgs_delrepeat1_out.txt' % (outputpath,id_of_data),index=None,sep='\t')
if __name__ == '__main__':
get_blastn_match_taxid_lineage()