import pandas as pd
# from collections import Counter
# # df_all_reads=pd.read_excel(r'20220505_tNGS\result\filter\qc.xls',engine='xlrd')
# # print(df_all_reads)
#
# with open(r'20220505_tNGS\result\filter\qc.xls','r') as f:
# f.readline()
# for line in f:
# line=line.strip('\n')
# list=line.split('\t')
# if list[0]=='total_reads':
# all_reads=int(list[1])
# print(all_reads)
#
# df_map=pd.read_csv(r'20220505_tNGS\result\amp.xls',engine='python',sep='\t',header=None)
#
# df_map.columns=['accession','count']
# print(df_map)
# print(sum(df_map['count']))
#
# # capture_rate=str(sum(df_map['count'])/all_reads*100)+'%'
# capture_rate=sum(df_map['count'])/all_reads
# capture_rate='%.2f%%' %(capture_rate*100)
# # print('%.2f%%' %(capture_rate*100))
# # capture_rate=format(capture_rate,'.00%')
# print(capture_rate)
#
# df_mNGS_all_reads=pd.read_csv(r'qc.xls',engine='python',sep='\t')
#
# print(df_mNGS_all_reads)
#
# df_tngs_species_rpm=pd.read_csv(r'report.xls',engine='python',sep='\t')
# df_tngs_species_rpm['tngs_species_rpm']=''
# print(df_tngs_species_rpm)
# for i in range(len(df_tngs_species_rpm)):
# df_tngs_species_rpm.loc[i,'tngs_species_rpm']=int(df_tngs_species_rpm.loc[i,'S_Count'])/all_reads*1000000
# df_tngs_species_rpm=df_tngs_species_rpm[['species_Cname','tngs_species_rpm']]
# print(df_tngs_species_rpm)
# df_tngs_species_rpm['mngs_species_rpm']=''
#
# df_mngs_species_rpm=pd.read_csv(r'classify_abundance_result.anno.xls',engine='python',sep='\t')
# print(df_mngs_species_rpm)
# dict_mngs_species_rpm=dict(zip(df_mngs_species_rpm['species_Cname'],df_mngs_species_rpm['sam_RPM']))
# print(dict_mngs_species_rpm)
# for i in range(len(df_tngs_species_rpm)):
# if df_tngs_species_rpm.loc[i,'species_Cname'] in dict_mngs_species_rpm:
# df_tngs_species_rpm.loc[i,'mngs_species_rpm']=dict_mngs_species_rpm[df_tngs_species_rpm.loc[i,'species_Cname']]
# else:
# df_tngs_species_rpm.loc[i, 'mngs_species_rpm'] ='-'
# print(df_tngs_species_rpm)
#
# print('-----'*5)
#
#
#
# ###统计捕获效率
# import os
# records_tngs_qc_xls=[]
# records_tngs_qc_xls_name=[]
# targetpath=r'比较'
# for root,dirs,files in os.walk(targetpath):
# for name in files:
# if 'T.qc.xls' in name:
# records_tngs_qc_xls.append(root+'\\'+name)
# records_tngs_qc_xls_name.append(name)
# print(records_tngs_qc_xls)
# print(records_tngs_qc_xls_name)
#
# records_tngs_amp_xls=[]
# records_tngs_amp_xls_name=[]
# targetpath=r'比较'
# for root,dirs,files in os.walk(targetpath):
# for name in files:
# if 'T.amp.xls' in name:
# records_tngs_amp_xls.append(root+'\\'+name)
# records_tngs_amp_xls_name.append(name)
# print(records_tngs_amp_xls)
# print(records_tngs_amp_xls_name)
#
# dict_capture_rate={}
# for i in records_tngs_qc_xls:
# with open(i, 'r') as f:
# f.readline()
# for line in f:
# line = line.strip('\n')
# list = line.split('\t')
# if list[0] == 'total_reads':
# all_reads = int(list[1])
# print(all_reads)
#
# for j in records_tngs_amp_xls:
# if i.rsplit('\\',1)[1].split('.',1)[0] in j:
# df_map = pd.read_csv(j,engine='python', sep='\t', header=None)
#
# df_map.columns = ['accession', 'count']
# # print(df_map)
# # print(sum(df_map['count']))
# capture_rate = sum(df_map['count']) / all_reads
# capture_rate = '%.2f%%' % (capture_rate * 100)
# dict_capture_rate[i.rsplit('\\',1)[1].split('.',1)[0]]=capture_rate
#
# print(dict_capture_rate)
#
# df_capture_rate=pd.DataFrame([dict_capture_rate])
# print(df_capture_rate)
#
# ### 统计q20,q30
# records_tngs_qc_xls=[]
# records_tngs_qc_xls_name=[]
# targetpath=r'比较'
# for root,dirs,files in os.walk(targetpath):
# for name in files:
# if 'T.qc.xls' in name:
# records_tngs_qc_xls.append(root+"\\"+name)
# records_tngs_qc_xls_name.append(name)
# print(records_tngs_qc_xls)
# print(records_tngs_qc_xls_name)
#
# dict_Qvalue={}
# for i in records_tngs_qc_xls:
# with open(i, 'r') as f:
# f.readline()
# for line in f:
# line = line.strip('\n')
# list = line.split('\t')
# if list[0] == 'Q20':
# Q_20 = list[1]
# elif list[0] == 'Q30':
# Q_30 = list[1]
# # print(Q_20)
# # print(Q_30)
# dict_Qvalue[i.rsplit('\\',1)[1].split('.',1)[0]]={'Q20':Q_20,'Q30':Q_30}
# print(dict_Qvalue)
# df_Qvalue=pd.DataFrame(dict_Qvalue)
# print(df_Qvalue)
### 统计tngs和mngs的rpm
print('第三步,统计rpm')
import os
records_tngs_qc_xls=[]
records_tngs_qc_xls_name=[]
targetpath=r'比较'
for root,dirs,files in os.walk(targetpath):
for name in files:
if 'T.qc.xls' in name:
records_tngs_qc_xls.append(root+'\\'+name)
records_tngs_qc_xls_name.append(name)
print(records_tngs_qc_xls)
print(records_tngs_qc_xls_name)
records_tngs_amp_xls=[]
records_tngs_amp_xls_name=[]
targetpath=r'比较'
for root,dirs,files in os.walk(targetpath):
for name in files:
if 'T.amp.xls' in name:
records_tngs_amp_xls.append(root+'\\'+name)
records_tngs_amp_xls_name.append(name)
print(records_tngs_amp_xls)
print(records_tngs_amp_xls_name)
records_tngs_report_xls=[]
records_tngs_report_xls_name=[]
targetpath=r'比较'
for root,dirs,files in os.walk(targetpath):
for name in files:
if 'T.report.xls' in name:
records_tngs_report_xls.append(root+'\\'+name)
records_tngs_report_xls_name.append(name)
print(records_tngs_report_xls)
print(records_tngs_report_xls_name)
records_mngs_result_xls=[]
records_mngs_result_xls_name=[]
targetpath=r'比较'
for root,dirs,files in os.walk(targetpath):
for name in files:
if '.classify_abundance_result.anno.xls' in name:
records_mngs_result_xls.append(root+'\\'+name)
records_mngs_result_xls_name.append(name)
print(records_mngs_result_xls)
print(records_mngs_result_xls_name)
#
# df_add_species_rpm=pd.DataFrame(columns=['species_Cname'])
df_add_species_rpm=pd.DataFrame()
for i in records_tngs_qc_xls:
with open(i, 'r') as f:
f.readline()
for line in f:
line = line.strip('\n')
list = line.split('\t')
if list[0] == 'total_reads':
all_reads = int(list[1])
print(all_reads)
for j in records_tngs_report_xls:
if i.rsplit('\\',1)[1].split('.',1)[0] in j:
df_tngs_species_rpm = pd.read_csv(j,engine='python', sep='\t')
df_tngs_species_rpm[i.rsplit('\\',1)[1].split('.',1)[0]+'-tngs_species_rpm'] = ''
print(df_tngs_species_rpm)
for k in range(len(df_tngs_species_rpm)):
df_tngs_species_rpm.loc[k,i.rsplit('\\',1)[1].split('.',1)[0]+'-tngs_species_rpm']=int(df_tngs_species_rpm.loc[k,'S_Count'])/all_reads*1000000
df_tngs_species_rpm=df_tngs_species_rpm[['species_Cname',i.rsplit('\\',1)[1].split('.',1)[0]+'-tngs_species_rpm']]
df_tngs_species_rpm[str(i.rsplit('\\',1)[1].split('.',1)[0]+'-mngs_species_rpm').replace('T','')] = ''
# print(i.rsplit('\\',1)[1].split('.',1)[0])
for m in records_mngs_result_xls:
if i.rsplit('\\',1)[1].split('.',1)[0].replace('T','') in m:
df_mngs_species_rpm =pd.read_csv(m,engine='python',sep='\t')
dict_mngs_species_rpm = dict(zip(df_mngs_species_rpm['species_Cname'], df_mngs_species_rpm['sam_RPM']))
# print(dict_mngs_species_rpm)
for n in range(len(df_tngs_species_rpm)):
if df_tngs_species_rpm.loc[n,'species_Cname'] in dict_mngs_species_rpm:
df_tngs_species_rpm.loc[n,i.rsplit('\\',1)[1].split('.',1)[0].replace('T','')+'-mngs_species_rpm']=dict_mngs_species_rpm[df_tngs_species_rpm.loc[n,'species_Cname']]
else:
df_tngs_species_rpm.loc[n, i.rsplit('\\',1)[1].split('.',1)[0].replace('T','')+'-mngs_species_rpm'] ='-'
# print(df_tngs_species_rpm)
print(df_tngs_species_rpm)
# df_add_species_rpm=pd.merge(df_add_species_rpm,df_tngs_species_rpm,on='species_Cname')
# df_add_species_rpm = pd.merge(df_add_species_rpm, df_tngs_species_rpm,left_on=None, right_on=None, left_index=False, right_index=False)
df_add_species_rpm = pd.concat([df_add_species_rpm, df_tngs_species_rpm])
print(df_add_species_rpm)
# df_mngs_species_rpm=pd.read_csv(r'比较\result\classify_abundance_result.anno.xls',engine='python',sep='\t')
# # print(df_mngs_species_rpm)
# dict_mngs_species_rpm=dict(zip(df_mngs_species_rpm['species_Cname'],df_mngs_species_rpm['sam_RPM']))
# # print(dict_mngs_species_rpm)
# for i in range(len(df_tngs_species_rpm)):
# if df_tngs_species_rpm.loc[i,'species_Cname'] in dict_mngs_species_rpm:
# df_tngs_species_rpm.loc[i,'mngs_species_rpm']=dict_mngs_species_rpm[df_tngs_species_rpm.loc[i,'species_Cname']]
# else:
# df_tngs_species_rpm.loc[i, 'mngs_species_rpm'] ='-'
# print(df_tngs_species_rpm)
# df_map.columns = ['accession', 'count']
# # print(df_map)
# # print(sum(df_map['count']))
# capture_rate = sum(df_map['count']) / all_reads
# capture_rate = '%.2f%%' % (capture_rate * 100)
# dict_capture_rate[i.rsplit('\\',1)[1].split('.',1)[0]]=capture_rate
# print(dict_capture_rate)
#
# df_capture_rate=pd.DataFrame([dict_capture_rate])
# print(df_capture_rate)