统计TNGS的捕获效率+Q20、Q30+各菌的检测效率

本文链接：https://blog.youkuaiyun.com/m0_50542529/article/details/135082877
import pandas as pd
# from collections import Counter
# # df_all_reads=pd.read_excel(r'20220505_tNGS\result\filter\qc.xls',engine='xlrd')
# # print(df_all_reads)
#
# with open(r'20220505_tNGS\result\filter\qc.xls','r') as f:
#     f.readline()
#     for line in f:
#         line=line.strip('\n')
#         list=line.split('\t')
#         if list[0]=='total_reads':
#             all_reads=int(list[1])
# print(all_reads)
#
# df_map=pd.read_csv(r'20220505_tNGS\result\amp.xls',engine='python',sep='\t',header=None)
#
# df_map.columns=['accession','count']
# print(df_map)
# print(sum(df_map['count']))
#
# # capture_rate=str(sum(df_map['count'])/all_reads*100)+'%'
# capture_rate=sum(df_map['count'])/all_reads
# capture_rate='%.2f%%' %(capture_rate*100)
# # print('%.2f%%' %(capture_rate*100))
# # capture_rate=format(capture_rate,'.00%')
# print(capture_rate)
#
# df_mNGS_all_reads=pd.read_csv(r'qc.xls',engine='python',sep='\t')
#
# print(df_mNGS_all_reads)
#
# df_tngs_species_rpm=pd.read_csv(r'report.xls',engine='python',sep='\t')
# df_tngs_species_rpm['tngs_species_rpm']=''
# print(df_tngs_species_rpm)
# for i in range(len(df_tngs_species_rpm)):
#     df_tngs_species_rpm.loc[i,'tngs_species_rpm']=int(df_tngs_species_rpm.loc[i,'S_Count'])/all_reads*1000000
# df_tngs_species_rpm=df_tngs_species_rpm[['species_Cname','tngs_species_rpm']]
# print(df_tngs_species_rpm)
# df_tngs_species_rpm['mngs_species_rpm']=''
#
# df_mngs_species_rpm=pd.read_csv(r'classify_abundance_result.anno.xls',engine='python',sep='\t')
# print(df_mngs_species_rpm)
# dict_mngs_species_rpm=dict(zip(df_mngs_species_rpm['species_Cname'],df_mngs_species_rpm['sam_RPM']))
# print(dict_mngs_species_rpm)
# for i in range(len(df_tngs_species_rpm)):
#     if df_tngs_species_rpm.loc[i,'species_Cname'] in dict_mngs_species_rpm:
#         df_tngs_species_rpm.loc[i,'mngs_species_rpm']=dict_mngs_species_rpm[df_tngs_species_rpm.loc[i,'species_Cname']]
#     else:
#         df_tngs_species_rpm.loc[i, 'mngs_species_rpm'] ='-'
# print(df_tngs_species_rpm)
#
# print('-----'*5)
#
#
#
# ###统计捕获效率
# import os
# records_tngs_qc_xls=[]
# records_tngs_qc_xls_name=[]
# targetpath=r'比较'
# for root,dirs,files in os.walk(targetpath):
#     for name in files:
#         if 'T.qc.xls' in name:
#             records_tngs_qc_xls.append(root+'\\'+name)
#             records_tngs_qc_xls_name.append(name)
# print(records_tngs_qc_xls)
# print(records_tngs_qc_xls_name)
#
# records_tngs_amp_xls=[]
# records_tngs_amp_xls_name=[]
# targetpath=r'比较'
# for root,dirs,files in os.walk(targetpath):
#     for name in files:
#         if 'T.amp.xls' in name:
#             records_tngs_amp_xls.append(root+'\\'+name)
#             records_tngs_amp_xls_name.append(name)
# print(records_tngs_amp_xls)
# print(records_tngs_amp_xls_name)
#
# dict_capture_rate={}
# for i in records_tngs_qc_xls:
#     with open(i, 'r') as f:
#         f.readline()
#         for line in f:
#             line = line.strip('\n')
#             list = line.split('\t')
#             if list[0] == 'total_reads':
#                 all_reads = int(list[1])
#     print(all_reads)
#
#     for j in records_tngs_amp_xls:
#         if i.rsplit('\\',1)[1].split('.',1)[0] in j:
#             df_map = pd.read_csv(j,engine='python', sep='\t', header=None)
#
#             df_map.columns = ['accession', 'count']
#             # print(df_map)
#             # print(sum(df_map['count']))
#             capture_rate = sum(df_map['count']) / all_reads
#             capture_rate = '%.2f%%' % (capture_rate * 100)
#             dict_capture_rate[i.rsplit('\\',1)[1].split('.',1)[0]]=capture_rate
#
# print(dict_capture_rate)
#
# df_capture_rate=pd.DataFrame([dict_capture_rate])
# print(df_capture_rate)
#
# ### 统计q20，q30
# records_tngs_qc_xls=[]
# records_tngs_qc_xls_name=[]
# targetpath=r'比较'
# for root,dirs,files in os.walk(targetpath):
#     for name in files:
#         if 'T.qc.xls' in name:
#             records_tngs_qc_xls.append(root+"\\"+name)
#             records_tngs_qc_xls_name.append(name)
# print(records_tngs_qc_xls)
# print(records_tngs_qc_xls_name)
#
# dict_Qvalue={}
# for i in records_tngs_qc_xls:
#     with open(i, 'r') as f:
#         f.readline()
#         for line in f:
#             line = line.strip('\n')
#             list = line.split('\t')
#             if list[0] == 'Q20':
#                 Q_20 = list[1]
#             elif list[0] == 'Q30':
#                 Q_30 = list[1]
#     # print(Q_20)
#     # print(Q_30)
#     dict_Qvalue[i.rsplit('\\',1)[1].split('.',1)[0]]={'Q20':Q_20,'Q30':Q_30}
# print(dict_Qvalue)
# df_Qvalue=pd.DataFrame(dict_Qvalue)
# print(df_Qvalue)

### 统计tngs和mngs的rpm
print('第三步，统计rpm')
import os
records_tngs_qc_xls=[]
records_tngs_qc_xls_name=[]
targetpath=r'比较'
for root,dirs,files in os.walk(targetpath):
    for name in files:
        if 'T.qc.xls' in name:
            records_tngs_qc_xls.append(root+'\\'+name)
            records_tngs_qc_xls_name.append(name)
print(records_tngs_qc_xls)
print(records_tngs_qc_xls_name)

records_tngs_amp_xls=[]
records_tngs_amp_xls_name=[]
targetpath=r'比较'
for root,dirs,files in os.walk(targetpath):
    for name in files:
        if 'T.amp.xls' in name:
            records_tngs_amp_xls.append(root+'\\'+name)
            records_tngs_amp_xls_name.append(name)
print(records_tngs_amp_xls)
print(records_tngs_amp_xls_name)

records_tngs_report_xls=[]
records_tngs_report_xls_name=[]
targetpath=r'比较'
for root,dirs,files in os.walk(targetpath):
    for name in files:
        if 'T.report.xls' in name:
            records_tngs_report_xls.append(root+'\\'+name)
            records_tngs_report_xls_name.append(name)
print(records_tngs_report_xls)
print(records_tngs_report_xls_name)

records_mngs_result_xls=[]
records_mngs_result_xls_name=[]
targetpath=r'比较'
for root,dirs,files in os.walk(targetpath):
    for name in files:
        if '.classify_abundance_result.anno.xls' in name:
            records_mngs_result_xls.append(root+'\\'+name)
            records_mngs_result_xls_name.append(name)
print(records_mngs_result_xls)
print(records_mngs_result_xls_name)



#
# df_add_species_rpm=pd.DataFrame(columns=['species_Cname'])
df_add_species_rpm=pd.DataFrame()
for i in records_tngs_qc_xls:
    with open(i, 'r') as f:
        f.readline()
        for line in f:
            line = line.strip('\n')
            list = line.split('\t')
            if list[0] == 'total_reads':
                all_reads = int(list[1])
    print(all_reads)

    for j in records_tngs_report_xls:
        if i.rsplit('\\',1)[1].split('.',1)[0] in j:
            df_tngs_species_rpm = pd.read_csv(j,engine='python', sep='\t')

            df_tngs_species_rpm[i.rsplit('\\',1)[1].split('.',1)[0]+'-tngs_species_rpm'] = ''
            print(df_tngs_species_rpm)
            for k in range(len(df_tngs_species_rpm)):
                df_tngs_species_rpm.loc[k,i.rsplit('\\',1)[1].split('.',1)[0]+'-tngs_species_rpm']=int(df_tngs_species_rpm.loc[k,'S_Count'])/all_reads*1000000
            df_tngs_species_rpm=df_tngs_species_rpm[['species_Cname',i.rsplit('\\',1)[1].split('.',1)[0]+'-tngs_species_rpm']]
            df_tngs_species_rpm[str(i.rsplit('\\',1)[1].split('.',1)[0]+'-mngs_species_rpm').replace('T','')] = ''

            # print(i.rsplit('\\',1)[1].split('.',1)[0])
            for m in records_mngs_result_xls:
                if i.rsplit('\\',1)[1].split('.',1)[0].replace('T','') in m:
                    df_mngs_species_rpm =pd.read_csv(m,engine='python',sep='\t')
                    dict_mngs_species_rpm = dict(zip(df_mngs_species_rpm['species_Cname'], df_mngs_species_rpm['sam_RPM']))
                    # print(dict_mngs_species_rpm)
                    for n in range(len(df_tngs_species_rpm)):
                        if df_tngs_species_rpm.loc[n,'species_Cname'] in dict_mngs_species_rpm:
                            df_tngs_species_rpm.loc[n,i.rsplit('\\',1)[1].split('.',1)[0].replace('T','')+'-mngs_species_rpm']=dict_mngs_species_rpm[df_tngs_species_rpm.loc[n,'species_Cname']]
                        else:
                            df_tngs_species_rpm.loc[n, i.rsplit('\\',1)[1].split('.',1)[0].replace('T','')+'-mngs_species_rpm'] ='-'
                    # print(df_tngs_species_rpm)
    print(df_tngs_species_rpm)
    # df_add_species_rpm=pd.merge(df_add_species_rpm,df_tngs_species_rpm,on='species_Cname')
    # df_add_species_rpm = pd.merge(df_add_species_rpm, df_tngs_species_rpm,left_on=None, right_on=None, left_index=False, right_index=False)
    df_add_species_rpm = pd.concat([df_add_species_rpm, df_tngs_species_rpm])

print(df_add_species_rpm)




# df_mngs_species_rpm=pd.read_csv(r'比较\result\classify_abundance_result.anno.xls',engine='python',sep='\t')
# # print(df_mngs_species_rpm)
# dict_mngs_species_rpm=dict(zip(df_mngs_species_rpm['species_Cname'],df_mngs_species_rpm['sam_RPM']))
# # print(dict_mngs_species_rpm)
# for i in range(len(df_tngs_species_rpm)):
#     if df_tngs_species_rpm.loc[i,'species_Cname'] in dict_mngs_species_rpm:
#         df_tngs_species_rpm.loc[i,'mngs_species_rpm']=dict_mngs_species_rpm[df_tngs_species_rpm.loc[i,'species_Cname']]
#     else:
#         df_tngs_species_rpm.loc[i, 'mngs_species_rpm'] ='-'
# print(df_tngs_species_rpm)



#             df_map.columns = ['accession', 'count']
#             # print(df_map)
#             # print(sum(df_map['count']))
#             capture_rate = sum(df_map['count']) / all_reads
#             capture_rate = '%.2f%%' % (capture_rate * 100)
#             dict_capture_rate[i.rsplit('\\',1)[1].split('.',1)[0]]=capture_rate
# print(dict_capture_rate)
#
# df_capture_rate=pd.DataFrame([dict_capture_rate])
# print(df_capture_rate)