20230328_count_pathogen_seq
import os
import click
import re
import pandas as pd
@click.command()
# @click.option('--inputpath1','-ip1',require=True,type=str,help='anorv-crpm002-single-cap.r2.out.mapped.sam')
# @click.option('--reference','-r',require=True,type=str,help='accession_to_Ename.xls')
# @click.option('--outputpath1','-op1',require=True,type=str,help='count.xls')
@click.option('-ip1', '--inputpath1', help='inputpath1(out.mapped.sam |...)', required=True)
@click.option('-r', '--reference', help='reference(accession_to_Ename.xls |...)', required=True)
@click.option('-op1', '--outputpath1', help='outputpath1(count.xls |...)', required=True)
def count_species(inputpath1,reference,outputpath1):
# referenece = r'C:\Users\Administrator\Desktop\20230327需要查看的\accession_to_Ename.xls'
# inputpath1 = r'C:\Users\Administrator\Desktop\20230327需要查看的\anorv-crpm002-single-cap.r2.out.mapped.sam'
# outputpath1 = r'C:\Users\Administrator\Desktop\20230327需要查看的\count.xls'
df_ref = pd.read_csv(reference, sep='\t')
dict_ref = dict(zip(df_ref['accession'], df_ref['Ename']))
list_accession = []
with open(inputpath1, 'r') as f2:
for line in f2:
line = line.strip('\n')
list1 = line.split('\t')
if sum([int(item.replace('M', '')) for item in re.findall('\d+M', list1[5])]) >= 145:
list_accession.append(dict_ref[list1[2]])
value = pd.value_counts(list_accession)
# print(value)
# print(value.index,value.values)
#
dict1 = dict(zip(value.index, value.values))
# print(dict1)
df1 = pd.DataFrame.from_dict(dict1, orient='index', columns=['count'])
df1 = df1.rename_axis('Ename').reset_index()
# print(df_ref)
# df1=pd.merge(df1,df_ref,how='left',on=['accession'])
# # df1=df1[[accession]]
print(df1)
df1.to_csv(outputpath1, sep='\t', index=None)
if __name__=='__main__':
count_species()
20230328_merge_pathogen_seq
import os
import pandas as pd
import click
@click.command()
@click.option('-i','--inputpath',help='inputpath(dir|...)',required=True)
@click.option('-o','--outputpath',help='outputpath(merge.count.xls|...)',required=True)
def merge_count(inputpath,outputpath):
# inputpath=r'C:\Users\Administrator\Desktop\20230327需要查看的'
# outputpath=r'C:\Users\Administrator\Desktop\20230327需要查看的\merge.count.xls'
list_need=[item for item in os.listdir(inputpath) if item.endswith('.count.xls')]
df_add=pd.DataFrame(columns=['Ename'])
for i in list_need:
df1=pd.read_csv(inputpath+'/'+i,sep='\t')
df1=df1.rename(columns={'count':i.split('.')[0]+'_count'})
print(df1)
df_add=pd.merge(df_add,df1,how='outer',on=['Ename'])
df_add.fillna(0,inplace=True)
print(df_add)
df_add.to_csv(outputpath,sep='\t',index=None)
if __name__=='__main__':
merge_count()
20230329_count_seq_rpm
import os
import click
import re
import pandas as pd
import math
@click.command()
# @click.option('--inputpath1','-ip1',require=True,type=str,help='anorv-crpm002-single-cap.r2.out.mapped.sam')
# @click.option('--reference','-r',require=True,type=str,help='accession_to_Ename.xls')
# @click.option('--outputpath1','-op1',require=True,type=str,help='count.xls')
@click.option('-ip1', '--inputpath1', help='inputpath1(anorv-crpm002-single-cap.r2.out.mapped.sam |...)', required=True)
@click.option('-r', '--reference', help='reference(accession_to_Ename.xls |...)', required=True)
@click.option('-op1', '--outputpath1', help='outputpath1(count.xls |...)', required=True)
@click.option('-ip2', '--inputpath2', help='inputpath2(anorv-crpm002-single-cap.qc.xls |...)', required=True)
def count_species(inputpath1,reference,outputpath1,inputpath2):
# referenece = r'C:\Users\Administrator\Desktop\20230327需要查看的\accession_to_Ename.xls'
# inputpath1 = r'C:\Users\Administrator\Desktop\20230327需要查看的\anorv-crpm002-single-cap.r2.out.mapped.sam'
# outputpath1 = r'C:\Users\Administrator\Desktop\20230327需要查看的\count.xls'
df_ref = pd.read_csv(reference, sep='\t')
dict_ref = dict(zip(df_ref['accession'], df_ref['Ename']))
df_qc=pd.read_csv(inputpath2,sep='\t')
dict_qc={}
# df_qc['value']
for i in range(len(df_qc)):
dict_qc[df_qc.loc[i,'item']]=int(df_qc.loc[i,'value'])
list_accession = []
with open(inputpath1, 'r') as f2:
for line in f2:
line = line.strip('\n')
list1 = line.split('\t')
if sum([int(item.replace('M', '')) for item in re.findall('\d+M', list1[5])]) >= 145:
list_accession.append(dict_ref[list1[2]])
if len(list_accession)==0:
df1=pd.DataFrame(columns=['Ename','count','rpm'])
df1.to_csv(outputpath1, sep='\t', index=None)
else:
value = pd.value_counts(list_accession)
# print(value)
# print(value.index,value.values)
#
dict1 = dict(zip(value.index, value.values))
# print(dict1)
df1 = pd.DataFrame.from_dict(dict1, orient='index', columns=['count'])
df1 = df1.rename_axis('Ename').reset_index()
# print(df_ref)
# df1=pd.merge(df1,df_ref,how='left',on=['accession'])
# # df1=df1[[accession]]
df1['rpm']=df1['count'].apply(lambda x: math.ceil(x/dict_qc['clean_reads']*1000000))
print(df_qc)
print(dict_qc)
print(type(dict_qc['clean_reads']),df1)
df1.to_csv(outputpath1, sep='\t', index=None)
if __name__=='__main__':
count_species()
20230329_estimate_qc
import json
import os
import click
@click.command()
@click.option('-i','--inputpath',help='inputpath(nanorv-crpm002-single-cap.json)',required=True)
@click.option('-o','--outputpath',help='outputpath(nanorv-crpm002-single-cap.qc.xls)',required=True)
def estimate_qc(inputpath,outputpath):
# inputpath=r'C:\Users\Administrator\Desktop\20230327需要查看的\nanorv-crpm002-single-cap.json'
file=open(inputpath)
dict1=json.load(file)
# file=json.loads(inputpath)
print(dict1)
print(dict1['summary'])
# outputpath=r'C:\Users\Administrator\Desktop\20230327需要查看的\nanorv-crpm002-single-cap.qc.xls'
with open(outputpath,'w') as f1:
f1.write('{}\t{}\n'.format('item','value'))
# for item in dict1['summary']:
# if item == 'before_filtering':
# if
f1.write('%s\t%s\n' %('raw_reads',str(dict1['summary']['before_filtering']['total_reads'])))
f1.write('%s\t%s\n' % ('raw_base', str(dict1['summary']['before_filtering']['total_bases'])))
f1.write('%s\t%s\n' %('clean_reads',str(dict1['summary']['after_filtering']['total_reads'])))
f1.write('%s\t%s\n' % ('clean_base', str(dict1['summary']['after_filtering']['total_bases'])))
f1.write('%s\t%s\n' %('q20(%)',str(round(dict1['summary']['after_filtering']['q20_rate']*100,2))))
f1.write('%s\t%s\n' % ('q30(%)', str(round(dict1['summary']['after_filtering']['q30_rate']*100,2))))
if __name__=='__main__':
estimate_qc()
20230329_merge_seq_rpm
import os
import pandas as pd
import click
@click.command()
@click.option('-i','--inputpath',help='inputpath(dir|...)',required=True)
@click.option('-o','--outputpath',help='outputpath(merge.count.xls|...)',required=True)
def merge_count(inputpath,outputpath):
# inputpath=r'C:\Users\Administrator\Desktop\20230327需要查看的'
# outputpath=r'C:\Users\Administrator\Desktop\20230327需要查看的\merge.count.xls'
list_need=[item for item in os.listdir(inputpath) if item.endswith('.count.rpm.xls')]
df_add=pd.DataFrame(columns=['Ename'])
for i in list_need:
df1=pd.read_csv(inputpath+'/'+i,sep='\t')
df1=df1.rename(columns={'count':i.split('.')[0]+'_count','rpm':i.split('.')[0]+'_rpm'})
print(df1)
df_add=pd.merge(df_add,df1,how='outer',on=['Ename'])
df_add.fillna(0,inplace=True)
print(df_add)
df_add.to_csv(outputpath,sep='\t',index=None)
if __name__=='__main__':
merge_count()
20230330_merge_qc
import pandas as pd
import os
import numpy as np
import click
@click.command()
@click.option('-i','--inputpath',help='inputpath(dir)|..',required=True)
@click.option('-o','--outputpath',help='outputpath(qc.merge.xls)|..',required=True)
def qc_merge(inputpath,outputpath):
# inputpath=r'C:\Users\Administrator\Desktop\20230327需要查看的'
list_need=[item for item in os.listdir(inputpath) if item.endswith('.qc.xls')]
# outputpath=r''
df_add=pd.DataFrame()
for i in list_need:
# with open()
df1=pd.read_csv(inputpath+'/'+i,sep='\t')
print(df1)
df2=df1.T
# print(df1.T)
df2=df2.rename_axis('id').reset_index(drop=True)
array=np.array(df2)
list_1=array.tolist()
print(list_1)
list_name=list_1[0]
df2.columns=list_name
df2.drop([0],inplace=True,axis=0)
df2=df2.reset_index(drop=True)
df2['sample_name']=i.split('.qc.xls',1)[0]
print(df2)
# df_add=df_add.append(df2)
df_add=pd.concat([df_add,df2],axis=0).reset_index(drop=True)
df_part=df_add.pop('sample_name')
df_add.insert(0,'sample_name',df_part)
print(df_add)
df_add.to_csv(outputpath,sep='\t',index=None)
if __name__=='__main__':
qc_merge()
run.capture_line
echo start at `date`
## step 1
path=/data/lijing/data
outpath=/data/lijing/test_result
# id=nanorv-crpm002-single-cap
ls ${path}/*.gz |while read id; do echo $(basename $id)>>${outpath}/name.txt;done
cut -d'.' -f 1 ${outpath}/name.txt |sort|uniq > ${outpath}/name.uniq.txt
## step 2
# for item in `cat ${outpath}/name.uniq.txt`; do echo $item; done
for item in `cat ${outpath}/name.uniq.txt`; do /home/biosoft/fastp/fastp \
--thread 20 -W 4 -q 20 -l 150 -y 30 --detect_adapter_for_pe -x \
-i ${path}/${item}.r1.fq.gz \
-I ${path}/${item}.r2.fq.gz \
-o ${outpath}/${item}.r1.out.fq.gz \
-O ${outpath}/${item}.r2.out.fq.gz \
-j ${outpath}/${item}.json \
-h ${outpath}/${item}.html && \
/usr/bin/python3 /data/lijing/download/20230329_estimate_qc.py \
-i ${outpath}/${item}.json \
-o ${outpath}/${item}.qc.xls; done
## step 3
for item in `cat ${outpath}/name.uniq.txt`; do /home/biosoft/bwa-0.7.17/bwa mem /data/lijing/download/Severe_acute_respiratory_syndrome_coronavirus_2.fa \
${outpath}/${item}.r1.out.fq.gz \
${outpath}/${item}.r2.out.fq.gz > \
${outpath}/${item}.out.sam && \
/usr/bin/samtools view -@ 20 -F 4 ${outpath}/${item}.out.sam > \
${outpath}/${item}.out.mapped.sam ; done
## step 4
for item in `cat ${outpath}/name.uniq.txt`; do /usr/bin/python3 /data/lijing/download/20230329_count_seq_rpm.py \
-ip1 ${outpath}/${item}.out.mapped.sam \
-r /data/lijing/download/accession_to_Ename.xls \
-ip2 ${outpath}/${item}.qc.xls \
-op1 ${outpath}/${item}.count.rpm.xls ; done
## srep 5
/usr/bin/python3 /data/lijing/download/20230329_merge_seq_rpm.py \
-i ${outpath} \
-o ${outpath}/count.merge.count.rpm.xls && \
/usr/bin/python3 /data/lijing/download/20230330_merge_qc.py \
-i ${outpath} \
-o ${outpath}/count.merge.qc.xls
echo finish at `date`