cCupcake---ToFu

Iso-Seq数据处理流程
 1 ############human brain______FL
 2 cd /zs32/data-analysis/liucy_group/llhuang/PacBio-publicdata/Human_brain
 3 source /opt/VENV_TOFU2/bin/activate
 4 export PATH=$PATH:/opt/tools/cDNA_Cupcake/annotation
 5 export PATH=$PATH:/opt/tools/cDNA_Cupcake/sequence
 6 export PATH=/opt/VENV_TOFU2/bin:$PATH
 7 gmap -D /zs32/data-analysis/liucy_group/llhuang/Reflib -d Gmapindexhg19-2017 -f samse -n 0 -t 12 brain.flnc.fasta > PB_brain_FL.sam 2> PB_brain_FL.sam.log
 8 ##sort by samtools
 9 samtools view -Sb PB_brain_FL.sam > PB_brain_FL.bam
10 samtools view -bF 4 PB_brain_FL.bam > PB_brain_FL.F.bam
11 samtools sort PB_brain_FL.F.bam PB_brain_FL.F.sort
12 samtools view -h -o PB_brain_FL.F.sort.sam PB_brain_FL.F.sort.bam
13 1##Collapse identical isoforms
14 source /opt/VENV_TOFU2/bin/activate
15 export PYTHONPATH=""
16 collapse_isoforms_by_sam.py --input brain.FL.Normal.fa -s PB_brain_FL.F.sort.sam --dun-merge-5-shorter -o PB_brain_FL
17 2##Chaining samples
18 注意:GFF文件必须是unique mapped的reads,且转化而来的gtf文件需要sort。
19 ######################chaining samples
20 #AD,事先把原始数据gmap比对得到sam文件,提取NH:i:1的reads。
21 awk 'BEGIN{flag=0}NR==FNR{a[$1]=$0}NR>FNR{if(flag==1){print $0; flag=0}if($1 in a){print a[$1]; flag=1}}' AD_rep.uniqueMapped.txt AD.rep.fa > AD_rep_uniqueM.fa
22 cd /zs32_2/Linglhuang/Alzheimer_final_confident
23 export PATH=/opt/VENV_TOFU2/bin:$PATH
24 gmap -D /zs32/data-analysis/liucy_group/llhuang/Reflib/ -d Gmapindexhg19-2017 -f gff3_gene -n 0 -t 24 --min-identity=0.9 --min-trimmed-coverage=0.9 AD_rep_uniqueM.fa > AD_rep_uniqueM.fa.gff 2> AD_rep_uniqueM.fa.gff.log
25 
26 #把gmap产生的GFF文件转换为Gtf文件
27 (1)
28 cd /zs32_2/Linglhuang/Alzheimer_final_confident
29 grep -E "mRNA|exon" AD_rep_uniqueM.fa.gff | cut -d ';' -f 1,2 > AD_rep_uniqueM.R.gff
30 sed -i 's/mRNA/transcript/;s/ID=/gene_id /;s/Name=/transcript_id /g' AD_rep_uniqueM.R.gff
31 sed -i 's/[.][0-9]*[.]mrna1.exon[0-9]*//' AD_rep_uniqueM.R.gff
32 sed -i 's/[.][0-9]*[.]mrna1//' AD_rep_uniqueM.R.gff
33 sed -i 's/PB/"PB/g;s/;/"; /' AD_rep_uniqueM.R.gff
34 sed -i 's/$/";/' AD_rep_uniqueM.R.gff
35 sed -i 's/Gmapindexhg19-2017/PacBio/g' AD_rep_uniqueM.R.gff
36 awk 'BEGIN{FS="\t"}{printf "%s\t%s\t%s\t%s\t%s\t.\t%s\t%s\t%s\n",$1,$2,$3,$4,$5,$7,$8,$9}' AD_rep_uniqueM.R.gff > AD_rep_uniqueM.gtf
37 
38 (2)#gffread
39 cd /zs32_2/Linglhuang/ALL_datasets/AD
40 /opt/tools/seq-analysis/cufflinks-2.2.1.Linux_x86_64/gffread AD_rep.gff -T -o AD_rep.R2.gff
41 sed 's/.mrna1//;s/[.][0-9]*[.]path1//;s/ gene_name "PB[.][0-9]*[.][0-9]*";//' AD_rep.R2.gff > AD_rep2.gtf
42 sed -i 's/Gmapindexhg19-2017/PacBio/g' AD_rep2.gtf
43 awk 'BEGIN{FS="\t"}{printf "%s\t%s\t%s\t%s\t%s\t.\t%s\t%s\t%s\n",$1,$2,$3,$4,$5,$7,$8,$9}' AD_rep2.gtf > Isoseq_rep2.gtf
44 
45 #######################chaining samples
46 #sort using AS toools
47 cd /zs32_2/Linglhuang/ALL_datasets/AD
48 /opt/tools/barna/barna.astalavista/build/distributions/astalavista-4.0.1-SNAPSHOT/bin/astalavista -t asta -i AD_rep_uniqueM.gtf
49 #chaining
50 cd /zs32_2/Linglhuang/ALL_datasets/AD_PB
51 source /opt/VENV_TOFU2/bin/activate
52 export PYTHONPATH=""
53 chain_samples.py AD_PB.config norm_nfl --allow_5merge
54 chain_samples.py AD_PB.config norm_nfl --fuzzy_junction 0

 

转载于:https://www.cnblogs.com/linglingtingfei/p/7613935.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值