文章目录
WES全外显子数据分析流程
WES相关软件下载安装
#conda安装前要先search一下
conda search snpeff
conda install sra-tools
conda install samtools
conda install bcftools
conda install snpeff
conda install multiqc
conda install qualimap
下载GATK4.X版本
wget https://github.com/broadinstitute/gatk/releases/download/4.1.2.0/gatk-4.1.2.0.zip
unzip gatk-4.1.2.0.zip
cd gatk-4.1.2.0
./gatk
配置环境变量
echo 'export PATH=/home/kelly/biosoft/gatk/gatk-4.1.2.0/:$PATH' >>~/.bashrc
source ~/.bashrc
gatk --help
下载GATK需要的参考文件
需要到GATK官网下载
location: ftp.broadinstitute.org/bundle
username: gsapubftp-anonymous
password:
或者安装ftp在线下载
sudo apt-get install lftp
#首先lftp,后面跟用户名,然后at符号,ftp服务器地址
lftp ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/
#这里密码是空的,我们直接敲回车即可
#下载文件夹直接用mirror
mirror hg38
#耐心等待
#退出当前环境
exit
tree -h
#查看当前文件夹内容及大小
下载好后用bwa构建索引
#先解压human.fa文件,然后构建索引
gzip -d Homo_sapiens_assembly38.fasta.gz
#解压前是890MB,解压后大小是3.0GB
#构建索引
bwa index Homo_sapiens_assembly38.fasta -p bwa_index/gatk_hg38
bwa index ref.fa -p genome###可以不加-p genome,这样建立索引都是以ref.fa为前缀
##大概3小时
构建好后生成这几个文件
测序数据质量控制
fastqc和multiqc用法
find *.gz|xargs fastqc -t 20
#或者multiqc
multiqc -n wes ./
质控
一般都选择trim_galore
ls /path/to/your/directory/*_1.fastq.gz >1
ls /path/to/your/directory/*_2.fastq.gz >2
paste 1 2 > config
写脚本
#vim qc.sh
bin_trim_galore=trim_galore
dir='/path'
cat config |while read id
do
arr=(${id})
fq1=${arr[0]}
fq2=${arr[1]}
nohup $bin_trim_galore -q 25 --phred33 --length 36 -e 0.1 --stringency 3 --paired -o $dir/clean $fq1 $fq2 &
done
比对
#单个样本比对
bwa mem -t 5 -R "@RG\tID:$sample\tSM:$sample\tLB:WGS\tPL:Illumina" /home/meiling/baiduyundisk/wes/hg38/bwa_index/Homo_sapiens_assembly38.fasta wes.1_val_1.fq.gz wes.2_val_2.fq.gz | samtools sort -@ 5 -o wes.bam -
#多样本比对,循环,批量
#clean目录
ls *1.fastq.gz > 1
ls *2.fastq.gz > 2
paste 1 2 > config
vim config
#增加第一列文件名,记得不能空格,要Tab分隔
#7E5239 7E5239.L1_1.fastq 7E5239.L1_2.fastq
#7E5240 7E5240_L1_A001.L1_1.fastq 7E5240_L1_A001.L1_2.fastq
#7E5241 7E5241.L1_1.fastq 7E5241.L1_2.fastq
INDEX=/home/meiling/baiduyundisk/wes/hg38/bwa_index/Homo_sapiens_assembly38.fasta
cat config |while read id
do
arr=($id)
fq1=${arr[1]}
fq2=${arr[2]}
sample=${arr[0]}
bwa mem -t 5 -R "@RG\tID:$sample\tSM:$sample\tLB:WGS\tPL:Illumina" $INDEX $fq1 $fq2 | samtools sort -@ 5 -o $sample.bam -
done
查看bam文件
samtools view -H SRR8517853.bam |grep -v "SQ"
bam文件质控
samtools stat
## stats.sh
cat config | while read id
do
bam=./4.align/${id}.bam
samtools stats -@ 16 --reference ~/wes_cancer/data/Homo_sapiens_assembly38.fasta ${bam} > ./4.align/stats/${id}.stat
plot-bamstats -p ./4.align/stats/${id} ./4.align/stats/${id}.stat
done
这一步质控也会生成HTML报告。可以下载查看。
qualimap
用qualimap软件查看基因组覆盖深度等信息。
cat config | while read id
do
qualimap bamqc --java-mem-size=10G -gff ~/wes_cancer/data/hg38.exon.bed -nr 100000 -nw 500 -nt 16 -bam ./4.align/${id}.bam -outdir ./4.align/qualimap/${id}
done
一步法找变异流程
先查看bam文件
samtools mpileup SRR8517854.bam |head -95|tail -3
最简单的找变异流程
ref=/home/meiling/baiduyundisk/wes/hg38/Homo_sapiens_assembly38.fasta
time samtools mpileup -ugf $ref *.bam|bcftools call -vmO z -o wxs_out.vcf.gz
GATK完整找变异流程
定义变量
#GATK=/opt/software/gatk-4.1.2.0/gatk
ref=/home/meiling/baiduyundisk/wes/hg38/Homo_sapiens_assembly38.fasta
snp=/home/meiling/baiduyundisk/wes/hg38/dbsnp_146.hg38.vcf.gz
indel=/home/meiling/baiduyundisk/wes/hg38/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz
GATK标准找体细胞变异流程(没有normal配对)
##制作config文件
for i in $(ls $dir/align/*.bam)
do
sample=$(basename $i|sed s/.bam//g)
echo ${sample} >> config
done
##只需一次
标记PCR重复reads
cat config |while read id
do
arr=($id)
sample=${arr[0]}
#mark dupulicates
$gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" MarkDuplicates \
-I $dir/align/$sample.bam \
-O $dir/align/${sample}_marked.bam \
-M $dir/align/$sample.metrics \
1>log.mark 2>&1
FixMateInformation
#fixmateinformation
$gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" FixMateInformation \
-I $dir/align/${sample}_marked.bam \
-O $dir/align/${sample}_marked_fixed.bam \
-SO coordinate \
1>log.fix 2>&1
#index
samtools index $dir/align/${sample}_marked_fixed.bam
Baserecalibrator碱基矫正
$gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" BaseRecalibrator \
-R $ref \
-I $dir/align/${sample}_marked_fixed.bam \
--known-sites $snp \
--known-sites $indel \
-O $dir/align/${sample}_recal.table \
1>${sample}_log.recal 2>&1
$gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" ApplyBQSR \
-R $ref \
-I $dir/align/${sample}_marked_fixed.bam \
-bqsr $dir/align/${sample}_recal.table \
-O $dir/align/${sample}_bqsr.bam \
1>${sample}_log.ApplyBQSR 2>&1
使用GATK的Mutect2命令 可以检测tumor-normal配对,或者tumor-only模式
$gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" Mutect2 \
-R $ref \
-I $dir/align/${sample}_bqsr.bam \
-tumor ${sample} \
--disable-read-filter MateOnSameContigOrNoMappedMateReadFilter \
-O $dir/align/${sample}_mutect2.vcf \
1>${sample}_log.HC 2>&1
##使用GATK的FilterMutectCalls过滤
$gatk FilterMutectCalls \
-R $ref \
-V $dir/align/${sample}_mutect2.vcf \
-O $dir/mutation/${sample}_somatic.vcf
cat $dir/mutation/${sample}_somatic.vcf \
| perl -alne '{if(/^#/){print}else{next unless $F[6] eq "PASS";next if $F[0] =~/_/;print } }' \
> $dir/mutation/${sample}_filter.vcf
done
注释
annovar使用
annovar是一款常用的注释软件,可在其官网注册后下载。
annovar无需安装,下载后解压即可直接使用。还需要下载数据库文件。在安装包下有常用数据库储存在humandb目录下。
Download database
# 这里下载三个常用数据库 refgene、dbsnp、1000genomes
perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar refGene humandb/
perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar avsnp150 humandb
perl annotate_variation.pl -buildver hg38 -downdb -webfrom annovar 1000g2015aug humandb
annovar软件里面是几个perl写的脚本:
annovar=/gluster/home/gaomeiling/software/annovar
#首先进行格式转换
cat config | while read id
do
perl $annovar/convert2annovar.pl -format vcf4 $dir/mutation/${id}_filter.vcf > $dir/mutation/${id}_filter.avinput
perl $annovar/table_annovar.pl \
$dir/mutation/${id}_filter.avinput $annovar/humandb/ \
-buildver hg38 \
-out $dir/annotation/annovar/${id} \
-remove \
-protocol refGene,knownGene,clinvar_20200316 \
-operation g,g,f \
-nastring . \
-vcfinput
done
VCF格式转maf
将annovar注释后的txt文件合并,并在后面添加一列样本名称,Tumor_Sample_Barcode
#利用maftools分析annovar的注释结果
cat config | while read id
do
grep -v '^Chr' ${id}.hg38_multianno.txt | cut -f 1-20 | awk -v T=${id} -v N=${id}_somatic '{print $0"\t"T"\t"N}' > ./vcf/${id}.annovar.vcf
done
head -1 11356.hg38_multianno.txt| sed 's/Otherinfo/Tumor_Sample_Barcode\tMatched_Norm_Sample_Barcode/' > header
cat header ./vcf/*annovar.vcf > ./vcf/annovar_merge.vcf
开始maftools可视化分析。
Mutsig分析驱动基因。
Mutsig
安装MATLABMCR
首先需要下载 MATLABMCR:https://ww2.mathworks.cn/products/compiler/matlab-runtime.html,并且版本要对应上,这里选择的是 2016a
cd /opt/software
mkdir MatlabMCR
cd MatlabMCR
wget -c http://ssd.mathworks.com/supportfiles/downloads/R2016a/deployment_files/R2016a/installers/glnxa64/MCR_R2016a_glnxa64_installer.zip
unzip MCR_R2016a_glnxa64_installer.zip
sudo ./install
安装过程需要选择一个有权限的安装路径,可以选择sudo .安装在/usr/local里面就不需要在单独配置环境变量。如果没有sudo权限,安装在普通路径下,安装好了之后可能要手动赋值一个变量:
根据安装提示添加即可。
export LD_LIBRARY_PATH=...:$LD_LIBRARY_PATH
安装 MutSigCV
#下载 MutSigCV
cd /opt/software
wget -c https://software.broadinstitute.org/cancer/cga/sites/default/files/data/tools/mutsig/MutSigCV_1.41.zip
unzip MutSigCV_1.41.zip
cd MutSigCV_1.41
接下来需要下载几个必备的文件:
wget -c http://www.broadinstitute.org/cancer/cga/sites/default/files/data/tools/mutsig/reference_files/gene.covariates.txt
wget -c http://www.broadinstitute.org/cancer/cga/sites/default/files/data/tools/mutsig/reference_files/exome_full192.coverage.zip
wget -c http://www.broadinstitute.org/cancer/cga/sites/default/files/data/tools/mutsig/reference_files/mutation_type_dictionary_file.txt
按照官网要求,还需要参考基因组的序列文件,但是官网只提供了 hg18 和 hg19 版本,我需要自己制作 hg38 版本的:
sed -n '1,/chr1_KI270706v1_random/p' ../data/Homo_sapiens_assembly38.fasta | grep -v '^>' >../data/chr_files_hg38.txt
最后运行结果:
cd /opt/software/MutSigCV_1.41
./run_MutSigCV.sh /usr/local/MATLAB/MATLAB_Runtime/v901/ \
/home/meiling/baiduyundisk/wes/annotation/annovar/all/var_annovar_maf2 \
exome_full192.coverage.txt gene.covariates.txt mutation_type_dictionary_file.txt \
output \
/home/meiling/baiduyundisk/wes/hg38/chr_files_hg38.txt
输入文件:maf格式文件,exome_full192.coverage.txt gene.covariates.txt mutation_type_dictionary_file.txt chr_files_hg38.txt,输出路径加前缀
其中gatk_merge_mutsig.sig_genes.txt 是 MutSigCV 的结果报告。根据P值判断显著性。