deepvirfinder安装
# codna 安装依赖
conda create -n dvf python=3.6 numpy theano=1.0.3 keras=2.2.4 scikit-learn Biopython h5py r-base
source activate dvf
# github下载程序
git clone https://github.com/jessieren/DeepVirFinder
cd DeepVirFinder
#安装R q-value包(该方法安装的qvalue包版本较旧,可参考后文的安装方法)
R
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("qvalue")
q()
调试
#预测 crAssphage 基因组
conda activate dvf
python dvf.py -i ./test/crAssphage.fa -o ./test/ -l 300
#输出日志
Using Theano backend.
1. Loading Models.
model directory /auto/cmb-panasas2/renj/software/DeepVirFinder/models
2. Encoding and Predicting Sequences.
processing line 1
processing line 1389
3. Done. Thank you for using DeepVirFinder.
output in ./test/crAssphage.fa_gt300bp_dvfpred.txt
#预测一组宏基因组组装的重叠群
python dvf.py -i ./test/CRC_meta.fa -l 1000 -c 2
#计算 q 值(错误发现率)
R
library(qvalue)
result <- read.csv("./test/CRC_meta.fa_gt1000bp_dvfpred.txt", sep='\t')
result$qvalue <- qvalue(result$pvalue)$qvalues
result[order(result$qvalue),]
q()
运行
# 指定程序路径和模型路径(绝对路径)
dvf=./DeepVirFinder/dvf.py
model=./DeepVirFinder/models
mkdir 03_virfind
#运行deepvirfinder
python ${dvf} -m ${model} \
-i 02_assembly/XXX/scaffolds.fasta -l 1000 -c 80 \
-o 03_virfind/dvf/XXX
关于q值的计算
#q值计算
R
library(qvalue)
result <- read.csv(".03_virfind/dvf/XXX/XXX.txt", sep='\t')
result$qvalue <- qvalue(result$pvalue)$qvalues
#若有如下报错
Error in smooth.spline(lambda, pi0, df = smooth.df) :
missing or infinite values in inputs are not allowed
#解决方法,更新qvalue,使用qvalue_truncp()计算q值
install.packages("devtools")
library("devtools")
install_github("jdstorey/qvalue")
#使用qvalue_truncp进行计算
result$qvalue <- qvalue_truncp(result$pvalue)$qvalues
关于阈值:deepvirfinder官方推荐q-value<0.01的序列认定为病毒序列,也有文献(Metagenomic features of traditional fermented milk products)采用score>0.9,p-value <0.01或p-value<0.05来界定病毒序列,没有具体标准,一般对p-value进行调整。
帮助文档
dvf.py
Options:
-h, --help show this help message and exit
-i INPUT_FA, --in=INPUT_FA
input fasta file
-m MODDIR, --mod=MODDIR
model directory (default ./models)
-o OUTPUT_DIR, --out=OUTPUT_DIR
output directory
-l CUTOFF_LEN, --len=CUTOFF_LEN
predict only for sequence >= L bp (default 1)
-c CORE_NUM, --core=CORE_NUM
number of parallel cores (default 1)
参考文献
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8172088/
https://github.com/jessieren/DeepVirFinder
Error in calculating q-value · Issue #2 · jessieren/DeepVirFinder · GitHub
https://github.com/StoreyLab/qvalue/issues/9
Metagenomic features of traditional fermented milk products - ScienceDirect
The ecogenomics of dsDNA bacteriophages in feces of stabled and feral horses - ScienceDirect