批量从NCBI相应数据库检索和下载数据:
conda install entrez-direct
esearch -db gene -query |efetch -db protein -format fasta >result.ffa
#!/bin/bash
function install_blast(){
#install local balst programs
wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-2.11.0+-x64-linux.tar.gz
tar -zxvf
export PATH=/home/xx/ncbi/bin:${PATH}
balstp -version #产看是否安装成功
}
function download_database(){
#download databases from NCBI websites
#wget ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz
if false;then
# 1.下载fasta格式序列文件,随后需要用makeblastdb格式化
time wget ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz
time unzip nr
time makeblastdb \
-in nr \
-parse_seqids \
-dbtype prot \
-out nr \
-logfile nr_logfile
#-parse_seqids根据序列ID来分裂
fi
# 2.下载NCBI已经格式化好的blast序列文件(推荐)
time for i in {00..38};do wget ftp://ftp.ncbi.nlm.nih.gov/blast/db/nr.${i}.tar.gz ; done
time for i in {00..38};do tar -zxvf nr.${i}.tar.gz ;done
time for i in {00..38};do rm nr.${i}.tar.gz;done #节省空间
}
function blast(){
#start blast+
nohup time blastp \
-num_threads 20 \
-query Bacillus_subtilis_GLB191_Genome_ORF_Prediction_translations.fasta \
-db ../nr/nr \
-out result.txt \
-max_hsps 1 \
-num_alignments 1 \
-evalue 1e-5 \
-outfmt "6 qseqid sseqid sgi stitle evalue bitscore pident qcovs length mismatch gapopen qstart qend sstart send" & #太耗时,利用nohup和&
if false;then
1.一般常用的就是-outfmt 5, 6 或者 7 参数,“5”输出XML格式,“6”输出TAB分割格式,“7”输出带注释的TAB分割格式
qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore
2.如果像输出其他信息,比如比对覆盖度信息(qcovs:Query Coverage Per Subject),需要如在outfmt 6基础上加上 “qcovs”( 覆盖度信息),
添加位置和顺序就是输出TAB文件中列的排列形式:
-outfmt "6 qseqid sseqid sgi stitle evalue bitscore pident qcovs length mismatch gapopen qstart qend sstart send"
3.如果需要blast比对返回一个最优的比对结果,需要控制-max_target_seqs , -num_alignments 和 -max_hsps 选项:
-max_target_seqs <Integer, >=1>Maximum number of aligned sequences to keep
Not applicable for outfmt <= 4*
Incompatible with: num_descriptions, num_alignments
-num_alignments <Integer, >=0>Number of database sequences to show alignments for*
Incompatible with: max_target_seqs
4.NCB blast-2.8版本可支持用NCBI自带代码分割的NR子库的索引作为比对的库,使用比较方便
当然如果用这个版本的话,NR库也要重新下载了ftp://ftp.ncbi.nlm.nih.gov/blast/db/v5/
/pub/taxonomy/taxdump.tar.gz
/pub/taxonomy/accession2taxid/prot.accession2taxid.gz
I.使用方式也比较简单(至少比之前的方法方便了),如果只想比对单一物种(如人:9606的话),命令如下:
blastp –db nr –query query.fasta –taxids 9606 –outfmt 6 –out blast.outfm6
II.如果想比对NR子库哺乳动物的话,需要先建个哺乳动物子库tax_id索引:
get_species_taxids.sh -t 40674 > 40674.txids
II.然后再将序列比对至NR哺乳动物子库:
blastp –db nr –query query.fasta –taxidlist 40674.txids –outfmt 6 –out blast.outfm6
具体说明可看:https://ftp.ncbi.nlm.nih.gov/blast/db/v5/blastdbv5.pdf
fi
}
#main process
install_blast
download_database
blast
筛选outfmt6结果中最佳比对,并获取Gene_name和swiss_prot描述
# -*- coding: utf-8 -*-
"""
Created on Fri Oct 15 11:40:29 2021
@author: Bio-windows
"""
import numpy as np
import pandas as pd
import os
import gzip
from Bio import SwissProt
import argparse
def outfmt6BestResults(blast_results_handle ):
'''
最佳比对结果:evalue最小, pident最大
'''
cou = ['qseqid','sseqid','pident','length','mismatch','gapopen','qstart','qend','sstart','send','evalue','bitscore']
blast = pd.read_csv(blast_results_handle, sep='\t', names=cou )
blast_best = blast.sort_values(by=['qseqid', 'evalue', 'pident'], ascending=[True, True, False]).drop_duplicates(subset=['qseqid'])
return blast_best
def extractInfo(blast_outfmt6, uniprot_swiss_prot_dat, output):
'''
提取Swiss-prot数据信息
'''
r = outfmt6BestResults(blast_outfmt6)
r.insert(2, 'gene_name', np.nan)
r.insert(3, 'SwissProtDescrition', np.nan)
#构建缓存字典
table_dic = {}
print('-'*50,'\n','构建缓存字典...')
for record in SwissProt.parse(gzip.open(os.path.join(uniprot_swiss_prot_dat, 'uniprot_sprot.dat.gz'))):
table_dic[record.entry_name] = [record.gene_name, record.description]
# =============================================================================
# table_dic_handle = open('tem_table_dic.txt', 'w')
# table_dic_handle.write(str(table_dic))
# table_dic_handle.close
# =============================================================================
print('完成并保存\n','-'*50)
#开始查找
for blast_best_record in range(len(r)):
key = r.iloc[blast_best_record, 1]
print('-'*50,'\n',key)
if key in table_dic.keys():
if table_dic[key][0] != '':
r.iloc[blast_best_record, 2] = table_dic[key][0].split(' ')[0].split('=')[1].split(';')[0]
else:
r.iloc[blast_best_record, 2] = ''
r.iloc[blast_best_record, 3] = table_dic[key][1]
print('完成')
else:
print(' 失败!!!\n\n\n')
r.to_excel(output+'swissprot_annotation.xls', index=False)
def param():
'''main function'''
parser = argparse.ArgumentParser(description='比对Uniprot-Swiss-Prot数据库的Blast outfmt6格式最优结果筛选,并提取Uniprot-Swiss-Prot相应注释')
parser.add_argument('-blast_outfmt6', help='blast outfmt6格式结果', type=str)
parser.add_argument('-uniprot_swiss_prot_dat', help='uniprot_swiss_prot.dat数据集', type=str)
parser.add_argument('-output', help='输出文件名称', type=str)
args = parser.parse_args()
extractInfo(args.blast_outfmt6, args.uniprot_swiss_prot_dat, args.output)
if __name__ == '__main__':
'''main function'''
# 执行
param()