import requests
import os
import re
def download_genome_first(gcf_id):
# 根据GCF或者GCA号动态选择base_url
if gcf_id.startswith("GCF"):
base_url = "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/"
elif gcf_id.startswith("GCA"):
base_url = "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/"
else:
print(f"Invalid ID: {gcf_id}. Skipping...")
return
# 提取GCF/GCA号的数字部分并按三位分割
parts = gcf_id.split('_')[1] # 提取数字部分
path_parts = [parts[i:i + 3] for i in range(0, len(parts), 3)]
path_parts.pop()
ftp_path = base_url + "/".join(path_parts)
#print(f"Downloading from {ftp_path}")
# 下载文件
try:
kv = {'user-Agent': 'Mozilla/5.0'}
response = requests.get(ftp_path, headers=kv)
response.encoding = response.apparent_encoding
except Exception as e:
print(f"第一次爬取失败: {e}")
return
html = response.text
pattern = rf'<a href="({gcf_id}[^/]+)/">'
url = re.findall(pattern, html)
if not url:
print(f"未找到匹配的目录: {gcf_id}")
return
url_2 = ftp_path + '/' + url[0] + '/' + url[0] + '_genomic.fna.gz'
print(url_2)
try:
response_2 = requests.get(url_2, headers=kv)
except Exception as e:
print(f"第二次爬取失败: {e}")
return
# 检查请求是否成功
if response_2.status_code == 200:
# 从URL中提取文件名
file_name = url_2.split("/")[-1]
# 创建完整的文件路径
file_path = os.path.join(output_dir, file_name)
os.makedirs(output_dir, exist_ok=True)
# 将下载的内容写入文件
with open(file_path, 'wb') as file:
file.write(response_2.content)
print(f"Downloaded {file_name} to {output_dir}")
else:
print(f"Failed to download file from {url_2}. Status code: {response_2.status_code}")
def batch_download(gcf_file):
# 读取GCF/GCA编号列表
with open(gcf_file, 'r') as file:
gcf_ids = [line.strip() for line in file.readlines()]
# 批量下载
for gcf_id in gcf_ids:
print(f"Processing: {gcf_id}")
download_genome_first(gcf_id)
# 使用示例
gcf_file = "./species-gcaids.txt"
# 这里是储存GCF/GCA号的txt文件存储路径
output_dir = "./downloads"
# 此处的下载文件存储的目录可以进行修改
batch_download(gcf_file)
print("所有文件已经下载完毕!")
NCBI-get-spesis-ref-IDs_fast.py
最新推荐文章于 2026-01-07 13:36:22 发布
831

被折叠的 条评论
为什么被折叠?



