**
关于利用doi,批量下载文献的方法
**
import requests
from bs4 import BeautifulSoup
import os
import threading
import time
from queue import Queue
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
# 设置下载文献存储的文件夹路径
path = r"E:\Quantum computing folder"#这里要换
if not os.path.exists(path):
os.makedirs(path)
# 日志文件路径
error_log_path = os.path.join(path, "error.log")
# 请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/129.0.0.0 Safari/537.36"
}
# Sci-Hub 域名
scihub_domain = "https://sci-hub.usualwant.com"#这个域名经常变化,网页查完更换
# 重试策略
retry_strategy = Retry(
total=5,
backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504],
allowed_methods=["HEAD", "GET", "OPTIONS"] # 更新为 allowed_methods
)
adapter = HTTPAdapter(max_retries=retry_strategy)
http = requests.Session()
http.headers.update(headers)
http.mount("https://", adapter)
http.mount("http://", adapter)
# 下载文献的函数
def download_paper(doi):
url = f"{scihub_domain}/{doi}"
pdf_url = "" # 初始化 pdf_url 避免 UnboundLocalError
try:
# 发送HTTP请求并解析HTML页面
response = http.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# 解析得到文献下载链接
iframe = soup.find("iframe")
if iframe:
pdf_url = iframe.get("src")
else:
embed = soup.find("embed")
if embed:
pdf_url = embed.get("src")
if not pdf_url:
raise ValueError("文献下载链接获取失败.")
# 处理链接前缀
if pdf_url.startswith('//'):
pdf_url = 'https:' + pdf_url
elif not pdf_url.startswith('http'):
pdf_url = f"{scihub_domain}/{pdf_url}"
# 下载文献并保存到文件
pdf_response = http.get(pdf_url, timeout=20)
pdf_response.raise_for_status()
pdf_filename = os.path.join(path, doi.replace("/", "_") + ".pdf")
with open(pdf_filename, "wb") as pdf_file:
pdf_file.write(pdf_response.content)
print(f"{doi} 文献下载成功.")
time.sleep(0.1) # 防止频繁请求Sci-Hub,设置适当的延迟
except Exception as e:
with open(error_log_path, "a", encoding="utf-8") as error_log:
error_log.write(f"{doi} 下载失败!\n")
if pdf_url:
error_log.write(f"下载url链接为: {pdf_url}\n")
error_log.write(f"错误信息: {e}\n\n")
print(f"{doi} 下载失败: {e}")
# 读取包含DOI号的txt文件并下载文献
def main():
doi_file_path = r"C:\Users\James\Desktop\doi.txt" # 已经是绝对路径
with open(doi_file_path, "r", encoding="utf-8") as file:
dois = [doi.strip() for doi in file.readlines()]
# 创建下载队列
queue = Queue()
# 将DOI号加入队列
for doi in dois:
if doi:
queue.put(doi)
# 下载任务函数
def worker():
while not queue.empty():
doi = queue.get()
download_paper(doi)
queue.task_done()
# 设置线程数
thread_count = 4
threads = []
# 创建并启动线程
for _ in range(thread_count):
thread = threading.Thread(target=worker)
thread.daemon = True # 设置为守护线程,程序退出时自动结束
threads.append(thread)
thread.start()
# 等待所有线程完成
queue.join()
# 可选:等待所有线程结束
for thread in threads:
thread.join()
if __name__ == "__main__":
main()