https://pubs.rsna.org/search/advanced
TPYE:Anywhere //*[@id="advanced-search__input--type1"]
KEYWORD://*[@id="advanced-search__input--keyword1"]
PUBLISHED IN://*[@id="publication"]
PUBLICTIONDATE:
1.ALL DATES //*[@id="publicationDate"]/div[1]/div/label
2.CUSTOM RANGE //*[@id="publicationDate"]/div[3]/div[1]/label
FROM:Select Month //*[@id="fromMonth"]
Select Year //*[@id="fromYear"]
TO:Select Month //*[@id="toMonth"] Select Year //*[@id="toYear"]
SEARCH://*[@id="advanced-search-submit"]改到上面import time
import json
import os
import argparse
import traceback
import copy
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
TimeoutException,
ElementClickInterceptedException,
ElementNotInteractableException,
NoSuchElementException,
WebDriverException
)
import undetected_chromedriver as uc
class MIACrawler:
def __init__(self, journal_name=None, download_dir=None, headless=False, use_profile=True,
pause_on_navigation=False, keywords=None, year=None):
"""
初始化Medical Image Analysis爬虫
Args:
journal_name: 期刊名称
download_dir: 下载目录
headless: 是否使用无头模式
use_profile: 是否使用配置文件
pause_on_navigation: 导航时是否暂停
keywords: 搜索关键词
year: 搜索年份
"""
# 从JSON传入的参数
self.journal_name = journal_name or "Medical Image Analysis"
self.keywords = keywords or '(ultrasound OR ultrasonic) AND (AI OR "artificial intelligence" OR "deep learning" OR "neural network")'
self.year = year or datetime.now().year
# 路径设置
self.default_save_dir = download_dir or r"d:\期刊网站爬虫\downloads_mia"
self.journal_url = "https://www.sciencedirect.com/search/entry"
self.cookie_file = r"d:\期刊网站爬虫\cookies\cookies_sciencedirect_mia.json"
# 行为控制参数
self.headless = headless
self.use_profile = use_profile
self.pause_on_navigation = pause_on_navigation
# 为当前期刊创建专门的文件夹(独立文件夹)
self.journal_safe_name = self.journal_name.replace(' ', '_').replace('/', '_').replace('\\', '_')
self.journal_download_dir = os.path.join(self.default_save_dir, self.journal_safe_name)
os.makedirs(self.journal_download_dir, exist_ok=True)
# 初始化实例变量
self.driver = None
self.wait = None
self.download_dir = self.journal_download_dir
print(f"✓ 初始化爬虫 - 期刊: {self.journal_name}")
print(f"✓ 独立下载目录: {self.journal_download_dir}")
def pause_if_enabled(self):
"""如果启用了暂停功能,则暂停执行"""
if self.pause_on_navigation:
try:
print("\n[PAUSE] 按Enter键继续...")
input()
except KeyboardInterrupt:
print("\n用户中断操作")
raise
def setup_driver(self):
"""设置Chrome驱动"""
try:
print("正在初始化Chrome浏览器驱动...")
chrome_options = uc.ChromeOptions()
prefs = {
"download.default_directory": self.download_dir,
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"safebrowsing.enabled": True
}
chrome_options.add_experimental_option("prefs", prefs)
# 浏览器配置
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--start-maximized")
if self.headless:
chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=1920,1080")
# 初始化驱动
try:
self.driver = uc.Chrome(options=chrome_options, use_subprocess=True)
print(f"浏览器驱动初始化成功!下载目录设置为: {self.download_dir}")
except Exception as e:
print(f"首次初始化失败,尝试备用配置: {e}")
chrome_path = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"
if os.path.exists(chrome_path):
chrome_options.binary_location = chrome_path
print(f"使用本地Chrome路径: {chrome_path}")
self.driver = uc.Chrome(
options=chrome_options,
use_subprocess=True,
version_main=None
)
print(f"备用配置初始化成功!下载目录设置为: {self.download_dir}")
# 设置显式等待
self.wait = WebDriverWait(self.driver, 30)
return True
except Exception as e:
print(f"设置驱动失败: {e}")
return False
def take_screenshot(self, filename):
"""截取当前页面截图"""
try:
screenshot_dir = os.path.join(self.journal_download_dir, "screenshots")
os.makedirs(screenshot_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
screenshot_path = os.path.join(screenshot_dir, f"{filename}_{timestamp}.png")
self.driver.save_screenshot(screenshot_path)
print(f"✓ 截图已保存: {screenshot_path}")
return screenshot_path
except Exception as e:
print(f"✗ 截图失败: {e}")
return None
def save_page_source(self, filename):
"""保存当前页面源码"""
try:
source_dir = os.path.join(self.journal_download_dir, "page_sources")
os.makedirs(source_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
source_path = os.path.join(source_dir, f"{filename}_{timestamp}.html")
with open(source_path, 'w', encoding='utf-8') as f:
f.write(self.driver.page_source)
print(f"✓ 页面源码已保存: {source_path}")
return source_path
except Exception as e:
print(f"✗ 保存页面源码失败: {e}")
return None
def save_cookies(self):
"""保存浏览器cookies"""
try:
journal_cookie_file = self.cookie_file.replace('.json', f'_{self.journal_safe_name}.json')
os.makedirs(os.path.dirname(journal_cookie_file), exist_ok=True)
cookies = self.driver.get_cookies()
print(f"正在保存cookies,共{len(cookies)}个...")
with open(journal_cookie_file, 'w', encoding='utf-8') as f:
json.dump(cookies, f, ensure_ascii=False, indent=2)
print(f"✓ Cookies已保存到: {journal_cookie_file}")
return True
except Exception as e:
print(f"✗ 保存cookies时出错: {e}")
return False
def load_cookies(self):
"""加载cookies到浏览器"""
journal_cookie_file = self.cookie_file.replace('.json', f'_{self.journal_safe_name}.json')
if not os.path.exists(journal_cookie_file):
print(f"⚠️ 当前期刊Cookie文件不存在: {journal_cookie_file}")
if not os.path.exists(self.cookie_file):
print(f"⚠️ 默认Cookie文件也不存在: {self.cookie_file}")
return False
journal_cookie_file = self.cookie_file
try:
with open(journal_cookie_file, 'r', encoding='utf-8') as f:
cookies = json.load(f)
print(f"正在加载cookies,共{len(cookies)}个...")
for cookie in cookies:
if 'expirationDate' in cookie:
del cookie['expirationDate']
if 'storeId' in cookie:
del cookie['storeId']
if 'sameSite' in cookie and cookie['sameSite'] is None:
cookie['sameSite'] = 'Lax'
try:
self.driver.add_cookie(cookie)
except Exception as e:
print(f"⚠️ 添加cookie失败: {cookie.get('name')} - {e}")
print("✓ Cookies加载完成")
return True
except Exception as e:
print(f"✗ 加载cookies时出错: {e}")
return False
def navigate_to_search_page(self):
"""导航到搜索页面"""
try:
print(f"正在访问 {self.journal_url}")
self.driver.get(self.journal_url)
time.sleep(5)
page_title = self.driver.title
print(f"✓ 已成功访问搜索页面")
print(f"ℹ️ 页面标题: {page_title}")
return True
except Exception as e:
print(f"✗ 导航到搜索页面失败: {e}")
self.take_screenshot("navigate_error")
return False
def setup_search_criteria(self):
"""设置搜索条件(使用JSON传入的参数)"""
try:
print("\n正在设置搜索条件...")
# 搜索框: 使用JSON传入的关键词
search_box = self.wait.until(EC.presence_of_element_located(
(By.XPATH, "//*[@id='qs']")
))
search_box.clear()
search_box.send_keys(self.keywords)
print(f"✓ 输入关键词搜索: {self.keywords}")
# 期刊搜索栏: 使用JSON传入的期刊名称
pub_box = self.wait.until(EC.presence_of_element_located(
(By.XPATH, "//*[@id='pub']")
))
pub_box.clear()
pub_box.send_keys(self.journal_name)
print(f"✓ 输入期刊: {self.journal_name}")
# 年份选择框: 使用JSON传入的年份
year_box = self.wait.until(EC.presence_of_element_located(
(By.XPATH, "//*[@id='date']")
))
year_box.clear()
year_box.send_keys(str(self.year))
print(f"✓ 输入年份: {self.year}")
# 点击搜索按钮
search_button = self.wait.until(EC.element_to_be_clickable(
(By.XPATH, "//*[@id='search-advanced-form']/div/div/div[4]/div/div[2]/button/span/span")
))
search_button.click()
print("✓ 点击搜索按钮")
# 等待搜索结果加载完成
print("ℹ️ 等待搜索结果加载完成...")
time.sleep(8)
print("✓ 等待8秒完成")
self.pause_if_enabled()
return True
except Exception as e:
print(f"✗ 设置搜索条件失败: {str(e)}")
self.save_page_source("search_criteria_error")
self.take_screenshot("search_criteria_error")
return False
def sort_by_date(self):
"""按日期排序搜索结果"""
try:
print("\n⚡ 正在按日期排序搜索结果 - 开始监视...")
self.wait.until(EC.presence_of_element_located(
(By.XPATH, "//*[@id='srp-sorting-options']/div/a/span")
))
print("✓ 排序按钮元素已出现")
try:
pre_check_element = self.driver.find_element(By.XPATH, "//*[@id='srp-sorting-options']/div/a/span")
print(f"✓ 预检:排序按钮元素存在")
print(f"ℹ️ 预检元素状态 - 可见性: {pre_check_element.is_displayed()}, 可点击性: {pre_check_element.is_enabled()}")
except Exception as pre_check_e:
print(f"⚠️ 预检失败,可能元素暂时不可见: {str(pre_check_e)}")
start_wait_time = time.time()
sort_button = self.wait.until(EC.element_to_be_clickable(
(By.XPATH, "//*[@id='srp-sorting-options']/div/a/span")
))
wait_duration = time.time() - start_wait_time
print(f"✅ 成功定位可点击的排序按钮,等待用时: {wait_duration:.3f}秒")
before_url = self.driver.current_url
print(f"ℹ️ 点击前URL: {before_url}")
print("⚡ 执行排序按钮点击操作...")
click_start_time = time.time()
try:
sort_button.click()
click_duration = time.time() - click_start_time
print(f"✅ 排序按钮点击操作执行完成,用时: {click_duration:.3f}秒")
except ElementClickInterceptedException:
print(f"✗ 排序按钮点击被拦截,元素可能被覆盖")
print(f"⚡ 尝试使用JavaScript点击排序按钮")
self.driver.execute_script("arguments[0].click();", sort_button)
print(f"✅ JavaScript点击排序按钮执行完成")
except Exception as click_e:
print(f"✗ 排序按钮点击失败: {str(click_e)}")
raise
print("ℹ️ 等待排序操作完成...")
try:
after_url = self.driver.current_url
if after_url == before_url:
print("ℹ️ URL未变化,等待内容更新...")
try:
self.wait.until_not(EC.visibility_of_element_located(
(By.XPATH, "//*[@id='srp-sorting-options']//option")
))
print("✓ 排序选项列表已收起")
except Exception as dropdown_e:
print(f"ℹ️ 排序选项列表状态检查异常: {str(dropdown_e)}")
else:
print("ℹ️ URL已变化,等待页面加载完成...")
self.wait.until(EC.presence_of_element_located(
(By.XPATH, "//div[contains(@class, 'result-list') or @id='search-results-list']")
))
print("✓ 新页面内容已加载")
articles = self.wait.until(EC.presence_of_all_elements_located(
(By.XPATH, "//article[contains(@class, 'result-item')] | //li[contains(@class, 'search-result')]")
))
print(f"✓ 排序完成,找到 {len(articles)} 篇文章")
except Exception as sort_wait_e:
print(f"⚠️ 显式等待排序完成失败,使用备用等待: {str(sort_wait_e)}")
time.sleep(3)
after_url = self.driver.current_url
if after_url != before_url:
print(f"✓ 检测到URL变化,排序操作可能触发了页面刷新: {after_url}")
else:
print(f"⚠️ URL未变化,建议验证排序结果")
if self.pause_on_navigation:
self.pause_if_enabled()
print("ℹ️ 执行了暂停操作")
else:
print("ℹ️ pause_on_navigation为False,跳过暂停操作")
print("ℹ️ 排序完成后额外等待5秒,确保结果完全加载...")
time.sleep(5)
print("✓ 额外等待完成")
print("✅ 排序按钮定位和点击监视完成")
return True
except Exception as e:
print(f"✗ 排序失败: {str(e)}")
self.take_screenshot("sort_error")
return False
def get_results_count(self):
"""获取搜索结果数量"""
try:
count_element = self.wait.until(EC.presence_of_element_located(
(By.XPATH, "//*[@id='srp-facets']/div[1]/h1/span")
))
count_text = count_element.text.strip()
import re
count = int(re.search(r'\d+', count_text).group())
print(f"✓ 搜索结果数量: {count}")
return count
except Exception as e:
print(f"✗ 获取结果数量失败: {str(e)}")
return 0
def select_articles_by_date(self, max_results=50):
"""根据日期选择本月发表的文章"""
try:
print("\n正在选择本月发表的文章...")
selected_count = 0
current_month = datetime.now().strftime("%B")
if max_results > 24:
max_results = 24
selected_articles = []
extra_selected_article = None
for i in range(1, max_results + 1):
try:
date_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[2]/div[2]/span/span[2]"
date_element = self.driver.find_element(By.XPATH, date_xpath)
date_text = date_element.text.strip()
print(f" 文章{i}日期: {date_text}")
if current_month in date_text:
try:
article_container_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]"
print(f" ℹ️ 尝试获取文章容器: {article_container_xpath}")
article_container = self.driver.find_element(By.XPATH, article_container_xpath)
print(f" ✓ 成功获取文章容器")
article_link = None
try:
link_element = article_container.find_element(By.CSS_SELECTOR, "h2 a")
article_link = link_element.get_attribute('href')
print(f" ✓ 成功获取文章链接: {article_link}")
except:
try:
link_elements = article_container.find_elements(By.XPATH, ".//a")
for link_elem in link_elements:
href = link_elem.get_attribute('href')
if href and '/science/article/' in href:
article_link = href
print(f" ✓ 使用备用方法获取文章链接: {article_link}")
break
except Exception as link_e:
print(f" ⚠️ 获取文章链接失败: {str(link_e)}")
article_title = None
try:
title_elements = article_container.find_elements(By.CSS_SELECTOR, "h2 a .anchor-text")
if title_elements:
article_title = title_elements[0].text.strip()
print(f" ✓ 成功获取文章标题: {article_title}")
else:
span_elements = article_container.find_elements(By.CSS_SELECTOR, "h2 a span span span")
if span_elements:
article_title = span_elements[0].text.strip()
print(f" ✓ 使用备用方法获取文章标题: {article_title}")
else:
h2_element = article_container.find_element(By.CSS_SELECTOR, "h2")
article_title = h2_element.text.strip()
print(f" ✓ 使用最终备用方法获取文章标题: {article_title}")
except Exception as title_e:
print(f" ⚠️ 获取文章标题失败: {str(title_e)}")
journal_name = None
try:
journal_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[2]/div[2]/span/span[1]/a/span/span/span"
journal_element = self.driver.find_element(By.XPATH, journal_xpath)
journal_name = journal_element.text.strip()
print(f" ✓ 成功获取期刊信息: {journal_name}")
except Exception as journal_e:
print(f" ⚠️ 获取期刊信息失败: {str(journal_e)}")
if article_title:
safe_title = article_title.replace('<', '').replace('>', '').replace(':', '').replace('"', '').replace('/', '').replace('\\', '').replace('|', '').replace('?', '').replace('*', '')
if not safe_title.strip():
safe_title = f"Article_{i}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
save_dir = os.path.join(self.journal_download_dir, safe_title)
os.makedirs(save_dir, exist_ok=True)
article_info = {
"title": article_title or "未知标题",
"journal": journal_name or "未知期刊",
"link": article_link or "未知链接",
"timestamp": datetime.now().isoformat(),
"article_index": i
}
json_file_path = os.path.join(save_dir, 'article_info.json')
with open(json_file_path, 'w', encoding='utf-8') as f:
json.dump(article_info, f, ensure_ascii=False, indent=2)
print(f" ✅ 文章信息已保存到: {json_file_path}")
except Exception as info_e:
print(f" ⚠️ 获取文章信息失败: {str(info_e)}")
select_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[1]/div[1]/label/span[1]"
try:
select_element = self.driver.find_element(By.XPATH, select_xpath)
print(f" ⚡ 准备点击选择框: 元素存在且可见={select_element.is_displayed()}")
click_before_state = select_element.get_attribute('class') or '无class属性'
print(f" ℹ️ 点击前状态: {click_before_state}")
select_element.click()
print(f" ✅ 点击操作执行完成")
click_after_state = select_element.get_attribute('class') or '无class属性'
print(f" ℹ️ 点击后状态: {click_after_state}")
if click_before_state != click_after_state:
print(f" ✓ 检测到状态变化,点击可能成功")
else:
print(f" ⚠️ 未检测到状态变化,建议验证点击效果")
selected_count += 1
selected_articles.append(i)
print(f" ✓ 选择了本月文章: {i}")
time.sleep(2)
except Exception as click_e:
print(f" ✗ 点击选择框失败: {str(click_e)}")
try:
print(f" ⚡ 尝试使用JavaScript点击")
self.driver.execute_script("arguments[0].click();", select_element)
print(f" ✅ JavaScript点击执行完成")
selected_count += 1
selected_articles.append(i)
time.sleep(2)
except Exception as js_e:
print(f" ✗ JavaScript点击也失败: {str(js_e)}")
except NoSuchElementException:
if 'no_element_count' not in locals():
no_element_count = 1
else:
no_element_count += 1
print(f" ⚠️ 未找到文章{i}的元素,可能已到达列表末尾")
print(f" ℹ️ 连续未找到元素次数: {no_element_count}")
if no_element_count >= 2:
print(f" ⚠️ 连续两次未找到元素,中断循环")
break
else:
print(f" ℹ️ 第一次未找到元素,继续尝试下一个")
continue
except Exception as inner_e:
print(f" ⚠️ 处理文章{i}时出错: {str(inner_e)}")
print(f"✓ 共选择了 {selected_count} 篇本月发表的文章")
if selected_count == 1:
print(f"⚠️ 只选择了1篇文章,需要额外选择一篇凑数")
for i in range(1, max_results + 1):
if i not in selected_articles:
try:
date_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[2]/div[2]/span/span[2]"
date_element = self.driver.find_element(By.XPATH, date_xpath)
date_text = date_element.text.strip()
print(f" 额外选择: 检查文章{i}日期: {date_text}")
select_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[1]/div[1]/label/span[1]"
select_element = self.driver.find_element(By.XPATH, select_xpath)
print(f" ⚡ 准备额外选择: 文章{i}选择框可见={select_element.is_displayed()}")
click_before_state = select_element.get_attribute('class') or '无class属性'
print(f" ℹ️ 点击前状态: {click_before_state}")
time.sleep(1)
select_element.click()
print(f" ✅ 点击操作执行完成")
time.sleep(1)
click_after_state = select_element.get_attribute('class') or '无class属性'
print(f" ℹ️ 点击后状态: {click_after_state}")
is_really_selected = False
retry_count = 0
max_retries = 2
while retry_count <= max_retries and not is_really_selected:
try:
label_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[1]/div[1]/label"
label_element = self.driver.find_element(By.XPATH, label_xpath)
is_checked = label_element.get_attribute('class') or ''
checkbox_checked = False
try:
checkbox_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[1]/div[1]/label/input"
checkbox_element = self.driver.find_element(By.XPATH, checkbox_xpath)
checkbox_checked = checkbox_element.get_attribute('checked') is not None
except:
pass
if 'checked' in is_checked.lower() or checkbox_checked or (retry_count == max_retries):
print(f" ✓ 确认:文章{i}已被选中")
is_really_selected = True
else:
print(f" ⚠️ 警告:未检测到选中状态 (重试 {retry_count}/{max_retries})")
retry_count += 1
if retry_count <= max_retries:
print(f" ⚡ 等待重试...")
time.sleep(1)
except Exception as check_e:
print(f" ⚠️ 无法验证选中状态: {str(check_e)}")
retry_count += 1
if retry_count <= max_retries:
time.sleep(1)
if not is_really_selected:
try:
print(f" ⚡ 尝试使用JavaScript点击")
label_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[1]/div[1]/label"
label_element = self.driver.find_element(By.XPATH, label_xpath)
self.driver.execute_script("arguments[0].click();", label_element)
print(f" ✅ JavaScript点击执行完成")
time.sleep(1)
is_checked = label_element.get_attribute('class') or ''
checkbox_checked = False
try:
checkbox_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[1]/div[1]/label/input"
checkbox_element = self.driver.find_element(By.XPATH, checkbox_xpath)
checkbox_checked = checkbox_element.get_attribute('checked') is not None
except:
pass
if 'checked' in is_checked.lower() or checkbox_checked:
print(f" ✓ JavaScript点击后确认:文章{i}已被选中")
is_really_selected = True
else:
print(f" ℹ️ 对于额外文章,信任JavaScript点击操作成功")
is_really_selected = True
except Exception as js_e:
print(f" ✗ JavaScript点击也失败: {str(js_e)}")
if is_really_selected:
print(f" ✅ 成功额外选择文章{i}")
selected_count += 1
selected_articles.append(i)
article_title = None
try:
article_container_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]"
article_container = self.driver.find_element(By.XPATH, article_container_xpath)
title_elements = article_container.find_elements(By.CSS_SELECTOR, "h2 a .anchor-text")
if title_elements:
article_title = title_elements[0].text.strip()
else:
span_elements = article_container.find_elements(By.CSS_SELECTOR, "h2 a span span span")
if span_elements:
article_title = span_elements[0].text.strip()
else:
h2_element = article_container.find_element(By.CSS_SELECTOR, "h2")
article_title = h2_element.text.strip()
except Exception as title_e:
print(f" ⚠️ 获取额外选择文章标题失败: {str(title_e)}")
extra_selected_article = {
"index": i,
"title": article_title or "未知标题",
"date": date_text
}
print(f" ✓ 已记录额外选择的文章: 索引={i}, 标题='{article_title}', 日期='{date_text}'")
extra_info_file = os.path.join(self.journal_download_dir, 'extra_selected_article.json')
with open(extra_info_file, 'w', encoding='utf-8') as f:
json.dump(extra_selected_article, f, ensure_ascii=False, indent=2)
print(f" ✅ 额外选择的文章信息已保存到: {extra_info_file}")
break
else:
print(f" ⚠️ 警告:额外文章选择失败")
continue
except Exception as e:
print(f" ⚠️ 选择文章{i}作为额外文章时出错: {str(e)}")
continue
self.extra_selected_article = extra_selected_article
return selected_count
except Exception as e:
print(f"✗ 选择文章失败: {str(e)}")
self.take_screenshot("select_articles_error")
return 0
def download_selected_articles(self):
"""下载选中的文章"""
try:
print("\n正在下载选中的文章...")
try:
button_exists = self.driver.find_element(By.XPATH, "//*[@id='srp-ddm']/form/button")
print(f"✓ 下载按钮存在: 可见={button_exists.is_displayed()}, 启用={button_exists.is_enabled()}")
button_text = button_exists.text.strip() or "无文本"
button_class = button_exists.get_attribute('class') or "无class属性"
print(f"ℹ️ 下载按钮信息 - 文本: '{button_text}', Class: {button_class}")
except Exception as find_e:
print(f"⚠️ 预先检查按钮时出错: {str(find_e)}")
download_button = self.wait.until(EC.element_to_be_clickable(
(By.XPATH, "//*[@id='srp-ddm']/form/button/span/span/span")
))
print(f"✅ 成功获取可点击的下载按钮元素")
try:
before_url = self.driver.current_url
before_time = time.time()
print(f"ℹ️ 点击前状态 - URL: {before_url}, 时间戳: {before_time}")
print("⚡ 执行下载按钮点击操作...")
download_button.click()
print("✅ 下载按钮点击操作执行完成")
after_time = time.time()
click_delay = after_time - before_time
print(f"ℹ️ 点击执行耗时: {click_delay:.3f}秒")
time.sleep(1)
after_url = self.driver.current_url
print(f"ℹ️ 点击后URL: {after_url}")
window_handles = self.driver.window_handles
print(f"ℹ️ 当前窗口数量: {len(window_handles)}")
try:
confirm_dialog = self.driver.find_element(By.CLASS_NAME, 'download-confirmation')
print(f"✓ 检测到下载确认对话框")
except:
print(f"ℹ️ 未检测到明显的下载确认对话框")
print("✓ 下载按钮点击监视完成")
except ElementClickInterceptedException:
print(f"✗ 下载按钮点击被拦截,元素可能被其他元素覆盖")
try:
print(f"⚡ 尝试使用JavaScript点击下载按钮")
self.driver.execute_script("arguments[0].click();", download_button)
print(f"✅ JavaScript下载按钮点击执行完成")
except Exception as js_e:
print(f"✗ JavaScript下载按钮点击也失败: {str(js_e)}")
raise
except Exception as click_e:
print(f"✗ 下载按钮点击失败: {str(click_e)}")
print(f"ℹ️ 错误类型: {type(click_e).__name__}")
print(f"ℹ️ 错误详情: {traceback.format_exc()}")
raise
time.sleep(3)
print("✓ 下载已开始,正在监控下载进度...")
success = self.monitor_zip_download(timeout=300)
if success:
print("✓ 下载完成")
if self.verify_zip_file():
print("✓ ZIP文件验证成功")
else:
print("⚠️ ZIP文件可能不完整")
else:
print("⚠️ 下载监控超时或失败")
return success
except Exception as e:
print(f"✗ 下载失败: {str(e)}")
self.take_screenshot("download_error")
return False
def monitor_zip_download(self, timeout=300):
"""监控ZIP文件下载进度"""
start_time = time.time()
initial_files = set(os.listdir(self.download_dir))
initial_zip_files = {f for f in initial_files if f.endswith('.zip')}
temp_files = []
new_zip_files = set()
size_stability_count = {}
STABILITY_THRESHOLD = 3
print(f"开始监控下载,超时时间: {timeout}秒")
print(f"忽略下载前已存在的{len(initial_files)}个文件")
print(f"忽略下载前已存在的{len(initial_zip_files)}个ZIP文件")
while time.time() - start_time < timeout:
current_files = set(os.listdir(self.download_dir))
newly_appeared_files = current_files - initial_files
if newly_appeared_files:
for file in newly_appeared_files:
file_path = os.path.join(self.download_dir, file)
if file.endswith('.crdownload') or file.endswith('.part') or file.startswith('~'):
if file not in temp_files:
temp_files.append(file)
print(f"检测到新临时文件: {file}")
elif file.endswith('.zip'):
if file not in new_zip_files:
new_zip_files.add(file)
print(f"检测到新ZIP文件: {file}")
size_stability_count[file] = 0
for temp_file in temp_files[:]:
base_name = temp_file.replace('.crdownload', '').replace('.part', '')
if base_name in current_files and base_name not in initial_files:
print(f"临时文件已完成转换: {temp_file} -> {base_name}")
if base_name.endswith('.zip'):
new_zip_files.add(base_name)
print(f"添加转换后的ZIP文件到监控列表: {base_name}")
size_stability_count[base_name] = 0
temp_files.remove(temp_file)
for file in new_zip_files:
file_path = os.path.join(self.download_dir, file)
try:
if not os.path.exists(file_path):
continue
current_size = os.path.getsize(file_path)
if not hasattr(self, '_zip_sizes'):
self._zip_sizes = {file: current_size}
elif file in self._zip_sizes:
size_diff = current_size - self._zip_sizes[file]
if size_diff > 0:
print(f"ZIP文件增长中: {file} (+{size_diff}字节)")
self._zip_sizes[file] = current_size
size_stability_count[file] = 0
else:
size_stability_count[file] += 1
print(f"ZIP文件大小稳定: {file} (计数: {size_stability_count[file]}/{STABILITY_THRESHOLD})")
if size_stability_count[file] >= STABILITY_THRESHOLD:
print(f"ZIP文件下载完成: {file} ({current_size}字节) - 连续{STABILITY_THRESHOLD}次大小稳定")
return True
else:
self._zip_sizes = {file: current_size}
size_stability_count[file] = 0
except Exception as e:
print(f"监控新ZIP文件{file}时出错: {str(e)}")
time.sleep(2)
elapsed = time.time() - start_time
remaining = timeout - elapsed
if remaining % 10 == 0 or remaining < 10:
print(f"剩余监控时间: {int(remaining)}秒")
if new_zip_files:
print(f"当前监控的ZIP文件数: {len(new_zip_files)}")
else:
print(f"尚未检测到新的ZIP文件")
print(f"下载监控超时 ({timeout}秒)")
return False
def verify_zip_file(self):
"""验证下载的ZIP文件完整性"""
try:
import zipfile
zip_files = [f for f in os.listdir(self.download_dir) if f.endswith('.zip')]
if not zip_files:
return False
zip_files.sort(key=lambda x: os.path.getmtime(os.path.join(self.download_dir, x)), reverse=True)
latest_zip = os.path.join(self.download_dir, zip_files[0])
print(f"正在验证ZIP文件: {os.path.basename(latest_zip)}")
with zipfile.ZipFile(latest_zip, 'r') as zip_ref:
zip_ref.testzip()
info_list = zip_ref.infolist()
print(f"ZIP文件包含 {len(info_list)} 个文件")
return len(info_list) > 0
except Exception as e:
print(f"ZIP文件验证失败: {str(e)}")
return False
def save_journal_summary(self):
"""保存期刊爬取总结信息"""
try:
summary_file = os.path.join(self.journal_download_dir, 'journal_summary.json')
summary = {
"journal_name": self.journal_name,
"download_dir": self.journal_download_dir,
"crawl_time": datetime.now().isoformat(),
"status": "completed",
"keywords": self.keywords,
"year": self.year
}
with open(summary_file, 'w', encoding='utf-8') as f:
json.dump(summary, f, ensure_ascii=False, indent=2)
print(f"✓ 期刊总结信息已保存: {summary_file}")
return True
except Exception as e:
print(f"✗ 保存期刊总结信息失败: {e}")
return False
def run(self):
"""运行爬虫的主流程"""
try:
if not self.setup_driver():
return False
try:
if not self.navigate_to_search_page():
return False
if self.use_profile:
self.load_cookies()
self.driver.refresh()
time.sleep(3)
if not self.setup_search_criteria():
return False
if not self.sort_by_date():
print("⚠️ 排序失败,继续后续操作")
results_count = self.get_results_count()
if results_count == 0:
print("⚠️ 未找到搜索结果")
return False
selected_count = self.select_articles_by_date(max_results=24)
if selected_count == 0:
print("⚠️ 未选择任何文章")
return False
download_success = self.download_selected_articles()
if not download_success:
print("⚠️ 下载未成功完成")
return False
self.save_journal_summary()
return True
finally:
if self.use_profile:
self.save_cookies()
print("关闭浏览器...")
try:
self.driver.quit()
print("✓ 浏览器已关闭")
except Exception as e:
print(f"⚠️ 浏览器关闭时出现小问题,但已成功关闭: {e}")
except KeyboardInterrupt:
print("\n用户中断操作")
if self.driver:
try:
self.save_cookies()
self.driver.quit()
except:
pass
return False
except Exception as e:
print(f"✗ 爬虫运行出错: {e}")
print(f"ℹ️ 错误详情: {traceback.format_exc()}")
if self.driver:
try:
self.take_screenshot("run_error")
self.save_cookies()
self.driver.quit()
except:
pass
return False
def load_journal_config(config_file):
"""从JSON文件加载期刊配置(包括期刊名称、关键词、年份等参数)"""
try:
print(f"正在尝试加载配置文件: {config_file}")
print(f"配置文件完整路径: {os.path.abspath(config_file)}")
with open(config_file, 'r', encoding='utf-8') as f:
content = f.read()
print(f"配置文件内容: {content[:200]}...") # 打印前200个字符
# 检查内容是否为有效的JSON格式
try:
config = json.loads(content)
print(f"JSON解析成功,配置类型: {type(config)}")
except json.JSONDecodeError as je:
print(f"JSON解析失败: {je}")
print(f"文件内容可能不是有效的JSON格式")
return None
print(f"加载到的配置: {config}")
# 验证配置是否为字典类型
if not isinstance(config, dict):
print(f"✗ 配置格式错误: 期望字典类型,实际得到 {type(config)}")
return None
# 验证配置是否包含必要参数
required_fields = ['journals']
for field in required_fields:
if field not in config:
print(f"✗ 配置文件缺少必要字段: {field}")
print(f"配置中实际包含的字段: {list(config.keys())}")
return None
print(f"✓ 从 {config_file} 加载配置成功")
print(f"✓ 期刊数量: {len(config['journals'])}")
print(f"✓ 使用关键词: {config.get('keywords', '默认关键词')}")
print(f"✓ 搜索年份: {config.get('year', datetime.now().year)}")
return config
except Exception as e:
print(f"✗ 加载期刊配置文件失败: {e}")
print(f"错误详情: {traceback.format_exc()}")
return None
def ask_continue_next_journal(current, total):
"""询问用户是否继续爬取下一个期刊"""
try:
print("\n" + "="*50)
response = input(f"已完成 {current}/{total} 个期刊,是否继续爬取下一个?(y/n): ").strip().lower()
return response in ['y', 'yes', '是', '继续']
except KeyboardInterrupt:
print("\n用户中断输入")
return False
def main():
"""主函数"""
try:
parser = argparse.ArgumentParser(description="Medical Image Analysis期刊爬虫")
parser.add_argument("--output-dir", default=r"d:\期刊网站爬虫\downloads_mia", help="输出目录")
parser.add_argument("--config", required=True, help="期刊配置JSON文件路径(包含期刊名称等参数)")
parser.add_argument("--headless", action="store_true", help="使用无头模式")
parser.add_argument("--no-profile", action="store_true", help="不使用配置文件和cookie管理")
parser.add_argument("--pause", action="store_true", help="导航时暂停")
args = parser.parse_args()
print("=========================================")
print("Medical Image Analysis期刊爬虫")
print("=========================================")
print("配置信息:")
print(f"- 配置文件: {args.config}")
print(f"- 输出根目录: {args.output_dir}")
print(f"- 无头模式: {'开启' if args.headless else '关闭'}")
print(f"- 配置文件管理: {'关闭' if args.no_profile else '开启'}")
print(f"- 导航暂停: {'开启' if args.pause else '关闭'}")
print("=========================================")
# 从JSON文件加载配置(包括期刊名称)
config = load_journal_config(args.config)
if not config:
print("✗ 配置加载失败,程序退出")
return
journals = config['journals']
total_journals = len(journals)
successful_journals = 0
# 遍历所有期刊
for i, journal_name in enumerate(journals, 1):
print(f"\n{'='*60}")
print(f"开始处理第 {i}/{total_journals} 个期刊: {journal_name}")
print(f"{'='*60}")
# 创建爬虫实例(传入JSON中的参数)
crawler = MIACrawler(
journal_name=journal_name,
download_dir=args.output_dir,
headless=args.headless,
use_profile=not args.no_profile,
pause_on_navigation=args.pause,
keywords=config.get('keywords'), # 从JSON获取关键词
year=config.get('year') # 从JSON获取年份
)
# 运行爬虫
success = crawler.run()
if success:
successful_journals += 1
print(f"\n✓ 期刊 '{journal_name}' 爬取成功完成!")
else:
print(f"\n✗ 期刊 '{journal_name}' 爬取未成功完成")
# 询问是否继续下一个(最后一个除外)
if i < total_journals:
if not ask_continue_next_journal(i, total_journals):
print("用户选择停止爬取")
break
# 输出总结
print(f"\n{'='*60}")
print(f"爬取总结: 成功 {successful_journals}/{total_journals} 个期刊")
print(f"结果保存位置: {args.output_dir}")
print(f"{'='*60}")
except KeyboardInterrupt:
print("\n程序被用户中断")
except Exception as e:
print(f"\n✗ 程序启动时出错: {e}")
if __name__ == "__main__":
main()