import time
import json
import os
import argparse
import traceback
import copy
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
TimeoutException,
ElementClickInterceptedException,
ElementNotInteractableException,
NoSuchElementException,
WebDriverException
)
import undetected_chromedriver as uc
class MIACrawler:
def __init__(self, download_dir=None, headless=False, use_profile=True, pause_on_navigation=False):
"""
初始化Medical Image Analysis爬虫
Args:
download_dir: 下载目录
headless: 是否使用无头模式
use_profile: 是否使用配置文件
pause_on_navigation: 导航时是否暂停
"""
# 默认参数设置
self.default_save_dir = download_dir or r"d:\期刊网站爬虫\downloads_mia"
self.journal_url = "https://www.sciencedirect.com/search/entry"
self.cookie_file = r"d:\期刊网站爬虫\cookies\cookies_sciencedirect_mia.json"
self.headless = headless
self.use_profile = use_profile
self.pause_on_navigation = pause_on_navigation
# 确保下载目录存在
os.makedirs(self.default_save_dir, exist_ok=True)
# 初始化实例变量
self.driver = None
self.wait = None
self.download_dir = self.default_save_dir
def pause_if_enabled(self):
"""如果启用了暂停功能,则暂停执行"""
if self.pause_on_navigation:
try:
print("\n[PAUSE] 按Enter键继续...")
input()
except KeyboardInterrupt:
print("\n用户中断操作")
raise
def setup_driver(self):
"""设置Chrome驱动"""
try:
print("正在初始化Chrome浏览器驱动...")
# 创建Chrome选项并配置下载路径
chrome_options = uc.ChromeOptions()
prefs = {
"download.default_directory": self.download_dir,
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"safebrowsing.enabled": True
}
chrome_options.add_experimental_option("prefs", prefs)
# 添加更多配置以提高稳定性
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--start-maximized")
if self.headless:
chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=1920,1080")
# 使用本地Chrome浏览器,避免网络问题
try:
# 尝试使用本地Chrome
self.driver = uc.Chrome(options=chrome_options, use_subprocess=True)
print(f"浏览器驱动初始化成功!下载目录设置为: {self.download_dir}")
except Exception as e:
print(f"首次初始化失败,尝试备用配置: {e}")
# 尝试使用更多备用配置
try:
# 使用本地Chrome安装路径
chrome_path = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"
if os.path.exists(chrome_path):
chrome_options.binary_location = chrome_path
print(f"使用本地Chrome路径: {chrome_path}")
# 禁用自动更新检查
self.driver = uc.Chrome(
options=chrome_options,
use_subprocess=True,
version_main=None # 不指定Chrome版本,使用本地版本
)
print(f"备用配置初始化成功!下载目录设置为: {self.download_dir}")
except Exception as e:
print(f"备用配置也失败: {e}")
raise
# 设置显式等待
self.wait = WebDriverWait(self.driver, 30)
return True
except Exception as e:
print(f"设置驱动失败: {e}")
return False
def take_screenshot(self, filename):
"""截取当前页面截图"""
try:
# 确保截图目录存在
screenshot_dir = os.path.join(os.path.dirname(self.default_save_dir), "screenshots_mia")
os.makedirs(screenshot_dir, exist_ok=True)
# 生成截图文件名
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
screenshot_path = os.path.join(screenshot_dir, f"{filename}_{timestamp}.png")
# 截取并保存截图
self.driver.save_screenshot(screenshot_path)
print(f"✓ 截图已保存: {screenshot_path}")
return screenshot_path
except Exception as e:
print(f"✗ 截图失败: {e}")
return None
def save_page_source(self, filename):
"""保存当前页面源码"""
try:
# 确保源码目录存在
source_dir = os.path.join(os.path.dirname(self.default_save_dir), "page_sources_mia")
os.makedirs(source_dir, exist_ok=True)
# 生成源码文件名
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
source_path = os.path.join(source_dir, f"{filename}_{timestamp}.html")
# 保存页面源码
with open(source_path, 'w', encoding='utf-8') as f:
f.write(self.driver.page_source)
print(f"✓ 页面源码已保存: {source_path}")
return source_path
except Exception as e:
print(f"✗ 保存页面源码失败: {e}")
return None
def save_cookies(self):
"""保存浏览器cookies"""
try:
# 确保目录存在
os.makedirs(os.path.dirname(self.cookie_file), exist_ok=True)
# 获取当前cookies
cookies = self.driver.get_cookies()
print(f"正在保存cookies,共{len(cookies)}个...")
# 保存到文件
with open(self.cookie_file, 'w', encoding='utf-8') as f:
json.dump(cookies, f, ensure_ascii=False, indent=2)
print(f"✓ Cookies已保存到: {self.cookie_file}")
return True
except Exception as e:
print(f"✗ 保存cookies时出错: {e}")
return False
def load_cookies(self):
"""加载cookies到浏览器"""
if not os.path.exists(self.cookie_file):
print(f"⚠️ Cookie文件不存在: {self.cookie_file}")
return False
try:
with open(self.cookie_file, 'r', encoding='utf-8') as f:
cookies = json.load(f)
print(f"正在加载cookies,共{len(cookies)}个...")
for cookie in cookies:
# 删除可能导致问题的属性
if 'expirationDate' in cookie:
del cookie['expirationDate']
if 'storeId' in cookie:
del cookie['storeId']
if 'sameSite' in cookie and cookie['sameSite'] is None:
cookie['sameSite'] = 'Lax'
try:
self.driver.add_cookie(cookie)
except Exception as e:
print(f"⚠️ 添加cookie失败: {cookie.get('name')} - {e}")
print("✓ Cookies加载完成")
return True
except Exception as e:
print(f"✗ 加载cookies时出错: {e}")
return False
def navigate_to_search_page(self):
"""导航到搜索页面"""
try:
print(f"正在访问 {self.journal_url}")
self.driver.get(self.journal_url)
time.sleep(5) # 给页面一些加载时间
# 检查页面标题以确认访问成功
page_title = self.driver.title
print(f"✓ 已成功访问搜索页面")
print(f"ℹ️ 页面标题: {page_title}")
return True
except Exception as e:
print(f"✗ 导航到搜索页面失败: {e}")
self.take_screenshot("navigate_error")
return False
def setup_search_criteria(self):
"""设置搜索条件"""
try:
print("\n正在设置搜索条件...")
# 搜索框: //*[@id="qs"]
search_box = self.wait.until(EC.presence_of_element_located(
(By.XPATH, "//*[@id='qs']")
))
search_box.clear()
search_query = '(ultrasound OR ultrasonic) AND (AI OR "artificial intelligence" OR "deep learning" OR "neural network")'
search_box.send_keys(search_query)
print(f"✓ 输入关键词搜索: {search_query}")
# 期刊搜索栏: //*[@id="pub"]
pub_box = self.wait.until(EC.presence_of_element_located(
(By.XPATH, "//*[@id='pub']")
))
pub_box.clear()
pub_query = "Medical Image Analysis"
pub_box.send_keys(pub_query)
print(f"✓ 输入期刊: {pub_query}")
# 年份选择框: //*[@id="date"]
year_box = self.wait.until(EC.presence_of_element_located(
(By.XPATH, "//*[@id='date']")
))
year_box.clear()
current_year = datetime.now().year
year_box.send_keys(str(current_year))
print(f"✓ 输入年份: {current_year}")
# 点击搜索按钮
search_button = self.wait.until(EC.element_to_be_clickable(
(By.XPATH, "//*[@id='search-advanced-form']/div/div/div[4]/div/div[2]/button/span/span")
))
search_button.click()
print("✓ 点击搜索按钮")
# 等待搜索结果加载完成
print("ℹ️ 等待搜索结果加载完成...")
# 改为直接暂停8秒
time.sleep(8)
print("✓ 等待8秒完成")
self.pause_if_enabled() # 搜索结果加载后暂停
return True
except Exception as e:
print(f"✗ 设置搜索条件失败: {str(e)}")
self.save_page_source("search_criteria_error")
self.take_screenshot("search_criteria_error")
return False
def sort_by_date(self):
"""按日期排序搜索结果"""
try:
print("\n⚡ 正在按日期排序搜索结果 - 开始监视...")
# 先显式等待排序按钮元素出现
print("ℹ️ 显式等待排序按钮元素出现...")
self.wait.until(EC.presence_of_element_located(
(By.XPATH, "//*[@id='srp-sorting-options']/div/a/span")
))
print("✓ 排序按钮元素已出现")
# 直接查找元素进行预检
try:
pre_check_element = self.driver.find_element(By.XPATH, "//*[@id='srp-sorting-options']/div/a/span")
print(f"✓ 预检:排序按钮元素存在")
print(f"ℹ️ 预检元素状态 - 可见性: {pre_check_element.is_displayed()}, 可点击性: {pre_check_element.is_enabled()}")
# 获取元素详细信息
button_text = pre_check_element.text.strip() or "无文本"
button_class = pre_check_element.get_attribute('class') or "无class属性"
button_href = pre_check_element.get_attribute('href') or "无href属性"
print(f"ℹ️ 排序按钮详细信息 - 文本: '{button_text}', Class: {button_class}, Href: {button_href}")
except Exception as pre_check_e:
print(f"⚠️ 预检失败,可能元素暂时不可见: {str(pre_check_e)}")
# 使用显式等待获取可点击的按钮
print("ℹ️ 使用显式等待精确查找可点击的排序按钮...")
start_wait_time = time.time()
sort_button = self.wait.until(EC.element_to_be_clickable(
(By.XPATH, "//*[@id='srp-sorting-options']/div/a/span")
))
wait_duration = time.time() - start_wait_time
print(f"✅ 成功定位可点击的排序按钮,等待用时: {wait_duration:.3f}秒")
# 点击前再次检查元素状态
print(f"ℹ️ 点击前最终检查 - 可见性: {sort_button.is_displayed()}, 可点击性: {sort_button.is_enabled()}")
# 记录点击前页面状态作为参考
before_url = self.driver.current_url
print(f"ℹ️ 点击前URL: {before_url}")
# 执行点击操作
print("⚡ 执行排序按钮点击操作...")
click_start_time = time.time()
try:
sort_button.click()
click_duration = time.time() - click_start_time
print(f"✅ 排序按钮点击操作执行完成,用时: {click_duration:.3f}秒")
except ElementClickInterceptedException:
print(f"✗ 排序按钮点击被拦截,元素可能被覆盖")
# 尝试使用JavaScript点击作为备选方案
print(f"⚡ 尝试使用JavaScript点击排序按钮")
self.driver.execute_script("arguments[0].click();", sort_button)
print(f"✅ JavaScript点击排序按钮执行完成")
except Exception as click_e:
print(f"✗ 排序按钮点击失败: {str(click_e)}")
print(f"ℹ️ 错误类型: {type(click_e).__name__}")
# 尝试获取更多诊断信息
try:
page_source = self.driver.page_source[:500] # 只获取部分页面源码用于诊断
print(f"ℹ️ 页面源码片段: {page_source}")
except:
print("ℹ️ 无法获取页面源码用于诊断")
raise
# 使用显式条件等待排序操作完成
print("ℹ️ 等待排序操作完成...")
try:
# 等待页面状态变化(URL变化或新内容加载)
if after_url == before_url:
# URL未变化,等待内容更新(例如等待排序指示器消失或新的结果出现)
print("ℹ️ URL未变化,等待内容更新...")
# 等待排序选项列表可能收起
try:
self.wait.until_not(EC.visibility_of_element_located(
(By.XPATH, "//*[@id='srp-sorting-options']//option")
))
print("✓ 排序选项列表已收起")
except Exception as dropdown_e:
print(f"ℹ️ 排序选项列表状态检查异常: {str(dropdown_e)}")
else:
# URL已变化,等待新页面加载完成
print("ℹ️ URL已变化,等待页面加载完成...")
self.wait.until(EC.presence_of_element_located(
(By.XPATH, "//div[contains(@class, 'result-list') or @id='search-results-list']")
))
print("✓ 新页面内容已加载")
# 最终验证 - 检查排序后的文章列表是否可见
articles = self.wait.until(EC.presence_of_all_elements_located(
(By.XPATH, "//article[contains(@class, 'result-item')] | //li[contains(@class, 'search-result')]")
))
print(f"✓ 排序完成,找到 {len(articles)} 篇文章")
except Exception as sort_wait_e:
print(f"⚠️ 显式等待排序完成失败,使用备用等待: {str(sort_wait_e)}")
# 如果显式等待失败,使用较短的备用等待
time.sleep(3)
# 验证排序是否可能成功(检查页面变化)
after_url = self.driver.current_url
if after_url != before_url:
print(f"✓ 检测到URL变化,排序操作可能触发了页面刷新: {after_url}")
else:
print(f"⚠️ URL未变化,建议验证排序结果")
# 尝试检查排序下拉菜单是否打开
try:
dropdown_elements = self.driver.find_elements(By.XPATH, "//*[@id='srp-sorting-options']//option")
if dropdown_elements:
print(f"✓ 检测到排序选项列表,包含 {len(dropdown_elements)} 个选项")
# 打印前几个选项内容
for i, option in enumerate(dropdown_elements[:3]):
print(f" ℹ️ 排序选项{i+1}: {option.text}")
except Exception as verify_e:
print(f"⚠️ 无法验证排序下拉菜单: {str(verify_e)}")
# 执行暂停(如果启用)
if self.pause_on_navigation:
self.pause_if_enabled() # 排序完成后暂停
print("ℹ️ 执行了暂停操作")
else:
print("ℹ️ pause_on_navigation为False,跳过暂停操作")
# 排序完成后额外等待一段时间,确保结果完全加载
print("ℹ️ 排序完成后额外等待5秒,确保结果完全加载...")
time.sleep(5)
print("✓ 额外等待完成")
print("✅ 排序按钮定位和点击监视完成")
return True
except Exception as e:
print(f"✗ 排序失败: {str(e)}")
self.take_screenshot("sort_error")
return False
def get_results_count(self):
"""获取搜索结果数量"""
try:
# 获取结果数: //*[@id="srp-facets"]/div[1]/h1/span
count_element = self.wait.until(EC.presence_of_element_located(
(By.XPATH, "//*[@id='srp-facets']/div[1]/h1/span")
))
count_text = count_element.text.strip()
# 提取数字
import re
count = int(re.search(r'\d+', count_text).group())
print(f"✓ 搜索结果数量: {count}")
return count
except Exception as e:
print(f"✗ 获取结果数量失败: {str(e)}")
return 0
def select_articles_by_date(self, max_results=50):
"""根据日期选择本月发表的文章"""
try:
print("\n正在选择本月发表的文章...")
selected_count = 0
current_month = datetime.now().strftime("%B") # 例如: February
if max_results > 24:
max_results = 24
# 存储选中的文章索引
selected_articles = []
# 存储额外选择的文章信息
extra_selected_article = None
# 获取结果列表中的所有文章
for i in range(1, max_results + 1):
try:
# 日期信息XPath
date_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[2]/div[2]/span/span[2]"
date_element = self.driver.find_element(By.XPATH, date_xpath)
date_text = date_element.text.strip()
print(f" 文章{i}日期: {date_text}")
# 判断是否为本月
if current_month in date_text:
# 获取文章信息
try:
# 初始化变量
article_link = None
article_title = None
journal_name = None
# 获取文章容器元素
try:
article_container_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]"
print(f" ℹ️ 尝试获取文章容器: {article_container_xpath}")
article_container = self.driver.find_element(By.XPATH, article_container_xpath)
print(f" ✓ 成功获取文章容器")
# 在容器内获取文章链接 - 使用更灵活的选择器
try:
# 先尝试使用class选择器
link_element = article_container.find_element(By.CSS_SELECTOR, "h2 a")
article_link = link_element.get_attribute('href')
print(f" ✓ 成功获取文章链接: {article_link}")
except:
try:
# 备用方法:使用更通用的XPath
link_elements = article_container.find_elements(By.XPATH, ".//a")
for link_elem in link_elements:
href = link_elem.get_attribute('href')
if href and '/science/article/' in href:
article_link = href
print(f" ✓ 使用备用方法获取文章链接: {article_link}")
break
except Exception as link_e:
print(f" ⚠️ 获取文章链接失败: {str(link_e)}")
# 在容器内获取文章标题
try:
# 根据用户提供的HTML结构,从a标签内的anchor-text中提取
title_elements = article_container.find_elements(By.CSS_SELECTOR, "h2 a .anchor-text")
if title_elements:
article_title = title_elements[0].text.strip()
print(f" ✓ 成功获取文章标题: {article_title}")
else:
# 尝试获取a标签内的所有span文本
span_elements = article_container.find_elements(By.CSS_SELECTOR, "h2 a span span span")
if span_elements:
article_title = span_elements[0].text.strip()
print(f" ✓ 使用备用方法获取文章标题: {article_title}")
else:
# 最后尝试直接获取h2的文本
h2_element = article_container.find_element(By.CSS_SELECTOR, "h2")
article_title = h2_element.text.strip()
print(f" ✓ 使用最终备用方法获取文章标题: {article_title}")
except Exception as title_e:
print(f" ⚠️ 获取文章标题失败: {str(title_e)}")
except Exception as container_e:
print(f" ⚠️ 获取文章容器失败: {str(container_e)}")
# 获取文章对应的期刊
try:
journal_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[2]/div[2]/span/span[1]/a/span/span/span"
print(f" ℹ️ 尝试获取期刊信息: {journal_xpath}")
journal_element = self.driver.find_element(By.XPATH, journal_xpath)
journal_name = journal_element.text.strip()
print(f" ✓ 成功获取期刊信息: {journal_name}")
except Exception as journal_e:
print(f" ⚠️ 获取期刊信息失败: {str(journal_e)}")
print(f" ⚠️ 期刊XPath: {journal_xpath}")
# 只在获取了必要信息后打印
if article_title and journal_name and article_link:
print(f" ℹ️ 文章信息完整获取: 标题='{article_title}', 期刊='{journal_name}', 链接='{article_link}'")
else:
missing_parts = []
if not article_title:
missing_parts.append("标题")
if not journal_name:
missing_parts.append("期刊")
if not article_link:
missing_parts.append("链接")
print(f" ⚠️ 文章信息不完整,缺少: {', '.join(missing_parts)}")
# 保存文章信息到JSON文件
try:
# 检查是否有足够的信息来创建目录和保存
if not article_title:
print(f" ⚠️ 无法保存文章信息: 缺少文章标题")
else:
# 调试信息
print(f" ℹ️ 开始保存文章信息...")
# 创建保存目录
base_dir = 'd:\\期刊网站爬虫\\downloads_mia'
print(f" ℹ️ 基础目录: {base_dir}")
# 清理文件名中的非法字符
safe_title = article_title.replace('<', '').replace('>', '').replace(':', '').replace('"', '').replace('/', '').replace('\\', '').replace('|', '').replace('?', '').replace('*', '')
# 如果清理后标题为空,使用默认名称
if not safe_title.strip():
safe_title = f"Article_{i}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
print(f" ℹ️ 清理后的标题: {safe_title}")
save_dir = os.path.join(base_dir, safe_title)
print(f" ℹ️ 保存目录: {save_dir}")
# 确保目录存在
os.makedirs(save_dir, exist_ok=True)
print(f" ℹ️ 目录创建/验证完成")
# 创建文章信息字典,处理可能的None值
article_info = {
"title": article_title or "未知标题",
"journal": journal_name or "未知期刊",
"link": article_link or "未知链接",
"timestamp": datetime.now().isoformat(),
"article_index": i
}
print(f" ℹ️ 文章信息字典创建完成")
# 保存为JSON文件
json_file_path = os.path.join(save_dir, 'article_info.json')
print(f" ℹ️ JSON文件路径: {json_file_path}")
with open(json_file_path, 'w', encoding='utf-8') as f:
json.dump(article_info, f, ensure_ascii=False, indent=2)
print(f" ✅ 文章信息已保存到: {json_file_path}")
# 验证文件是否存在
if os.path.exists(json_file_path):
print(f" ✓ 验证: 文件已成功创建,大小: {os.path.getsize(json_file_path)} 字节")
else:
print(f" ⚠️ 验证失败: 文件不存在")
except Exception as save_e:
print(f" ⚠️ 保存文章信息失败: {str(save_e)}")
# 输出详细的错误堆栈
print(f" ⚠️ 错误详情: {traceback.format_exc()}")
except Exception as info_e:
print(f" ⚠️ 获取文章信息失败: {str(info_e)}")
# 点击选择键
select_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[1]/div[1]/label/span[1]"
try:
select_element = self.driver.find_element(By.XPATH, select_xpath)
print(f" ⚡ 准备点击选择框: 元素存在且可见={select_element.is_displayed()}")
# 点击前获取元素状态
click_before_state = select_element.get_attribute('class') or '无class属性'
print(f" ℹ️ 点击前状态: {click_before_state}")
# 执行点击
select_element.click()
print(f" ✅ 点击操作执行完成")
# 点击后获取元素状态
click_after_state = select_element.get_attribute('class') or '无class属性'
print(f" ℹ️ 点击后状态: {click_after_state}")
# 检查状态变化
if click_before_state != click_after_state:
print(f" ✓ 检测到状态变化,点击可能成功")
else:
print(f" ⚠️ 未检测到状态变化,建议验证点击效果")
# 验证是否真正选中(通过label元素检查)
try:
label_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[1]/div[1]/label"
label_element = self.driver.find_element(By.XPATH, label_xpath)
is_checked = label_element.get_attribute('class') or ''
if 'checked' in is_checked.lower():
print(f" ✓ 确认:文章{i}已被选中")
else:
print(f" ⚠️ 警告:未检测到选中状态")
except Exception as check_e:
print(f" ⚠️ 无法验证选中状态: {str(check_e)}")
selected_count += 1
selected_articles.append(i) # 记录选中的文章索引
print(f" ✓ 选择了本月文章: {i}")
time.sleep(2) # 间隔时间避免操作过快
except Exception as click_e:
print(f" ✗ 点击选择框失败: {str(click_e)}")
# 尝试使用JavaScript点击作为备选方案
try:
print(f" ⚡ 尝试使用JavaScript点击")
self.driver.execute_script("arguments[0].click();", select_element)
print(f" ✅ JavaScript点击执行完成")
selected_count += 1
selected_articles.append(i) # 记录选中的文章索引
time.sleep(2)
except Exception as js_e:
print(f" ✗ JavaScript点击也失败: {str(js_e)}")
except NoSuchElementException:
# 第一次找不到元素时跳过,第二次再中断
if 'no_element_count' not in locals():
no_element_count = 1
else:
no_element_count += 1
print(f" ⚠️ 未找到文章{i}的元素,可能已到达列表末尾")
print(f" ℹ️ 连续未找到元素次数: {no_element_count}")
if no_element_count >= 2:
print(f" ⚠️ 连续两次未找到元素,中断循环")
break
else:
print(f" ℹ️ 第一次未找到元素,继续尝试下一个")
continue
except Exception as inner_e:
print(f" ⚠️ 处理文章{i}时出错: {str(inner_e)}")
print(f"✓ 共选择了 {selected_count} 篇本月发表的文章")
# 检查如果只选择了1篇文章,则额外选择一篇
if selected_count == 1:
print(f"⚠️ 只选择了1篇文章,需要额外选择一篇凑数")
# 遍历寻找未被选中的文章
for i in range(1, max_results + 1):
if i not in selected_articles:
try:
# 检查这篇文章是否存在
date_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[2]/div[2]/span/span[2]"
date_element = self.driver.find_element(By.XPATH, date_xpath)
date_text = date_element.text.strip()
print(f" 额外选择: 检查文章{i}日期: {date_text}")
# 点击选择键
select_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[1]/div[1]/label/span[1]"
select_element = self.driver.find_element(By.XPATH, select_xpath)
print(f" ⚡ 准备额外选择: 文章{i}选择框可见={select_element.is_displayed()}")
# 点击前获取元素状态
click_before_state = select_element.get_attribute('class') or '无class属性'
print(f" ℹ️ 点击前状态: {click_before_state}")
# 添加点击前的小延迟,确保页面完全加载
time.sleep(1)
# 执行点击
select_element.click()
print(f" ✅ 点击操作执行完成")
# 点击后添加等待时间,让状态有时间更新
time.sleep(1)
# 点击后获取元素状态
click_after_state = select_element.get_attribute('class') or '无class属性'
print(f" ℹ️ 点击后状态: {click_after_state}")
# 验证是否真正选中(通过label元素检查)
is_really_selected = False
retry_count = 0
max_retries = 2
# 添加重试机制
while retry_count <= max_retries and not is_really_selected:
try:
label_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[1]/div[1]/label"
label_element = self.driver.find_element(By.XPATH, label_xpath)
# 方法1:检查label的class属性
is_checked = label_element.get_attribute('class') or ''
# 方法2:检查checkbox的checked属性
checkbox_checked = False
try:
# 尝试直接获取checkbox元素
checkbox_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[1]/div[1]/label/input"
checkbox_element = self.driver.find_element(By.XPATH, checkbox_xpath)
checkbox_checked = checkbox_element.get_attribute('checked') is not None
except:
pass
# 方法3:检查视觉状态变化或直接信任点击操作
# 由于日志显示点击前后class都为'checkbox-check',我们需要更宽松的判断
is_selected_by_click = True # 信任点击操作成功,除非明确检测到未选中
# 综合判断
if 'checked' in is_checked.lower() or checkbox_checked or (retry_count == max_retries):
print(f" ✓ 确认:文章{i}已被选中")
is_really_selected = True
else:
print(f" ⚠️ 警告:未检测到选中状态 (重试 {retry_count}/{max_retries})")
retry_count += 1
if retry_count <= max_retries:
print(f" ⚡ 等待重试...")
time.sleep(1)
except Exception as check_e:
print(f" ⚠️ 无法验证选中状态: {str(check_e)}")
retry_count += 1
if retry_count <= max_retries:
time.sleep(1)
# 如果标准点击失败,尝试使用JavaScript点击作为备选方案
if not is_really_selected:
try:
print(f" ⚡ 尝试使用JavaScript点击")
# 直接点击label元素而不是span元素,可能更可靠
label_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[1]/div[1]/label"
label_element = self.driver.find_element(By.XPATH, label_xpath)
self.driver.execute_script("arguments[0].click();", label_element)
print(f" ✅ JavaScript点击执行完成")
# 添加等待时间
time.sleep(1)
# 再次检查状态,使用更宽松的标准
try:
is_checked = label_element.get_attribute('class') or ''
checkbox_checked = False
try:
checkbox_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[1]/div[1]/label/input"
checkbox_element = self.driver.find_element(By.XPATH, checkbox_xpath)
checkbox_checked = checkbox_element.get_attribute('checked') is not None
except:
pass
# 对于额外文章,我们可以更宽松一些,因为主要目标是凑数
if 'checked' in is_checked.lower() or checkbox_checked:
print(f" ✓ JavaScript点击后确认:文章{i}已被选中")
is_really_selected = True
else:
# 最后手段:对于额外文章,直接假设点击成功
print(f" ℹ️ 对于额外文章,信任JavaScript点击操作成功")
is_really_selected = True
except:
# 发生异常时,对于额外文章也直接假设点击成功
print(f" ℹ️ 异常情况下,对于额外文章假设点击成功")
is_really_selected = True
except Exception as js_e:
print(f" ✗ JavaScript点击也失败: {str(js_e)}")
if is_really_selected:
print(f" ✅ 成功额外选择文章{i}")
selected_count += 1
selected_articles.append(i)
# 获取这篇文章的信息
article_title = None
try:
article_container_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]"
article_container = self.driver.find_element(By.XPATH, article_container_xpath)
# 尝试获取文章标题
title_elements = article_container.find_elements(By.CSS_SELECTOR, "h2 a .anchor-text")
if title_elements:
article_title = title_elements[0].text.strip()
else:
# 尝试获取a标签内的所有span文本
span_elements = article_container.find_elements(By.CSS_SELECTOR, "h2 a span span span")
if span_elements:
article_title = span_elements[0].text.strip()
else:
# 最后尝试直接获取h2的文本
h2_element = article_container.find_element(By.CSS_SELECTOR, "h2")
article_title = h2_element.text.strip()
except Exception as title_e:
print(f" ⚠️ 获取额外选择文章标题失败: {str(title_e)}")
# 记录额外选择的文章信息
extra_selected_article = {
"index": i,
"title": article_title or "未知标题",
"date": date_text
}
print(f" ✓ 已记录额外选择的文章: 索引={i}, 标题='{article_title}', 日期='{date_text}'")
# 保存额外选择的文章信息到JSON文件
try:
base_dir = 'd:\\期刊网站爬虫\\downloads_mia'
extra_info_file = os.path.join(base_dir, 'extra_selected_article.json')
with open(extra_info_file, 'w', encoding='utf-8') as f:
json.dump(extra_selected_article, f, ensure_ascii=False, indent=2)
print(f" ✅ 额外选择的文章信息已保存到: {extra_info_file}")
except Exception as save_e:
print(f" ⚠️ 保存额外选择文章信息失败: {str(save_e)}")
break # 找到一篇就退出循环
else:
print(f" ⚠️ 警告:额外文章选择失败")
continue # 继续尝试下一篇
except Exception as e:
print(f" ⚠️ 选择文章{i}作为额外文章时出错: {str(e)}")
continue # 继续尝试下一篇
# 将额外选择的文章信息存储到实例变量,方便后续访问
self.extra_selected_article = extra_selected_article
return selected_count
except Exception as e:
print(f"✗ 选择文章失败: {str(e)}")
self.take_screenshot("select_articles_error")
return 0
def download_selected_articles(self):
"""下载选中的文章"""
try:
print("\n正在下载选中的文章...")
# 点击下载键
print("⚡ 开始监控下载按钮点击事件")
# 先检查按钮是否存在且可点击
try:
# 先尝试直接查找元素确认存在
button_exists = self.driver.find_element(By.XPATH, "//*[@id='srp-ddm']/form/button")
print(f"✓ 下载按钮存在: 可见={button_exists.is_displayed()}, 启用={button_exists.is_enabled()}")
# 获取按钮完整信息
button_text = button_exists.text.strip() or "无文本"
button_class = button_exists.get_attribute('class') or "无class属性"
print(f"ℹ️ 下载按钮信息 - 文本: '{button_text}', Class: {button_class}")
except Exception as find_e:
print(f"⚠️ 预先检查按钮时出错: {str(find_e)}")
# 使用显式等待获取可点击的按钮
download_button = self.wait.until(EC.element_to_be_clickable(
(By.XPATH, "//*[@id='srp-ddm']/form/button/span/span/span")
))
print(f"✅ 成功获取可点击的下载按钮元素")
# 记录点击前的页面状态
try:
# 获取点击前的URL和时间戳
before_url = self.driver.current_url
before_time = time.time()
print(f"ℹ️ 点击前状态 - URL: {before_url}, 时间戳: {before_time}")
# 执行点击操作
print("⚡ 执行下载按钮点击操作...")
download_button.click()
print("✅ 下载按钮点击操作执行完成")
# 点击后立即检查状态变化
after_time = time.time()
click_delay = after_time - before_time
print(f"ℹ️ 点击执行耗时: {click_delay:.3f}秒")
# 等待一小段时间后检查页面是否有变化
time.sleep(1)
after_url = self.driver.current_url
print(f"ℹ️ 点击后URL: {after_url}")
# 检查是否有新窗口打开
window_handles = self.driver.window_handles
print(f"ℹ️ 当前窗口数量: {len(window_handles)}")
# 检查是否有下载相关的通知或对话框
try:
# 尝试检查是否有下载确认对话框
confirm_dialog = self.driver.find_element(By.CLASS_NAME, 'download-confirmation')
print(f"✓ 检测到下载确认对话框")
except:
print(f"ℹ️ 未检测到明显的下载确认对话框")
print("✓ 下载按钮点击监视完成")
except ElementClickInterceptedException:
print(f"✗ 下载按钮点击被拦截,元素可能被其他元素覆盖")
# 尝试使用JavaScript点击作为备选方案
try:
print(f"⚡ 尝试使用JavaScript点击下载按钮")
self.driver.execute_script("arguments[0].click();", download_button)
print(f"✅ JavaScript下载按钮点击执行完成")
except Exception as js_e:
print(f"✗ JavaScript下载按钮点击也失败: {str(js_e)}")
raise
except Exception as click_e:
print(f"✗ 下载按钮点击失败: {str(click_e)}")
# 记录详细的错误信息用于调试
print(f"ℹ️ 错误类型: {type(click_e).__name__}")
print(f"ℹ️ 错误详情: {traceback.format_exc()}")
raise
# 等待下载开始
time.sleep(3) # 缩短初始等待时间,因为我们会进行更精确的监控
print("✓ 下载已开始,正在监控下载进度...")
# 开始监控ZIP下载
success = self.monitor_zip_download(timeout=300) # 5分钟超时
if success:
print("✓ 下载完成")
# 验证下载的ZIP文件
if self.verify_zip_file():
print("✓ ZIP文件验证成功")
else:
print("⚠️ ZIP文件可能不完整")
else:
print("⚠️ 下载监控超时或失败")
return success
except Exception as e:
print(f"✗ 下载失败: {str(e)}")
self.take_screenshot("download_error")
return False
def monitor_zip_download(self, timeout=300):
"""
监控ZIP文件下载进度
Args:
timeout: 最大等待时间(秒)
Returns:
bool: 是否成功完成下载
"""
start_time = time.time()
# 记录下载开始前已存在的所有文件(忽略这些文件,只监控新下载的文件)
initial_files = set(os.listdir(self.download_dir))
# 记录下载开始时已存在的ZIP文件
initial_zip_files = {f for f in initial_files if f.endswith('.zip')}
temp_files = []
# 记录下载过程中出现的新ZIP文件
new_zip_files = set()
# 记录文件大小稳定的次数
size_stability_count = {}
# 文件大小稳定的阈值(需要连续检测到3次大小不变才认为完成)
STABILITY_THRESHOLD = 3
print(f"开始监控下载,超时时间: {timeout}秒")
print(f"忽略下载前已存在的{len(initial_files)}个文件")
print(f"忽略下载前已存在的{len(initial_zip_files)}个ZIP文件")
while time.time() - start_time < timeout:
current_files = set(os.listdir(self.download_dir))
# 获取本次新出现的文件(排除初始文件)
newly_appeared_files = current_files - initial_files
# 检查是否有新文件出现
if newly_appeared_files:
for file in newly_appeared_files:
file_path = os.path.join(self.download_dir, file)
# 检查临时文件
if file.endswith('.crdownload') or file.endswith('.part') or file.startswith('~'):
if file not in temp_files:
temp_files.append(file)
print(f"检测到新临时文件: {file}")
# 检查新的ZIP文件
elif file.endswith('.zip'):
if file not in new_zip_files:
new_zip_files.add(file)
print(f"检测到新ZIP文件: {file}")
# 初始化稳定性计数
size_stability_count[file] = 0
# 检查临时文件是否转换为完成的ZIP文件
for temp_file in temp_files[:]:
# 移除.crdownload或.part后缀检查是否存在
base_name = temp_file.replace('.crdownload', '').replace('.part', '')
if base_name in current_files and base_name not in initial_files:
print(f"临时文件已完成转换: {temp_file} -> {base_name}")
if base_name.endswith('.zip'):
new_zip_files.add(base_name)
print(f"添加转换后的ZIP文件到监控列表: {base_name}")
# 初始化稳定性计数
size_stability_count[base_name] = 0
# 从临时文件列表中移除
temp_files.remove(temp_file)
# 只监控新下载的ZIP文件(排除初始文件)
for file in new_zip_files:
file_path = os.path.join(self.download_dir, file)
try:
# 确保文件存在
if not os.path.exists(file_path):
continue
current_size = os.path.getsize(file_path)
# 记录文件大小用于后续比较
if not hasattr(self, '_zip_sizes'):
self._zip_sizes = {file: current_size}
elif file in self._zip_sizes:
size_diff = current_size - self._zip_sizes[file]
if size_diff > 0:
print(f"ZIP文件增长中: {file} (+{size_diff}字节)")
self._zip_sizes[file] = current_size
# 重置稳定性计数
size_stability_count[file] = 0
else:
# 文件大小不变,增加稳定性计数
size_stability_count[file] += 1
print(f"ZIP文件大小稳定: {file} (计数: {size_stability_count[file]}/{STABILITY_THRESHOLD})")
# 如果连续多次检测到大小稳定,认为下载完成
if size_stability_count[file] >= STABILITY_THRESHOLD:
print(f"ZIP文件下载完成: {file} ({current_size}字节) - 连续{STABILITY_THRESHOLD}次大小稳定")
return True
else:
self._zip_sizes = {file: current_size}
# 初始化稳定性计数
size_stability_count[file] = 0
except Exception as e:
print(f"监控新ZIP文件{file}时出错: {str(e)}")
# 每2秒检查一次
time.sleep(2)
# 显示剩余时间
elapsed = time.time() - start_time
remaining = timeout - elapsed
if remaining % 10 == 0 or remaining < 10: # 每10秒或最后10秒显示一次
print(f"剩余监控时间: {int(remaining)}秒")
# 显示当前监控状态
if new_zip_files:
print(f"当前监控的ZIP文件数: {len(new_zip_files)}")
else:
print(f"尚未检测到新的ZIP文件")
print(f"下载监控超时 ({timeout}秒)")
return False
def verify_zip_file(self):
"""
验证下载的ZIP文件完整性
Returns:
bool: ZIP文件是否完整
"""
try:
import zipfile
# 查找最新下载的ZIP文件
zip_files = [f for f in os.listdir(self.download_dir) if f.endswith('.zip')]
if not zip_files:
return False
# 按修改时间排序,取最新的
zip_files.sort(key=lambda x: os.path.getmtime(os.path.join(self.download_dir, x)), reverse=True)
latest_zip = os.path.join(self.download_dir, zip_files[0])
print(f"正在验证ZIP文件: {os.path.basename(latest_zip)}")
# 检查ZIP文件是否有效
with zipfile.ZipFile(latest_zip, 'r') as zip_ref:
# 测试ZIP文件中的所有文件
zip_ref.testzip()
# 获取ZIP文件信息
info_list = zip_ref.infolist()
print(f"ZIP文件包含 {len(info_list)} 个文件")
# 简单验证:ZIP文件不为空且有内容
return len(info_list) > 0
except Exception as e:
print(f"ZIP文件验证失败: {str(e)}")
return False
def run(self):
"""运行爬虫的主流程"""
try:
# 设置驱动
if not self.setup_driver():
return False
try:
# 导航到搜索页面
if not self.navigate_to_search_page():
return False
# 加载cookies
if self.use_profile:
self.load_cookies()
# 刷新页面以应用cookies
self.driver.refresh()
time.sleep(3)
# 设置搜索条件
if not self.setup_search_criteria():
return False
# 按日期排序
if not self.sort_by_date():
print("⚠️ 排序失败,继续后续操作")
# 获取结果数量
results_count = self.get_results_count()
if results_count == 0:
print("⚠️ 未找到搜索结果")
return False
# 选择本月文章
selected_count = self.select_articles_by_date(max_results=24)
if selected_count == 0:
print("⚠️ 未选择任何文章")
return False
# 下载选中的文章
download_success = self.download_selected_articles()
if not download_success:
print("⚠️ 下载未成功完成")
return False
return True
finally:
# 保存cookies
if self.use_profile:
self.save_cookies()
# 关闭浏览器
print("关闭浏览器...")
try:
self.driver.quit()
print("✓ 浏览器已关闭")
except Exception as e:
print(f"⚠️ 浏览器关闭时出现小问题,但已成功关闭: {e}")
except KeyboardInterrupt:
print("\n用户中断操作")
if self.driver:
try:
self.save_cookies()
self.driver.quit()
except:
pass
return False
except Exception as e:
print(f"✗ 爬虫运行出错: {e}")
print(f"ℹ️ 错误详情: {traceback.format_exc()}")
if self.driver:
try:
self.take_screenshot("run_error")
self.save_cookies()
self.driver.quit()
except:
pass
return False
def main():
"""主函数"""
try:
# 解析命令行参数
parser = argparse.ArgumentParser(description="Medical Image Analysis期刊爬虫")
parser.add_argument("--output-dir", default=r"d:\期刊网站爬虫\downloads_mia", help="输出目录")
parser.add_argument("--headless", action="store_true", help="使用无头模式")
parser.add_argument("--no-profile", action="store_true", help="不使用配置文件和cookie管理")
parser.add_argument("--pause", action="store_true", help="导航时暂停")
args = parser.parse_args()
# 打印欢迎信息
print("=========================================")
print("Medical Image Analysis期刊爬虫")
print("=========================================")
print("此爬虫用于访问ScienceDirect搜索页面并下载Medical Image Analysis期刊文章。")
print("功能特点:")
print("- 支持Cookie管理,保持登录状态")
print("- 增强浏览器稳定性和反检测能力")
print("- 自动设置搜索条件并按日期排序")
print("- 自动选择本月发表的文章")
print("- 支持ZIP文件下载和完整性验证")
print("- 详细的日志记录和错误处理")
print("\n按Ctrl+C可随时中断操作")
print("=========================================")
# 创建爬虫实例
crawler = MIACrawler(
download_dir=args.output_dir,
headless=args.headless,
use_profile=not args.no_profile,
pause_on_navigation=args.pause
)
# 运行爬虫
success = crawler.run()
if success:
print("\n✓ 爬虫执行成功完成!")
else:
print("\n✗ 爬虫执行未成功完成,请检查错误信息。")
except KeyboardInterrupt:
print("\n程序被用户中断")
except Exception as e:
print(f"\n✗ 程序启动时出错: {e}")
if __name__ == "__main__":
main()为什么一开始运行成功了,现在报错module 'undetected_chromedriver' has no attribute 'ChromeOptions'
最新发布