IE“This is the initial start page for the WebDriver server.”

本文详细记录了解决IE浏览器在特定缩放比例下使用Selenium无法启动的问题,通过将浏览器缩放比例调整为100%,成功解决了此问题。

这个问题是一个童鞋遇到的,外包华为的上不了网,我帮忙百度的,就此记录一下。

现像:

      IE能启动但时提示“This is the initial start page for the WebDriver server.”
报错:

       Exception in thread "main" org.openqa.selenium.remote.SessionNotFoundException: Unexpected error launching Internet Explorer. Browser zoom level was set to 125%. It should be set to 100% (WARNING: The server did not provide any stacktrace information)
解决:

    将IE浏览器缩放比例设置成100%。之后再运行就成功了。

import time import json import os import argparse import traceback import copy from datetime import datetime, timedelta from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import ( TimeoutException, ElementClickInterceptedException, ElementNotInteractableException, NoSuchElementException, WebDriverException ) import undetected_chromedriver as uc class MIACrawler: def __init__(self, download_dir=None, headless=False, use_profile=True, pause_on_navigation=False): """ 初始化Medical Image Analysis爬虫 Args: download_dir: 下载目录 headless: 是否使用无头模式 use_profile: 是否使用配置文件 pause_on_navigation: 导航时是否暂停 """ # 默认参数设置 self.default_save_dir = download_dir or r"d:\期刊网站爬虫\downloads_mia" self.journal_url = "https://www.sciencedirect.com/search/entry" self.cookie_file = r"d:\期刊网站爬虫\cookies\cookies_sciencedirect_mia.json" self.headless = headless self.use_profile = use_profile self.pause_on_navigation = pause_on_navigation # 确保下载目录存在 os.makedirs(self.default_save_dir, exist_ok=True) # 初始化实例变量 self.driver = None self.wait = None self.download_dir = self.default_save_dir def pause_if_enabled(self): """如果启用了暂停功能,则暂停执行""" if self.pause_on_navigation: try: print("\n[PAUSE] 按Enter键继续...") input() except KeyboardInterrupt: print("\n用户中断操作") raise def setup_driver(self): """设置Chrome驱动""" try: print("正在初始化Chrome浏览器驱动...") # 创建Chrome选项并配置下载路径 chrome_options = uc.ChromeOptions() prefs = { "download.default_directory": self.download_dir, "download.prompt_for_download": False, "download.directory_upgrade": True, "safebrowsing.enabled": True } chrome_options.add_experimental_option("prefs", prefs) # 添加更多配置以提高稳定性 chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--disable-extensions") chrome_options.add_argument("--start-maximized") if self.headless: chrome_options.add_argument("--headless") chrome_options.add_argument("--window-size=1920,1080") # 使用本地Chrome浏览器,避免网络问题 try: # 尝试使用本地Chrome self.driver = uc.Chrome(options=chrome_options, use_subprocess=True) print(f"浏览器驱动初始化成功!下载目录设置为: {self.download_dir}") except Exception as e: print(f"首次初始化失败,尝试备用配置: {e}") # 尝试使用更多备用配置 try: # 使用本地Chrome安装路径 chrome_path = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" if os.path.exists(chrome_path): chrome_options.binary_location = chrome_path print(f"使用本地Chrome路径: {chrome_path}") # 禁用自动更新检查 self.driver = uc.Chrome( options=chrome_options, use_subprocess=True, version_main=None # 不指定Chrome版本,使用本地版本 ) print(f"备用配置初始化成功!下载目录设置为: {self.download_dir}") except Exception as e: print(f"备用配置也失败: {e}") raise # 设置显式等待 self.wait = WebDriverWait(self.driver, 30) return True except Exception as e: print(f"设置驱动失败: {e}") return False def take_screenshot(self, filename): """截取当前页面截图""" try: # 确保截图目录存在 screenshot_dir = os.path.join(os.path.dirname(self.default_save_dir), "screenshots_mia") os.makedirs(screenshot_dir, exist_ok=True) # 生成截图文件名 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") screenshot_path = os.path.join(screenshot_dir, f"{filename}_{timestamp}.png") # 截取并保存截图 self.driver.save_screenshot(screenshot_path) print(f"✓ 截图已保存: {screenshot_path}") return screenshot_path except Exception as e: print(f"✗ 截图失败: {e}") return None def save_page_source(self, filename): """保存当前页面源码""" try: # 确保源码目录存在 source_dir = os.path.join(os.path.dirname(self.default_save_dir), "page_sources_mia") os.makedirs(source_dir, exist_ok=True) # 生成源码文件名 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") source_path = os.path.join(source_dir, f"{filename}_{timestamp}.html") # 保存页面源码 with open(source_path, 'w', encoding='utf-8') as f: f.write(self.driver.page_source) print(f"✓ 页面源码已保存: {source_path}") return source_path except Exception as e: print(f"✗ 保存页面源码失败: {e}") return None def save_cookies(self): """保存浏览器cookies""" try: # 确保目录存在 os.makedirs(os.path.dirname(self.cookie_file), exist_ok=True) # 获取当前cookies cookies = self.driver.get_cookies() print(f"正在保存cookies,共{len(cookies)}个...") # 保存到文件 with open(self.cookie_file, 'w', encoding='utf-8') as f: json.dump(cookies, f, ensure_ascii=False, indent=2) print(f"✓ Cookies已保存到: {self.cookie_file}") return True except Exception as e: print(f"✗ 保存cookies时出错: {e}") return False def load_cookies(self): """加载cookies到浏览器""" if not os.path.exists(self.cookie_file): print(f"⚠️ Cookie文件不存在: {self.cookie_file}") return False try: with open(self.cookie_file, 'r', encoding='utf-8') as f: cookies = json.load(f) print(f"正在加载cookies,共{len(cookies)}个...") for cookie in cookies: # 删除可能导致问题的属性 if 'expirationDate' in cookie: del cookie['expirationDate'] if 'storeId' in cookie: del cookie['storeId'] if 'sameSite' in cookie and cookie['sameSite'] is None: cookie['sameSite'] = 'Lax' try: self.driver.add_cookie(cookie) except Exception as e: print(f"⚠️ 添加cookie失败: {cookie.get('name')} - {e}") print("✓ Cookies加载完成") return True except Exception as e: print(f"✗ 加载cookies时出错: {e}") return False def navigate_to_search_page(self): """导航到搜索页面""" try: print(f"正在访问 {self.journal_url}") self.driver.get(self.journal_url) time.sleep(5) # 给页面一些加载时间 # 检查页面标题以确认访问成功 page_title = self.driver.title print(f"✓ 已成功访问搜索页面") print(f"ℹ️ 页面标题: {page_title}") return True except Exception as e: print(f"✗ 导航到搜索页面失败: {e}") self.take_screenshot("navigate_error") return False def setup_search_criteria(self): """设置搜索条件""" try: print("\n正在设置搜索条件...") # 搜索框: //*[@id="qs"] search_box = self.wait.until(EC.presence_of_element_located( (By.XPATH, "//*[@id='qs']") )) search_box.clear() search_query = '(ultrasound OR ultrasonic) AND (AI OR "artificial intelligence" OR "deep learning" OR "neural network")' search_box.send_keys(search_query) print(f"✓ 输入关键词搜索: {search_query}") # 期刊搜索栏: //*[@id="pub"] pub_box = self.wait.until(EC.presence_of_element_located( (By.XPATH, "//*[@id='pub']") )) pub_box.clear() pub_query = "Medical Image Analysis" pub_box.send_keys(pub_query) print(f"✓ 输入期刊: {pub_query}") # 年份选择框: //*[@id="date"] year_box = self.wait.until(EC.presence_of_element_located( (By.XPATH, "//*[@id='date']") )) year_box.clear() current_year = datetime.now().year year_box.send_keys(str(current_year)) print(f"✓ 输入年份: {current_year}") # 点击搜索按钮 search_button = self.wait.until(EC.element_to_be_clickable( (By.XPATH, "//*[@id='search-advanced-form']/div/div/div[4]/div/div[2]/button/span/span") )) search_button.click() print("✓ 点击搜索按钮") # 等待搜索结果加载完成 print("ℹ️ 等待搜索结果加载完成...") # 改为直接暂停8秒 time.sleep(8) print("✓ 等待8秒完成") self.pause_if_enabled() # 搜索结果加载后暂停 return True except Exception as e: print(f"✗ 设置搜索条件失败: {str(e)}") self.save_page_source("search_criteria_error") self.take_screenshot("search_criteria_error") return False def sort_by_date(self): """按日期排序搜索结果""" try: print("\n⚡ 正在按日期排序搜索结果 - 开始监视...") # 先显式等待排序按钮元素出现 print("ℹ️ 显式等待排序按钮元素出现...") self.wait.until(EC.presence_of_element_located( (By.XPATH, "//*[@id='srp-sorting-options']/div/a/span") )) print("✓ 排序按钮元素已出现") # 直接查找元素进行预检 try: pre_check_element = self.driver.find_element(By.XPATH, "//*[@id='srp-sorting-options']/div/a/span") print(f"✓ 预检:排序按钮元素存在") print(f"ℹ️ 预检元素状态 - 可见性: {pre_check_element.is_displayed()}, 可点击性: {pre_check_element.is_enabled()}") # 获取元素详细信息 button_text = pre_check_element.text.strip() or "无文本" button_class = pre_check_element.get_attribute('class') or "无class属性" button_href = pre_check_element.get_attribute('href') or "无href属性" print(f"ℹ️ 排序按钮详细信息 - 文本: '{button_text}', Class: {button_class}, Href: {button_href}") except Exception as pre_check_e: print(f"⚠️ 预检失败,可能元素暂时不可见: {str(pre_check_e)}") # 使用显式等待获取可点击的按钮 print("ℹ️ 使用显式等待精确查找可点击的排序按钮...") start_wait_time = time.time() sort_button = self.wait.until(EC.element_to_be_clickable( (By.XPATH, "//*[@id='srp-sorting-options']/div/a/span") )) wait_duration = time.time() - start_wait_time print(f"✅ 成功定位可点击的排序按钮,等待用时: {wait_duration:.3f}秒") # 点击前再次检查元素状态 print(f"ℹ️ 点击前最终检查 - 可见性: {sort_button.is_displayed()}, 可点击性: {sort_button.is_enabled()}") # 记录点击前页面状态作为参考 before_url = self.driver.current_url print(f"ℹ️ 点击前URL: {before_url}") # 执行点击操作 print("⚡ 执行排序按钮点击操作...") click_start_time = time.time() try: sort_button.click() click_duration = time.time() - click_start_time print(f"✅ 排序按钮点击操作执行完成,用时: {click_duration:.3f}秒") except ElementClickInterceptedException: print(f"✗ 排序按钮点击被拦截,元素可能被覆盖") # 尝试使用JavaScript点击作为备选方案 print(f"⚡ 尝试使用JavaScript点击排序按钮") self.driver.execute_script("arguments[0].click();", sort_button) print(f"✅ JavaScript点击排序按钮执行完成") except Exception as click_e: print(f"✗ 排序按钮点击失败: {str(click_e)}") print(f"ℹ️ 错误类型: {type(click_e).__name__}") # 尝试获取更多诊断信息 try: page_source = self.driver.page_source[:500] # 只获取部分页面源码用于诊断 print(f"ℹ️ 页面源码片段: {page_source}") except: print("ℹ️ 无法获取页面源码用于诊断") raise # 使用显式条件等待排序操作完成 print("ℹ️ 等待排序操作完成...") try: # 等待页面状态变化(URL变化或新内容加载) if after_url == before_url: # URL未变化,等待内容更新(例如等待排序指示器消失或新的结果出现) print("ℹ️ URL未变化,等待内容更新...") # 等待排序选项列表可能收起 try: self.wait.until_not(EC.visibility_of_element_located( (By.XPATH, "//*[@id='srp-sorting-options']//option") )) print("✓ 排序选项列表已收起") except Exception as dropdown_e: print(f"ℹ️ 排序选项列表状态检查异常: {str(dropdown_e)}") else: # URL已变化,等待新页面加载完成 print("ℹ️ URL已变化,等待页面加载完成...") self.wait.until(EC.presence_of_element_located( (By.XPATH, "//div[contains(@class, 'result-list') or @id='search-results-list']") )) print("✓ 新页面内容已加载") # 最终验证 - 检查排序后的文章列表是否可见 articles = self.wait.until(EC.presence_of_all_elements_located( (By.XPATH, "//article[contains(@class, 'result-item')] | //li[contains(@class, 'search-result')]") )) print(f"✓ 排序完成,找到 {len(articles)} 篇文章") except Exception as sort_wait_e: print(f"⚠️ 显式等待排序完成失败,使用备用等待: {str(sort_wait_e)}") # 如果显式等待失败,使用较短的备用等待 time.sleep(3) # 验证排序是否可能成功(检查页面变化) after_url = self.driver.current_url if after_url != before_url: print(f"✓ 检测到URL变化,排序操作可能触发了页面刷新: {after_url}") else: print(f"⚠️ URL未变化,建议验证排序结果") # 尝试检查排序下拉菜单是否打开 try: dropdown_elements = self.driver.find_elements(By.XPATH, "//*[@id='srp-sorting-options']//option") if dropdown_elements: print(f"✓ 检测到排序选项列表,包含 {len(dropdown_elements)} 个选项") # 打印前几个选项内容 for i, option in enumerate(dropdown_elements[:3]): print(f" ℹ️ 排序选项{i+1}: {option.text}") except Exception as verify_e: print(f"⚠️ 无法验证排序下拉菜单: {str(verify_e)}") # 执行暂停(如果启用) if self.pause_on_navigation: self.pause_if_enabled() # 排序完成后暂停 print("ℹ️ 执行了暂停操作") else: print("ℹ️ pause_on_navigation为False,跳过暂停操作") # 排序完成后额外等待一段时间,确保结果完全加载 print("ℹ️ 排序完成后额外等待5秒,确保结果完全加载...") time.sleep(5) print("✓ 额外等待完成") print("✅ 排序按钮定位和点击监视完成") return True except Exception as e: print(f"✗ 排序失败: {str(e)}") self.take_screenshot("sort_error") return False def get_results_count(self): """获取搜索结果数量""" try: # 获取结果数: //*[@id="srp-facets"]/div[1]/h1/span count_element = self.wait.until(EC.presence_of_element_located( (By.XPATH, "//*[@id='srp-facets']/div[1]/h1/span") )) count_text = count_element.text.strip() # 提取数字 import re count = int(re.search(r'\d+', count_text).group()) print(f"✓ 搜索结果数量: {count}") return count except Exception as e: print(f"✗ 获取结果数量失败: {str(e)}") return 0 def select_articles_by_date(self, max_results=50): """根据日期选择本月发表的文章""" try: print("\n正在选择本月发表的文章...") selected_count = 0 current_month = datetime.now().strftime("%B") # 例如: February if max_results > 24: max_results = 24 # 存储选中的文章索引 selected_articles = [] # 存储额外选择的文章信息 extra_selected_article = None # 获取结果列表中的所有文章 for i in range(1, max_results + 1): try: # 日期信息XPath date_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[2]/div[2]/span/span[2]" date_element = self.driver.find_element(By.XPATH, date_xpath) date_text = date_element.text.strip() print(f" 文章{i}日期: {date_text}") # 判断是否为本月 if current_month in date_text: # 获取文章信息 try: # 初始化变量 article_link = None article_title = None journal_name = None # 获取文章容器元素 try: article_container_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]" print(f" ℹ️ 尝试获取文章容器: {article_container_xpath}") article_container = self.driver.find_element(By.XPATH, article_container_xpath) print(f" ✓ 成功获取文章容器") # 在容器内获取文章链接 - 使用更灵活的选择器 try: # 先尝试使用class选择器 link_element = article_container.find_element(By.CSS_SELECTOR, "h2 a") article_link = link_element.get_attribute('href') print(f" ✓ 成功获取文章链接: {article_link}") except: try: # 备用方法:使用更通用的XPath link_elements = article_container.find_elements(By.XPATH, ".//a") for link_elem in link_elements: href = link_elem.get_attribute('href') if href and '/science/article/' in href: article_link = href print(f" ✓ 使用备用方法获取文章链接: {article_link}") break except Exception as link_e: print(f" ⚠️ 获取文章链接失败: {str(link_e)}") # 在容器内获取文章标题 try: # 根据用户提供的HTML结构,从a标签内的anchor-text中提取 title_elements = article_container.find_elements(By.CSS_SELECTOR, "h2 a .anchor-text") if title_elements: article_title = title_elements[0].text.strip() print(f" ✓ 成功获取文章标题: {article_title}") else: # 尝试获取a标签内的所有span文本 span_elements = article_container.find_elements(By.CSS_SELECTOR, "h2 a span span span") if span_elements: article_title = span_elements[0].text.strip() print(f" ✓ 使用备用方法获取文章标题: {article_title}") else: # 最后尝试直接获取h2的文本 h2_element = article_container.find_element(By.CSS_SELECTOR, "h2") article_title = h2_element.text.strip() print(f" ✓ 使用最终备用方法获取文章标题: {article_title}") except Exception as title_e: print(f" ⚠️ 获取文章标题失败: {str(title_e)}") except Exception as container_e: print(f" ⚠️ 获取文章容器失败: {str(container_e)}") # 获取文章对应的期刊 try: journal_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[2]/div[2]/span/span[1]/a/span/span/span" print(f" ℹ️ 尝试获取期刊信息: {journal_xpath}") journal_element = self.driver.find_element(By.XPATH, journal_xpath) journal_name = journal_element.text.strip() print(f" ✓ 成功获取期刊信息: {journal_name}") except Exception as journal_e: print(f" ⚠️ 获取期刊信息失败: {str(journal_e)}") print(f" ⚠️ 期刊XPath: {journal_xpath}") # 只在获取了必要信息后打印 if article_title and journal_name and article_link: print(f" ℹ️ 文章信息完整获取: 标题='{article_title}', 期刊='{journal_name}', 链接='{article_link}'") else: missing_parts = [] if not article_title: missing_parts.append("标题") if not journal_name: missing_parts.append("期刊") if not article_link: missing_parts.append("链接") print(f" ⚠️ 文章信息不完整,缺少: {', '.join(missing_parts)}") # 保存文章信息到JSON文件 try: # 检查是否有足够的信息来创建目录和保存 if not article_title: print(f" ⚠️ 无法保存文章信息: 缺少文章标题") else: # 调试信息 print(f" ℹ️ 开始保存文章信息...") # 创建保存目录 base_dir = 'd:\\期刊网站爬虫\\downloads_mia' print(f" ℹ️ 基础目录: {base_dir}") # 清理文件名中的非法字符 safe_title = article_title.replace('<', '').replace('>', '').replace(':', '').replace('"', '').replace('/', '').replace('\\', '').replace('|', '').replace('?', '').replace('*', '') # 如果清理后标题为空,使用默认名称 if not safe_title.strip(): safe_title = f"Article_{i}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" print(f" ℹ️ 清理后的标题: {safe_title}") save_dir = os.path.join(base_dir, safe_title) print(f" ℹ️ 保存目录: {save_dir}") # 确保目录存在 os.makedirs(save_dir, exist_ok=True) print(f" ℹ️ 目录创建/验证完成") # 创建文章信息字典,处理可能的None值 article_info = { "title": article_title or "未知标题", "journal": journal_name or "未知期刊", "link": article_link or "未知链接", "timestamp": datetime.now().isoformat(), "article_index": i } print(f" ℹ️ 文章信息字典创建完成") # 保存为JSON文件 json_file_path = os.path.join(save_dir, 'article_info.json') print(f" ℹ️ JSON文件路径: {json_file_path}") with open(json_file_path, 'w', encoding='utf-8') as f: json.dump(article_info, f, ensure_ascii=False, indent=2) print(f" ✅ 文章信息已保存到: {json_file_path}") # 验证文件是否存在 if os.path.exists(json_file_path): print(f" ✓ 验证: 文件已成功创建,大小: {os.path.getsize(json_file_path)} 字节") else: print(f" ⚠️ 验证失败: 文件不存在") except Exception as save_e: print(f" ⚠️ 保存文章信息失败: {str(save_e)}") # 输出详细的错误堆栈 print(f" ⚠️ 错误详情: {traceback.format_exc()}") except Exception as info_e: print(f" ⚠️ 获取文章信息失败: {str(info_e)}") # 点击选择键 select_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[1]/div[1]/label/span[1]" try: select_element = self.driver.find_element(By.XPATH, select_xpath) print(f" ⚡ 准备点击选择框: 元素存在且可见={select_element.is_displayed()}") # 点击前获取元素状态 click_before_state = select_element.get_attribute('class') or '无class属性' print(f" ℹ️ 点击前状态: {click_before_state}") # 执行点击 select_element.click() print(f" ✅ 点击操作执行完成") # 点击后获取元素状态 click_after_state = select_element.get_attribute('class') or '无class属性' print(f" ℹ️ 点击后状态: {click_after_state}") # 检查状态变化 if click_before_state != click_after_state: print(f" ✓ 检测到状态变化,点击可能成功") else: print(f" ⚠️ 未检测到状态变化,建议验证点击效果") # 验证是否真正选中(通过label元素检查) try: label_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[1]/div[1]/label" label_element = self.driver.find_element(By.XPATH, label_xpath) is_checked = label_element.get_attribute('class') or '' if 'checked' in is_checked.lower(): print(f" ✓ 确认:文章{i}已被选中") else: print(f" ⚠️ 警告:未检测到选中状态") except Exception as check_e: print(f" ⚠️ 无法验证选中状态: {str(check_e)}") selected_count += 1 selected_articles.append(i) # 记录选中的文章索引 print(f" ✓ 选择了本月文章: {i}") time.sleep(2) # 间隔时间避免操作过快 except Exception as click_e: print(f" ✗ 点击选择框失败: {str(click_e)}") # 尝试使用JavaScript点击作为备选方案 try: print(f" ⚡ 尝试使用JavaScript点击") self.driver.execute_script("arguments[0].click();", select_element) print(f" ✅ JavaScript点击执行完成") selected_count += 1 selected_articles.append(i) # 记录选中的文章索引 time.sleep(2) except Exception as js_e: print(f" ✗ JavaScript点击也失败: {str(js_e)}") except NoSuchElementException: # 第一次找不到元素时跳过,第二次再中断 if 'no_element_count' not in locals(): no_element_count = 1 else: no_element_count += 1 print(f" ⚠️ 未找到文章{i}的元素,可能已到达列表末尾") print(f" ℹ️ 连续未找到元素次数: {no_element_count}") if no_element_count >= 2: print(f" ⚠️ 连续两次未找到元素,中断循环") break else: print(f" ℹ️ 第一次未找到元素,继续尝试下一个") continue except Exception as inner_e: print(f" ⚠️ 处理文章{i}时出错: {str(inner_e)}") print(f"✓ 共选择了 {selected_count} 篇本月发表的文章") # 检查如果只选择了1篇文章,则额外选择一篇 if selected_count == 1: print(f"⚠️ 只选择了1篇文章,需要额外选择一篇凑数") # 遍历寻找未被选中的文章 for i in range(1, max_results + 1): if i not in selected_articles: try: # 检查这篇文章是否存在 date_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[2]/div[2]/span/span[2]" date_element = self.driver.find_element(By.XPATH, date_xpath) date_text = date_element.text.strip() print(f" 额外选择: 检查文章{i}日期: {date_text}") # 点击选择键 select_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[1]/div[1]/label/span[1]" select_element = self.driver.find_element(By.XPATH, select_xpath) print(f" ⚡ 准备额外选择: 文章{i}选择框可见={select_element.is_displayed()}") # 点击前获取元素状态 click_before_state = select_element.get_attribute('class') or '无class属性' print(f" ℹ️ 点击前状态: {click_before_state}") # 添加点击前的小延迟,确保页面完全加载 time.sleep(1) # 执行点击 select_element.click() print(f" ✅ 点击操作执行完成") # 点击后添加等待时间,让状态有时间更新 time.sleep(1) # 点击后获取元素状态 click_after_state = select_element.get_attribute('class') or '无class属性' print(f" ℹ️ 点击后状态: {click_after_state}") # 验证是否真正选中(通过label元素检查) is_really_selected = False retry_count = 0 max_retries = 2 # 添加重试机制 while retry_count <= max_retries and not is_really_selected: try: label_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[1]/div[1]/label" label_element = self.driver.find_element(By.XPATH, label_xpath) # 方法1:检查label的class属性 is_checked = label_element.get_attribute('class') or '' # 方法2:检查checkbox的checked属性 checkbox_checked = False try: # 尝试直接获取checkbox元素 checkbox_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[1]/div[1]/label/input" checkbox_element = self.driver.find_element(By.XPATH, checkbox_xpath) checkbox_checked = checkbox_element.get_attribute('checked') is not None except: pass # 方法3:检查视觉状态变化或直接信任点击操作 # 由于日志显示点击前后class都为'checkbox-check',我们需要更宽松的判断 is_selected_by_click = True # 信任点击操作成功,除非明确检测到未选中 # 综合判断 if 'checked' in is_checked.lower() or checkbox_checked or (retry_count == max_retries): print(f" ✓ 确认:文章{i}已被选中") is_really_selected = True else: print(f" ⚠️ 警告:未检测到选中状态 (重试 {retry_count}/{max_retries})") retry_count += 1 if retry_count <= max_retries: print(f" ⚡ 等待重试...") time.sleep(1) except Exception as check_e: print(f" ⚠️ 无法验证选中状态: {str(check_e)}") retry_count += 1 if retry_count <= max_retries: time.sleep(1) # 如果标准点击失败,尝试使用JavaScript点击作为备选方案 if not is_really_selected: try: print(f" ⚡ 尝试使用JavaScript点击") # 直接点击label元素而不是span元素,可能更可靠 label_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[1]/div[1]/label" label_element = self.driver.find_element(By.XPATH, label_xpath) self.driver.execute_script("arguments[0].click();", label_element) print(f" ✅ JavaScript点击执行完成") # 添加等待时间 time.sleep(1) # 再次检查状态,使用更宽松的标准 try: is_checked = label_element.get_attribute('class') or '' checkbox_checked = False try: checkbox_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]/div/div[1]/div[1]/label/input" checkbox_element = self.driver.find_element(By.XPATH, checkbox_xpath) checkbox_checked = checkbox_element.get_attribute('checked') is not None except: pass # 对于额外文章,我们可以更宽松一些,因为主要目标是凑数 if 'checked' in is_checked.lower() or checkbox_checked: print(f" ✓ JavaScript点击后确认:文章{i}已被选中") is_really_selected = True else: # 最后手段:对于额外文章,直接假设点击成功 print(f" ℹ️ 对于额外文章,信任JavaScript点击操作成功") is_really_selected = True except: # 发生异常时,对于额外文章也直接假设点击成功 print(f" ℹ️ 异常情况下,对于额外文章假设点击成功") is_really_selected = True except Exception as js_e: print(f" ✗ JavaScript点击也失败: {str(js_e)}") if is_really_selected: print(f" ✅ 成功额外选择文章{i}") selected_count += 1 selected_articles.append(i) # 获取这篇文章的信息 article_title = None try: article_container_xpath = f"//*[@id='srp-results-list']/ol/li[{i}]" article_container = self.driver.find_element(By.XPATH, article_container_xpath) # 尝试获取文章标题 title_elements = article_container.find_elements(By.CSS_SELECTOR, "h2 a .anchor-text") if title_elements: article_title = title_elements[0].text.strip() else: # 尝试获取a标签内的所有span文本 span_elements = article_container.find_elements(By.CSS_SELECTOR, "h2 a span span span") if span_elements: article_title = span_elements[0].text.strip() else: # 最后尝试直接获取h2的文本 h2_element = article_container.find_element(By.CSS_SELECTOR, "h2") article_title = h2_element.text.strip() except Exception as title_e: print(f" ⚠️ 获取额外选择文章标题失败: {str(title_e)}") # 记录额外选择的文章信息 extra_selected_article = { "index": i, "title": article_title or "未知标题", "date": date_text } print(f" ✓ 已记录额外选择的文章: 索引={i}, 标题='{article_title}', 日期='{date_text}'") # 保存额外选择的文章信息到JSON文件 try: base_dir = 'd:\\期刊网站爬虫\\downloads_mia' extra_info_file = os.path.join(base_dir, 'extra_selected_article.json') with open(extra_info_file, 'w', encoding='utf-8') as f: json.dump(extra_selected_article, f, ensure_ascii=False, indent=2) print(f" ✅ 额外选择的文章信息已保存到: {extra_info_file}") except Exception as save_e: print(f" ⚠️ 保存额外选择文章信息失败: {str(save_e)}") break # 找到一篇就退出循环 else: print(f" ⚠️ 警告:额外文章选择失败") continue # 继续尝试下一篇 except Exception as e: print(f" ⚠️ 选择文章{i}作为额外文章时出错: {str(e)}") continue # 继续尝试下一篇 # 将额外选择的文章信息存储到实例变量,方便后续访问 self.extra_selected_article = extra_selected_article return selected_count except Exception as e: print(f"✗ 选择文章失败: {str(e)}") self.take_screenshot("select_articles_error") return 0 def download_selected_articles(self): """下载选中的文章""" try: print("\n正在下载选中的文章...") # 点击下载键 print("⚡ 开始监控下载按钮点击事件") # 先检查按钮是否存在且可点击 try: # 先尝试直接查找元素确认存在 button_exists = self.driver.find_element(By.XPATH, "//*[@id='srp-ddm']/form/button") print(f"✓ 下载按钮存在: 可见={button_exists.is_displayed()}, 启用={button_exists.is_enabled()}") # 获取按钮完整信息 button_text = button_exists.text.strip() or "无文本" button_class = button_exists.get_attribute('class') or "无class属性" print(f"ℹ️ 下载按钮信息 - 文本: '{button_text}', Class: {button_class}") except Exception as find_e: print(f"⚠️ 预先检查按钮时出错: {str(find_e)}") # 使用显式等待获取可点击的按钮 download_button = self.wait.until(EC.element_to_be_clickable( (By.XPATH, "//*[@id='srp-ddm']/form/button/span/span/span") )) print(f"✅ 成功获取可点击的下载按钮元素") # 记录点击前的页面状态 try: # 获取点击前的URL和时间戳 before_url = self.driver.current_url before_time = time.time() print(f"ℹ️ 点击前状态 - URL: {before_url}, 时间戳: {before_time}") # 执行点击操作 print("⚡ 执行下载按钮点击操作...") download_button.click() print("✅ 下载按钮点击操作执行完成") # 点击后立即检查状态变化 after_time = time.time() click_delay = after_time - before_time print(f"ℹ️ 点击执行耗时: {click_delay:.3f}秒") # 等待一小段时间后检查页面是否有变化 time.sleep(1) after_url = self.driver.current_url print(f"ℹ️ 点击后URL: {after_url}") # 检查是否有新窗口打开 window_handles = self.driver.window_handles print(f"ℹ️ 当前窗口数量: {len(window_handles)}") # 检查是否有下载相关的通知或对话框 try: # 尝试检查是否有下载确认对话框 confirm_dialog = self.driver.find_element(By.CLASS_NAME, 'download-confirmation') print(f"✓ 检测到下载确认对话框") except: print(f"ℹ️ 未检测到明显的下载确认对话框") print("✓ 下载按钮点击监视完成") except ElementClickInterceptedException: print(f"✗ 下载按钮点击被拦截,元素可能被其他元素覆盖") # 尝试使用JavaScript点击作为备选方案 try: print(f"⚡ 尝试使用JavaScript点击下载按钮") self.driver.execute_script("arguments[0].click();", download_button) print(f"✅ JavaScript下载按钮点击执行完成") except Exception as js_e: print(f"✗ JavaScript下载按钮点击也失败: {str(js_e)}") raise except Exception as click_e: print(f"✗ 下载按钮点击失败: {str(click_e)}") # 记录详细的错误信息用于调试 print(f"ℹ️ 错误类型: {type(click_e).__name__}") print(f"ℹ️ 错误详情: {traceback.format_exc()}") raise # 等待下载开始 time.sleep(3) # 缩短初始等待时间,因为我们会进行更精确的监控 print("✓ 下载已开始,正在监控下载进度...") # 开始监控ZIP下载 success = self.monitor_zip_download(timeout=300) # 5分钟超时 if success: print("✓ 下载完成") # 验证下载的ZIP文件 if self.verify_zip_file(): print("✓ ZIP文件验证成功") else: print("⚠️ ZIP文件可能不完整") else: print("⚠️ 下载监控超时或失败") return success except Exception as e: print(f"✗ 下载失败: {str(e)}") self.take_screenshot("download_error") return False def monitor_zip_download(self, timeout=300): """ 监控ZIP文件下载进度 Args: timeout: 最大等待时间(秒) Returns: bool: 是否成功完成下载 """ start_time = time.time() # 记录下载开始前已存在的所有文件(忽略这些文件,只监控新下载的文件) initial_files = set(os.listdir(self.download_dir)) # 记录下载开始时已存在的ZIP文件 initial_zip_files = {f for f in initial_files if f.endswith('.zip')} temp_files = [] # 记录下载过程中出现的新ZIP文件 new_zip_files = set() # 记录文件大小稳定的次数 size_stability_count = {} # 文件大小稳定的阈值(需要连续检测到3次大小不变才认为完成) STABILITY_THRESHOLD = 3 print(f"开始监控下载,超时时间: {timeout}秒") print(f"忽略下载前已存在的{len(initial_files)}个文件") print(f"忽略下载前已存在的{len(initial_zip_files)}个ZIP文件") while time.time() - start_time < timeout: current_files = set(os.listdir(self.download_dir)) # 获取本次新出现的文件(排除初始文件) newly_appeared_files = current_files - initial_files # 检查是否有新文件出现 if newly_appeared_files: for file in newly_appeared_files: file_path = os.path.join(self.download_dir, file) # 检查临时文件 if file.endswith('.crdownload') or file.endswith('.part') or file.startswith('~'): if file not in temp_files: temp_files.append(file) print(f"检测到新临时文件: {file}") # 检查新的ZIP文件 elif file.endswith('.zip'): if file not in new_zip_files: new_zip_files.add(file) print(f"检测到新ZIP文件: {file}") # 初始化稳定性计数 size_stability_count[file] = 0 # 检查临时文件是否转换为完成的ZIP文件 for temp_file in temp_files[:]: # 移除.crdownload或.part后缀检查是否存在 base_name = temp_file.replace('.crdownload', '').replace('.part', '') if base_name in current_files and base_name not in initial_files: print(f"临时文件已完成转换: {temp_file} -> {base_name}") if base_name.endswith('.zip'): new_zip_files.add(base_name) print(f"添加转换后的ZIP文件到监控列表: {base_name}") # 初始化稳定性计数 size_stability_count[base_name] = 0 # 从临时文件列表中移除 temp_files.remove(temp_file) # 只监控新下载的ZIP文件(排除初始文件) for file in new_zip_files: file_path = os.path.join(self.download_dir, file) try: # 确保文件存在 if not os.path.exists(file_path): continue current_size = os.path.getsize(file_path) # 记录文件大小用于后续比较 if not hasattr(self, '_zip_sizes'): self._zip_sizes = {file: current_size} elif file in self._zip_sizes: size_diff = current_size - self._zip_sizes[file] if size_diff > 0: print(f"ZIP文件增长中: {file} (+{size_diff}字节)") self._zip_sizes[file] = current_size # 重置稳定性计数 size_stability_count[file] = 0 else: # 文件大小不变,增加稳定性计数 size_stability_count[file] += 1 print(f"ZIP文件大小稳定: {file} (计数: {size_stability_count[file]}/{STABILITY_THRESHOLD})") # 如果连续多次检测到大小稳定,认为下载完成 if size_stability_count[file] >= STABILITY_THRESHOLD: print(f"ZIP文件下载完成: {file} ({current_size}字节) - 连续{STABILITY_THRESHOLD}次大小稳定") return True else: self._zip_sizes = {file: current_size} # 初始化稳定性计数 size_stability_count[file] = 0 except Exception as e: print(f"监控新ZIP文件{file}时出错: {str(e)}") # 每2秒检查一次 time.sleep(2) # 显示剩余时间 elapsed = time.time() - start_time remaining = timeout - elapsed if remaining % 10 == 0 or remaining < 10: # 每10秒或最后10秒显示一次 print(f"剩余监控时间: {int(remaining)}秒") # 显示当前监控状态 if new_zip_files: print(f"当前监控的ZIP文件数: {len(new_zip_files)}") else: print(f"尚未检测到新的ZIP文件") print(f"下载监控超时 ({timeout}秒)") return False def verify_zip_file(self): """ 验证下载的ZIP文件完整性 Returns: bool: ZIP文件是否完整 """ try: import zipfile # 查找最新下载的ZIP文件 zip_files = [f for f in os.listdir(self.download_dir) if f.endswith('.zip')] if not zip_files: return False # 按修改时间排序,取最新的 zip_files.sort(key=lambda x: os.path.getmtime(os.path.join(self.download_dir, x)), reverse=True) latest_zip = os.path.join(self.download_dir, zip_files[0]) print(f"正在验证ZIP文件: {os.path.basename(latest_zip)}") # 检查ZIP文件是否有效 with zipfile.ZipFile(latest_zip, 'r') as zip_ref: # 测试ZIP文件中的所有文件 zip_ref.testzip() # 获取ZIP文件信息 info_list = zip_ref.infolist() print(f"ZIP文件包含 {len(info_list)} 个文件") # 简单验证:ZIP文件不为空且有内容 return len(info_list) > 0 except Exception as e: print(f"ZIP文件验证失败: {str(e)}") return False def run(self): """运行爬虫的主流程""" try: # 设置驱动 if not self.setup_driver(): return False try: # 导航到搜索页面 if not self.navigate_to_search_page(): return False # 加载cookies if self.use_profile: self.load_cookies() # 刷新页面以应用cookies self.driver.refresh() time.sleep(3) # 设置搜索条件 if not self.setup_search_criteria(): return False # 按日期排序 if not self.sort_by_date(): print("⚠️ 排序失败,继续后续操作") # 获取结果数量 results_count = self.get_results_count() if results_count == 0: print("⚠️ 未找到搜索结果") return False # 选择本月文章 selected_count = self.select_articles_by_date(max_results=24) if selected_count == 0: print("⚠️ 未选择任何文章") return False # 下载选中的文章 download_success = self.download_selected_articles() if not download_success: print("⚠️ 下载未成功完成") return False return True finally: # 保存cookies if self.use_profile: self.save_cookies() # 关闭浏览器 print("关闭浏览器...") try: self.driver.quit() print("✓ 浏览器已关闭") except Exception as e: print(f"⚠️ 浏览器关闭时出现小问题,但已成功关闭: {e}") except KeyboardInterrupt: print("\n用户中断操作") if self.driver: try: self.save_cookies() self.driver.quit() except: pass return False except Exception as e: print(f"✗ 爬虫运行出错: {e}") print(f"ℹ️ 错误详情: {traceback.format_exc()}") if self.driver: try: self.take_screenshot("run_error") self.save_cookies() self.driver.quit() except: pass return False def main(): """主函数""" try: # 解析命令行参数 parser = argparse.ArgumentParser(description="Medical Image Analysis期刊爬虫") parser.add_argument("--output-dir", default=r"d:\期刊网站爬虫\downloads_mia", help="输出目录") parser.add_argument("--headless", action="store_true", help="使用无头模式") parser.add_argument("--no-profile", action="store_true", help="不使用配置文件和cookie管理") parser.add_argument("--pause", action="store_true", help="导航时暂停") args = parser.parse_args() # 打印欢迎信息 print("=========================================") print("Medical Image Analysis期刊爬虫") print("=========================================") print("此爬虫用于访问ScienceDirect搜索页面并下载Medical Image Analysis期刊文章。") print("功能特点:") print("- 支持Cookie管理,保持登录状态") print("- 增强浏览器稳定性和反检测能力") print("- 自动设置搜索条件并按日期排序") print("- 自动选择本月发表的文章") print("- 支持ZIP文件下载和完整性验证") print("- 详细的日志记录和错误处理") print("\n按Ctrl+C可随时中断操作") print("=========================================") # 创建爬虫实例 crawler = MIACrawler( download_dir=args.output_dir, headless=args.headless, use_profile=not args.no_profile, pause_on_navigation=args.pause ) # 运行爬虫 success = crawler.run() if success: print("\n✓ 爬虫执行成功完成!") else: print("\n✗ 爬虫执行未成功完成,请检查错误信息。") except KeyboardInterrupt: print("\n程序被用户中断") except Exception as e: print(f"\n✗ 程序启动时出错: {e}") if __name__ == "__main__": main()为什么一开始运行成功了,现在报错module 'undetected_chromedriver' has no attribute 'ChromeOptions'
最新发布
11-16
import json import os import logging from datetime import datetime, timedelta import pytz import requests from urllib.parse import quote import time from selenium import webdriver from selenium.webdriver.edge.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.edge.options import Options from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup import re import shutil import glob import asyncio import edge_tts # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler("data_fetcher.log"), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) # 下载目录配置 DOWNLOAD_DIR = os.path.expanduser("~/Downloads") # 浏览器默认下载目录 TARGET_DIR = "./bobao" # 目标保存目录(当前目录下的baobao文件夹) def get_broadcast_data_with_token(tenant_access_token): """使用token获取飞书的数据""" url = 'https://open.feishu.cn/open-apis/bitable/v1/apps/E1zybPqiqa0TaesZjKKch5ZcnJd/tables/tblwFY4k3pmrV5WK/records/search' headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {tenant_access_token}' } data = {} try: logger.info(f"正在请求飞书数据,URL: {url}") response = requests.post(url, headers=headers, json=data, timeout=30) response.raise_for_status() response_dict = response.json() items = response_dict.get("data", {}).get("items", []) logger.info(f"成功获取飞书数据,共 {len(items)} 条记录") data = [] for item in items: fields = item.get("fields", {}) time_segment = extract_time_segment(fields, '时间段') song1_text = extract_text_field(fields, '壹歌曲-歌手') song2_text = extract_text_field(fields, '贰歌曲-歌手') # 新增:从文本字段获取播报内容 song1_broadcast = extract_text_field(fields, '需更新文案-播报') song2_broadcast = extract_text_field(fields, '需更新文案2-播报') if time_segment == "08:10-08:15": song1_name = "" song1_artist = "" song2_name = "" song2_artist = "" else: song1_parts = song1_text.split(' ', 1) song1_name = song1_parts[0] if len(song1_parts) > 0 else "" song1_artist = song1_parts[1] if len(song1_parts) > 1 else "" song2_parts = song2_text.split(' ', 1) song2_name = song2_parts[0] if len(song2_parts) > 0 else "" song2_artist = song2_parts[1] if len(song2_parts) > 1 else "" data.append({ "播音日期": extract_broadcast_date(fields, '播音日期'), "时间段": time_segment, "开播音乐file_token": extract_file_token(fields, '开播音乐'), "开场白-播报file_token": extract_file_token(fields, '开场白-播报'), "壹歌曲-歌手": song1_text, "需更新文案-播报": song1_broadcast, # 更新为文本内容 "贰歌曲-歌手": song2_text, "需更新文案2-播报": song2_broadcast, # 更新为文本内容 "壹歌名": song1_name, "壹歌手": song1_artist, "贰歌名": song2_name, "贰歌手": song2_artist, "结束语-播报file_token": extract_file_token(fields, '结束语-播报'), "结束音乐file_token": extract_file_token(fields, '结束音乐') }) return data except requests.exceptions.HTTPError as http_err: logger.error(f"HTTP 错误发生: {http_err}") except requests.exceptions.Timeout: logger.error("请求超时,服务器响应时间过长") except requests.exceptions.ConnectionError: logger.error("连接错误,无法连接到服务器") except Exception as err: logger.error(f"其他错误发生: {err}", exc_info=True) return [] def extract_file_token(fields, field_name): """提取 file_token""" field_data = fields.get(field_name, []) if isinstance(field_data, list) and len(field_data) > 0: value = field_data[0] if isinstance(value, dict): return value.get("file_token", "") return '' def extract_text_field(fields, field_name): """提取文本字段内容""" field_data = fields.get(field_name, []) if isinstance(field_data, list) and len(field_data) > 0: value = field_data[0] if isinstance(value, dict): # 尝试获取text字段,如果没有则获取content return value.get("text", value.get("content", "")) return '' def extract_time_segment(fields, field_name): """提取时间段字段""" field_data = fields.get(field_name, []) if isinstance(field_data, list) and len(field_data) > 0: value = field_data[0] if isinstance(value, dict): return value.get("text", "") return None def extract_broadcast_date(fields, field_name): """提取播音日期字段""" field_data = fields.get(field_name, 0) if isinstance(field_data, int): try: timestamp = field_data / 1000 parsed_date = datetime.fromtimestamp(timestamp, tz=pytz.utc).astimezone(pytz.timezone('Asia/Shanghai')) return parsed_date.strftime("%Y-%m-%d") except (ValueError, OverflowError): pass return None def get_auth_token(): """获取认证 token""" url = "https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal" headers = {"Content-Type": "application/json; charset=utf-8"} payload = {"app_id": "cli_a882683e8779d00c", "app_secret": "3NKkALA7vyMRVnpKJinmrb1LJ7YuK4H0"} try: logger.info("正在获取认证token") response = requests.post(url, json=payload, headers=headers, timeout=30) response.raise_for_status() data = response.json() if data["code"] == 0: logger.info("成功获取认证token") return data["tenant_access_token"] else: logger.error(f"请求失败:{data['msg']}(错误码:{data['code']})") except requests.exceptions.HTTPError as http_err: logger.error(f"HTTP 错误发生: {http_err}") except requests.exceptions.Timeout: logger.error("获取token超时") except requests.exceptions.ConnectionError: logger.error("连接错误,无法获取token") except Exception as e: logger.error(f"获取token异常:{e}", exc_info=True) return None def display_data(data): """展示获取的数据""" if not data: print("没有获取到任何数据") return print(f"共获取到 {len(data)} 条广播数据记录:") for i, item in enumerate(data, 1): print(f"\n--- 记录 {i} ---") for key, value in item.items(): if key.endswith('file_token') and value: print(f"{key}: {value[:10]}...") else: print(f"{key}: {value}") def format_song_info(record): """格式化并打印歌曲信息""" print("\n歌曲信息:") print(f"壹歌名: {record['壹歌名']}") print(f"壹歌手: {record['壹歌手']}") print(f"贰歌名: {record['贰歌名']}") print(f"贰歌手: {record['贰歌手']}") # 新增:打印播报内容 print("\n播报内容:") print(f"需更新文案-播报: {record['需更新文案-播报']}") print(f"需更新文案2-播报: {record['需更新文案2-播报']}") def wait_for_download(initial_files, timeout=60): """等待下载完成并返回新文件路径""" start_time = time.time() while time.time() - start_time < timeout: current_files = os.listdir(DOWNLOAD_DIR) new_files = [f for f in current_files if f not in initial_files] # 过滤掉临时下载文件 valid_files = [f for f in new_files if not f.endswith('.crdownload')] if valid_files: # 返回第一个找到的有效文件 return os.path.join(DOWNLOAD_DIR, valid_files[0]) time.sleep(1) print(f"等待下载超时({timeout}秒)") return None def download_song(song_name, artist=None, save_path=TARGET_DIR, custom_filename=None): """ 搜索并下载指定歌曲,下载后重命名为指定名称 参数: song_name (str): 歌曲名称 artist (str, optional): 歌手名称,默认为None save_path (str, optional): 保存路径,默认为"./bobao" custom_filename (str, optional): 自定义文件名,默认为None """ # 创建保存目录(如果不存在) if not os.path.exists(save_path): os.makedirs(save_path) # 构建搜索关键词 search_query = song_name if artist: search_query += f" {artist}" print(f"正在搜索歌曲: {search_query}") # 记录下载前的文件列表 initial_files = os.listdir(DOWNLOAD_DIR) # 设置Edge浏览器选项 edge_options = Options() edge_options.add_argument('--disable-gpu') edge_options.add_argument('--no-sandbox') edge_options.add_argument('--disable-dev-shm-usage') # 指定EdgeDriver的路径 edge_driver_path = r"C:\Users\shaopeng.qi\Downloads\edgedriver_win64 (1)\msedgedriver.exe" # 检查路径是否存在 if not os.path.exists(edge_driver_path): print(f"错误: EdgeDriver路径不存在 - {edge_driver_path}") print("请下载与您Edge浏览器版本匹配的EdgeDriver,并更新路径") print("下载地址: https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/") return # 初始化WebDriver service = Service(executable_path=edge_driver_path) service.log_path = "edgedriver.log" driver = webdriver.Edge(service=service, options=edge_options) try: # 直接访问搜索结果页面 search_url = f"https://www.gequbao.com/s/{quote(search_query)}.html" print(f"正在访问搜索结果页面: {search_url}") driver.get(search_url) # 等待页面加载完成 WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, ".card-body")) ) # 检查是否有搜索结果 try: no_results = WebDriverWait(driver, 5).until( EC.presence_of_element_located((By.XPATH, "//div[contains(text(), '没有找到相关歌曲')]")) ) print("未找到匹配的歌曲") return except: # 没有找到"没有找到相关歌曲"的提示,继续执行 pass # 滚动到页面底部,确保所有元素都加载 driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(1) # 查找第一个歌曲的链接 print("正在查找歌曲链接...") song_links = WebDriverWait(driver, 10).until( EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".music-link")) ) if not song_links: print("未找到歌曲链接") with open("error_page.html", "w", encoding="utf-8") as f: f.write(driver.page_source) print("已保存当前页面源码到error_page.html,可用于分析问题") return # 获取第一个歌曲链接的href属性 first_song_link = song_links[0].get_attribute("href") print(f"找到歌曲链接: {first_song_link}") # 提取歌曲ID match = re.search(r'/music/(\d+)', first_song_link) if not match: print("无法从链接中提取歌曲ID") with open("error_page.html", "w", encoding="utf-8") as f: f.write(driver.page_source) print("已保存当前页面源码到error_page.html,可用于分析问题") return song_id = match.group(1) print(f"提取的歌曲ID: {song_id}") # 直接构建并访问歌曲详情页URL song_detail_url = f"https://www.gequbao.com/music/{song_id}" print(f"正在访问歌曲详情页: {song_detail_url}") # 打开新窗口 driver.execute_script(f"window.open('{song_detail_url}');") # 等待新窗口打开 time.sleep(2) # 检查是否打开了新窗口 if len(driver.window_handles) > 1: # 切换到新窗口 for window in driver.window_handles: if window != driver.current_window_handle: driver.switch_to.window(window) print("已切换到歌曲详情页") break else: print("未成功打开歌曲详情页,尝试直接访问") # 直接访问歌曲详情页 driver.get(song_detail_url) print("已直接访问歌曲详情页") # 等待页面加载 print("等待页面加载完成...") time.sleep(5) # 检查是否在歌曲详情页 try: # 查找歌曲标题元素,确认在详情页 song_title = WebDriverWait(driver, 5).until( EC.presence_of_element_located((By.CSS_SELECTOR, ".song-title")) ) print(f"已进入歌曲详情页,歌曲标题: {song_title.text}") except: # 如果找不到歌曲标题元素,尝试其他可能的元素 try: # 尝试查找下载按钮 download_btn = WebDriverWait(driver, 3).until( EC.presence_of_element_located((By.XPATH, "//a[contains(text(), '下载')]")) ) print("已进入歌曲详情页,找到下载按钮") except: print("无法确认是否在歌曲详情页") with open("error_page.html", "w", encoding="utf-8") as f: f.write(driver.page_source) print("已保存当前页面源码到error_page.html,可用于分析问题") return # 点击下载按钮 print("正在查找并点击下载按钮...") try: download_btn = WebDriverWait(driver, 10).until( EC.element_to_be_clickable((By.XPATH, "//*[@id=\"btn-download-mp3\"]")) ) print("找到下载按钮") # 点击下载按钮 download_btn.click() print("已点击下载按钮") # 等待下载对话框出现 print("等待下载对话框出现...") time.sleep(3) # 点击低品质MP3按钮 - 使用更灵活的XPath选择器 print("正在查找并点击低品质MP3按钮...") try: # 使用通配符匹配动态ID部分 low_quality_btn = WebDriverWait(driver, 10).until( EC.element_to_be_clickable((By.XPATH, "//*[starts-with(@id, 'jconfirm-box')]/div/ul/li[2]/a")) ) print("找到低品质MP3按钮") # 点击低品质MP3按钮 low_quality_btn.click() # 构建保存文件名 if custom_filename: target_filename = f"{custom_filename}.mp3" elif artist: target_filename = f"{artist} - {song_name}.mp3" else: target_filename = f"{song_name}.mp3" target_path = os.path.join(save_path, target_filename) print(f"已点击下载按钮,等待下载完成后将保存为: {target_path}") # 等待下载完成 print("等待下载开始...") downloaded_file = wait_for_download(initial_files, timeout=60) if downloaded_file: # 等待文件完全下载(避免文件正在写入时移动) time.sleep(5) # 移动并重命名文件 try: shutil.move(downloaded_file, target_path) print(f"歌曲已成功下载并重命名为: {target_path}") except Exception as move_err: print(f"移动文件时出错: {move_err}") print(f"下载的文件保存在: {downloaded_file}") else: print("未检测到新的下载文件") except Exception as e: print(f"点击低品质MP3按钮时出错: {e}") with open("error_page.html", "w", encoding="utf-8") as f: f.write(driver.page_source) print("已保存当前页面源码到error_page.html,可用于分析问题") return except Exception as e: print(f"点击下载按钮时出错: {e}") with open("error_page.html", "w", encoding="utf-8") as f: f.write(driver.page_source) print("已保存当前页面源码到error_page.html,可用于分析问题") return except Exception as e: print(f"操作过程中发生错误: {e}") with open("error_page.html", "w", encoding="utf-8") as f: f.write(driver.page_source) print("已保存当前页面源码到error_page.html,可用于分析问题") finally: # 关闭浏览器 driver.quit() print("浏览器已关闭") def download_required_songs(data): """下载指定的两首歌曲并命名为file_4和file_6""" if not data or len(data) == 0: print("没有可下载的歌曲数据") return first_record = data[0] # 下载第一首歌 (file_4) song1_name = first_record.get("壹歌名") song1_artist = first_record.get("壹歌手") if song1_name and song1_artist: print(f"\n准备下载第一首歌: {song1_name} - {song1_artist},保存为 file_4.mp3") try: download_song(song1_name, song1_artist, custom_filename="file_4") except Exception as e: print(f"下载失败: {song1_name} - {song1_artist}, 错误: {e}") else: print("第一首歌信息不完整,无法下载") print(f"歌名: {song1_name}, 歌手: {song1_artist}") # 下载第二首歌 (file_6) song2_name = first_record.get("贰歌名") song2_artist = first_record.get("贰歌手") if song2_name and song2_artist: print(f"\n准备下载第二首歌: {song2_name} - {song2_artist},保存为 file_6.mp3") try: download_song(song2_name, song2_artist, custom_filename="file_6") except Exception as e: print(f"下载失败: {song2_name} - {song2_artist}, 错误: {e}") else: print("第二首歌信息不完整,无法下载") print(f"歌名: {song2_name}, 歌手: {song2_artist}") async def convert_text_to_speech(text, output_file): """将文本转换为语音并保存为MP3文件""" if text: try: communicate = edge_tts.Communicate(text, voice="zh-CN-YunyangNeural") await communicate.save(output_file) print(f"语音已保存为:{output_file}") return True except Exception as e: print(f"文本转语音失败: {e}") return False else: print(f"文本为空,跳过转换: {output_file}") return False async def convert_broadcast_texts_async(data): """异步转换播报文本为语音文件""" if not data or len(data) == 0: print("没有可转换的播报文本数据") return first_record = data[0] # 确保目标目录存在 if not os.path.exists(TARGET_DIR): os.makedirs(TARGET_DIR) # 转换第一个播报文本 (file_3) text1 = first_record.get("需更新文案-播报", "") output_file1 = os.path.join(TARGET_DIR, "file_3.mp3") print(f"\n准备转换第一个播报文本,保存为 file_3.mp3") print(f"文本内容: {text1[:50]}...") # 只显示前50个字符 success1 = await convert_text_to_speech(text1, output_file1) if success1: print("第一个播报文本转换成功") # 转换第二个播报文本 (file_5) text2 = first_record.get("需更新文案2-播报", "") output_file2 = os.path.join(TARGET_DIR, "file_5.mp3") print(f"\n准备转换第二个播报文本,保存为 file_5.mp3") print(f"文本内容: {text2[:50]}...") # 只显示前50个字符 success2 = await convert_text_to_speech(text2, output_file2) if success2: print("第二个播报文本转换成功") def convert_broadcast_texts(data): """转换播报文本为语音文件 - 兼容同步和异步环境""" try: # 尝试获取当前事件循环 loop = asyncio.get_event_loop() except RuntimeError: # 如果没有活动的事件循环,创建一个新的 loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) # 检查事件循环是否正在运行 if loop.is_running(): # 如果事件循环已经在运行(如在Jupyter中),使用asyncio.run_coroutine_threadsafe # 注意:这需要在支持多线程的环境中 future = asyncio.run_coroutine_threadsafe(convert_broadcast_texts_async(data), loop) future.result() # 等待异步任务完成 else: # 如果事件循环没有运行,直接运行 loop.run_until_complete(convert_broadcast_texts_async(data)) def main(): """主函数""" logger.info("===== 飞书数据获取及媒体处理程序启动 =====") # 获取认证token authorization = get_auth_token() if not authorization: logger.error("获取认证token失败,程序退出") return # 获取广播数据 data = get_broadcast_data_with_token(authorization) # 展示获取的数据 display_data(data) # 保存数据到JSON文件 try: with open('broadcast_data.json', 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) print("\n数据已保存到 broadcast_data.json 文件") except Exception as e: logger.error(f"保存数据失败: {e}") # 打印歌曲信息 if data: format_song_info(data[0]) # 自动下载两首歌曲,命名为file_4和file_6 download_required_songs(data) # 转换播报文本为语音文件 convert_broadcast_texts(data) logger.info("===== 飞书数据获取及媒体处理程序结束 =====") if __name__ == "__main__": main() 检查上述代码中存在的问题
06-24
# !/usr/bin/env python # -*- coding: utf-8 -*- # ---------------------- '''''' ''' ''' import requests, json import os, time import pandas as pd from selenium import webdriver from lxml import etree class TestSpider(object): def __init__(self): pass def read_excel(self): # ------------读取关于指定的excel文件------------- self.df = pd.read_excel('榜单链接.xlsx') self.proess_start() def proess_start(self): # ------------过滤出属于天猫平台的链接------------- url_list = [] self.tmall_links = self.df[self.df['链接'].str.contains('huodong.taobao.com|pages.tmall.com')] for index, row in self.tmall_links.iterrows(): url = row['链接'] url_list.append(url) for url in url_list: self.driver = webdriver.Chrome(executable_path=r'C:\Users\LENOVO\Desktop\jerry\chromedriver-win64\chromedriver.exe') self.driver.maximize_window() self.driver.get(url) time.sleep(2) break # -----------------模拟登录------------------ # 含内嵌标签 # self.ncbq = self.driver.find_element_by_id('login_frame') # self.driver.switch_to.frame(self.ncbq) # time.sleep(2) # # 登录方式获取 # self.driver.find_element_by_id('baxia-dialog-content').click() # time.sleep(2) # self.driver.find_element_by_id('fm-login-id').send_keys('18966663333') # time.sleep(2) # self.driver.find_element_by_id('fm-login-password').send_keys('456789') # time.sleep(2) # self.driver.find_element_by_class_name('class="fm-button fm-submit password-login button-low-light"').click() self.mouse_down() def mouse_down(self): # 翻页处理 - 鼠标往下滑 for i in range(1, 11): js_code = 'scrollTo(0, {})'.format(i * 200) self.driver.execute_script(js_code) time.sleep(0.5) time.sleep(3) self.get_response() def get_response(self): # 获取页面中包含 JSON 数据的元素 pass # json_element = self.driver.execute_script("return window.someJsonData") # 假设 JSON 数据存储在 window.someJsonData 中 # 将 JSON 数据转换为 Python 字典 # data = json.loads(json_element) # response = self.driver.page_source # self.start_url = 'https://huodong.taobao.com/wow/z/tbhome/tbpc-venue/rank?tagId=94841581&secondTab=18&spm=a21bo.29414687.1010.d1_link' # response = requests.get(self.start_url) # print(response) def create_dir(self): # 创建保存图片的文件夹 if not os.path.exists('榜单图片'): os.makedirs('榜单图片') def main(self): self.read_excel() if __name__ == '__main__': test = TestSpider() test.main(),这是我的部分代码
06-02
【import pandas as pd from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException import logging import time import os from datetime import datetime import glob import threading from concurrent.futures import ThreadPoolExecutor, as_completed import queue # ==================== 静态配置区域 ==================== class Config: # ChromeDriver配置 CHROME_DRIVER_PATH = "chromedriver.exe" # ChromeDriver路径 DOWNLOAD_DIR = r"apk_downloads" # 下载目录(修复路径格式) # Excel文件配置 EXCEL_FILE_PATH = "urls.xlsx" # Excel文件路径 SHEET_NAME = "Sheet1" # 工作表名 URL_COLUMN = "url" # URL列名 STATUS_COLUMN = "status" # 状态列名 DOWNLOAD_TIME_COLUMN = "download_time" # 下载时间列名 ERROR_MSG_COLUMN = "error_message" # 错误信息列名 # 浏览器配置 IMPLICIT_WAIT = 2 # 隐式等待时间(秒) EXPLICIT_WAIT = 2 # 增加显式等待时间(秒) PAGE_LOAD_TIMEOUT = 2 # 增加页面加载超时时间(秒) # 下载配置 CLICK_WAIT_TIME = 1 # 点击后等待时间(秒) RETRY_COUNT = 3 # 重试次数 DOWNLOAD_TIMEOUT = 3 # 下载完成等待超时时间(秒) # 多线程配置 MAX_WORKERS = 1 # 最大线程数(根据系统性能调整) THREAD_DELAY = 1 # 线程启动间隔(秒) # User-Agent配置 USER_AGENT = "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36" # ==================== 线程安全的Excel写入器 ==================== class ExcelWriter: def __init__(self, file_path, sheet_name): self.file_path = file_path self.sheet_name = sheet_name self.lock = threading.Lock() # 初始化Excel文件结构 self._init_excel_file() def _init_excel_file(self): """初始化Excel文件,确保必要的列存在""" with self.lock: try: if os.path.exists(self.file_path): df = pd.read_excel(self.file_path, sheet_name=self.sheet_name) # 确保必要的列存在 for col in [Config.STATUS_COLUMN, Config.DOWNLOAD_TIME_COLUMN, Config.ERROR_MSG_COLUMN]: if col not in df.columns: df[col] = "" else: # 创建新的DataFrame df = pd.DataFrame(columns=[ Config.URL_COLUMN, Config.STATUS_COLUMN, Config.DOWNLOAD_TIME_COLUMN, Config.ERROR_MSG_COLUMN ]) # 保存文件 df.to_excel(self.file_path, sheet_name=self.sheet_name, index=False) logger.info(f"Excel文件初始化完成: {self.file_path}") except Exception as e: logger.error(f"初始化Excel文件失败: {str(e)}") def update_status(self, url, status, error_message=""): """更新指定URL的状态""" with self.lock: try: # 读取现有数据 df = pd.read_excel(self.file_path, sheet_name=self.sheet_name) # 查找URL对应的行 mask = df[Config.URL_COLUMN] == url if mask.any(): # 更新现有行 df.loc[mask, Config.STATUS_COLUMN] = status df.loc[mask, Config.DOWNLOAD_TIME_COLUMN] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") if error_message: df.loc[mask, Config.ERROR_MSG_COLUMN] = error_message else: # 添加新行 new_row = { Config.URL_COLUMN: url, Config.STATUS_COLUMN: status, Config.DOWNLOAD_TIME_COLUMN: datetime.now().strftime("%Y-%m-%d %H:%M:%S"), Config.ERROR_MSG_COLUMN: error_message } df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True) # 保存文件 df.to_excel(self.file_path, sheet_name=self.SHEET_NAME, index=False) logger.info(f"Excel状态更新: {url} -> {status}") except Exception as e: logger.error(f"更新Excel状态失败: {str(e)}") # ==================== 日志配置 ==================== def setup_logging(): logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(), # 输出到控制台 ] ) return logging.getLogger(__name__) # ==================== ChromeDriver设置 ==================== def setup_chrome_driver(): # 创建下载目录 if not os.path.exists(Config.DOWNLOAD_DIR): os.makedirs(Config.DOWNLOAD_DIR) logger.info(f"创建下载目录: {Config.DOWNLOAD_DIR}") # Chrome选项配置 chrome_options = webdriver.ChromeOptions() # 设置User-Agent chrome_options.add_argument(f"--user-agent={Config.USER_AGENT}") # 模拟移动设备设置 mobile_emulation = { "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0}, "userAgent": Config.USER_AGENT } chrome_options.add_experimental_option("mobileEmulation", mobile_emulation) # 下载设置 - 修复路径格式 prefs = { "download.default_directory": os.path.abspath(Config.DOWNLOAD_DIR), "download.prompt_for_download": False, "download.directory_upgrade": True, "safebrowsing.enabled": False, # 禁用安全浏览以加快下载 "profile.default_content_settings.popups": 0 # 禁止弹出下载窗口 } chrome_options.add_experimental_option("prefs", prefs) # 其他选项 chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--disable-blink-features=AutomationControlled") chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) chrome_options.add_experimental_option('useAutomationExtension', False) # chrome_options.add_argument("--headless") # 无头模式,如需可视化可注释掉 try: service = Service(Config.CHROME_DRIVER_PATH) driver = webdriver.Chrome(service=service, options=chrome_options) # 设置超时时间 driver.implicitly_wait(Config.IMPLICIT_WAIT) driver.set_page_load_timeout(Config.PAGE_LOAD_TIMEOUT) # 隐藏自动化特征 driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") logger.info("ChromeDriver初始化成功") logger.info(f"使用User-Agent: {Config.USER_AGENT}") return driver except Exception as e: logger.error(f"ChromeDriver初始化失败: {str(e)}") raise # ==================== 读取Excel文件 ==================== def read_urls_from_excel(): try: # 尝试多种读取方式 try: df = pd.read_excel(Config.EXCEL_FILE_PATH, sheet_name=Config.SHEET_NAME) except: df = pd.read_excel(Config.EXCEL_FILE_PATH) if Config.URL_COLUMN not in df.columns: # 尝试查找包含url的列(不区分大小写) url_columns = [col for col in df.columns if 'url' in col.lower()] if url_columns: logger.info(f"使用列 '{url_columns[0]}' 替代 '{Config.URL_COLUMN}'") urls = df[url_columns[0]].dropna().tolist() else: raise ValueError(f"Excel文件中未找到URL相关列,现有列: {list(df.columns)}") else: urls = df[Config.URL_COLUMN].dropna().tolist() logger.info(f"从Excel读取到 {len(urls)} 个URL") return urls except Exception as e: logger.error(f"读取Excel文件失败: {str(e)}") if os.path.exists(Config.EXCEL_FILE_PATH): logger.info(f"文件存在,路径: {os.path.abspath(Config.EXCEL_FILE_PATH)}") else: logger.error(f"文件不存在: {Config.EXCEL_FILE_PATH}") raise # ==================== 智能下载按钮识别策略 ==================== def click_download_button(driver, url): """基于HTML文档结构的智能下载按钮识别""" # 根据您提供的HTML文档,针对性选择器 button_selectors = [ # 针对文档2的选择器 "#download-btn a.down", "#download-btn a.android", "#download-btn a.ios", "#download-btn a", "#download-btn", # 通用选择器 "a[href*='.apk']", "a[href*='download']", "button[onclick*='download']", "button[onclick*='downApp']", # 文本选择器 "//a[contains(text(), '下载')]", "//button[contains(text(), '下载')]", "//*[contains(text(), 'Download')]", "//*[contains(text(), 'download')]", # 类名选择器 ".download-btn", ".btn-download", ".download-button", ".btn-primary" ] for selector in button_selectors: try: if selector.startswith("//"): element = WebDriverWait(driver, 10).until( EC.element_to_be_clickable((By.XPATH, selector)) ) else: element = WebDriverWait(driver, 10).until( EC.element_to_be_clickable((By.CSS_SELECTOR, selector)) ) # 滚动到元素并高亮 driver.execute_script("arguments[0].scrollIntoView({block: 'center', behavior: 'smooth'});", element) driver.execute_script("arguments[0].style.border='3px solid red';", element) time.sleep(1) # 尝试JavaScript点击 driver.execute_script("arguments[0].click();", element) logger.info(f"成功点击下载按钮: {selector}") return True except TimeoutException: continue except Exception as e: logger.warning(f"选择器 {selector} 失败: {str(e)}") continue # 如果常规选择器都失败,尝试执行页面中的下载函数 try: # 尝试执行页面中的downApp函数(针对文档2) driver.execute_script("downApp();") logger.info("通过执行downApp()函数触发下载") return True except: pass # 最后尝试查找所有包含apk的链接 try: apk_links = driver.find_elements(By.CSS_SELECTOR, "a[href*='.apk']") if apk_links: driver.execute_script("arguments[0].click();", apk_links[0]) logger.info("通过APK链接触发下载") return True except: pass return False # ==================== 等待下载完成 ==================== def wait_for_download_complete(download_dir, timeout=Config.DOWNLOAD_TIMEOUT): """等待当前下载完成""" logger.info("等待下载完成...") start_time = time.time() last_size = -1 stable_count = 0 while time.time() - start_time < timeout: # 查找正在下载的临时文件 temp_files = glob.glob(os.path.join(download_dir, "*.crdownload")) if not temp_files: # 没有临时文件,检查是否有完整的APK文件 apk_files = [f for f in os.listdir(download_dir) if f.endswith('.apk')] if apk_files: # 检查文件大小是否稳定 try: current_size = os.path.getsize(os.path.join(download_dir, apk_files[0])) if current_size == last_size: stable_count += 1 if stable_count >= 3: # 连续3次检查大小不变 logger.info("下载完成") return True else: last_size = current_size stable_count = 0 except: pass time.sleep(2) # 每2秒检查一次 logger.warning("下载超时") return False # ==================== 线程处理函数 ==================== def process_url_thread(url, excel_writer, thread_id): """单个URL的线程处理函数""" driver = None try: logger.info(f"[线程{thread_id}] 开始处理URL: {url}") excel_writer.update_status(url, "处理中") # 每个线程使用独立的浏览器实例 driver = setup_chrome_driver() # 记录下载前的文件状态 initial_files = set([f for f in os.listdir(Config.DOWNLOAD_DIR) if f.endswith('.apk')]) # 访问URL driver.get(url) logger.info(f"[线程{thread_id}] 页面加载成功") # 等待页面完全加载 time.sleep(2) # 尝试点击下载按钮 if click_download_button(driver, url): # 等待下载开始并完成 time.sleep(Config.CLICK_WAIT_TIME) # 等待下载完成 if wait_for_download_complete(Config.DOWNLOAD_DIR): logger.info(f"[线程{thread_id}] 下载操作执行成功") excel_writer.update_status(url, "下载成功") return True else: logger.warning(f"[线程{thread_id}] 下载可能未完成") excel_writer.update_status(url, "下载超时", "下载操作超时") else: logger.warning(f"[线程{thread_id}] 未找到下载按钮") excel_writer.update_status(url, "失败", "未找到下载按钮") except TimeoutException: logger.error(f"[线程{thread_id}] 页面加载超时") excel_writer.update_status(url, "失败", "页面加载超时") except WebDriverException as e: logger.error(f"[线程{thread_id}] 浏览器异常: {str(e)}") excel_writer.update_status(url, "失败", f"浏览器异常: {str(e)}") except Exception as e: logger.error(f"[线程{thread_id}] 处理URL时发生未知错误: {str(e)}") excel_writer.update_status(url, "失败", f"未知错误: {str(e)}") finally: if driver: driver.quit() logger.info(f"[线程{thread_id}] 浏览器已关闭") return False # ==================== 检查下载目录 ==================== def check_downloaded_files(): """检查下载目录中的文件""" if os.path.exists(Config.DOWNLOAD_DIR): files = os.listdir(Config.DOWNLOAD_DIR) apk_files = [f for f in files if f.endswith('.apk')] logger.info(f"下载目录中有 {len(apk_files)} 个APK文件") return apk_files return [] # ==================== 多线程下载管理器 ==================== class DownloadManager: def __init__(self, max_workers=Config.MAX_WORKERS): self.max_workers = max_workers self.excel_writer = ExcelWriter(Config.EXCEL_FILE_PATH, Config.SHEET_NAME) self.results = { 'success': 0, 'failed': 0, 'total': 0 } self.lock = threading.Lock() def process_urls(self, urls): """使用线程池处理所有URL""" self.results['total'] = len(urls) logger.info(f"开始多线程下载,线程数: {self.max_workers},URL总数: {len(urls)}") with ThreadPoolExecutor(max_workers=self.max_workers) as executor: # 提交所有任务 future_to_url = {} for i, url in enumerate(urls): # 添加延迟以避免同时启动过多线程 if i > 0: time.sleep(Config.THREAD_DELAY) future = executor.submit(process_url_thread, url, self.excel_writer, i+1) future_to_url[future] = url # 等待所有任务完成 for future in as_completed(future_to_url): url = future_to_url[future] try: result = future.result() with self.lock: if result: self.results['success'] += 1 else: self.results['failed'] += 1 logger.info(f"URL处理完成: {url} -> {'成功' if result else '失败'}") except Exception as e: with self.lock: self.results['failed'] += 1 logger.error(f"URL处理异常: {url} -> {str(e)}") return self.results # ==================== 主函数 ==================== def main(): global logger logger = setup_logging() try: logger.info("=== APK批量下载程序启动 ===") logger.info(f"下载目录: {Config.DOWNLOAD_DIR}") logger.info(f"最大线程数: {Config.MAX_WORKERS}") logger.info(f"User-Agent: {Config.USER_AGENT}") # 检查初始下载文件 initial_files = set(check_downloaded_files()) logger.info(f"初始下载目录文件数: {len(initial_files)}") # 读取URL列表 urls = read_urls_from_excel() if not urls: logger.error("没有找到可处理的URL") return # 创建下载管理器并处理URL manager = DownloadManager() results = manager.process_urls(urls) # 检查最终下载文件 final_files = set(check_downloaded_files()) new_files = final_files - initial_files # 统计结果 logger.info(f"\n=== 处理完成 ===") logger.info(f"成功: {results['success']}个") logger.info(f"失败: {results['failed']}个") logger.info(f"总计: {results['total']}个") logger.info(f"新下载APK文件: {len(new_files)}个") logger.info(f"下载文件位置: {Config.DOWNLOAD_DIR}") logger.info(f"状态文件: {Config.EXCEL_FILE_PATH}") # 显示新下载的文件名 if new_files: logger.info("新下载的文件:") for file in new_files: logger.info(f" - {file}") except Exception as e: logger.error(f"程序执行出错: {str(e)}") finally: logger.info("程序结束") if __name__ == "__main__": main()】将以上代码去除读取日志,不再生成日志,不再读取urls.xlsx中的url,直接输入一个url
11-11
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值