在这个脚本中,我采用模拟点击的方式,爬取某个网页上所有的下载资源。
如果要写这个脚本的话可能需要去检查一下网页的资源。例如到某个下载的标签下左击鼠标,然后点击检查,对比右边的网页资源进行有选择填写。
import time, os, re
import tkinter as tk
from tkinter import messagebox
from urllib.parse import quote
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.edge.service import Service as EdgeService
from selenium.webdriver.edge.options import Options as EdgeOptions
# 统计所有的下载的文件夹的个数
down_load_file_count=0
def click_all_download_buttons(driver):
# 定位所有包含"下载"文本的按钮
download_buttons = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located(
(By.XPATH, '//span[text()="下载"]/parent::button')
)
)
# 遍历点击所有下载按钮
for index, btn in enumerate(download_buttons):
try:
# 滚动到元素可见区域
driver.execute_script("arguments[0].scrollIntoView();", btn)
# 显式等待按钮可点击
WebDriverWait(driver, 10).until(
EC.element_to_be_clickable(btn)
).click()
down_load_file_count+=1
print(f"已点击第 {index+1} 个下载按钮")
time.sleep(1) # 给下载一些时间
except Exception as e:
print(f"点击第 {index+1} 个按钮失败: {str(e)}")
def process_all_pages(driver):
while True:
click_all_download_buttons(driver)
print("page ok!!!")
try:
# 等待按钮可点击
next_btn = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, 'li.whale_5-pagination-next:not([aria-disabled="true"]) button')
)
)
# 点击下一页
next_btn.click()
time.sleep(5)
print("已切换到新页面:", driver.current_url)
except Exception as e:
print("已到达最后一页或出现错误:", str(e))
break
def replace_url_params(url, version=None, err_type=None):
"""
使用正则表达式替换url中的version和errType参数。
"""
if version:
print(url)
url = re.sub(r'(%22version%22:%22)([^%]+?)(%22)', r'\g<1>{}\g<3>'.format(version), url)
print(url)
if err_type:
print(url)
url = re.sub(r'(%22errType%22:%22)([^%]+?)(%22)', r'\g<1>{}\g<3>'.format(err_type), url)
print(url)
return url
def create_url():
"""
根据用户输入的基础URL、版本号和错误类型生成最终的URL,并通过Selenium打开该URL。
"""
main_url = main_url_var.get()
version = version_var.get()
err_type = err_type_var.get()
download_base_path = main_download_var.get()
# 获取当前日期并格式化为yy_mm_dd
today = datetime.now().strftime("%y_%m_%d")
# 创建当日日期的文件夹路径
download_path = os.path.join(download_base_path, today)
# 如果目录不存在,则创建
if not os.path.exists(download_path):
os.makedirs(download_path)
print(f"已创建目录: {download_path}")
final_urls = []
# 替换main_url中的version和errType部分
if err_type == 'all':
for et in ['native', 'java', 'anr']:
new_url = replace_url_params(main_url, version=version, err_type=et)
final_urls.append(new_url)
else:
final_url = replace_url_params(main_url, version=version, err_type=err_type)
final_urls = [final_url]
# 打开最终生成的URL
options = webdriver.ChromeOptions()
# 指定你的chrome用户数据目录,下次就不用扫码了
# 这里可能需要根据你的User Data的文件目录进行更改
options.add_argument("user-data-dir=C:\\Users\\qzp\\AppData\\Local\\Google\\Chrome\\User Data") # 替换为你的Edge用户数据目录路径
options.add_argument("profile-directory=Default") # 如果需要使用特定的用户配置文件,请替换这里的"Default"
# 设置默认下载路径
prefs = {"download.default_directory": download_path} # 使用更新后的下载路径
options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(options=options)
try:
print("正在打开页面...")
for url in final_urls:
driver.get(url)
print("开始处理页面...\n\n\n\n\n\n")
time.sleep(5) # 等待页面加载
process_all_pages(driver)
print("目前已经下载了",down_load_file_count,"个文件")
finally:
driver.quit()
if __name__ == "__main__":
# 创建ui窗口
app = tk.Tk()
app.title("URL Generator")
# 添加主URL输入区域
tk.Label(app, text="Main URL").grid(row=0, column=0)
# 输入网页地址
main_url_var = tk.StringVar(value="")
tk.Entry(app, textvariable=main_url_var, width=80).grid(row=0, column=1)
# 版本号输入区域
tk.Label(app, text="Version").grid(row=1, column=0)
version_var = tk.StringVar(value="4.1.276.10")
tk.Entry(app, textvariable=version_var).grid(row=1, column=1)
# 错误类型选择区域
tk.Label(app, text="Error Type").grid(row=2, column=0)
err_type_var = tk.StringVar(value="native")
options = ["all", "native", "java", "anr"]
tk.OptionMenu(app, err_type_var, *options).grid(row=2, column=1)
# 文档下载目录
tk.Label(app, text="DownLoad File").grid(row=3, column=0)
main_download_var = tk.StringVar(value="E:\\workspace\\python\\爬虫\\")
tk.Entry(app, textvariable=main_download_var, width=80).grid(row=3, column=1)
# 添加按钮,点击后调用create_url函数
tk.Button(app, text="Generate URL", command=create_url).grid(row=4, columnspan=2)
app.mainloop()