文章目录
Python爬虫网页资源抓取工具的设计与实现(附完整代码)
在网络信息获取日益频繁的今天,能够快速将网页及其关联资源完整保存到本地的工具显得尤为实用。本文将解析一个基于Python的网页资源抓取工具——WebPageCrawler,结合实际代码示例,探讨其核心功能、技术实现与设计思路,为同类工具开发提供参考。
工具概述:功能与定位
WebPageCrawler是一款带图形用户界面(GUI)的网页资源抓取工具,核心功能是将目标网页的HTML、CSS、JavaScript、图片、字体等资源完整下载到本地,并自动修正资源引用路径,确保本地网页与在线版本展示一致。
与简单保存工具相比,其特点在于:
- 可视化操作界面,无需命令行知识
- 自动识别并下载多类型资源(HTML/CSS/JS/图片/字体)
- 处理CSS嵌套资源(如背景图片、字体文件)
- 智能转换资源路径为本地相对路径
- 实时反馈下载状态与统计信息
技术栈与核心库
工具依赖多个Python库,分工明确:
- GUI框架:
tkinter与ttk构建交互界面 - 网络请求:
requests获取网页内容与资源 - 网页解析:
BeautifulSoup提取HTML中的资源链接 - URL处理:
urllib.parse处理URL转换与拼接 - CSS解析:
cssutils处理CSS中的资源引用 - 文件操作:
os与pathlib处理目录与文件 - 多线程:
threading实现后台下载,避免GUI卡顿
核心功能模块解析(附代码示例)
1. 图形用户界面(GUI)设计
界面通过setup_ui方法构建,采用网格布局实现组件有序排列:
def setup_ui(self):
# 主框架
main_frame = ttk.Frame(self.root, padding="10")
main_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
# URL输入区域
url_frame = ttk.Frame(main_frame)
url_frame.grid(row=0, column=0, columnspan=2, sticky=(tk.W, tk.E), pady=(0, 10))
ttk.Label(url_frame, text="目标URL:").grid(row=0, column=0, sticky=tk.W)
self.url_entry = ttk.Entry(url_frame, width=70)
self.url_entry.grid(row=0, column=1, padx=(10, 0), sticky=(tk.W, tk.E))
self.url_entry.insert(0, "https://") # 默认填充协议
# 按钮区域
button_frame = ttk.Frame(main_frame)
button_frame.grid(row=1, column=0, columnspan=2, sticky=(tk.W, tk.E), pady=(0, 10))
self.crawl_button = ttk.Button(button_frame, text="开始抓取", command=self.start_crawling)
self.crawl_button.grid(row=0, column=0, padx=(0, 10))
self.open_folder_button = ttk.Button(button_frame, text="打开目录",
command=self.open_download_folder, state="disabled")
self.open_folder_button.grid(row=0, column=1, padx=(0, 10))
# 资源列表(Treeview)
list_frame = ttk.LabelFrame(main_frame, text="下载的资源列表", padding="5")
list_frame.grid(row=2, column=0, columnspan=2, sticky=(tk.W, tk.E, tk.N, tk.S), pady=(0, 10))
columns = ('序号', '类型', 'URL', '本地路径', '状态')
self.tree = ttk.Treeview(list_frame, columns=columns, show='headings', height=15)
for col in columns:
self.tree.heading(col, text=col)
self.tree.column('URL', width=250) # 加宽URL列
self.tree.column('本地路径', width=200)
# 滚动条
scrollbar = ttk.Scrollbar(list_frame, orient=tk.VERTICAL, command=self.tree.yview)
self.tree.configure(yscrollcommand=scrollbar.set)
self.tree.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
scrollbar.grid(row=0, column=1, sticky=(tk.N, tk.S))
# 统计与状态
self.stats_var = tk.StringVar()
self.stats_var.set("总计: 0 个资源 (成功: 0, 失败: 0) | CSS资源: 0")
ttk.Label(main_frame, textvariable=self.stats_var).grid(row=3, column=0, sticky=tk.W)
self.status_var = tk.StringVar()
self.status_var.set("就绪")
ttk.Label(main_frame, textvariable=self.status_var, relief=tk.SUNKEN).grid(row=4, column=0, columnspan=2, sticky=(tk.W, tk.E))
# 配置自适应布局
self.root.columnconfigure(0, weight=1)
self.root.rowconfigure(0, weight=1)
main_frame.columnconfigure(0, weight=1)
main_frame.rowconfigure(2, weight=1)
界面设计采用模块化思想,将输入区、按钮区、列表区等功能分离,并通过columnconfigure和rowconfigure实现自适应布局,确保窗口大小改变时元素合理缩放。
2. 网页抓取核心流程
抓取流程由start_crawling触发,核心逻辑在crawl_webpage中实现,遵循"准备-下载-解析-修正"四步流程:
def crawl_webpage(self, url):
try:
# 初始化资源存储容器
self.resources = []
self.url_map = {} # 原始URL到本地路径的映射
self.downloaded_urls = set() # 避免重复下载
self.clear_treeview()
# 创建基础目录(以域名命名)
domain = urlparse(url).netloc
base_dir = os.path.join(self.download_folder, domain)
os.makedirs(base_dir, exist_ok=True)
# 下载主页面HTML
html_content, main_file_path = self.download_html(url, base_dir)
if not html_content:
self.update_status("主页下载失败")
return
# 解析HTML提取资源链接
soup = BeautifulSoup(html_content, 'html.parser')
resources_to_download = []
# 提取CSS链接
for link in soup.find_all('link', rel='stylesheet'):
if href := link.get('href'):
resources_to_download.append(('css', href))
# 提取JS链接
for script in soup.find_all('script', src=True):
if src := script.get('src'):
resources_to_download.append(('js', src))
# 提取图片链接
for img in soup.find_all('img', src=True):
if src := img.get('src'):
resources_to_download.append(('image', src))
# 下载所有资源
total = len(resources_to_download)
for i, (resource_type, resource_url) in enumerate(resources_to_download):
self.status_var.set(f"正在下载资源 ({i + 1}/{total}): {resource_url}")
local_path = self.download_resource(resource_url, url, base_dir, resource_type)
# 记录URL映射(用于后续路径修正)
absolute_url = self.get_absolute_url(resource_url, url)
if local_path:
relative_path = os.path.relpath(local_path, os.path.dirname(main_file_path))
self.url_map[absolute_url] = relative_path.replace('\\', '/') # 统一正斜杠
# 更新UI
self.resources.append({
'type': resource_type, 'url': resource_url,
'local_path': local_path, 'status': '成功' if local_path else '失败'
})
self.root.after(0, self.update_treeview, len(self.resources) - 1)
# 修正HTML中的资源路径
self.fix_html_resource_paths(main_file_path, url)
self.update_status("抓取完成!")
except Exception as e:
self.update_status(f"错误: {str(e)}")
finally:
self.root.after(0, self.crawling_finished)
核心流程亮点:
- 按资源类型分类处理,便于后续存储与管理
- 使用
url_map记录URL与本地路径的映射关系,为路径修正做准备 - 通过
root.after(0, ...)实现子线程安全更新UI,避免界面卡顿 - 异常处理确保工具稳定性,错误信息可通过状态条反馈给用户
3. 资源下载与路径处理
download_resource方法处理各类资源的下载逻辑,包括URL转换、路径生成和文件保存:
def download_resource(self, resource_url, base_url, base_dir, resource_type):
try:
# 构建完整URL(处理相对路径)
absolute_url = self.get_absolute_url(resource_url, base_url)
# 检查是否已下载
if absolute_url in self.downloaded_urls:
return self.url_map.get(absolute_url)
self.downloaded_urls.add(absolute_url)
# 发送请求(模拟浏览器User-Agent)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(absolute_url, headers=headers, timeout=30)
# 解析文件名与路径
parsed_url = urlparse(absolute_url)
filename = os.path.basename(parsed_url.path) or f"unknown_{resource_type}"
# 自动补全扩展名
if '.' not in filename:
ext_map = {'css': '.css', 'js': '.js', 'image': '.jpg', 'icon': '.ico', 'font': '.woff2'}
filename += ext_map.get(resource_type, '.dat')
# 按类型分目录存储
sub_dirs = {
'css': 'css', 'js': 'js', 'image': 'images',
'icon': 'icons', 'font': 'webfonts'
}
sub_dir = os.path.join(base_dir, sub_dirs.get(resource_type, ''))
os.makedirs(sub_dir, exist_ok=True)
filepath = os.path.join(sub_dir, filename)
# 确保文件名唯一(避免覆盖)
filepath = self.ensure_unique_filename(filepath)
# 保存文件
with open(filepath, 'wb') as f:
f.write(response.content)
# 处理CSS中的嵌套资源
if resource_type == 'css':
self.process_css_resources(filepath, absolute_url, base_dir)
return filepath
except Exception as e:
print(f"下载失败 {resource_url}: {e}")
return None
def ensure_unique_filename(self, filepath):
"""如果文件已存在,添加计数器确保文件名唯一"""
if not os.path.exists(filepath):
return filepath
directory, filename = os.path.split(filepath)
name, ext = os.path.splitext(filename)
counter = 1
while True:
new_filename = f"{name}_{counter}{ext}"
new_filepath = os.path.join(directory, new_filename)
if not os.path.exists(new_filepath):
return new_filepath
counter += 1
资源处理关键逻辑:
- URL标准化:通过
get_absolute_url将相对URL转换为绝对URL,避免链接无效 - 分类存储:不同类型资源保存到对应子目录(如图片到
images文件夹),结构清晰 - 文件名唯一:
ensure_unique_filename方法解决同名文件覆盖问题,通过添加计数器(如image.jpg→image_1.jpg) - 嵌套资源处理:对CSS文件单独处理,递归下载其中引用的背景图片、字体等资源
4. CSS嵌套资源与路径修正
工具的核心优势在于处理CSS中的嵌套资源(如background: url(images/bg.png)),并修正所有资源引用路径:
def process_css_resources(self, css_file_path, css_url, base_dir):
"""处理CSS文件中引用的资源(如背景图片、字体)"""
try:
with open(css_file_path, 'r', encoding='utf-8') as f:
css_content = f.read()
# 正则匹配CSS中的url()引用
url_pattern = r'url\s*\(\s*[\'"]?(.*?)[\'"]?\s*\)'
matches = re.findall(url_pattern, css_content, re.IGNORECASE)
for match in matches:
# 跳过data:协议的内联资源
if match.startswith('data:'):
continue
# 下载CSS中引用的资源
resource_url = match
absolute_url = self.get_absolute_url(resource_url, css_url)
resource_type = self.determine_resource_type(resource_url)
local_path = self.download_resource(resource_url, css_url, base_dir, resource_type)
# 修正CSS中的资源路径
if local_path:
# 计算相对于CSS文件的相对路径
relative_path = os.path.relpath(local_path, os.path.dirname(css_file_path))
css_content = css_content.replace(
f'url({match})',
f'url({relative_path.replace("\\", "/")})'
)
self.css_resource_count += 1
# 保存修改后的CSS
with open(css_file_path, 'w', encoding='utf-8') as f:
f.write(css_content)
except Exception as e:
print(f"处理CSS资源失败: {e}")
def fix_html_resource_paths(self, html_file_path, base_url):
"""修正HTML中的资源引用路径为本地路径"""
with open(html_file_path, 'r', encoding='utf-8') as f:
html_content = f.read()
soup = BeautifulSoup(html_content, 'html.parser')
# 修正CSS链接
for link in soup.find_all('link', rel='stylesheet'):
if href := link.get('href'):
absolute_url = self.get_absolute_url(href, base_url)
if absolute_url in self.url_map:
link['href'] = self.url_map[absolute_url]
# 修正图片链接
for img in soup.find_all('img', src=True):
if src := img.get('src'):
absolute_url = self.get_absolute_url(src, base_url)
if absolute_url in self.url_map:
img['src'] = self.url_map[absolute_url]
# 保存修正后的HTML
with open(html_file_path, 'w', encoding='utf-8') as f:
f.write(str(soup))
路径修正机制:
- CSS资源提取:使用正则表达式匹配
url()格式的资源引用,支持带引号或不带引号的写法 - 相对路径计算:根据资源在本地的存储位置,动态计算相对于引用文件(HTML/CSS)的路径
- 批量替换:通过BeautifulSoup修改HTML中的
href和src属性,通过字符串替换修改CSS中的url()引用 - 跨平台兼容:将路径中的反斜杠
\统一转换为正斜杠/,确保在Windows和Linux系统上都能正常访问
5. 完整源代码与最终效果
WebPageCrawler完整源码如下:
import tkinter as tk
from tkinter import ttk, messagebox, filedialog
import requests
import os
import urllib.parse
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import threading
import webbrowser
from pathlib import Path
import time
import re
import cssutils
import logging
# 禁用cssutils的日志输出
cssutils.log.setLevel(logging.CRITICAL)
class WebPageCrawler:
def __init__(self, root):
self.root = root
self.root.title("网页资源抓取工具 - 增强版")
self.root.geometry("900x650")
# 存储下载的资源信息
self.resources = []
self.download_folder = ""
self.url_map = {} # 存储原始URL到本地路径的映射
self.downloaded_urls = set() # 记录已经下载的URL,避免重复下载
self.css_resource_count = 0 # 统计CSS中引入的资源数量
self.setup_ui()
def setup_ui(self):
# 主框架
main_frame = ttk.Frame(self.root, padding="10")
main_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
# URL输入区域
url_frame = ttk.Frame(main_frame)
url_frame.grid(row=0, column=0, columnspan=2, sticky=(tk.W, tk.E), pady=(0, 10))
ttk.Label(url_frame, text="目标URL:").grid(row=0, column=0, sticky=tk.W)
self.url_entry = ttk.Entry(url_frame, width=70)
self.url_entry.grid(row=0, column=1, padx=(10, 0), sticky=(tk.W, tk.E))
self.url_entry.insert(0, "https://")
# 按钮区域
button_frame = ttk.Frame(main_frame)
button_frame.grid(row=1, column=0, columnspan=2, sticky=(tk.W, tk.E), pady=(0, 10))
self.crawl_button = ttk.Button(button_frame, text="开始抓取", command=self.start_crawling)
self.crawl_button.grid(row=0, column=0, padx=(0, 10))
self.open_folder_button = ttk.Button(button_frame, text="打开目录",
command=self.open_download_folder,
state="disabled")
self.open_folder_button.grid(row=0, column=1, padx=(0, 10))
self.clear_button = ttk.Button(button_frame, text="清空列表", command=self.clear_list)
self.clear_button.grid(row=0, column=2, padx=(0, 10))
self.view_html_button = ttk.Button(button_frame, text="查看HTML",
command=self.view_html,
state="disabled")
self.view_html_button.grid(row=0, column=3)
# 进度条
self.progress = ttk.Progressbar(button_frame, mode='indeterminate')
self.progress.grid(row=0, column=4, padx=(20, 0), sticky=(tk.W, tk.E))
# 资源列表
list_frame = ttk.LabelFrame(main_frame, text="下载的资源列表", padding="5")
list_frame.grid(row=2, column=0, columnspan=2, sticky=(tk.W, tk.E, tk.N, tk.S), pady=(0, 10))
# Treeview 显示资源列表
columns = ('序号', '类型', 'URL', '本地路径', '状态')
self.tree = ttk.Treeview(list_frame, columns=columns, show='headings', height=15)
# 设置列标题
for col in columns:
self.tree.heading(col, text=col)
self.tree.column(col, width=100)
self.tree.column('URL', width=250)
self.tree.column('本地路径', width=200)
# 滚动条
scrollbar = ttk.Scrollbar(list_frame, orient=tk.VERTICAL, command=self.tree.yview)
self.tree.configure(yscrollcommand=scrollbar.set)
self.tree.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
scrollbar.grid(row=0, column=1, sticky=(tk.N, tk.S))
# 统计信息
stats_frame = ttk.Frame(main_frame)
stats_frame.grid(row=3, column=0, columnspan=2, sticky=(tk.W, tk.E), pady=(0, 10))
self.stats_var = tk.StringVar()
self.stats_var.set("总计: 0 个资源 (成功: 0, 失败: 0) | CSS资源: 0")
stats_label = ttk.Label(stats_frame, textvariable=self.stats_var)
stats_label.grid(row=0, column=0, sticky=tk.W)
# 状态栏
self.status_var = tk.StringVar()
self.status_var.set("就绪")
status_label = ttk.Label(main_frame, textvariable=self.status_var, relief=tk.SUNKEN)
status_label.grid(row=4, column=0, columnspan=2, sticky=(tk.W, tk.E))
# 配置网格权重
self.root.columnconfigure(0, weight=1)
self.root.rowconfigure(0, weight=1)
main_frame.columnconfigure(0, weight=1)
main_frame.rowconfigure(2, weight=1)
list_frame.columnconfigure(0, weight=1)
list_frame.rowconfigure(0, weight=1)
url_frame.columnconfigure(1, weight=1)
button_frame.columnconfigure(4, weight=1)
def start_crawling(self):
url = self.url_entry.get().strip()
if not url or url == "https://":
messagebox.showerror("错误", "请输入有效的URL")
return
# 选择保存目录
folder = filedialog.askdirectory(title="选择保存目录")
if not folder:
return
self.download_folder = folder
self.open_folder_button.config(state="normal")
# 禁用按钮,开始进度条
self.crawl_button.config(state="disabled")
self.progress.start()
self.status_var.set("正在抓取网页...")
# 在新线程中执行爬取任务
thread = threading.Thread(target=self.crawl_webpage, args=(url,))
thread.daemon = True
thread.start()
def crawl_webpage(self, url):
try:
self.resources = []
self.url_map = {}
self.downloaded_urls = set()
self.css_resource_count = 0
self.clear_treeview()
self.update_stats()
# 创建基础目录
domain = urlparse(url).netloc
base_dir = os.path.join(self.download_folder, domain)
os.makedirs(base_dir, exist_ok=True)
# 下载主页面
html_content, main_file_path = self.download_html(url, base_dir)
if not html_content:
self.update_status("主页下载失败")
return
# 解析HTML并获取资源链接
soup = BeautifulSoup(html_content, 'html.parser')
# 获取所有资源链接
resources_to_download = []
# CSS文件
for link in soup.find_all('link', rel='stylesheet'):
href = link.get('href')
if href:
resources_to_download.append(('css', href))
# JavaScript文件
for script in soup.find_all('script', src=True):
src = script.get('src')
if src:
resources_to_download.append(('js', src))
# 图片
for img in soup.find_all('img', src=True):
src = img.get('src')
if src:
resources_to_download.append(('image', src))
# 网站图标 (favicon)
for link in soup.find_all('link', rel=lambda x: x and ('icon' in x.lower() or 'shortcut' in x.lower())):
href = link.get('href')
if href:
resources_to_download.append(('icon', href))
# 如果没有找到明确的图标链接,尝试下载默认的favicon.ico
if not any(res[0] == 'icon' for res in resources_to_download):
favicon_url = urljoin(url, '/favicon.ico')
resources_to_download.append(('icon', favicon_url))
# 下载所有资源
total = len(resources_to_download)
for i, (resource_type, resource_url) in enumerate(resources_to_download):
self.status_var.set(f"正在下载资源 ({i + 1}/{total}): {resource_url}")
local_path = self.download_resource(resource_url, url, base_dir, resource_type)
# 记录URL映射
absolute_url = self.get_absolute_url(resource_url, url)
if local_path:
# 计算相对于HTML文件的路径
relative_path = os.path.relpath(local_path, os.path.dirname(main_file_path))
self.url_map[absolute_url] = relative_path.replace('\\', '/') # 统一使用正斜杠
self.resources.append({
'type': resource_type,
'url': resource_url,
'local_path': local_path,
'status': '成功' if local_path else '失败'
})
# 更新UI
self.root.after(0, self.update_treeview, len(self.resources) - 1)
self.root.after(0, self.update_stats)
# 修改HTML中的资源路径
self.status_var.set("正在修正HTML中的资源路径...")
self.fix_html_resource_paths(main_file_path, url)
success_count = len([r for r in self.resources if r['status'] == '成功'])
self.update_status(f"抓取完成!共下载 {success_count} 个资源,HTML文件已修正")
self.view_html_button.config(state="normal")
except Exception as e:
self.update_status(f"抓取过程中发生错误: {str(e)}")
finally:
self.root.after(0, self.crawling_finished)
def download_html(self, url, base_dir):
"""下载HTML页面"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=30)
response.encoding = response.apparent_encoding
# 保存HTML文件
filename = "index.html"
filepath = os.path.join(base_dir, filename)
with open(filepath, 'w', encoding='utf-8') as f:
f.write(response.text)
# 添加到资源列表
self.resources.append({
'type': 'html',
'url': url,
'local_path': filepath,
'status': '成功'
})
self.root.after(0, self.update_treeview, 0)
self.root.after(0, self.update_stats)
return response.text, filepath
except Exception as e:
print(f"下载HTML失败: {e}")
return None, None
def download_resource(self, resource_url, base_url, base_dir, resource_type):
"""下载单个资源"""
try:
# 构建完整URL
absolute_url = self.get_absolute_url(resource_url, base_url)
# 检查是否已经下载过
if absolute_url in self.downloaded_urls:
return self.url_map.get(absolute_url)
self.downloaded_urls.add(absolute_url)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
# 对于图标文件,如果下载失败,尝试使用默认路径
if resource_type == 'icon':
try:
response = requests.get(absolute_url, headers=headers, timeout=10)
except:
# 如果特定图标下载失败,尝试默认的favicon.ico
favicon_url = urljoin(base_url, '/favicon.ico')
if favicon_url != absolute_url:
try:
response = requests.get(favicon_url, headers=headers, timeout=10)
absolute_url = favicon_url
except:
return None
else:
response = requests.get(absolute_url, headers=headers, timeout=30)
# 解析URL获取文件路径
parsed_url = urlparse(absolute_url)
path = parsed_url.path
if not path or path == '/':
filename = 'index.html'
else:
filename = os.path.basename(path)
if not filename:
filename = 'index'
# 如果没有扩展名,根据资源类型添加
if '.' not in filename:
if resource_type == 'css':
filename += '.css'
elif resource_type == 'js':
filename += '.js'
elif resource_type == 'image':
# 尝试从Content-Type推断图片类型
content_type = response.headers.get('Content-Type', '')
if 'jpeg' in content_type or 'jpg' in content_type:
filename += '.jpg'
elif 'png' in content_type:
filename += '.png'
elif 'gif' in content_type:
filename += '.gif'
elif 'svg' in content_type:
filename += '.svg'
else:
filename += '.jpg' # 默认
elif resource_type == 'icon':
filename = 'favicon.ico'
elif resource_type == 'font':
# 尝试从Content-Type推断字体类型
content_type = response.headers.get('Content-Type', '')
if 'woff2' in content_type:
filename += '.woff2'
elif 'woff' in content_type:
filename += '.woff'
elif 'truetype' in content_type or 'ttf' in content_type:
filename += '.ttf'
elif 'opentype' in content_type or 'otf' in content_type:
filename += '.otf'
elif 'eot' in content_type:
filename += '.eot'
else:
filename += '.woff' # 默认
# 根据资源类型创建子目录
if resource_type == 'css':
sub_dir = os.path.join(base_dir, 'css')
os.makedirs(sub_dir, exist_ok=True)
filepath = os.path.join(sub_dir, filename)
elif resource_type == 'js':
sub_dir = os.path.join(base_dir, 'js')
os.makedirs(sub_dir, exist_ok=True)
filepath = os.path.join(sub_dir, filename)
elif resource_type == 'image':
sub_dir = os.path.join(base_dir, 'images')
os.makedirs(sub_dir, exist_ok=True)
filepath = os.path.join(sub_dir, filename)
elif resource_type == 'icon':
sub_dir = os.path.join(base_dir, 'icons')
os.makedirs(sub_dir, exist_ok=True)
filepath = os.path.join(sub_dir, filename)
elif resource_type == 'font':
sub_dir = os.path.join(base_dir, 'webfonts')
os.makedirs(sub_dir, exist_ok=True)
filepath = os.path.join(sub_dir, filename)
else:
filepath = os.path.join(base_dir, filename)
# 确保文件名唯一
filepath = self.ensure_unique_filename(filepath)
# 保存文件
with open(filepath, 'wb') as f:
f.write(response.content)
# 如果是CSS文件,处理其中的资源引用
if resource_type == 'css':
self.process_css_resources(filepath, absolute_url, base_dir)
return filepath
except Exception as e:
print(f"下载资源失败 {resource_url}: {e}")
return None
def process_css_resources(self, css_file_path, css_url, base_dir):
"""处理CSS文件中的资源引用"""
try:
with open(css_file_path, 'r', encoding='utf-8') as f:
css_content = f.read()
# 使用正则表达式查找CSS中的URL引用
url_pattern = r'url\s*\(\s*[\'"]?(.*?)[\'"]?\s*\)'
matches = re.findall(url_pattern, css_content, re.IGNORECASE)
# 处理每个找到的URL
for match in matches:
# 跳过data URL和内联资源
if match.startswith('data:'):
continue
# 处理CSS中的URL
resource_url = match
absolute_resource_url = self.get_absolute_url(resource_url, css_url)
# 确定资源类型
resource_type = self.determine_resource_type(resource_url)
# 下载资源
local_path = self.download_resource(resource_url, css_url, base_dir, resource_type)
# 记录URL映射
if local_path:
# 计算相对于CSS文件的路径
if resource_type == 'font':
# 字体文件使用相对路径 ../webfonts/
relative_path = f"../webfonts/{os.path.basename(local_path)}"
elif resource_type == 'image':
# 图片文件使用相对路径 ../images/
relative_path = f"../images/{os.path.basename(local_path)}"
else:
# 其他资源使用相对于CSS文件的路径
relative_path = os.path.relpath(local_path, os.path.dirname(css_file_path))
self.url_map[absolute_resource_url] = relative_path.replace('\\', '/')
# 替换CSS中的URL
css_content = css_content.replace(
f'url({match})',
f'url({relative_path.replace("\\", "/")})'
)
# 添加到资源列表
self.css_resource_count += 1
self.resources.append({
'type': f'css-{resource_type}',
'url': resource_url,
'local_path': local_path,
'status': '成功'
})
# 更新UI
self.root.after(0, self.update_treeview, len(self.resources) - 1)
self.root.after(0, self.update_stats)
# 保存修改后的CSS文件
with open(css_file_path, 'w', encoding='utf-8') as f:
f.write(css_content)
except Exception as e:
print(f"处理CSS资源失败 {css_file_path}: {e}")
def determine_resource_type(self, url):
"""根据URL确定资源类型"""
url_lower = url.lower()
# 字体文件
if any(ext in url_lower for ext in ['.woff', '.woff2', '.ttf', '.eot', '.otf']):
return 'font'
# 图片文件
elif any(ext in url_lower for ext in ['.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp']):
return 'image'
# 图标文件
elif '.ico' in url_lower:
return 'icon'
# CSS文件
elif '.css' in url_lower:
return 'css'
# 默认为图片
else:
return 'image'
def get_absolute_url(self, url, base_url):
"""将相对URL转换为绝对URL"""
if url.startswith('//'):
return 'https:' + url
elif url.startswith(('http://', 'https://')):
return url
else:
return urljoin(base_url, url)
def ensure_unique_filename(self, filepath):
"""确保文件名唯一,避免覆盖"""
if not os.path.exists(filepath):
return filepath
directory = os.path.dirname(filepath)
filename = os.path.basename(filepath)
name, ext = os.path.splitext(filename)
counter = 1
while True:
new_filename = f"{name}_{counter}{ext}"
new_filepath = os.path.join(directory, new_filename)
if not os.path.exists(new_filepath):
return new_filepath
counter += 1
def fix_html_resource_paths(self, html_file_path, base_url):
"""修正HTML文件中的资源路径"""
try:
with open(html_file_path, 'r', encoding='utf-8') as f:
html_content = f.read()
soup = BeautifulSoup(html_content, 'html.parser')
# 修正CSS链接
for link in soup.find_all('link', rel='stylesheet'):
href = link.get('href')
if href:
absolute_url = self.get_absolute_url(href, base_url)
if absolute_url in self.url_map:
link['href'] = self.url_map[absolute_url]
# 修正JavaScript链接
for script in soup.find_all('script', src=True):
src = script.get('src')
if src:
absolute_url = self.get_absolute_url(src, base_url)
if absolute_url in self.url_map:
script['src'] = self.url_map[absolute_url]
# 修正图片链接
for img in soup.find_all('img', src=True):
src = img.get('src')
if src:
absolute_url = self.get_absolute_url(src, base_url)
if absolute_url in self.url_map:
img['src'] = self.url_map[absolute_url]
# 修正图标链接
for link in soup.find_all('link', rel=lambda x: x and ('icon' in x.lower() or 'shortcut' in x.lower())):
href = link.get('href')
if href:
absolute_url = self.get_absolute_url(href, base_url)
if absolute_url in self.url_map:
link['href'] = self.url_map[absolute_url]
# 保存修正后的HTML
with open(html_file_path, 'w', encoding='utf-8') as f:
f.write(str(soup))
print("HTML资源路径修正完成")
except Exception as e:
print(f"修正HTML资源路径失败: {e}")
def update_treeview(self, index):
"""更新Treeview显示"""
if index < len(self.resources):
resource = self.resources[index]
self.tree.insert('', 'end', values=(
index + 1,
resource['type'],
resource['url'][:100] + '...' if len(resource['url']) > 100 else resource['url'],
resource['local_path'] or '下载失败',
resource['status']
))
def clear_treeview(self):
"""清空Treeview"""
for item in self.tree.get_children():
self.tree.delete(item)
def update_status(self, message):
"""更新状态栏"""
self.root.after(0, lambda: self.status_var.set(message))
def update_stats(self):
"""更新统计信息"""
total = len(self.resources)
success = len([r for r in self.resources if r['status'] == '成功'])
failed = total - success
self.stats_var.set(
f"总计: {total} 个资源 (成功: {success}, 失败: {failed}) | CSS资源: {self.css_resource_count}")
def crawling_finished(self):
"""爬取完成后的清理工作"""
self.crawl_button.config(state="normal")
self.progress.stop()
def open_download_folder(self):
"""打开下载目录"""
if self.download_folder and os.path.exists(self.download_folder):
webbrowser.open(self.download_folder)
else:
messagebox.showwarning("警告", "目录不存在或未选择目录")
def view_html(self):
"""查看HTML文件"""
html_files = []
for resource in self.resources:
if resource['type'] == 'html' and resource['status'] == '成功':
html_files.append(resource['local_path'])
if html_files:
webbrowser.open(html_files[0])
else:
messagebox.showwarning("警告", "未找到HTML文件")
def clear_list(self):
"""清空资源列表"""
self.resources = []
self.url_map = {}
self.downloaded_urls = set()
self.css_resource_count = 0
self.clear_treeview()
self.update_stats()
self.status_var.set("已清空列表")
self.view_html_button.config(state="disabled")
def main():
root = tk.Tk()
app = WebPageCrawler(root)
root.mainloop()
if __name__ == "__main__":
main()
网页抓取工具最终效果图如下:

保存到本地后的目录结构如下:

网页原始效果如下:

本地网页效果如下:

总结与扩展方向
WebPageCrawler通过模块化设计实现了网页资源的完整抓取与本地化,其核心价值在于解决了"资源依赖"和"路径修正"两大痛点。代码中大量使用了面向对象思想,将UI、网络请求、资源处理等功能封装为独立方法,便于维护与扩展。
未来可优化的方向:
- 增加资源过滤功能,允许用户选择仅下载特定类型资源
- 实现增量更新,通过文件哈希比对仅下载修改过的资源
- 添加代理池支持,解决部分网站的反爬限制
- 优化大型文件下载体验,支持断点续传
该工具展示了Python在网络爬虫与GUI应用结合方面的优势,通过合理运用requests、BeautifulSoup等库,可快速构建功能完善的实用工具。对于需要离线保存网页或分析网页资源的场景,这类工具能显著提高工作效率。
26万+

被折叠的 条评论
为什么被折叠?



