"""
https://www.biquge345.com小说下载器 - 异步并发小说爬虫与EPUB生成工具
pip install aiohttp beautifulsoup4 tqdm ebooklib -i https://pypi.tuna.tsinghua.edu.cn/simple/
功能:
1. 从笔趣阁网站异步下载小说章节内容
2. 自动解析小说信息(书名、作者、简介、封面)
3. 重新编号章节(统一为第1章、第2章...)
4. 支持自定义并发数和重试机制
5. 自动生成EPUB格式电子书
6. 生成下载日志和统计信息
使用方法:
1. 运行脚本:python biquge345_crawler.py
2. 输入小说目录页面的URL
3. 设置并发数(推荐100,可根据网络情况调整)
4. 设置重试次数(默认3次)
5. 程序会自动下载并生成EPUB文件
输出文件:
- saved_pages/:临时保存的章节HTML文件
- epub_books/:生成的EPUB电子书
- download_logs/:下载日志和统计信息
注意:请遵守网站robots.txt,合理控制请求频率,避免对目标网站造成压力。
"""
import aiohttp
import asyncio
import os
import re
import random
import time
import shutil
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tqdm import tqdm
from typing import List, Dict
import ebooklib
from ebooklib import epub
import datetime
class NovelDownloader:
def __init__(self, config=None):
# 默认配置
self.config = {
'concurrency': 8,
'min_interval': 200,
'max_interval': 400,
'max_retries': 3,
'retry_min_interval': 2000,
'retry_max_interval': 4000
}
# 更新用户配置
if config:
self.config.update(config)
self.base_url = "https://www.biquge345.com"
self.book_url = ""
self.save_dir = "saved_pages"
self.epub_dir = "epub_books"
self.log_dir = "download_logs"
self.semaphore = asyncio.Semaphore(self.config['concurrency'])
self.headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "zh-CN,zh;q=0.9",
"priority": "u=0, i",
"sec-ch-ua": "\"Google Chrome\";v=\"141\", \"Not?A_Brand\";v=\"8\", \"Chromium\";v=\"141\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36"
}
self.cookies = {
"PHPSESSID": "2cojheldp34dkpqq5k954fib25",
"_ga": "GA1.1.1857964030.1762124026",
"__gads": "ID=630f667a0be008b7:T=1762124412:RT=1762124412:S=ALNI_Maqx_HllndQs7fHW1H1pX_DjQeT2Q",
"__gpi": "UID=000011ace8aaf1d2:T=1762124412:RT=1762124412:S=ALNI_Maz4nCTf0jFGvRpctpnE96rlu023A",
"__eoi": "ID=2913f0c44ad20310:T=1762124412:RT=1762124412:S=AA-AfjbVVCoKHD27Yo7cQCc95tTW",
"_ga_G02LF2E3VC": "GS2.1.s1762124026$o1$g1$t1762125126$j56$l0$h0"
}
# 创建目录
for directory in [self.save_dir, self.epub_dir, self.log_dir]:
if not os.path.exists(directory):
os.makedirs(directory)
# 小说信息
self.novel_info = {
'title': '',
'author': '',
'description': '',
'cover_url': ''
}
async def random_delay(self, min_ms, max_ms):
"""随机延迟"""
delay = random.uniform(min_ms / 1000, max_ms / 1000)
await asyncio.sleep(delay)
async def fetch_html_with_retry(self, session, url, referer=None):
"""带重试机制的异步获取HTML内容"""
headers = self.headers.copy()
if referer:
headers["referer"] = referer
else:
headers["referer"] = self.book_url
for attempt in range(self.config['max_retries'] + 1):
async with self.semaphore:
try:
# 请求前延迟
await self.random_delay(self.config['min_interval'], self.config['max_interval'])
async with session.get(url, headers=headers, cookies=self.cookies) as response:
if response.status == 200:
return await response.text()
else:
if attempt < self.config['max_retries']:
retry_delay = random.uniform(
self.config['retry_min_interval'] / 1000,
self.config['retry_max_interval'] / 1000
)
await asyncio.sleep(retry_delay)
continue
else:
print(f"请求失败: {url}, 状态码: {response.status}")
return None
except Exception as e:
if attempt < self.config['max_retries']:
retry_delay = random.uniform(
self.config['retry_min_interval'] / 1000,
self.config['retry_max_interval'] / 1000
)
await asyncio.sleep(retry_delay)
continue
else:
print(f"请求错误: {url}, 错误: {e}")
return None
return None
def parse_novel_info(self, html_content):
"""解析小说基本信息"""
soup = BeautifulSoup(html_content, 'html.parser')
# 提取书名
title_tag = soup.find('h1')
if title_tag:
self.novel_info['title'] = title_tag.get_text().strip()
# 提取作者 - 修正提取逻辑
# 方法1: 查找包含"作者:"文本的span标签
author_spans = soup.find_all('span', class_='x1')
for span in author_spans:
if '作者:' in span.get_text():
author_link = span.find('a')
if author_link:
self.novel_info['author'] = author_link.get_text().strip()
else:
# 如果没有a标签,直接从span文本中提取
author_text = span.get_text().strip()
self.novel_info['author'] = author_text.replace('作者:', '').strip()
break
# 方法2: 如果方法1没找到,尝试其他选择器
if not self.novel_info['author']:
author_div = soup.find('div', class_='xinxi')
if author_div:
author_text = author_div.get_text()
author_match = re.search(r'作者[::]?\s*([^\s]+)', author_text)
if author_match:
self.novel_info['author'] = author_match.group(1).strip()
# 方法3: 从meta标签中提取
if not self.novel_info['author']:
author_meta = soup.find('meta', {'name': 'author'})
if author_meta and author_meta.get('content'):
self.novel_info['author'] = author_meta['content'].strip()
# 提取简介
desc_tag = soup.find('div', class_='x3')
if desc_tag:
self.novel_info['description'] = desc_tag.get_text().strip()
# 提取封面
cover_tag = soup.find('img', class_='zhutu')
if cover_tag and cover_tag.get('src'):
self.novel_info['cover_url'] = urljoin(self.base_url, cover_tag['src'])
print(f"书名: {self.novel_info['title']}")
print(f"作者: {self.novel_info['author']}")
if self.novel_info['description']:
print(f"简介: {self.novel_info['description'][:100]}...")
def parse_chapter_links(self, html_content):
"""解析章节链接并重新编号"""
soup = BeautifulSoup(html_content, 'html.parser')
chapter_links = []
# 从章节列表区域提取链接
chapter_list = soup.find('ul', class_='info')
if chapter_list:
links = chapter_list.find_all('a', href=True)
chapter_count = 1 # 从第1章开始计数
for link in links:
href = link['href']
original_title = link.get('title', link.get_text().strip())
if href.startswith('/chapter/'):
full_url = urljoin(self.base_url, href)
# 重新编号章节,忽略原始章节号
new_title = f"第{chapter_count}章 {self.extract_chapter_content(original_title)}"
chapter_links.append({
'url': full_url,
'original_title': original_title,
'new_title': new_title,
'chapter_number': chapter_count,
'filename': f"第{chapter_count:03d}章_{self.sanitize_filename(self.extract_chapter_content(original_title))}.html"
})
chapter_count += 1
return chapter_links
def extract_chapter_content(self, title):
"""提取章节标题内容,移除卷和章节号信息"""
# 移除"第X章"、"第X卷"等前缀
patterns = [
r'^第[零一二三四五六七八九十百千\d]+章\s*',
r'^第[零一二三四五六七八九十百千\d]+卷\s*',
r'^[上下]?卷?\s*',
r'^第\d+\章\s*',
r'^第\d+章\s*'
]
cleaned_title = title
for pattern in patterns:
cleaned_title = re.sub(pattern, '', cleaned_title)
# 如果清理后为空,返回原始标题
return cleaned_title.strip() if cleaned_title.strip() else title
def sanitize_filename(self, filename):
"""清理文件名,移除非法字符"""
return re.sub(r'[<>:"/\\|?*]', '_', filename)
def extract_chapter_content_from_html(self, html_content):
"""从章节HTML中提取正文内容"""
soup = BeautifulSoup(html_content, 'html.parser')
# 查找正文内容区域
content_div = soup.find('div', id='txt')
if not content_div:
content_div = soup.find('div', class_='txt')
if content_div:
# 移除广告和无关元素
for element in content_div.find_all(['script', 'style', 'div', 'a']):
element.decompose()
# 获取纯文本内容
content = content_div.get_text(separator='\n')
# 清理内容
content = re.sub(r'一秒记住.*?无弹窗!', '', content)
content = re.sub(r'\n\s*\n', '\n\n', content) # 合并多个空行
content = content.strip()
return content
return ""
async def download_chapter(self, session, chapter_info, pbar):
"""下载单个章节"""
url = chapter_info['url']
filename = chapter_info['filename']
original_title = chapter_info['original_title']
new_title = chapter_info['new_title']
html_content = await self.fetch_html_with_retry(session, url)
if html_content:
# 提取正文内容
chapter_content = self.extract_chapter_content_from_html(html_content)
# 保存原始HTML
file_path = os.path.join(self.save_dir, filename)
with open(file_path, 'w', encoding='utf-8') as f:
f.write(html_content)
# 保存提取的内容
content_file_path = os.path.join(self.save_dir, f"content_{filename}")
with open(content_file_path, 'w', encoding='utf-8') as f:
f.write(chapter_content)
pbar.set_description(f"✓ {new_title[:20]:<20}")
pbar.update(1)
return {
'success': True,
'chapter': new_title,
'content': chapter_content,
'original_title': original_title
}
else:
pbar.set_description(f"✗ {new_title[:20]:<20}")
pbar.update(1)
return {
'success': False,
'chapter': new_title,
'url': url,
'content': ''
}
def create_epub(self, chapter_results, log_file_path):
"""创建EPUB电子书"""
if not self.novel_info['title']:
print("无法创建EPUB:缺少书名信息")
return
# 创建EPUB书籍
book = epub.EpubBook()
# 设置元数据
book.set_identifier(f"novel_{int(time.time())}")
book.set_title(self.novel_info['title'])
book.set_language('zh')
book.add_author(self.novel_info['author'])
# 添加简介
intro_chapter = epub.EpubHtml(title='简介', file_name='intro.xhtml', lang='zh')
intro_content = f"""
<html>
<head>
<title>简介</title>
<style>
body {{ font-family: Arial, sans-serif; line-height: 1.6; margin: 20px; }}
h1 {{ color: #333; border-bottom: 2px solid #333; }}
.description {{ margin: 20px 0; }}
</style>
</head>
<body>
<h1>{self.novel_info['title']}</h1>
<h2>作者:{self.novel_info['author']}</h2>
<div class="description">
{self.novel_info['description'].replace(chr(10), '<br/>')}
</div>
</body>
</html>
"""
intro_chapter.content = intro_content
book.add_item(intro_chapter)
# 添加目录章节
toc_chapter = epub.EpubHtml(title='小说目录', file_name='toc.xhtml', lang='zh')
toc_content = """
<html>
<head>
<title>小说目录</title>
<style>
body {{ font-family: Arial, sans-serif; line-height: 1.6; margin: 20px; }}
h1 {{ color: #333; border-bottom: 2px solid #333; }}
ul {{ list-style-type: none; padding: 0; }}
li {{ margin: 5px 0; }}
a {{ text-decoration: none; color: #0066cc; }}
a:hover {{ text-decoration: underline; }}
</style>
</head>
<body>
<h1>小说目录</h1>
<ul>
"""
# 添加章节
chapters = []
nav_points = []
# 添加简介到目录
toc_content += '<li><a href="intro.xhtml">1. 简介</a></li>'
toc_content += '<li><a href="toc.xhtml">2. 小说目录</a></li>'
chapter_index = 3
for result in chapter_results:
if result['success'] and result['content']:
chapter_file_name = f'chapter_{result["chapter"].replace(" ", "_")}.xhtml'
chapter = epub.EpubHtml(
title=result['chapter'],
file_name=chapter_file_name,
lang='zh'
)
chapter_content = f"""
<html>
<head>
<title>{result['chapter']}</title>
<style>
body {{ font-family: Arial, sans-serif; line-height: 1.8; margin: 20px; }}
h1 {{ color: #333; text-align: center; border-bottom: 1px solid #ccc; }}
.content {{ text-indent: 2em; margin: 10px 0; }}
</style>
</head>
<body>
<h1>{result['chapter']}</h1>
<div class="content">
{result['content'].replace(chr(10), '<br/>')}
</div>
</body>
</html>
"""
chapter.content = chapter_content
book.add_item(chapter)
chapters.append(chapter)
# 添加到目录
toc_content += f'<li><a href="{chapter_file_name}">{chapter_index}. {result["chapter"]}</a></li>'
chapter_index += 1
toc_content += """
</ul>
</body>
</html>
"""
toc_chapter.content = toc_content
book.add_item(toc_chapter)
# 设置书籍结构
book.toc = [
epub.Link('intro.xhtml', '简介', 'intro'),
epub.Link('toc.xhtml', '小说目录', 'toc')
]
# 添加章节到目录
for chapter in chapters:
book.toc.append(epub.Link(chapter.file_name, chapter.title, chapter.file_name))
# 添加导航文件
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
# 定义书籍脊柱
book.spine = ['nav', intro_chapter, toc_chapter] + chapters
# 保存EPUB文件
epub_filename = f"{self.novel_info['title']}({self.novel_info['author']}).epub"
epub_filename = self.sanitize_filename(epub_filename)
epub_path = os.path.join(self.epub_dir, epub_filename)
epub.write_epub(epub_path, book, {})
print(f"✓ EPUB电子书已创建: {epub_path}")
return epub_path
def write_download_log(self, chapter_results, epub_path):
"""写入下载日志"""
log_filename = f"{self.novel_info['title']}_下载日志.txt"
log_filename = self.sanitize_filename(log_filename)
log_path = os.path.join(self.log_dir, log_filename)
with open(log_path, 'w', encoding='utf-8') as f:
f.write(f"小说下载日志\n")
f.write("=" * 50 + "\n")
f.write(f"书名: {self.novel_info['title']}\n")
f.write(f"作者: {self.novel_info['author']}\n")
f.write(f"下载时间: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"EPUB文件: {epub_path}\n")
f.write("=" * 50 + "\n\n")
# 成功章节
success_chapters = [r for r in chapter_results if r['success']]
f.write(f"成功下载章节: {len(success_chapters)} 章\n")
for i, result in enumerate(success_chapters, 1):
f.write(f"{i}. {result['chapter']} (原: {result['original_title']})\n")
# 失败章节
failed_chapters = [r for r in chapter_results if not r['success']]
if failed_chapters:
f.write(f"\n失败章节: {len(failed_chapters)} 章\n")
for i, result in enumerate(failed_chapters, 1):
f.write(f"{i}. {result['chapter']} - URL: {result['url']}\n")
f.write(f"\n配置信息:\n")
f.write(f"并发数: {self.config['concurrency']}\n")
f.write(f"最大重试次数: {self.config['max_retries']}\n")
f.write(f"请求间隔: {self.config['min_interval']}-{self.config['max_interval']}ms\n")
print(f"✓ 下载日志已保存: {log_path}")
return log_path
async def download_all_chapters(self):
"""下载所有章节并创建EPUB"""
print("开始下载小说章节...")
print(f"并发数: {self.config['concurrency']}")
print(f"重试次数: {self.config['max_retries']}")
print(f"请求间隔: {self.config['min_interval']}-{self.config['max_interval']}ms")
print(f"重试间隔: {self.config['retry_min_interval']}-{self.config['retry_max_interval']}ms")
print("-" * 50)
# 创建aiohttp会话
timeout = aiohttp.ClientTimeout(total=60)
connector = aiohttp.TCPConnector(limit=self.config['concurrency'])
async with aiohttp.ClientSession(timeout=timeout, connector=connector) as session:
# 1. 首先获取目录页面
print("正在获取章节目录...")
book_html = await self.fetch_html_with_retry(session, self.book_url)
if not book_html:
print("获取目录页面失败!")
return
# 解析小说信息
self.parse_novel_info(book_html)
# 保存目录页面
book_file_path = os.path.join(self.save_dir, "book_page.html")
with open(book_file_path, 'w', encoding='utf-8') as f:
f.write(book_html)
print("✓ 目录页面已保存")
# 2. 解析章节链接并重新编号
chapter_links = self.parse_chapter_links(book_html)
if not chapter_links:
print("未找到章节链接!")
return
print(f"找到 {len(chapter_links)} 个章节,已重新编号")
# 显示前几个章节的重新编号情况
print("\n前5个章节的重新编号情况:")
for i, chapter in enumerate(chapter_links[:5]):
print(f" {chapter['new_title']} (原: {chapter['original_title']})")
if len(chapter_links) > 5:
print(" ...")
print("\n开始下载章节...")
# 3. 异步下载所有章节(带进度条)
chapter_results = []
with tqdm(total=len(chapter_links), desc="下载进度", unit="章") as pbar:
tasks = []
for chapter_info in chapter_links:
task = self.download_chapter(session, chapter_info, pbar)
tasks.append(task)
results = await asyncio.gather(*tasks)
chapter_results = results
# 统计结果
success_count = sum(1 for r in chapter_results if r['success'])
failed_count = len(chapter_results) - success_count
print(f"\n下载完成!")
print(f"成功: {success_count} 个章节")
print(f"失败: {failed_count} 个章节")
# 显示失败章节
if failed_count > 0:
print("\n失败的章节:")
for result in chapter_results:
if not result['success']:
print(f" {result['chapter']}")
# 4. 创建EPUB电子书
if success_count > 0:
print("\n正在创建EPUB电子书...")
epub_path = self.create_epub(chapter_results, "")
# 5. 写入下载日志
log_path = self.write_download_log(chapter_results, epub_path)
# 6. 如果没有失败章节,删除下载的网页文件
if failed_count == 0:
print("\n所有章节下载成功,清理临时文件...")
if os.path.exists(self.save_dir):
shutil.rmtree(self.save_dir)
print("✓ 临时文件已清理")
else:
print(f"\n有 {failed_count} 个章节下载失败,保留临时文件以供重新下载")
print(f"\n最终结果:")
print(f"EPUB电子书: {epub_path}")
print(f"下载日志: {log_path}")
else:
print("没有成功下载任何章节,无法创建EPUB电子书")
async def main():
# 获取小说目录地址
while True:
book_url = input("请输入小说目录地址: ").strip()
if book_url:
if book_url.startswith('http'):
break
else:
print("请输入有效的URL地址!")
else:
print("URL不能为空!")
# 获取用户输入的配置
config = {}
# 并发数
while True:
try:
concurrency_input = input("请输入并发数 (1-100,推荐100): ").strip()
if not concurrency_input:
config['concurrency'] = 8
break
concurrency = int(concurrency_input)
if 1 <= concurrency <= 100:
config['concurrency'] = concurrency
break
else:
print("并发数必须在1-100之间!")
except ValueError:
print("请输入有效的数字!")
# 重试次数
while True:
try:
retries_input = input("请输入最大重试次数 (默认3): ").strip()
if not retries_input:
config['max_retries'] = 3
break
retries = int(retries_input)
if retries >= 0:
config['max_retries'] = retries
break
else:
print("重试次数必须大于等于0!")
except ValueError:
print("请输入有效的数字!")
# 使用默认的时间间隔配置
config.update({
'min_interval': 200,
'max_interval': 400,
'retry_min_interval': 2000,
'retry_max_interval': 4000
})
print(f"\n配置信息:")
print(f"小说地址: {book_url}")
print(f"并发数: {config['concurrency']}")
print(f"最大重试次数: {config['max_retries']}")
print(f"请求间隔: {config['min_interval']}-{config['max_interval']}ms")
print(f"重试间隔: {config['retry_min_interval']}-{config['retry_max_interval']}ms")
print("-" * 50)
# 创建下载器并开始下载
downloader = NovelDownloader(config=config)
downloader.book_url = book_url
await downloader.download_all_chapters()
if __name__ == "__main__":
# 在Windows上设置事件循环策略
if os.name == 'nt':
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
asyncio.run(main())
如果作为一个Python爬虫项目结构应该是怎样划分的?
最新发布