import requests
from bs4 import BeautifulSoup
import re
import time
import os
import random
from fake_useragent import UserAgent
from urllib.parse import urlparse, urljoin, urlunparse
import logging
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("novel_downloader.log"),
logging.StreamHandler()
]
)
# 创建用户代理生成器
ua = UserAgent()
# 需要过滤的导航元素列表(已添加笔趣阁专用元素)
NAV_ELEMENTS = [
"首页", "关灯", "字体:", "大", "中", "小",
"上一页", "返回目录", "下一页", "加入书签", "推荐票",
"返回书页", "目录", "设置", "书架", "加入书架",
"上一章", "下一章", "书签", "投票", "举报",
"本章未完,请点击下一页继续阅读", "本章结束,请点击下一章继续阅读",
"返回", "电脑版", "手机版", "APP版", "客户端", "介绍", "足迹", "超大", "进书架", "本章未完 点击下一页继续阅读",
"顶部", "底部", "页脚", "页眉", "章节列表", "加书签", "我的书架", "阅读历史", "本章已完",
"请收藏本站", "请记住本书首发域名", "天才一秒记住本站地址", "顶点小说", "笔趣阁", "更新最快",
"推荐阅读", "上一节", "下一节", "目录页", "返回顶部", "加入书签", "投票推荐", "章节报错","热门推荐", "新书推荐", "最新网址", "蚂蚁文学", "本章结束"
"加入书签,方便阅读", # 笔趣阁专用
"本章未完,点击下一页继续阅读", # 笔趣阁专用
"热门推荐", "新书推荐", "推荐阅读", "完本小说", "排行榜单", "小说推荐",
"猜你喜欢", "编辑推荐", "小说排行", "精品推荐", "最新小说"
]
# 特殊换行保留标签
PRESERVE_LINEBREAK_TAGS = ["br", "p", "div", "pre", "blockquote"]
# 常见内容容器选择器(添加笔趣阁专用选择器)
CONTENT_SELECTORS = [
('div', 'Readarea ReadAjax_content'), # 笔趣阁专用
('div', 'content'), # 通用内容区
('div', 'novel-content'), # 小说专用
('div', 'size16 color5 pt-read-text'),
('div', 'pt-read-text'),
('div', 'chapter-content'),
('div', 'novelcontent'),
('div', 'chapter-content'),
('article', None),
('div', 'chapter_body'),
('div', 'read-content'),
('div', 'txt_cont'),
('div', 'content-body'),
('div', 'read-box'),
('div', 'chapter-content-inner'),
('div', 'chapter-text'),
('div', 'content-main'),
('div', 'chapter-container'),
('div', 'chapter-content'),
('div', 'chapter'),
('div', 'main-content'),
('div', 'entry-content'),
('div', 'article-content'),
('div', 'content-body'),
]
# 下一页链接选择器(优化后的选择器)
NEXT_PAGE_SELECTORS = [
# 优先匹配bottem区域
('div', 'bottem1', 'a', re.compile(r'下一[页章]|继续阅读')),
('div', 'bottem2', 'a', re.compile(r'下一[页章]|继续阅读')),
# 笔趣阁专用选择器
('a', {'id': 'pt_next'}), # 优先匹配id=pt_next的链接
# 通用选择器
('a', re.compile(r'下一页|下一章|下一节|继续阅读'), None),
('button', re.compile(r'下一页|下一章|下一节|继续阅读'), 'parent_a'),
('a', None, re.compile(r'next-page|next|next-chapter')),
('a', None, 'next'),
('a', None, 'bt_next'),
('a', None, 'btn-next'),
('a', None, 'nextChapter'),
('a', None, 'next_chapter'),
('a', None, 'chapter-next'),
('a', None, 'btn-next-page'),
('a', {'id': 'nextChapterBtn'}),
('a', {'id': 'next_chapter'}),
('a', {'id': 'next_page'}),
('a', {'id': 'btn_next'}),
('a', {'id': 'nextChapter'}),
('a', {'class': 'btn-info'}),
('a', {'class': 'next-page'}),
('a', {'class': 'next-chapter'}),
('a', {'class': 'next-button'}),
]
def normalize_url(url, base_url):
"""规范化URL,确保正确处理相对路径和绝对路径"""
parsed_base = urlparse(base_url)
parsed_url = urlparse(url)
# 如果URL是绝对路径,直接返回
if parsed_url.scheme and parsed_url.netloc:
return url
# 处理相对路径
if url.startswith('//'):
return f"{parsed_base.scheme}:{url}"
# 处理以/开头的路径
if url.startswith('/'):
return f"{parsed_base.scheme}://{parsed_base.netloc}{url}"
# 处理相对路径
base_path = parsed_base.path.rsplit('/', 1)[0] if '.' in parsed_base.path else parsed_base.path
return f"{parsed_base.scheme}://{parsed_base.netloc}{base_path}/{url}"
def extract_content_and_next_link(html, base_url):
"""从HTML中提取内容并找到下一页链接,优化换行处理"""
soup = BeautifulSoup(html, 'html.parser')
content_div = None
# 新增过滤:移除推荐区域
for ad_section in content_div.find_all(class_=re.compile(r"recommend|hot|rank|footer_link|banner")):
ad_section.decompose()
# 移除特定ID的广告区块
for ad_id in ["hm_t_125039", "banner", "footer_link"]:
if content_div.find(id=ad_id):
content_div.find(id=ad_id).decompose()
# 尝试多种内容容器选择器
for selector in CONTENT_SELECTORS:
tag, class_name = selector
if class_name:
content_div = soup.find(tag, class_=class_name)
else:
content_div = soup.find(tag)
if content_div:
logging.info(f"使用选择器: {selector} 找到内容容器")
break
# 兜底策略
if not content_div:
logging.warning("使用body作为内容容器")
content_div = soup.find('body')
if not content_div:
logging.error("无法找到内容容器")
return "", None
# 1. 移除导航元素
for nav_text in NAV_ELEMENTS:
for element in content_div.find_all(string=re.compile(re.escape(nav_text))):
element.extract()
for ad_class in ["lm", "footer_link", "footer_cont", "reader_mark"]:
for element in content_div.find_all(class_=ad_class):
element.decompose()
# 2. 移除script/style标签
for script in content_div.find_all(['script', 'style']):
script.extract()
# 3. 清理空白字符
for nbsp in content_div.find_all(string=re.compile(r'\u00a0')):
nbsp.replace_with(nbsp.replace('\u00a0', ' '))
# 4. 处理换行标签
for tag in PRESERVE_LINEBREAK_TAGS:
for element in content_div.find_all(tag):
element.append('\n')
# 5. 获取原始文本
raw_text = content_div.get_text(separator='\n', strip=False)
# 6. 文本优化处理
cleaned_text = re.sub(r'\n{3,}', '\n\n', raw_text) # 多个连续换行合并
cleaned_text = re.sub(r'(\S)\n(\S)', r'\1 \2', cleaned_text) # 合并单字换行
cleaned_text = re.sub(r'\n\s+\n', '\n\n', cleaned_text) # 清理空白行
cleaned_text = re.sub(r'^\s+', '', cleaned_text, flags=re.MULTILINE) # 删除行首空白
# 在现有的文本清理之后添加
cleaned_text = re.sub(r'热门推荐.+?推荐阅读', '', cleaned_text, flags=re.DOTALL)
cleaned_text = re.sub(r'新书推荐.+?加入书签', '', cleaned_text, flags=re.DOTALL)
cleaned_text = re.sub(r'本章未完.+?继续阅读', '', cleaned_text)
# 7. 添加章节标题
title_tag = soup.find('title')
chapter_title = title_tag.text.split('-')[0] if title_tag else "未知章节"
cleaned_text = f"## {chapter_title} ##\n\n" + cleaned_text.strip()
# 8. 保留诗歌格式
cleaned_text = re.sub(r'(\n {4,}.+?\n)\n+', r'\1\n', cleaned_text)
cleaned_text = re.sub(r'\n{2,}', '\n\n', cleaned_text) # 多个空行合并
# 查找下一页链接 - 优化后的逻辑
next_link = None
# 1. 查找JavaScript变量(扩展匹配规则)
script_tags = soup.find_all('script')
for script in script_tags:
if script.string:
# 扩展正则表达式匹配更多变量名
match = re.search(
r'var\s+(?:nexturl|nextUrl|next_page_url|next_page|nextChapterUrl)\s*=\s*["\'](.*?)["\'];',
script.string
)
if match:
next_link = match.group(1)
if '<' in next_link:
link_soup = BeautifulSoup(next_link, 'html.parser')
a_tag = link_soup.find('a')
if a_tag and 'href' in a_tag.attrs:
next_link = a_tag['href']
next_link = normalize_url(next_link, base_url)
logging.info(f"通过JavaScript变量找到下一页: {next_link}")
return cleaned_text + "\n\n", next_link
# 2. 改进的选择器匹配逻辑
for selector in NEXT_PAGE_SELECTORS:
try:
# 处理四元素选择器(新增类型)
if len(selector) == 4:
parent_tag, parent_class, child_tag, text_pattern = selector
parent_element = soup.find(parent_tag, class_=parent_class)
if parent_element:
element = parent_element.find(child_tag, string=text_pattern)
if element and 'href' in element.attrs:
next_link = element['href']
next_link = normalize_url(next_link, base_url)
logging.info(f"通过选择器 {selector} 找到下一页: {next_link}")
return cleaned_text + "\n\n", next_link
# 处理三元素选择器
elif len(selector) == 3:
tag, *params = selector
element = None
if isinstance(params[0], dict):
# 属性选择器
element = soup.find(tag, attrs=params[0])
elif hasattr(params[0], 'match') and isinstance(params[1], str):
# 文本和类名选择器
text_pattern, class_name = params
element = soup.find(tag, string=text_pattern, class_=class_name)
elif hasattr(params[0], 'match'):
# 文本匹配
element = soup.find(tag, string=params[0])
elif isinstance(params[1], str):
# 类名匹配
element = soup.find(tag, class_=params[1])
if element and 'href' in element.attrs:
next_link = element['href']
next_link = normalize_url(next_link, base_url)
logging.info(f"通过选择器 {selector} 找到下一页: {next_link}")
return cleaned_text + "\n\n", next_link
# 处理二元素选择器
elif len(selector) == 2:
tag, param = selector
element = None
if isinstance(param, dict):
element = soup.find(tag, attrs=param)
elif isinstance(param, str):
element = soup.find(tag, class_=param)
elif hasattr(param, 'match'):
element = soup.find(tag, string=param)
if element and 'href' in element.attrs:
next_link = element['href']
next_link = normalize_url(next_link, base_url)
logging.info(f"通过选择器 {selector} 找到下一页: {next_link}")
return cleaned_text + "\n\n", next_link
except Exception as e:
logging.warning(f"选择器 {selector} 匹配出错: {str(e)}")
continue
logging.warning("未找到下一页链接")
return cleaned_text + "\n\n", None
def get_random_headers():
"""生成随机的请求头"""
return {
'User-Agent': ua.random,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0',
'DNT': '1',
'Referer': 'https://www.dbxsd.com/'
}
def detect_encoding(response):
"""智能检测页面编码"""
if 'content-type' in response.headers:
content_type = response.headers['content-type'].lower()
if 'charset=' in content_type:
return content_type.split('charset=')[-1]
soup = BeautifulSoup(response.content, 'html.parser', from_encoding='iso-8859-1')
meta_tag = soup.find('meta', charset=True)
if meta_tag:
return meta_tag['charset']
meta_tag = soup.find('meta', {'http-equiv': re.compile(r'content-type', re.I)})
if meta_tag and 'content' in meta_tag.attrs:
content = meta_tag['content'].lower()
if 'charset=' in content:
return content.split('charset=')[-1]
return 'utf-8'
def download_novel(start_url, output_file="novel.txt", max_retries=5):
"""下载整本小说"""
parsed_url = urlparse(start_url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
session = requests.Session()
session.headers.update({
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'DNT': '1',
'Upgrade-Insecure-Requests': '1'
})
current_url = start_url
chapter_count = 0
retry_count = 0
total_bytes = 0
visited_urls = set()
output_dir = os.path.dirname(output_file)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir)
logging.info(f"开始下载小说,起始URL: {start_url}")
logging.info(f"基础URL: {base_url}")
with open(output_file, 'w', encoding='utf-8') as f:
while current_url and retry_count < max_retries:
if current_url in visited_urls:
logging.warning(f"检测到重复URL: {current_url},跳过")
break
visited_urls.add(current_url)
chapter_count += 1
try:
logging.info(f"正在下载第 {chapter_count} 章: {current_url}")
delay = random.uniform(1.5, 4.0)
time.sleep(delay)
headers = get_random_headers()
headers['Referer'] = base_url
response = session.get(current_url, headers=headers, timeout=15)
if response.status_code != 200:
logging.error(f"错误: 无法获取页面,状态码: {response.status_code}")
retry_count += 1
continue
detected_encoding = detect_encoding(response)
response.encoding = detected_encoding
logging.info(f"检测到编码: {detected_encoding}")
content, next_link = extract_content_and_next_link(response.text, current_url)
if content and len(content.strip()) > 20:
bytes_written = f.write(content)
total_bytes += bytes_written
f.flush()
logging.info(f"成功写入第 {chapter_count} 章内容 ({len(content)} 字符)")
retry_count = 0
else:
logging.warning(f"未提取到有效内容,可能页面结构变化")
debug_file = f"debug_ch{chapter_count}.html"
with open(debug_file, 'w', encoding='utf-8') as debug_f:
debug_f.write(response.text)
logging.info(f"已保存调试文件: {debug_file}")
retry_count += 1
continue
if next_link and next_link != current_url:
current_url = next_link
else:
current_url = None
logging.info("已到达最后一章")
except requests.exceptions.RequestException as e:
logging.error(f"网络请求出错: {str(e)}")
retry_count += 1
time.sleep(5)
except Exception as e:
logging.error(f"处理章节时出错: {str(e)}")
retry_count += 1
time.sleep(5)
logging.info(f"已保存到: {output_file}")
logging.info(f"总章节数: {chapter_count}")
logging.info(f"总字数: {total_bytes} 字节")
if chapter_count > 0:
logging.info(f"小说下载完成!")
else:
logging.error("下载失败,未获取到任何章节内容")
return chapter_count, total_bytes
def main():
"""主函数,处理用户输入"""
print("=" * 50)
print("笔趣阁小说下载器 - 自动翻页版")
print("=" * 50)
start_url = input("请输入小说起始URL: ").strip()
if not start_url:
print("错误: 起始URL不能为空!")
return
output_file = input("请输入输出文件名(默认: novel.txt): ").strip()
if not output_file:
output_file = "novel.txt"
start_time = time.time()
chapter_count, total_bytes = download_novel(start_url, output_file)
print("\n" + "=" * 50)
if chapter_count > 0:
print(f"下载完成! 共 {chapter_count} 章,{total_bytes} 字节")
else:
print("下载失败,请检查日志文件了解详情")
print(f"耗时: {time.time() - start_time:.2f} 秒")
print(f"文件已保存至: {os.path.abspath(output_file)}")
print("=" * 50)
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print("\n用户中断,程序退出")
except Exception as e:
print(f"程序发生错误: {str(e)}")
logging.exception("程序发生未处理异常")
D:\tools\python\python.exe D:\历史项目留存2\诺褀2025\python加工浦发模型模拟\py搭建\pythonProject1\爬虫抖音视频\111.py
==================================================
笔趣阁小说下载器 - 自动翻页版
==================================================
请输入小说起始URL: https://www.mayiwsk.com/127_127460/53169900.html
请输入输出文件名(默认: novel.txt): 01.txt
2025-09-01 11:04:38,714 - INFO - 开始下载小说,起始URL: https://www.mayiwsk.com/127_127460/53169900.html
2025-09-01 11:04:38,714 - INFO - 基础URL: https://www.mayiwsk.com
2025-09-01 11:04:38,716 - INFO - 正在下载第 1 章: https://www.mayiwsk.com/127_127460/53169900.html
2025-09-01 11:04:42,793 - INFO - 检测到编码: utf-8
2025-09-01 11:04:42,802 - ERROR - 处理章节时出错: 'NoneType' object has no attribute 'find_all'
==================================================
下载完成! 共 1 章,0 字节
耗时: 9.09 秒
文件已保存至: D:\历史项目留存2\诺褀2025\python加工浦发模型模拟\py搭建\pythonProject1\爬虫抖音视频\01.txt
==================================================
2025-09-01 11:04:47,803 - WARNING - 检测到重复URL: https://www.mayiwsk.com/127_127460/53169900.html,跳过
2025-09-01 11:04:47,803 - INFO - 已保存到: 01.txt
2025-09-01 11:04:47,803 - INFO - 总章节数: 1
2025-09-01 11:04:47,803 - INFO - 总字数: 0 字节
2025-09-01 11:04:47,803 - INFO - 小说下载完成!