import requests
from bs4 import BeautifulSoup
import re
import time
import os
import random
from fake_useragent import UserAgent
from urllib.parse import urlparse, urljoin
import logging
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("novel_downloader.log"),
logging.StreamHandler()
]
)
# 创建用户代理生成器
ua = UserAgent()
# 需要过滤的导航元素列表
NAV_ELEMENTS = [
"首页", "关灯", "字体:", "大", "中", "小",
"上一页", "返回目录", "下一页", "加入书签", "推荐票",
"返回书页", "目录", "设置", "书架", "加入书架",
"上一章", "下一章", "书签", "投票", "举报",
"本章未完,请点击下一页继续阅读", "本章结束,请点击下一章继续阅读",
"返回", "电脑版", "手机版", "APP版", "客户端", "介绍", "足迹", "超大", "进书架", "本章未完 点击下一页继续阅读",
"顶部", "底部", "页脚", "页眉", "章节列表", "加书签", "我的书架", "阅读历史", "本章已完 m.3qdu.com",
"请收藏本站", "请记住本书首发域名", "天才一秒记住本站地址", "顶点小说", "笔趣阁", "更新最快", "</p>"
]
# 特殊换行保留标签
PRESERVE_LINEBREAK_TAGS = ["br", "p", "div", "pre", "blockquote"]
def extract_content_and_next_link(html, base_url):
"""从HTML中提取内容并找到下一页链接,优化换行处理"""
soup = BeautifulSoup(html, 'html.parser')
# 内容容器选择器
content_div = None
selectors = [
('div', 'size16 color5 pt-read-text'),
('div', 'pt-read-text'),
('div', 'novel-content'),
('div', 'chapter-content'),
('div', 'novelcontent'),
('div', 'content'),
('div', 'chapter-content'),
('article', None),
('div', 'chapter_body'),
('div', 'read-content'),
('div', 'txt_cont'),
('div', 'content-body'),
('div', 'read-box'),
('div', 'chapter-content-inner'),
('div', 'chapter-content'),
('div', 'chapter-text'),
('div', 'chapter-content'),
('div', 'chapter-content'),
('div', 'chapter-content'),
]
for tag, class_name in selectors:
if class_name:
content_div = soup.find(tag, class_=class_name)
else:
content_div = soup.find(tag)
if content_div:
break
if not content_div:
logging.warning("使用body作为内容容器")
content_div = soup.find('body')
if not content_div:
logging.error("无法找到内容容器")
return "", None
# ==== 优化换行处理 ====
# 1. 移除导航元素
for nav_text in NAV_ELEMENTS:
for element in content_div.find_all(string=re.compile(re.escape(nav_text))):
if element.strip() == nav_text:
element.extract()
# 2. 智能处理特殊空白字符
for nbsp in content_div.find_all(string=re.compile(r'\u00a0')):
nbsp.replace_with(nbsp.replace('\u00a0', ' '))
# 3. 处理换行标签
# 保留原始换行标签但标记为特殊处理
for tag in PRESERVE_LINEBREAK_TAGS:
for element in content_div.find_all(tag):
element.append('\n')
# 4. 获取原始文本(保留换行)
raw_text = content_div.get_text(separator='\n', strip=False)
# 5. 关键优化:合并多余的换行符
# - 将连续3个以上换行符替换为1个空行
# - 保留单行换行(用于诗歌/对话)
# - 合并单字换行
cleaned_text = re.sub(r'\n{3,}', '\n\n', raw_text) # 多个连续换行合并
cleaned_text = re.sub(r'(\S)\n(\S)', r'\1 \2', cleaned_text) # 合并单字换行
cleaned_text = re.sub(r'\n\s+\n', '\n\n', cleaned_text) # 清理空白行
cleaned_text = re.sub(r'^\s+', '', cleaned_text, flags=re.MULTILINE) # 删除行首空白
# 6. 特殊处理章节标题
title_tag = soup.find('title')
chapter_title = title_tag.text.split('-')[0] if title_tag else "未知章节"
cleaned_text = f"## {chapter_title} ##\n\n" + cleaned_text.strip()
# 7. 保留诗歌格式(检测缩进)
cleaned_text = re.sub(r'(\n {4,}.+?\n)\n+', r'\1\n', cleaned_text)
# 8. 最终优化
cleaned_text = re.sub(r'\n{2,}', '\n\n', cleaned_text) # 多个空行合并
# =====================
# 查找下一页链接
next_link = None
next_page_selectors = [
('a', re.compile(r'下一页|下一章|下一节|继续阅读'), None),
('a', None, re.compile(r'next-page|next|next-chapter')),
('a', None, 'next'),
('a', None, 'bt_next'),
('a', None, 'btn-next'),
('a', None, 'nextChapter'),
('a', None, 'next_chapter'),
('a', None, 'chapter-next'),
('a', id=='nextChapterBtn'),
('a', id=='next_chapter'),
('a', id=='next_page'),
('a', id=='btn_next')
]
for selector in next_page_selectors:
if len(selector) == 3:
tag, text_pattern, class_pattern = selector
if text_pattern:
next_page_tag = soup.find(tag, string=text_pattern)
elif class_pattern:
next_page_tag = soup.find(tag, class_=class_pattern)
elif len(selector) == 2:
attr, value = selector
next_page_tag = soup.find('a', **{attr: value})
if next_page_tag and 'href' in next_page_tag.attrs:
next_link = next_page_tag['href']
if not next_link.startswith('http'):
next_link = urljoin(base_url, next_link)
logging.info(f"找到下一页链接: {next_link}")
break
return cleaned_text + "\n\n", next_link
def get_random_headers():
"""生成随机的请求头"""
return {
'User-Agent': ua.random,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0',
'DNT': '1',
'Referer': 'https://www.dppss.com/'
}
def download_novel(start_url, output_file="novel.txt", max_retries=5):
"""下载整本小说"""
parsed_url = urlparse(start_url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
current_url = start_url
chapter_count = 0
retry_count = 0
total_bytes = 0
output_dir = os.path.dirname(output_file)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir)
logging.info(f"开始下载小说,起始URL: {start_url}")
logging.info(f"基础URL: {base_url}")
session = requests.Session()
with open(output_file, 'w', encoding='utf-8') as f:
while current_url and retry_count < max_retries:
chapter_count += 1
logging.info(f"正在下载第 {chapter_count} 章: {current_url}")
try:
headers = get_random_headers()
headers['Referer'] = base_url
response = session.get(current_url, headers=headers, timeout=20)
if response.status_code != 200:
logging.error(f"错误: 无法获取页面,状态码: {response.status_code}")
retry_count += 1
time.sleep(3)
continue
# 自动检测编码
if response.encoding == 'ISO-8859-1':
response.encoding = 'utf-8'
content, next_link = extract_content_and_next_link(response.text, base_url)
if content and len(content.strip()) > 20:
bytes_written = f.write(content)
total_bytes += bytes_written
f.flush()
logging.info(f"成功写入第 {chapter_count} 章内容 ({bytes_written} 字节)")
retry_count = 0
else:
logging.warning(f"未提取到有效内容,可能页面结构变化")
debug_file = f"debug_ch{chapter_count}.html"
with open(debug_file, 'w', encoding='utf-8') as debug_f:
debug_f.write(response.text)
logging.info(f"已保存调试文件: {debug_file}")
retry_count += 1
time.sleep(5)
continue
if next_link and next_link != current_url:
current_url = next_link
else:
current_url = None
logging.info("已到达最后一章")
delay = random.uniform(1.0, 3.0)
logging.info(f"等待 {delay:.2f} 秒后继续...")
time.sleep(delay)
except requests.exceptions.RequestException as e:
logging.error(f"网络请求出错: {str(e)}")
retry_count += 1
time.sleep(5)
except Exception as e:
logging.error(f"处理章节时出错: {str(e)}")
retry_count += 1
time.sleep(5)
logging.info(f"已保存到: {output_file}")
logging.info(f"总字数: {total_bytes} 字节")
if chapter_count > 0:
logging.info(f"小说下载完成! 共 {chapter_count} 章")
else:
logging.error("下载失败,未获取到任何章节内容")
return chapter_count, total_bytes
def main():
"""主函数,处理用户输入"""
print("=" * 50)
print("小说下载器 - 优化换行版")
print("=" * 50)
while True:
start_url = input("请输入小说起始URL: ").strip()
if start_url:
break
print("错误: 起始URL不能为空,请重新输入!")
output_file = input("请输入输出文件名(默认: novel.txt): ").strip()
if not output_file:
output_file = "novel.txt"
start_time = time.time()
chapter_count, total_bytes = download_novel(start_url, output_file)
if chapter_count > 0:
print(f"\n下载完成! 共 {chapter_count} 章,{total_bytes} 字节")
print(f"耗时: {time.time() - start_time:.2f} 秒")
print(f"文件已保存至: {os.path.abspath(output_file)}")
else:
print("\n下载失败,请检查日志文件了解详情")
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print("\n用户中断,程序退出")
except Exception as e:
print(f"程序发生错误: {str(e)}")
class DouYin:
pass修改一下,给我一个健全的代码
最新发布