import base64
import urllib.parse
import json
import os
import time
import asyncio
import aiohttp
import random
from tqdm import tqdm
from ebooklib import epub
import datetime
import re
class BQGNovelDownloader:
def __init__(self, config=None):
# 默认配置 - 间隔时间翻倍
self.config = {
'concurrency': 8,
'min_interval': 400,
'max_interval': 800,
'max_retries': 3,
'retry_min_interval': 4000,
'retry_max_interval': 8000
}
# 更新用户配置
if config:
self.config.update(config)
self.headers = {
"Accept": "*/*",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"Referer": "https://bqg123.net/",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36",
"sec-ch-ua": "\"Google Chrome\";v=\"141\", \"Not?A_Brand\";v=\"8\", \"Chromium\";v=\"141\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\""
}
# 固定URL和params - 保持原来的格式方便修改
self.book_url = "https://bv-jp.booktt.cc/v3/load_book_info/27257389/972315.js"
self.params = {
"ws": "9191920",
"tk": "0404"
}
self.session = None
self.semaphore = asyncio.Semaphore(self.config['concurrency'])
# 创建目录
self.save_dir = "saved_pages"
self.epub_dir = "epub_books"
self.log_dir = "download_logs"
for directory in [self.save_dir, self.epub_dir, self.log_dir]:
if not os.path.exists(directory):
os.makedirs(directory)
# 小说信息
self.novel_info = {
'title': '',
'author': '',
'description': '',
'cover_url': ''
}
async def create_session(self):
"""创建aiohttp会话"""
if self.session is None:
timeout = aiohttp.ClientTimeout(total=60)
connector = aiohttp.TCPConnector(limit=self.config['concurrency'])
self.session = aiohttp.ClientSession(timeout=timeout, connector=connector, headers=self.headers)
async def close_session(self):
"""关闭会话"""
if self.session:
await self.session.close()
self.session = None
async def random_delay(self, min_ms, max_ms):
"""随机延迟"""
delay = random.uniform(min_ms / 1000, max_ms / 1000)
await asyncio.sleep(delay)
async def fetch_with_retry(self, url, params=None):
"""带重试机制的异步请求"""
for attempt in range(self.config['max_retries'] + 1):
try:
# 请求前延迟
await self.random_delay(self.config['min_interval'], self.config['max_interval'])
async with self.session.get(url, params=params, headers=self.headers) as response:
if response.status == 200:
return await response.text()
elif attempt < self.config['max_retries']:
# 重试延迟
retry_delay = random.uniform(
self.config['retry_min_interval'] / 1000,
self.config['retry_max_interval'] / 1000
)
await asyncio.sleep(retry_delay)
continue
except Exception as e:
if attempt < self.config['max_retries']:
retry_delay = random.uniform(
self.config['retry_min_interval'] / 1000,
self.config['retry_max_interval'] / 1000
)
await asyncio.sleep(retry_delay)
continue
return None
def decode_encoded_str(self, encoded_str):
"""解密编码字符串"""
try:
url_decoded = urllib.parse.unquote(encoded_str)
base64_decoded = base64.b64decode(url_decoded)
decoded_str = base64_decoded.decode('utf-8')
final_decoded_str = urllib.parse.unquote(decoded_str)
return json.loads(final_decoded_str)
except Exception:
return None
async def get_book_info(self):
"""获取小说信息并选择数据源"""
response_text = await self.fetch_with_retry(self.book_url, self.params)
if not response_text:
return None
# 解密 book_info_str
book_info_str = None
lines = response_text.split('\n')
for line in lines:
if 'window[\'book_info_str\']' in line:
start_idx = line.find('"') + 1
end_idx = line.rfind('"')
if start_idx > 0 and end_idx > start_idx:
book_info_str = line[start_idx:end_idx]
break
if not book_info_str:
return None
book_info = self.decode_encoded_str(book_info_str)
if not book_info:
return None
# 保存小说信息
self.novel_info['title'] = book_info.get('book_name', '未知')
self.novel_info['author'] = book_info.get('author', '未知')
self.novel_info['description'] = book_info.get('intro', '')
# 显示数据源信息
print(f"小说名称: {self.novel_info['title']}")
print(f"作者: {self.novel_info['author']}")
if self.novel_info['description']:
print(f"简介: {self.novel_info['description'][:100]}...")
print(f"数据源数量: {book_info.get('source_count', 1)}")
# 主数据源
print(f"\n数据源 0:")
print(f" 章节数: {book_info.get('chapter_count_source', '未知')}")
# 其他数据源
other_sources = book_info.get('other_source', [])
for i, source in enumerate(other_sources, 1):
print(f"数据源 {i}:")
print(f" 章节数: {source.get('chapter_count_getok', '未知')}")
# 选择数据源
total_sources = 1 + len(other_sources)
selected_source = 0
if total_sources > 1:
choice = input(f"\n请选择数据源 (0-{total_sources - 1}, 默认0): ").strip()
if choice:
selected_source = int(choice)
# 构建目录请求参数
if selected_source == 0:
chapter_list_url = book_info.get('url_chapter_list_kv', '')
time_param = book_info.get('time_chapter_list_kv', '')
else:
chapter_list_url = book_info.get('url_chapter_list_kv', '')
time_param = other_sources[selected_source - 1].get('time_chapter_list_kv',
other_sources[selected_source - 1].get('time_update',
''))
return {
'book_info': book_info,
'chapter_list_url': chapter_list_url,
'time_param': time_param
}
async def get_chapter_list(self, chapter_list_url, time_param):
"""获取章节列表"""
final_url = f"https://bv-jp.booktt.cc/load_chapter_list/{chapter_list_url}.js"
params = {"t": str(time_param), "tk": "0404"}
response_text = await self.fetch_with_retry(final_url, params)
if not response_text:
return None
# 解密章节列表
start_marker = 'var chapter_list_data_str="'
start_idx = response_text.find(start_marker)
if start_idx == -1:
return None
start_idx += len(start_marker)
end_idx = response_text.find('"', start_idx)
if end_idx == -1:
return None
encoded_str = response_text[start_idx:end_idx]
return self.decode_encoded_str(encoded_str)
def save_decrypted_content(self, chapter_name, chapter_data):
"""保存解密内容到TXT文件"""
# 清理文件名
safe_name = re.sub(r'[<>:"/\\|?*]', '_', chapter_name)
file_path = os.path.join(self.save_dir, f"{safe_name}.txt")
with open(file_path, 'w', encoding='utf-8') as f:
f.write(f"章节名称: {chapter_name}\n")
f.write("=" * 50 + "\n")
f.write("完整解密数据:\n")
f.write(json.dumps(chapter_data, ensure_ascii=False, indent=2))
f.write("\n" + "=" * 50 + "\n")
# 提取正文内容
if 'chapter_kv' in chapter_data and 'content' in chapter_data['chapter_kv']:
content_text = chapter_data['chapter_kv']['content']
if "通知:新站book4.cc" in content_text:
content_text = content_text.split("通知:新站book4.cc")[0]
content_text = content_text.strip()
f.write("\n正文内容:\n")
f.write("=" * 50 + "\n")
f.write(content_text)
def write_realtime_log(self, chapter_results):
"""实时写入下载日志"""
log_filename = f"{self.novel_info['title']}_下载日志.txt"
log_filename = re.sub(r'[<>:"/\\|?*]', '_', log_filename)
log_path = os.path.join(self.log_dir, log_filename)
with open(log_path, 'w', encoding='utf-8') as f:
f.write(f"小说下载日志\n")
f.write("=" * 50 + "\n")
f.write(f"书名: {self.novel_info['title']}\n")
f.write(f"作者: {self.novel_info['author']}\n")
f.write(f"下载时间: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"EPUB文件: epub_books\\{self.novel_info['title']}({self.novel_info['author']}).epub\n")
f.write("=" * 50 + "\n\n")
# 成功章节
success_chapters = [r for r in chapter_results if r['success']]
f.write(f"成功下载章节: {len(success_chapters)} 章\n")
for i, result in enumerate(success_chapters, 1):
f.write(f"{i}. {result['chapter']} (原: {result['original_title']})\n")
# 失败章节
failed_chapters = [r for r in chapter_results if not r['success']]
if failed_chapters:
f.write(f"\n失败章节: {len(failed_chapters)} 章\n")
for i, result in enumerate(failed_chapters, 1):
f.write(f"{i}. {result['chapter']} - URL: {result['url']}\n")
f.write(f"\n配置信息:\n")
f.write(f"并发数: {self.config['concurrency']}\n")
f.write(f"最大重试次数: {self.config['max_retries']}\n")
f.write(f"请求间隔: {self.config['min_interval']}-{self.config['max_interval']}ms\n")
async def download_chapter(self, chapter_info, pbar, chapter_results):
"""下载单个章节"""
async with self.semaphore:
chapter_name = chapter_info['name']
url_kv = chapter_info['url_kv']
chapter_len = chapter_info['len']
# 获取章节内容
url = f"https://dmit.xsjs.cc/load_chapter/{url_kv}.js"
params = {"t": str(chapter_len), "tk": "0404"}
response_text = await self.fetch_with_retry(url, params)
if not response_text:
chapter_results.append({
'success': False,
'chapter': chapter_name,
'url': url,
'original_title': chapter_name
})
# 实时更新日志
self.write_realtime_log(chapter_results)
return None
# 解密章节内容
start_marker = 'var chapter_data_str="'
start_idx = response_text.find(start_marker)
if start_idx == -1:
chapter_results.append({
'success': False,
'chapter': chapter_name,
'url': url,
'original_title': chapter_name
})
self.write_realtime_log(chapter_results)
return None
start_idx += len(start_marker)
end_idx = response_text.find('"', start_idx)
if end_idx == -1:
chapter_results.append({
'success': False,
'chapter': chapter_name,
'url': url,
'original_title': chapter_name
})
self.write_realtime_log(chapter_results)
return None
encoded_str = response_text[start_idx:end_idx]
chapter_data = self.decode_encoded_str(encoded_str)
if not chapter_data:
chapter_results.append({
'success': False,
'chapter': chapter_name,
'url': url,
'original_title': chapter_name
})
self.write_realtime_log(chapter_results)
return None
# 保存完整解密内容到TXT文件
self.save_decrypted_content(chapter_name, chapter_data)
# 提取正文内容
if 'chapter_kv' in chapter_data and 'content' in chapter_data['chapter_kv']:
content_text = chapter_data['chapter_kv']['content']
# 清理内容
if "通知:新站book4.cc" in content_text:
content_text = content_text.split("通知:新站book4.cc")[0]
content_text = content_text.strip()
else:
chapter_results.append({
'success': False,
'chapter': chapter_name,
'url': url,
'original_title': chapter_name
})
self.write_realtime_log(chapter_results)
return None
chapter_results.append({
'success': True,
'chapter': chapter_name,
'original_title': chapter_name,
'content': content_text
})
# 实时更新日志
self.write_realtime_log(chapter_results)
return {
'name': chapter_name,
'content': content_text
}
async def download_all_chapters(self, chapter_list):
"""下载所有章节"""
chapters_data = []
chapter_results = [] # 用于实时日志
print("开始下载章节...")
with tqdm(total=len(chapter_list), desc="下载进度", unit="章") as pbar:
tasks = []
for chapter_info in chapter_list:
task = asyncio.create_task(self.download_chapter(chapter_info, pbar, chapter_results))
tasks.append(task)
# 使用asyncio.as_completed来实时更新进度条
for task in asyncio.as_completed(tasks):
result = await task
if result is not None:
chapters_data.append(result)
pbar.update(1)
if result and result.get('name'):
pbar.set_description(f"✓ {result['name'][:20]:<20}")
return chapters_data, chapter_results
def create_epub(self, chapters_data):
"""创建EPUB电子书"""
book = epub.EpubBook()
book.set_identifier(f'bqg_novel_{int(time.time())}')
book.set_title(self.novel_info['title'])
book.add_author(self.novel_info['author'])
book.set_language('zh')
# 添加简介章节
intro_chapter = epub.EpubHtml(title='简介', file_name='intro.xhtml', lang='zh')
intro_content = f"""
<html>
<head>
<title>简介</title>
<style>
body {{ font-family: Arial, sans-serif; line-height: 1.6; margin: 20px; }}
h1 {{ color: #333; border-bottom: 2px solid #333; }}
.description {{ margin: 20px 0; }}
</style>
</head>
<body>
<h1>{self.novel_info['title']}</h1>
<h2>作者:{self.novel_info['author']}</h2>
<div class="description">
{self.novel_info['description'].replace(chr(10), '<br/>')}
</div>
</body>
</html>
"""
intro_chapter.content = intro_content
book.add_item(intro_chapter)
# 添加样式
style = '''
body {
font-family: "Microsoft YaHei", sans-serif;
font-size: 16px;
line-height: 1.6;
margin: 20px;
text-align: justify;
}
h1 {
text-align: center;
margin-bottom: 40px;
font-size: 24px;
border-bottom: 1px solid #ccc;
padding-bottom: 10px;
}
p {
margin-bottom: 15px;
text-indent: 2em;
}
'''
nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css",
media_type="text/css", content=style)
book.add_item(nav_css)
# 创建EPUB章节
chapters = []
for i, chapter_data in enumerate(chapters_data):
content_paragraphs = chapter_data['content'].split('\n')
formatted_content = ''.join(
f'<p>{paragraph.strip()}</p>' for paragraph in content_paragraphs if paragraph.strip())
chapter_html = f'''
<!DOCTYPE html>
<html>
<head>
<title>{chapter_data['name']}</title>
<link rel="stylesheet" type="text/css" href="../style/nav.css" />
</head>
<body>
<h1>{chapter_data['name']}</h1>
<div>{formatted_content}</div>
</body>
</html>
'''
epub_chapter = epub.EpubHtml(
title=chapter_data['name'],
file_name=f'chapter_{i + 1:04d}.xhtml',
lang='zh'
)
epub_chapter.content = chapter_html
book.add_item(epub_chapter)
chapters.append(epub_chapter)
# 设置书籍结构
book.spine = ['nav', intro_chapter] + chapters
book.toc = [epub.Link('intro.xhtml', '简介', 'intro')] + chapters
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
epub_filename = f"{self.novel_info['title']}({self.novel_info['author']}).epub"
epub_filename = re.sub(r'[<>:"/\\|?*]', '_', epub_filename)
output_path = os.path.join(self.epub_dir, epub_filename)
epub.write_epub(output_path, book, {})
return output_path
async def download_novel(self):
"""下载整本小说"""
await self.create_session()
print("获取小说信息...")
book_data = await self.get_book_info()
if not book_data:
print("获取小说信息失败")
return False
print("获取章节列表...")
chapter_list_data = await self.get_chapter_list(
book_data['chapter_list_url'],
book_data['time_param']
)
if not chapter_list_data:
print("获取章节列表失败")
return False
total_chapters = len(chapter_list_data['chapter_list'])
print(f"找到 {total_chapters} 个章节")
# 下载所有章节
chapters_data, chapter_results = await self.download_all_chapters(chapter_list_data['chapter_list'])
successful_chapters = len(chapters_data)
failed_chapters = total_chapters - successful_chapters
print(f"\n下载完成!")
print(f"成功: {successful_chapters} 个章节")
print(f"失败: {failed_chapters} 个章节")
if successful_chapters == 0:
print("没有成功下载任何章节")
return False
# 生成EPUB
print("\n正在创建EPUB电子书...")
epub_path = self.create_epub(chapters_data)
print(f"✓ EPUB电子书已创建: {epub_path}")
# 最终日志
log_filename = f"{self.novel_info['title']}_下载日志.txt"
log_filename = re.sub(r'[<>:"/\\|?*]', '_', log_filename)
log_path = os.path.join(self.log_dir, log_filename)
print(f"✓ 下载日志已保存: {log_path}")
# 清理临时文件
if failed_chapters == 0:
print("\n所有章节下载成功,清理临时文件...")
if os.path.exists(self.save_dir):
import shutil
shutil.rmtree(self.save_dir)
print("✓ 临时文件已清理")
print(f"\n最终结果:")
print(f"EPUB电子书: {epub_path}")
print(f"下载日志: {log_path}")
await self.close_session()
return True
async def main():
print("=" * 50)
print(" BQG小说下载器")
print("=" * 50)
# 获取用户配置
config = {}
# 并发数
while True:
try:
concurrency_input = input("请输入并发数 (1-100,推荐3): ").strip()
if not concurrency_input:
config['concurrency'] = 8
break
concurrency = int(concurrency_input)
if 1 <= concurrency <= 100:
config['concurrency'] = concurrency
break
else:
print("并发数必须在1-100之间!")
except ValueError:
print("请输入有效的数字!")
# 重试次数
while True:
try:
retries_input = input("请输入最大重试次数 (默认3): ").strip()
if not retries_input:
config['max_retries'] = 3
break
retries = int(retries_input)
if retries >= 0:
config['max_retries'] = retries
break
else:
print("重试次数必须大于等于0!")
except ValueError:
print("请输入有效的数字!")
# 使用默认的时间间隔配置
config.update({
'min_interval': 400,
'max_interval': 800,
'retry_min_interval': 4000,
'retry_max_interval': 8000
})
print(f"\n配置信息:")
print(f"并发数: {config['concurrency']}")
print(f"最大重试次数: {config['max_retries']}")
print(f"请求间隔: {config['min_interval']}-{config['max_interval']}ms")
print(f"重试间隔: {config['retry_min_interval']}-{config['retry_max_interval']}ms")
print("-" * 50)
# 创建下载器并开始下载
downloader = BQGNovelDownloader(config=config)
success = await downloader.download_novel()
if success:
print("\n✓ 下载完成!")
else:
print("\n✗ 下载失败!")
if __name__ == "__main__":
asyncio.run(main())
以这个网站爬虫为示例,告诉我怎么改成直接JS引擎调用来实现爬虫
最新发布