目标:https://tianyabook.com/top/allvisit/
目标:天涯在线书库排行总榜
具体要求:
① 异常处理:代码中包含完善的 try…except 异常捕获机制(如处理网络超时、元素未找到等情况),保证程序在遍历大量页面时不会因为单条报错而中断。
② 字段清洗:对爬取的文本进行字符串处理。例如:从“Python 编程(含视频教学)” 中,将书名 “Python 编程” 和备注 “含视频教学” 分割存储为两个独立字段。
③ 数据量: 需实现自动翻页,爬取数据不少于 500 条。
④ 数据存储: 将清洗后的规范数据写入 CSV 或 Excel,要求表头清晰,无乱码。
可行功能拓展(加分项):
从中提取主标题与副标签
统计各类型小说占比(生成柱状图或饼图),文字显示为中文
监测某本书是否更新章节(增量爬取)
运行后可在运行结果栏,搜索书籍
可进行下载,并保存到本地,保存为txt文件
使用工具:PyCharm Community Edition 2022.3.2
代码如下:
"""
天涯书库总榜爬虫 | 修正版(解决 'continue' 不在循环问题 + 正文下载失败)
功能:精准采集 + 自定义页码 + 真实TXT下载 + 类型统计 + 增量检测
"""
import time
import csv
import os
import re
import requests
from selenium import webdriver
from selenium.webdriver.edge.service import Service as EdgeService
from selenium.webdriver.edge.options import Options as EdgeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from collections import Counter
# ================== 配置区 ==================
BASE_URL = "https://www.tianyabook.com/top/allvisit/" # 基础URL
OUTPUT_FILE = "books_clean.csv"
DOWNLOAD_DIR = "downloaded_books"
MAX_ALLOWED_PAGE = 3190
HEADLESS_MODE = False
CHARSET = 'gbk' # 网站编码为 gbk
TIMEOUT = 15
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
# 创建目录
os.makedirs(DOWNLOAD_DIR, exist_ok=True)
os.makedirs(os.path.dirname(OUTPUT_FILE) if '/' in OUTPUT_FILE else '.', exist_ok=True)
# ================== 工具函数 ==================
def extract_title_notes(full_title):
"""分离主标题与备注"""
match = re.match(r"^(.*?)\s*((.*?))$", full_title.strip())
if match:
return match.group(1).strip(), match.group(2).strip()
return full_title.strip(), ""
def clean_text(text):
"""清理多余空白"""
return re.sub(r'\s+', ' ', text).strip() if text else ""
def read_existing_titles():
"""读取已有书名用于增量检测"""
if not os.path.exists(OUTPUT_FILE):
return set()
titles = set()
try:
with open(OUTPUT_FILE, mode='r', encoding='utf-8-sig') as f:
reader = csv.DictReader(f)
for row in reader:
titles.add(row.get("书名", "").strip())
except Exception as e:
print(f"⚠️ 读取历史数据失败:{e}")
return titles
def plot_distribution(types_counter):
"""绘制类型饼图"""
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS']
plt.rcParams['axes.unicode_minus'] = False
labels = list(types_counter.keys())
sizes = list(types_counter.values())
plt.figure(figsize=(9, 6))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140)
plt.title("小说类型占比分布", fontsize=16, fontweight='bold')
plt.axis('equal')
plt.tight_layout()
plt.show()
def get_real_download_url(detail_url):
"""
方法1:从书籍详情页直接提取 .txtarticle.php?id=xxx 链接
"""
try:
headers = {"User-Agent": USER_AGENT}
response = requests.get(detail_url, headers=headers, timeout=TIMEOUT)
response.raise_for_status()
response.encoding = CHARSET
soup = BeautifulSoup(response.text, "html.parser", from_encoding=CHARSET)
# 查找包含“下载本书”的按钮或其父级 a 标签
download_btn = soup.find("a", href=re.compile(r"/modules/article/txtarticle\.php\?id=\d+"))
if download_btn and 'href' in download_btn.attrs:
link = download_btn['href']
if link.startswith("/"):
return f"https://www.tianyabook.com{link}"
return link
except Exception as e:
print(f"❌ 通过详情页获取下载链接失败:{e}")
return None
def fetch_novel_content_via_id(book_id):
"""
方法2:直接构造并请求 TXT 接口
"""
url = f"https://www.tianyabook.com/modules/article/txtarticle.php?id={book_id}"
try:
headers = {"User-Agent": USER_AGENT}
response = requests.get(url, headers=headers, timeout=TIMEOUT)
response.raise_for_status()
# 网站返回为 GBK 编码
content = response.content.decode('gbk', errors='replace')
return content.strip()
except Exception as e:
print(f"❌ 请求 TXT 接口失败:{e}")
return None
def extract_book_id_from_url(url):
"""
从 https://www.tianyabook.com/shu/3801.html 中提取 3801
"""
match = re.search(r'/shu/(\d+)\.html', url)
return match.group(1) if match else None
def download_as_txt(title, content):
"""保存为本地 TXT 文件"""
filename = re.sub(r'[<>:"/\\|?*\x00-\x1F]', '_', title)
filepath = os.path.join(DOWNLOAD_DIR, f"{filename}.txt")
with open(filepath, "w", encoding="utf-8") as f:
f.write(f"《{title}》\n\n")
f.write(content)
print(f"✅ 书籍已保存:{filepath}")
def get_page_range():
"""获取用户输入的页码范围"""
print("📌 欢迎使用天涯书库爬虫系统")
print(f"ℹ️ 总榜共约 {MAX_ALLOWED_PAGE} 页,每页约 20 条数据。")
while True:
try:
start = input("🔢 请输入起始页码(如 1): ").strip()
end = input("🔢 请输入结束页码(如 5): ").strip()
start_page = int(start)
end_page = int(end)
if start_page < 1:
print("❌ 起始页必须大于等于 1,请重新输入。")
continue
if end_page < start_page:
print("❌ 结束页不能小于起始页,请重新输入。")
continue
if end_page > MAX_ALLOWED_PAGE:
confirm = input(f"⚠️ 注意:当前网站最多约 {MAX_ALLOWED_PAGE} 页,你输入了 {end_page},可能无效!是否继续?(y/n): ")
if confirm.lower() != 'y':
continue
return start_page, end_page
except ValueError:
print("❌ 输入无效,请输入正整数。")
except Exception as e:
print(f"❌ 发生错误:{e}")
# ================== 主程序 ==================
if __name__ == "__main__":
print("🚀 开始爬取天涯书库热门书籍(自定义页码范围)…")
# 获取用户指定页码
start_page, end_page = get_page_range()
# 记录已存在的书名(用于增量检测)
existing_titles = read_existing_titles()
new_updates = []
# 设置 Edge 浏览器选项
edge_options = EdgeOptions()
if HEADLESS_MODE:
edge_options.add_argument("--headless")
edge_options.add_argument("--disable-gpu")
edge_options.add_argument("--no-sandbox")
edge_options.add_argument("--disable-dev-shm-usage")
edge_options.add_argument(f"user-agent={USER_AGENT}")
driver = None
try:
service = EdgeService("msedgedriver.exe") # 确保驱动在同目录
driver = webdriver.Edge(service=service, options=edge_options)
wait = WebDriverWait(driver, TIMEOUT)
books = []
seen_urls = set() # 防止重复采集
for page_idx in range(start_page, end_page + 1):
if page_idx == 1:
current_url = BASE_URL
else:
current_url = f"{BASE_URL}{page_idx}.html"
print(f"📘 正在爬取第 {page_idx} 页:{current_url}")
try:
driver.get(current_url)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "table.table")))
time.sleep(1.5)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser", from_encoding=CHARSET)
table = soup.find("table", class_="table")
rows = table.find_all("tr")[1:] if table else []
if not rows:
print("📄 当前页无有效书籍数据或已被反爬拦截。")
continue
page_books = 0
for row in rows:
cols = row.find_all("td")
if len(cols) < 3:
continue
title_tag = cols[0].find("a")
if not title_tag:
continue
href = title_tag.get("href")
full_title = clean_text(title_tag.get_text())
author = clean_text(cols[2].get_text())
if not href or href in seen_urls:
continue
seen_urls.add(href)
main_title, notes = extract_title_notes(full_title)
book_data = {
"书名": main_title,
"备注": notes,
"作者": author,
"详情链接": href,
"类型": "未知",
"指数": clean_text(cols[3].get_text()),
"更新时间": clean_text(cols[4].get_text()) if len(cols) > 4 else "",
"连载状态": clean_text(cols[5].get_text()) if len(cols) > 5 else ""
}
books.append(book_data)
page_books += 1
print(f"✅ 第 {page_idx} 页成功采集 {page_books} 条书籍。")
except TimeoutException:
print(f"⏰ 第 {page_idx} 页加载超时,跳过…")
continue
except Exception as e:
print(f"❌ 第 {page_idx} 页发生错误:{e}")
continue
time.sleep(1.2)
print(f"🎉 共采集到 {len(books)} 条真实书籍数据!")
# === 补充类型信息 ===
print("🔍 正在补充书籍类型信息(可能较慢)...")
type_counter = Counter()
for idx, book in enumerate(books):
print(f"⏳ 处理第 {idx+1}/{len(books)} 本: 《{book['书名']}》")
try:
driver.get(book["详情链接"])
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "breadcrumb")))
detail_soup = BeautifulSoup(driver.page_source, "html.parser", from_encoding=CHARSET)
meta_genre = detail_soup.find("meta", property="og:novel:category")
if meta_genre:
genre = clean_text(meta_genre["content"])
else:
breadcrumb = detail_soup.find("ol", class_="breadcrumb")
links = breadcrumb.find_all("a") if breadcrumb else []
genre = links[1].get_text() if len(links) > 1 else "未知"
book["类型"] = genre
type_counter[genre] += 1
if book["书名"] in existing_titles:
new_updates.append(book)
except Exception as e:
print(f" ❌ 类型获取失败:{e}")
book["类型"] = "未知"
type_counter["未知"] += 1
# === 写入 CSV ===
keys = ["书名", "备注", "作者", "类型", "指数", "更新时间", "连载状态", "详情链接"]
with open(OUTPUT_FILE, mode="w", encoding="utf-8-sig", newline="", errors="ignore") as f:
writer = csv.DictWriter(f, fieldnames=keys)
writer.writeheader()
writer.writerows(books)
print(f"💾 数据已保存至:{OUTPUT_FILE}")
# === 增量更新提示 ===
if new_updates:
print(f"🔔 检测到 {len(new_updates)} 本书有更新:")
for b in new_updates[:10]:
print(f" • 《{b['书名']}》({b['类型']})")
# === 可视化统计 ===
if input("📊 是否生成小说类型饼图?(y/n): ").strip().lower() == 'y':
plot_distribution(type_counter)
# === 搜索 & 下载功能 ===
query = input("📥 输入要搜索的书名关键词(回车跳过): ").strip()
if query:
matches = [b for b in books if query in b["书名"]]
if matches:
print(f"🔎 找到 {len(matches)} 本匹配书籍:")
for b in matches:
print(f" • 《{b['书名']}》| 类型:{b['类型']} | 作者:{b['作者']} | 链接:{b['详情链接']}")
target = matches[0]
if input("👉 是否下载该书TXT?(y/n): ").strip().lower() == 'y':
downloaded = False # 标记是否已成功下载
try:
# 方式1:尝试从详情页提取真实下载链接
dl_url = get_real_download_url(target["详情链接"])
if dl_url:
try:
print(f"🔗 正在从 {dl_url} 下载正文...")
response = requests.get(dl_url, headers={"User-Agent": USER_AGENT}, timeout=TIMEOUT)
response.raise_for_status()
response.encoding = 'gbk'
novel_text = response.text.strip()
if novel_text and len(novel_text) > 10:
download_as_txt(target["书名"], novel_text)
downloaded = True
else:
print("⚠️ 下载内容为空或过短,尝试备选方案...")
except Exception as e:
print(f"❌ 下载失败(方式1):{e}")
# 方式2:如果未成功,尝试通过 book_id 构造接口
if not downloaded:
book_id = extract_book_id_from_url(target["详情链接"])
if book_id:
print(f"🔁 尝试使用 ID 构造接口:id={book_id}")
novel_text = fetch_novel_content_via_id(book_id)
if novel_text:
download_as_txt(target["书名"], novel_text)
downloaded = True
else:
print("⚠️ 接口返回为空,尝试最终回退方案...")
# 最终回退:仅写章节列表
if not downloaded:
print("⚠️ 所有方式失败,使用章节列表作为替代内容...")
try:
driver.get(target["详情链接"])
wait.until(EC.presence_of_element_located((By.ID, "list-chapterAll")))
chapter_soup = BeautifulSoup(driver.page_source, "html.parser")
chapters = chapter_soup.find("dl", id="list-chapterAll")
content = "\n".join([a.get_text() for a in chapters.find_all("a")]) \
if chapters else "正文未能获取"
download_as_txt(target["书名"], content)
except Exception as e:
print(f"❌ 回退方案也失败:{e}")
except Exception as e:
print(f"❌ 下载过程发生异常:{e}")
else:
print("❗ 未找到相关书籍。")
except Exception as e:
print(f"❌ 浏览器启动失败或发生异常:{e}")
finally:
if driver:
input("👉 按回车键关闭浏览器…")
driver.quit()
print("✅ 爬取任务结束。")
总结任务需求和代码,给出实验题目
最新发布