爬虫获取小说（试炼）

最新推荐文章于 2026-01-04 19:33:50 发布

原创最新推荐文章于 2026-01-04 19:33:50 发布 · 1.4k 阅读

20 ·

CC 4.0 BY-SA版权

文章标签：

#爬虫

爬虫专栏收录该内容

4 篇文章

订阅专栏

获取网页内容：

使用 requests 库发送 HTTP 请求，获取网页内容。

2. 解析书籍信息：

使用 BeautifulSoup 解析网页，提取书名。

3. 获取章节内容：

支持多页章节内容，自动爬取“下一页”直到章节结束。

保存章节内容：

以书名为文件夹，每个章节保存为一个文本文件，文件名为章节标题。

检查“下一章”链接，自动爬取下一章内容，直到没有下一章为止。
本程序只供大家练习使用。

import os
import requests
from bs4 import BeautifulSoup

# 目标网页地址
url = "http://www.wxkushu.net/files/article/xiaoshuo/328/328229/84971906.html"

# 获取网页内容
def get_page_content(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"请求失败，状态码: {response.status_code}")
    return response.text

# 解析书籍信息
def parse_book_info(html):
    soup = BeautifulSoup(html, "html.parser")
    # 获取书名
    book_title = soup.find("h1").text.strip()
    return book_title

# 获取章节内容（支持多页）
def get_chapter_content(chapter_url):
    content = ""
    current_url = chapter_url
    while True:
        html = get_page_content(current_url)
        soup = BeautifulSoup(html, "html.parser")
        content_div = soup.find("div", id="content")
        if content_div:
            content += content_div.text.strip() + "\n"
        # 检查是否有下一页
        next_page = soup.find("a", text="下一页")
        if next_page and "href" in next_page.attrs:
            current_url = "http://www.wxkushu.net" + next_page["href"]
        else:
            break
    return content.strip()

# 保存章节内容
def save_chapter(book_title, chapter_title, content):
    # 创建书籍文件夹
    if not os.path.exists(book_title):
        os.makedirs(book_title)
    # 保存章节内容
    file_path = os.path.join(book_title, f"{chapter_title}.txt")
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(content)
    print(f"章节已保存: {file_path}")

# 主函数
def main():
    try:
        # 获取网页内容
        html = get_page_content(url)
        # 解析书籍信息
        book_title = parse_book_info(html)
        print(f"书名: {book_title}")
        # 初始化当前章节
        current_url = url
        chapter_number = 1
        while current_url:
            print(f"正在下载第 {chapter_number} 章")
            chapter_title = f"第 {chapter_number} 章"
            content = get_chapter_content(current_url)
            save_chapter(book_title, chapter_title, content)
            # 检查是否有下一章
            html = get_page_content(current_url)
            soup = BeautifulSoup(html, "html.parser")
            next_chapter = soup.find("a", text="下一章")
            if next_chapter and "href" in next_chapter.attrs:
                current_url = "http://www.wxkushu.net" + next_chapter["href"]
                chapter_number += 1
            else:
                break
        print("所有章节下载完成！")
    except Exception as e:
        print(f"发生错误: {str(e)}")

# 运行程序
if __name__ == "__main__":
    main()