- 获取网页内容:
- 使用 requests 库发送 HTTP 请求,获取网页内容。
2. 解析书籍信息:
- 使用 BeautifulSoup 解析网页,提取书名。
3. 获取章节内容:
- 支持多页章节内容,自动爬取“下一页”直到章节结束。
- 保存章节内容:
- 以书名为文件夹,每个章节保存为一个文本文件,文件名为章节标题。
- 自动爬取下一章:
- 检查“下一章”链接,自动爬取下一章内容,直到没有下一章为止。
- 本程序只供大家练习使用。
-
import os import requests from bs4 import BeautifulSoup # 目标网页地址 url = "http://www.wxkushu.net/files/article/xiaoshuo/328/328229/84971906.html" # 获取网页内容 def get_page_content(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" } response = requests.get(url, headers=headers) if response.status_code != 200: raise Exception(f"请求失败,状态码: {response.status_code}") return response.text # 解析书籍信息 def parse_book_info(html): soup = BeautifulSoup(html, "html.parser") # 获取书名 book_title = soup.find("h1").text.strip() return book_title # 获取章节内容(支持多页) def get_chapter_content(chapter_url): content = "" current_url = chapter_url while True: html = get_page_content(current_url) soup = BeautifulSoup(html, "html.parser") content_div = soup.find("div", id="content") if content_div: content += content_div.text.strip() + "\n" # 检查是否有下一页 next_page = soup.find("a", text="下一页") if next_page and "href" in next_page.attrs: current_url = "http://www.wxkushu.net" + next_page["href"] else: break return content.strip() # 保存章节内容 def save_chapter(book_title, chapter_title, content): # 创建书籍文件夹 if not os.path.exists(book_title): os.makedirs(book_title) # 保存章节内容 file_path = os.path.join(book_title, f"{chapter_title}.txt") with open(file_path, "w", encoding="utf-8") as file: file.write(content) print(f"章节已保存: {file_path}") # 主函数 def main(): try: # 获取网页内容 html = get_page_content(url) # 解析书籍信息 book_title = parse_book_info(html) print(f"书名: {book_title}") # 初始化当前章节 current_url = url chapter_number = 1 while current_url: print(f"正在下载第 {chapter_number} 章") chapter_title = f"第 {chapter_number} 章" content = get_chapter_content(current_url) save_chapter(book_title, chapter_title, content) # 检查是否有下一章 html = get_page_content(current_url) soup = BeautifulSoup(html, "html.parser") next_chapter = soup.find("a", text="下一章") if next_chapter and "href" in next_chapter.attrs: current_url = "http://www.wxkushu.net" + next_chapter["href"] chapter_number += 1 else: break print("所有章节下载完成!") except Exception as e: print(f"发生错误: {str(e)}") # 运行程序 if __name__ == "__main__": main()
5883

被折叠的 条评论
为什么被折叠?



