python爬虫自动下载小说_python下载小说代码 selector.xpath-优快云博客

本文链接：https://blog.youkuaiyun.com/qq_37002981/article/details/138311664
文章介绍了如何使用Python中的requests、parsel库以及正则表达式来爬取并解析一个特定小说网站（如biquge.su）上的《斗罗大陆》章节信息，利用ThreadPoolExecutor进行多线程下载和保存章节内容。
摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >
import requests
# 正则表达式模块
import re
import parsel
import json
import os
from concurrent.futures import ThreadPoolExecutor

def doGet(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
    }
    return requests.get(url=url, headers=headers)

def getBookInfo(bookname, host):
    searchUrl = f'{host}/search.php?searchkey={bookname}'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
    }

    response = requests.get(url=searchUrl, headers=headers)

    response_text = response.text

    selector = parsel.Selector(response_text)
    # //*[@id="wrapper"]/table
    bookname_table = selector.xpath('//*[@id="wrapper"]/table')[0]
    bookname_table_trs = bookname_table.css('table tr')

    # print(type(bookname_table_trs))

    books_info = []
    for index, tr in enumerate(bookname_table_trs): 
        if index==0:
            continue
        # print('index=', index, '，tr类型=', type(tr))
        # print(tr)
        # print(tr.css('td')[0].css('a')[0])
        book_href = tr.css('td')[0].css('a')[0].attrib['href']
        book_name = tr.css('td')[0].css('a')[0].css('::text').get()
        book_new_chapter = tr.css('td')[1].css('a').css('::text').get()
        book_author = tr.css('td')[2].css('::text').get()
        book_state = tr.css('td')[4].css('::text').get()
        book_info = {}
        book_info['book_name'] = book_name
        book_info['book_href'] = book_href
        book_info['book_new_chapter'] = book_new_chapter
        book_info['book_author'] = book_author
        book_info['book_state'] = book_state
        books_info.append(book_info)
    return books_info

def getAllChapterLink(url):
    print(f'开始执行:{url}')
    response = requests.get(url)
    response_text = response.text
    print('response_text')
    selector = parsel.Selector(response_text)
    chapter_list = selector.xpath('//*[@id="list"]/dl/dd/a')
    chapter_href_list = []
    for chapter in chapter_list:
        chapter_href = chapter.attrib['href']
        chapter_href_list.append(chapter_href)
    return chapter_href_list
        
def getChapterInfo(url):
    response = requests.get(url)
    response_text = response.text
    selector = parsel.Selector(response_text)
    booktitle = selector.xpath('//*[@id="bgdiv"]/div[2]/div[1]/h1/text()').get()
    # print(f'booktitle={booktitle}')
    booktext = selector.xpath('//div[@id="booktext"]').get()
    # print(type(booktext))
    booktext = re.sub(r'<div.*?</div>', '', booktext)
    booktext = re.sub(r'<div.*?>', '', booktext)
    booktext = re.sub(r'</div>', '', booktext)
    # print(booktext.replace('<br>', '\r\n'))
    return booktitle, booktext.replace('<br>', '\r\n')

def getStoryAndDownload(host, book_href, book_dir, executor):
    print(f'book_name={book_name},book_href={book_href}')
    allChapterLink = getAllChapterLink(host + book_href)
    print('allChapterLink.length='+len(allChapterLink))
    for index, chapterLink in enumerate(allChapterLink):
        executor.submit(saveChapter, host=host, chapterLink=chapterLink, index=index, book_dir=book_dir)
        # booktitle, booktext = getChapterInfo(host + chapterLink)
        # print(f'booktitle={booktitle}开始写入')
        # with open(f'{book_dir}/{index}-{booktitle}.txt', 'w', encoding='utf-8') as file:
        #     file.write(booktext)
        #     print(f'{book_dir}/{index}-{booktitle}.txt已写入完成')

def saveChapter(host, chapterLink, index, book_dir):
    booktitle, booktext = getChapterInfo(host + chapterLink)
    print(f'booktitle={booktitle}开始写入')
    with open(f'{book_dir}/{index}-{booktitle}.txt', 'w', encoding='utf-8') as file:
        file.write(booktext)
        print(f'{book_dir}/{index}-{booktitle}.txt已写入完成')

def saveBooksInfo(book, host):
    book_name = book['book_name']
    book_href = book['book_href']
    parentPath = os.path.dirname(os.path.abspath(__file__))
    book_dir = parentPath + "\\" + book_name
    if not os.path.exists(book_dir):
        os.mkdir(book_dir)
    else:
        print(f'已存在的书名：{book_name}')
        return

    # executor.submit(getStoryAndDownload, host=host, book_href=book_href, book_dir=book_dir, executor=executor)
    # print(f'book_name={book_name},book_href={book_href}')
    executor = ThreadPoolExecutor(max_workers=16)
    allChapterLink = getAllChapterLink(host + book_href)
    for index, chapterLink in enumerate(allChapterLink):
        executor.submit(saveChapter, host=host, chapterLink=chapterLink, index=index, book_dir=book_dir)
        # booktitle, booktext = getChapterInfo(host + chapterLink)
        # print(f'booktitle={booktitle}开始写入')
        # with open(f'{book_dir}/{index}-{booktitle}.txt', 'w', encoding='utf-8') as file:
        #     file.write(booktext)
        #     print(f'{book_dir}/{index}-{booktitle}.txt已写入完成')

if __name__=='__main__':
    # print(os.path.dirname(os.path.abspath(__file__)))
    host = 'https://m.biquge.su'
    books_info = getBookInfo('斗罗大陆', host=host)
    executor = ThreadPoolExecutor(max_workers=16)
    print('books_info=', books_info)
    for book in books_info:
        # executor.submit(saveBooksInfo, book, host)
        book_name = book['book_name']
        book_href = book['book_href']
        parentPath = os.path.dirname(os.path.abspath(__file__))
        book_dir = parentPath + "\\" + book_name
        if not os.path.exists(book_dir):
            os.mkdir(book_dir)
        else:
            print(f'已存在的书名：{book_name}')
            continue

        allChapterLink = getAllChapterLink(host + book_href)
        for index, chapterLink in enumerate(allChapterLink):
            executor.submit(saveChapter, host=host, chapterLink=chapterLink, index=index, book_dir=book_dir)