import requests
import re
import parsel
import json
import os
from concurrent.futures import ThreadPoolExecutor
def doGet(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
}
return requests.get(url=url, headers=headers)
def getBookInfo(bookname, host):
searchUrl = f'{host}/search.php?searchkey={bookname}'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
}
response = requests.get(url=searchUrl, headers=headers)
response_text = response.text
selector = parsel.Selector(response_text)
bookname_table = selector.xpath('//*[@id="wrapper"]/table')[0]
bookname_table_trs = bookname_table.css('table tr')
books_info = []
for index, tr in enumerate(bookname_table_trs):
if index==0:
continue
book_href = tr.css('td')[0].css('a')[0].attrib['href']
book_name = tr.css('td')[0].css('a')[0].css('::text').get()
book_new_chapter = tr.css('td')[1].css('a').css('::text').get()
book_author = tr.css('td')[2].css('::text').get()
book_state = tr.css('td')[4].css('::text').get()
book_info = {}
book_info['book_name'] = book_name
book_info['book_href'] = book_href
book_info['book_new_chapter'] = book_new_chapter
book_info['book_author'] = book_author
book_info['book_state'] = book_state
books_info.append(book_info)
return books_info
def getAllChapterLink(url):
print(f'开始执行:{url}')
response = requests.get(url)
response_text = response.text
print('response_text')
selector = parsel.Selector(response_text)
chapter_list = selector.xpath('//*[@id="list"]/dl/dd/a')
chapter_href_list = []
for chapter in chapter_list:
chapter_href = chapter.attrib['href']
chapter_href_list.append(chapter_href)
return chapter_href_list
def getChapterInfo(url):
response = requests.get(url)
response_text = response.text
selector = parsel.Selector(response_text)
booktitle = selector.xpath('//*[@id="bgdiv"]/div[2]/div[1]/h1/text()').get()
booktext = selector.xpath('//div[@id="booktext"]').get()
booktext = re.sub(r'<div.*?</div>', '', booktext)
booktext = re.sub(r'<div.*?>', '', booktext)
booktext = re.sub(r'</div>', '', booktext)
return booktitle, booktext.replace('<br>', '\r\n')
def getStoryAndDownload(host, book_href, book_dir, executor):
print(f'book_name={book_name},book_href={book_href}')
allChapterLink = getAllChapterLink(host + book_href)
print('allChapterLink.length='+len(allChapterLink))
for index, chapterLink in enumerate(allChapterLink):
executor.submit(saveChapter, host=host, chapterLink=chapterLink, index=index, book_dir=book_dir)
def saveChapter(host, chapterLink, index, book_dir):
booktitle, booktext = getChapterInfo(host + chapterLink)
print(f'booktitle={booktitle}开始写入')
with open(f'{book_dir}/{index}-{booktitle}.txt', 'w', encoding='utf-8') as file:
file.write(booktext)
print(f'{book_dir}/{index}-{booktitle}.txt已写入完成')
def saveBooksInfo(book, host):
book_name = book['book_name']
book_href = book['book_href']
parentPath = os.path.dirname(os.path.abspath(__file__))
book_dir = parentPath + "\\" + book_name
if not os.path.exists(book_dir):
os.mkdir(book_dir)
else:
print(f'已存在的书名:{book_name}')
return
executor = ThreadPoolExecutor(max_workers=16)
allChapterLink = getAllChapterLink(host + book_href)
for index, chapterLink in enumerate(allChapterLink):
executor.submit(saveChapter, host=host, chapterLink=chapterLink, index=index, book_dir=book_dir)
if __name__=='__main__':
host = 'https://m.biquge.su'
books_info = getBookInfo('斗罗大陆', host=host)
executor = ThreadPoolExecutor(max_workers=16)
print('books_info=', books_info)
for book in books_info:
book_name = book['book_name']
book_href = book['book_href']
parentPath = os.path.dirname(os.path.abspath(__file__))
book_dir = parentPath + "\\" + book_name
if not os.path.exists(book_dir):
os.mkdir(book_dir)
else:
print(f'已存在的书名:{book_name}')
continue
allChapterLink = getAllChapterLink(host + book_href)
for index, chapterLink in enumerate(allChapterLink):
executor.submit(saveChapter, host=host, chapterLink=chapterLink, index=index, book_dir=book_dir)