这两年版权之争导致绝大部分网站要么不能下载,要么删除了终点网站的小说,想下个小说半天找不到好网站,自己写一个吧。
写的单线程,有点慢,自用的话可以改多线程。
import requests
import parsel
from bs4 import BeautifulSoup
def get_response(html_url):
headers = {
"Referer": "https://m.3yt.org/",
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
response = requests.get(url=html_url, headers=headers)
response.encoding = response.apparent_encoding
return response.text
#获取所有目录页地址
def get_page_list(novel_name, novel_url):
response = get_response(novel_url)
soup = BeautifulSoup(response, 'html.parser')
divs = soup.find('div', class_='pagelist')
for op in divs.find_all('option'):
page_link = op['value'] # 提取链接
get_title_list(novel_name,'https://m.3yt.org'+page_link)
#获取每一个目录页所有章节的连接
def get_title_list(novel_name, novel_url):
response = get_response(novel_url)
soup = BeautifulSoup(response, 'html.parser')
read_ul = soup.find('ul', class_='read')
for li in read_ul.find_all('li'):
chapter_link = li.a['href'] # 提取链接
chapter_name = li.a.text.strip() # 提取章节名称
get_one_novel(novel_name, chapter_name,'https://m.3yt.org'+chapter_link)
#获取单个章节内容
def get_one_novel(novel_name, chapter_name, novel_url):
content_str = ""
response = get_response(novel_url)
soup = BeautifulSoup(response, 'html.parser')
div = soup.find('div', class_='content')
for p in div.find_all('p'):
chapter_row = p.text.strip()
content_str = content_str + '\n' + chapter_row
save(novel_name, chapter_name, content_str)
print(chapter_name+" 已保存")
#保存
def save(novel_name, title, content):
filename = f'{novel_name}' + '.txt'
# 一定要记得加后缀 .txt mode 保存方式 a 是追加保存 encoding 保存编码
with open(filename, mode='a', encoding='utf-8') as f:
f.write(title)
f.write('\n')
f.write(content)
f.write('\n')
name = "我的谍战日记"
novel_url = "https://m.3yt.org/ml/317857/"
get_page_list(name, novel_url)
--------------
下载完了发现每章有多页的情况,没有做翻页加载,也感觉如上逻辑不是很好用,改为从第一章或者指定第几章开始下,模拟翻页下载。具体如下:
import requests
import parsel
from tqdm import tqdm
from lxml import etree
from bs4 import BeautifulSoup
def get_response(html_url):
headers = {
"Referer": "https://m.3yt.org/",
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
response = requests.get(url=html_url, headers=headers)
response.encoding = response.apparent_encoding
return response.text
#获取单个章节内容
def get_one_novel(novel_name, novel_url):
content_str = ""
response = get_response(novel_url)
soup = BeautifulSoup(response, 'html.parser')
title = soup.find('title').text.strip()
div = soup.find('div', class_='content')
for p in div.find_all('p'):
chapter_row = p.text.strip()
content_str = content_str + '\n' + chapter_row
save(novel_name, title, content_str)
print(title+" 已保存")
#下一页内容:
div_next = soup.find('div', class_='pager')
a_tags = div_next.find_all('a')
if len(a_tags) >= 3:
third_a = a_tags[2]
next_pageurl = third_a.get('href')
get_one_novel(novel_name, 'https://m.3yt.org'+next_pageurl)
#保存
def save(novel_name, title, content):
filename = f'{novel_name}' + '.txt'
with open(filename, mode='a', encoding='utf-8') as f:
f.write(title)
f.write('\n')
f.write(content)
f.write('\n')
name = "我的谍战日记"
#具体章节的url
novel_url = "https://m.3yt.org/ml/317857/133956324.html"
get_one_novel(name, novel_url)