简单爬取一下网页上的小说(能看到的)并保存在文件中:
import requests
import parsel
# 分析网页,先爬取第一章
# 获取网页数据 建议每次都写headers 因为如果次数太多 会被限速
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
url = 'https://read.qidian.com/ajax/book/category?_csrfToken=v9DxpKiCAUhmnK3w3iU7Mahacm8F99gkGfvpcv60&bookId=1015209014'
def get_text(url):
response = requests.get(url, headers=headers)
# 提取我们需要的小说内容
sel = parsel.Selector(response.text)
# 章节名 使用css提取
chapter_name = sel.css('span.content-wrap::text').get()
# 正文
_chapter_text = sel.css('div.read-content.j_readContent > p::text').getall()
# 优化输出内容
chapter_text = [chapter_name]
for text in _chapter_text:
chapter_text.append(text)
print('\n'.join(chapter_text))
with open('xx.txt', mode='a', encoding='utf-8') as f: # 此处有待优化 将书名写入
f.write('\n'.join(chapter_text))
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
chapter_names = []
chapter_urls = []
# 以下代码应该根据网页的json数据的实际情况来确定遍历次数与元素标签
for i in range(int(len(response.json()['data']['vs']))):
for j in range(int(len(response.json()['data']['vs'][i]['cs']))):
chapter_names.append(response.json()['data']['vs'][i]['vN'] + ' ' + response.json()['data']['vs'][i]['cs'][j]['cN'])
if i == 0: # 免费章节 每一章节的地址
chapter_urls.append('https://read.qidian.com/chapter/' + response.json()['data']['vs'][i]['cs'][j]['cU'])
for i in chapter_urls:
get_text(i) # 每一章的内容