使用requests、xpath获取某中文网小说信息
import xlwt
import requests
from lxml import etree
import time
def getOnePage(url):
# 抓取全部小说网页页面
html=requests.get(url)
selector=etree.HTML(html.text)
# 选择<ul>节点中所有的<li>节点
infos=selector.xpath('//ul[@class="all-img-list cf"]/li')
for info in infos:
style_1=info.xpath('div[2]/p[1]/a[2]/text()')[0]
style_2=info.xpath('div[2]/p[1]/a[3]/text()')[0]
yield {
# 提取标题
'title':info.xpath('div[2]/h2/a/text()')[0],
# 提取作者
'author':info.xpath('div[2]/p[1]/a[1]/text()')[0],
# 提取风格
'style':style_1+style_2,
# 提取完成度
'complete':info.xpath('div[2]/p[1]/span/text()')[0],
# 提取介绍
'introduce':info.xpath('div[2]/p[2]/text()')[0].strip(),
}
# 定义表头
header=['标题','作者','类型','完成度','介绍']
# 创建Workbook对象
book=xlwt.Workbook(encoding='utf-8')
# 添加一个名为novels的Sheet
sheet=book.add_sheet('novels')
# 为Excel添加表头
for h in range(len(header)):
sheet.write(0,h,header[h])
# 产生前10页url
urls=['https://www.qidian.com/all/page{}/'.format(str(i)) for i in range(1,11)]
i=1
# 开始抓取页面中小说的数据,并将提取的数据保存在Excel的sheet中
for url in urls:
novels=getOnePage(url)
for novel in novels:
print(novel)
time.sleep(0.1)
sheet.write(i,0,novel['title'])
sheet.write(i,1,novel['author'])
sheet.write(i,2,novel['style'])
sheet.write(i,3,novel['complete'])
sheet.write(i,4,novel['introduce'])
i+=1
# 将内存中的Excel数据保存为introduce文件
book.save('novel.xls')