爬取起点中文网小说信息并存储到Excel-优快云博客

本文链接：https://blog.youkuaiyun.com/m0_60255954/article/details/127876921

使用requests、xpath获取某中文网小说信息

import xlwt
import requests
from lxml import etree
import time
def getOnePage(url):
    # 抓取全部小说网页页面
    html=requests.get(url)
    selector=etree.HTML(html.text)
    # 选择<ul>节点中所有的<li>节点
    infos=selector.xpath('//ul[@class="all-img-list cf"]/li')
    for info in infos:
        style_1=info.xpath('div[2]/p[1]/a[2]/text()')[0]
        style_2=info.xpath('div[2]/p[1]/a[3]/text()')[0]
        yield {
            # 提取标题
            'title':info.xpath('div[2]/h2/a/text()')[0],
            # 提取作者
            'author':info.xpath('div[2]/p[1]/a[1]/text()')[0],
            # 提取风格
            'style':style_1+style_2,
            # 提取完成度
            'complete':info.xpath('div[2]/p[1]/span/text()')[0],
            # 提取介绍
            'introduce':info.xpath('div[2]/p[2]/text()')[0].strip(),
        }

# 定义表头
header=['标题','作者','类型','完成度','介绍']
# 创建Workbook对象
book=xlwt.Workbook(encoding='utf-8')
# 添加一个名为novels的Sheet
sheet=book.add_sheet('novels')
# 为Excel添加表头
for h in range(len(header)):
    sheet.write(0,h,header[h])
# 产生前10页url
urls=['https://www.qidian.com/all/page{}/'.format(str(i)) for i in range(1,11)]
i=1
# 开始抓取页面中小说的数据，并将提取的数据保存在Excel的sheet中
for url in urls:
    novels=getOnePage(url)
    for novel in novels:
        print(novel)
        time.sleep(0.1)
        sheet.write(i,0,novel['title'])
        sheet.write(i,1,novel['author'])
        sheet.write(i,2,novel['style'])
        sheet.write(i,3,novel['complete'])
        sheet.write(i,4,novel['introduce'])
        i+=1
# 将内存中的Excel数据保存为introduce文件
book.save('novel.xls')

爬虫笔记04