python request 爬虫爬取起点中文网小说

1.网页分析。进入https://www.qidian.com/,点击全部,进行翻页,你就会发现一个规律,

url=https://www.qidian.com/all?orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page=0(1,2,3,……)

那么我么可以这样写

url = https://www.qidian.com/all?orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page='+str(start)+'


if __name__=='__main__':
    for i in range(1,6):
     gethtml(start=i*1)

2.获取小说列表页面源码。

import requests
from lxml import etree
import os
def gethtml(start):
    url = 'https://www.qidian.com/all?orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page='+str(start)+''
    html = requests.get(url)
    page = etree.HTML(html.text)
    titlelist = page.xpath('//div[@class="book-mid-info"]/h4/a/text()')
    titlelinklist = page.xpath('//div[@class="book-mid-info"]/h4/a/@href')
    for title,titlelink in zip(titlelist,titlelinklist):
        if os.path.exists(title) == False:  # 如果以该小说名为名字的文件夹不存在
            os.mkdir(title)  # 则新建以该小说名为名的文件夹
        get_son_html(title,titlelink)

 

3.获取小说页面源码。

def get_son_html(title,titlelink):
    html = requests.get('https:'+titlelink)
    page = etree.HTML(html.text)
    son_titlelist = page.xpath('//ul[@class="cf"]/li/a/text()')
    son_linklist = page.xpath('//ul[@class="cf"]/li/a/@href')
    for son_title,son_link in zip(son_titlelist,son_linklist):
        save(son_title,son_link,title)

4.获取小说内容也源码并保存。

def save(son_title,son_link,title):
    html = requests.get('https:'+son_link)
    page = etree.HTML(html.text)
    content = '\n'.join(page.xpath('//div[@class="read-content j_readContent"]/p/text()'))
    filename = title + '\\' +son_title + '.txt'  #小说名,先存在小说文件夹中,再以章节取名,存为.txt文件
    print('正在保存小说……',filename)
    open(filename,'w',encoding='utf-8').write(content)

5.完整代码。

import requests
from lxml import etree
import os
def gethtml(start):
    url = 'https://www.qidian.com/all?orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page='+str(start)+''
    html = requests.get(url)
    page = etree.HTML(html.text)
    titlelist = page.xpath('//div[@class="book-mid-info"]/h4/a/text()')
    titlelinklist = page.xpath('//div[@class="book-mid-info"]/h4/a/@href')
    for title,titlelink in zip(titlelist,titlelinklist):
        if os.path.exists(title) == False:  # 如果以该小说名为名字的文件夹不存在
            os.mkdir(title)  # 则新建以该小说名为名的文件夹
        get_son_html(title,titlelink)

def get_son_html(title,titlelink):
    html = requests.get('https:'+titlelink)
    page = etree.HTML(html.text)
    son_titlelist = page.xpath('//ul[@class="cf"]/li/a/text()')
    son_linklist = page.xpath('//ul[@class="cf"]/li/a/@href')
    for son_title,son_link in zip(son_titlelist,son_linklist):
        save(son_title,son_link,title)

def save(son_title,son_link,title):
    html = requests.get('https:'+son_link)
    page = etree.HTML(html.text)
    content = '\n'.join(page.xpath('//div[@class="read-content j_readContent"]/p/text()'))
    filename = title + '\\' +son_title + '.txt'  #小说名,先存在小说文件夹中,再以章节取名,存为.txt文件
    print('正在保存小说……',filename)
    open(filename,'w',encoding='utf-8').write(content)

if __name__=='__main__':
    for i in range(1,6): #爬取5页
     gethtml(start=i*1)

 

评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值