Python3.7爬虫:爬取小说并写入txt文件 ---趣笔阁

本文介绍了一个用于爬取网络小说的Python程序。该程序利用requests和BeautifulSoup库抓取小说网站上的内容,并将其保存到本地文件中。具体实现了小说基本信息及各章节的爬取功能。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

 版权声明:本文为博主原创文章,未经博主允许不得转载。

import re

import requests
from bs4 import BeautifulSoup

# 请求字典
req_header = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, sdch, br',
    'Accept-Language': 'zh-CN,zh;q=0.8',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Cookie': 'UM_distinctid=165c0eb51a95a7-0c5f1e30c8c4cd-323f5c0f-144000-165c0eb51aa7c8; bcolor=; font=; size=; fontcolor=; width=; CNZZDATA1260821856=941700315-1536539302-https%253A%252F%252Fwww.baidu.com%252F%7C1536648034',
    'Host': 'www.biqugex.com',
    'If-Modified-Since': 'Mon, 25 Jun 2018 00:09:02 GMT',
    'If-None-Match': '"1529885342"',
    'Referer': 'https://www.biqugex.com/s.php?ie=gbk&s=9157106854577873494&q=%C6%DF%BD%E7%CE%E4%C9%F1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}

# 小说下载函数
# id:小说编号
# txt字典项介绍
# title:小说题目
# first_page:第一章页面
# txt_section:章节地址
# section_name:章节名称
# section_text:章节正文
# section_ct:章节页数
req_url_base = 'https://www.biqugex.com'  # 小说主页


def get_txt(txt_id):
    txt = {}
    txt['title'] = ''
    txt['id'] = str(txt_id)

    try:
        print("输入要下载的小说编号")
        txt['id'] = input()
        req_url = req_url_base + '/book_' + txt['id'] + '/'  # 获取小说url
        # print('小说路径:'+ req_url)
        res = requests.get(req_url, params=req_header)  # 获取小说目录界面
        soups = BeautifulSoup(res.text, "html.parser")  # soups 转换
        txt['title'] = soups.select('.book .info h2')[0].text  # 获取小说题目
        txt['author'] = soups.select('.book .info .small span')  # 获取小说详细信息
        txt['intro'] = soups.select('.book .info .intro ')  # 获取小说简介
        txt['author'][0].text  # 作者
        txt['author'][1].text  # 分类
        txt['author'][2].text  # 是否连载
        txt['author'][3].text  # 字数
        txt['author'][4].text  # 最后更新时间
        txt['author'][5].text  # 最新章节
        first_page = soups.select('.listmain dl dd a')  # 获取章节地址
     #   first_page = first_page[6]['href']  # 截取小说第一章
     #   section_next = first_page
        list = []
        # 循环截取每章地址存入list
        for i in range(len(first_page)):
            list.append(first_page[i]['href'])
        io = open('{0:0>8}-{1}.txt'.format(txt['id'], txt['title']), "ab+")
        io.write(txt['title'].encode('UTF-8'))  # 写入小说名字
        io.write('\r\n'.encode('UTF-8'))
        io.write(txt['author'][0].text.encode('UTF-8'))  # 写入作者
        io.write('\r\n'.encode('UTF-8'))
        io.write(txt['author'][1].text.encode('UTF-8'))  # 写入分类
        io.write('\r\n'.encode('UTF-8'))
        io.write(txt['author'][2].text.encode('UTF-8'))  # 写入是否连载
        io.write('\r\n'.encode('UTF-8'))
        io.write(txt['author'][3].text.encode('UTF-8'))  # 写入字数
        io.write('\r\n'.encode('UTF-8'))
        io.write(txt['author'][4].text.encode('UTF-8'))  # 写入最后更新时间
        io.write('\r\n'.encode('UTF-8'))
        io.write(txt['author'][5].text.encode('UTF-8'))  # 写入最近章节
        io.write('\r\n'.encode('UTF-8'))
        io.write(txt['intro'][0].text.encode('UTF-8'))  # 写入小说简介
        io.write('\r\n'.encode('UTF-8'))
        # 进入循环 下载每章内容
        # 循环每章内容写入txt
        for i, val in enumerate(list[6:]):
            # print(req_url_base + val)
            # r = requests.get(str(req_url_base + str(first_page)), params=req_header)
            r = requests.get(str(req_url_base + val), params=req_header)
            soup = BeautifulSoup(r.text, "html.parser")  # soup转换
            section_name = soup.select('#wrapper .book.reader .content h1')[0]  # 获取章节名称
            section_text = soup.select('#wrapper .content .showtxt')[0]  # 获取章节文本
            # 删除无用项
            for ss in section_text.select("script"):
                ss.decompose()
            section_text = re.sub('\s+', '\r\n\t', section_text.text).strip('\r\n')  # 格式化
            #   section_next = soup.select('#wrapper .book.reader .content .page_chapter ul li a')[2]  # 获取分页
            #   section_next = section_next['href']  # 截取下一章地址
            #   print(section_next)
            # 开始写入内容
            io.write('\n'.encode('UTF-8'))
            io.write(section_name.text.encode('UTF-8'))  # 写入标题
            io.write('\n'.encode('UTF-8'))
            io.write(section_text.encode('UTF-8'))  # 写入小说内容
            io.write('\n'.encode('UTF-8'))
            print('下载 ', section_name, '完成')  # 提示

    finally:
        print('{0:0>8}-{1}.txt'.format(txt['id'], txt['title']), '下载完成')
        io.close()
get_txt(99999)


更新时间:2018年9月17日15:15:48

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值