Python爬取小说

最新推荐文章于 2024-08-07 09:00:00 发布

原创最新推荐文章于 2024-08-07 09:00:00 发布 · 343 阅读

2 ·

CC 4.0 BY-SA版权

Python 专栏收录该内容

3 篇文章

订阅专栏

简介

Python爬虫爬取小说
目标网址http://www.jingcaiyuedu.com/book/91703.html
开发环境：win10 python3.6

import requests
import re

#下载一个网页
url = 'http://www.jingcaiyuedu.com/book/91703.html'

#模拟游览器发送http请求
response = requests.get(url)

#编码方式
response.encoding = 'utf-8'

#目标小说主页的网页源码
html = response.text

#小说名字
title = re.findall(r'<meta property="og:title" content="(.*?)"/>',html,re.S)[0]

#新建一个文件，保存小说内容
fb = open('%s.txt' % title, 'w', encoding='utf-8')

#获得每一章的信息（章节，url）
dl = re.findall(r'<dl class="panel-body panel-chapterlist">.*?</dl>',html,re.S)[1]
chapter_info_list = re.findall(r'href="(.*?)">(.*?)<',dl)

#循环每一个章节，分别下载
for chapter_info in chapter_info_list:
    chapter_url, chapter_title = chapter_info
    chapter_url = "http://www.jingcaiyuedu.com%s" % chapter_url
    print(chapter_url,chapter_title)

    #下载章节内容
    chapter_response = requests.get(chapter_url)
    chapter_response.encoding='uft-8'
    chapter_html = chapter_response.text

    #提取，章节内容
    chapter_content = re.findall(r'<div class="panel-body" id="htmlContent">.*?</div>',chapter_html,re.S)[0]

    #清洗数据
    chapter_content = chapter_content.replace(' ','')
    chapter_content = chapter_content.replace('&nbsp;','')
    chapter_content = chapter_content.replace('<br>','\n')
    chapter_content = chapter_content.replace('<br/>', '')
    chapter_content = chapter_content.replace('<p>', '')
    chapter_content = chapter_content.replace('</p>', '')
    chapter_content = chapter_content.replace('</div>', '')
    chapter_content = chapter_content.replace('<divclass="panel-body"id="htmlContent">', '')

    #持久化
    fb.write(chapter_title)
    fb.write(chapter_content)
    fb.write('\n\n')

    print(chapter_url)