简介
Python爬虫爬取小说
目标网址http://www.jingcaiyuedu.com/book/91703.html
开发环境:win10 python3.6
import requests
import re
#下载一个网页
url = 'http://www.jingcaiyuedu.com/book/91703.html'
#模拟游览器发送http请求
response = requests.get(url)
#编码方式
response.encoding = 'utf-8'
#目标小说主页的网页源码
html = response.text
#小说名字
title = re.findall(r'<meta property="og:title" content="(.*?)"/>',html,re.S)[0]
#新建一个文件,保存小说内容
fb = open('%s.txt' % title, 'w', encoding='utf-8')
#获得每一章的信息(章节,url)
dl = re.findall(r'<dl class="panel-body panel-chapterlist">.*?</dl>',html,re.S)[1]
chapter_info_list = re.findall(r'href="(.*?)">(.*?)<',dl)
#循环每一个章节,分别下载
for chapter_info in chapter_info_list:
chapter_url, chapter_title = chapter_info
chapter_url = "http://www.jingcaiyuedu.com%s" % chapter_url
print(chapter_url,chapter_title)
#下载章节内容
chapter_response = requests.get(chapter_url)
chapter_response.encoding='uft-8'
chapter_html = chapter_response.text
#提取,章节内容
chapter_content = re.findall(r'<div class="panel-body" id="htmlContent">.*?</div>',chapter_html,re.S)[0]
#清洗数据
chapter_content = chapter_content.replace(' ','')
chapter_content = chapter_content.replace(' ','')
chapter_content = chapter_content.replace('<br>','\n')
chapter_content = chapter_content.replace('<br/>', '')
chapter_content = chapter_content.replace('<p>', '')
chapter_content = chapter_content.replace('</p>', '')
chapter_content = chapter_content.replace('</div>', '')
chapter_content = chapter_content.replace('<divclass="panel-body"id="htmlContent">', '')
#持久化
fb.write(chapter_title)
fb.write(chapter_content)
fb.write('\n\n')
print(chapter_url)