# 爬取‘笔趣阁’小说网站小说
# 给定小说的地址页面,进行所有章节爬取
# 还有一个问题目前无法结局
# 问题1:Cookie中的数据会定时变化,当变化后,页面即无法访问
# 解决:可以调用第三方的Selenium和PhantomJS,进行无头访问
# 网上很多这两个第三方库的使用方法,就不测试举例了。
# 导入第三方库
import requests
import urllib
import re
from lxml import etree
from bs4 import BeautifulSoup
import time
import chardet
# 获取指定网页内容
def get_novel_html(url, num_retries):
#设置请求头,照抄的F12里的
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': '__cdnuid=fb7722fac4ed168b158dec4ed746875a; __guid=129197578.917650827861537500.1534748484770.319; __cdn_clearance=1534755942.64|0|S7lG0rT5P4URQJauAPI%2F7NXBSho%3D; monitor_count=19',
'Host': 'www.biquge.com.tw',
'If-Modified-Since': 'Mon, 20 Aug 2018 01:31:37 GMT',
'If-None-Match': "80625b892538d41:0",
'Referer': 'http://www.biquge.com.tw/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
#异常判断
try:
novel_html = requests.get(url, headers=headers)
except requests.exceptions.ConnectionError as e:
#输出错误信息
print('请求错误')
print('错误详情是:', e)
novel_html = None
# 判断是否为正常返回(200为正常返回值)
if (novel_html != None) and (500 <= novel_html.status_code <= 600):
#当时服务器错误的时候,进行三次重试,每次间隔一秒,并且用递归方式,减少重试次数
if (num_retries > 0):
print('服务器错误,正在重试。。')
time.sleep(1)
num_retries -= 1
get_novel_html(url, num_retries)
else:
# 测试返回数数据的页面编码,并将页面重新编码,防止乱码
charset = chardet.detect(novel_html.content)
novel_html.encoding = charset['encoding']
return novel_html.text
# 解析目标网页,获取小说章节的urls
def get_chapter_urls(html):
chapter_urls = []
chapter_titles = []
url_http = 'http://www.biquge.com.tw'
# BeautifulSoup解析
soup = BeautifulSoup(html, 'lxml')
data_list = soup.find('div', {'id': 'list'}).find('dl').find_all('dd')
for i in range(len(data_list)):
# print(data)
chapter_url = url_http + data_list[i].find('a').get('href')
chapter_title = data_list[i].find('a').get_text()
chapter_urls.append(chapter_url)
chapter_titles.append(chapter_title)
return chapter_titles, chapter_urls
# 保存数据
def save_novel(titles, urls):
with open('小说下载/{}.{}'.format('修真聊天群', 'txt'), 'w', encoding='utf-8') as f:
for i in range(len(titles)):
novel_chapter_url = urls[i]
novel_chapter_html = get_novel_html(novel_chapter_url, 3)
content = get_chapter_content(novel_chapter_html)
item = {
'title': titles[i],
'url': urls[i],
'content': content
}
print(item)
f.write('{title}\n,{url}\n\n,{content}\n\n'.format(**item))
time.sleep(0.5)
f.close()
#获取小说内容
def get_chapter_content(html):
soup = BeautifulSoup(html, 'lxml')
content = soup.find('div', {'id': 'content'}).get_text()
return content
if __name__ == '__main__':
url = '输入小说目录地址'
# 重试的次数
num_retries = 3
novel_html = get_novel_html(url, num_retries)
titles, urls = get_chapter_urls(novel_html)
save_novel(titles, urls)
爬取笔趣阁小说
最新推荐文章于 2024-08-14 08:38:25 发布