python 3.70
scrapy 1.60
windows 10.01
爬取一部小说 小说网址
- 设置items
import scrapy
class Novel1Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
chapter_name = scrapy.Field()
content = scrapy.Field()
- 设置pipelines
import codecs
import json
class Novel1Pipeline(object):
def __init__(self):
print('starting')
self.file = codecs.open('text_novel1.json', 'w', encoding='utf-8')
def process_item(self, item, spider):
json_text = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(json_text)
return item
def close_item(self):
print('end')
self.file.close()
- 编写爬虫
# -*- coding: utf-8 -*-
import scrapy
from novel1.items import Novel1Item
class Novel1Spider(scrapy.Spider):
name = 'novel_1'
download_delay = 0.5
allowed_domains = ['biqukan.com']
start_urls = ['http://www.biqukan.com/0_790/']
def parse(self, response):
dd_list = response.xpath('//div[@class="listmain"]/dl/dt[2]/following-sibling::dd/a')
title = response.css('h2::text').get()
for dd in dd_list:
item = Novel1Item(title=title)
item['chapter_name'] = dd.xpath('./text()').get(default='No name')
url = response.urljoin(dd.xpath('./@href').get())
req = scrapy.Request(url=url, callback=self.parse_content)
# scrapy.Request返回值是它回调函数的返回值
# Request中有一个参数是meta
# 作用是将一个信息(任意的类型)传递给callback回调函数
# 以字典的形式传递,
# meta={'key':item} (下面的request.meta['item']也可以这么写)
# 这里为了将详情页的内容和主页的内容一起存储,用meta方法对主页的内容
# 进行了一个暂存,在后面进行统一的提交
req.meta['item'] = item
yield req
def parse_content(self, response):
item = response.meta['item']
raw_content = response.css('div.showtxt::text').getall()
item['content'] = [str(c).replace('\xa0', '').strip() for c in raw_content][:-2]
yield item
- 设置文件 settings.py
import random
BOT_NAME = 'novel1'
SPIDER_MODULES = ['novel1.spiders']
NEWSPIDER_MODULE = 'novel1.spiders'
USER_AGENT_LIST = [
'MSIE (MSIE 6.0; X11; Linux; i686) Opera 7.23',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0',
'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)',
'Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)'
]
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = random.choice(USER_AGENT_LIST)
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'novel1.pipelines.Novel1Pipeline': 300,
}
- 验证返回结果 (scrapy是异步爬取框架,所以乱序)
{"title": "元尊", "chapter_name": "第一章 蟒雀吞龙", "content": ["灯火通明的....## 人为省略"]}
{"title": "元尊", "chapter_name": "第二章 20点准时更新,敬请期待!", "content": ["20点准时更新,敬请期待!"]}
{"title": "元尊", "chapter_name": "第一章 15点准时更新,敬请期待!", "content": ["15点准时更新,敬请期待!"]}