Scrapy-Redis分布式案列

最新推荐文章于 2025-06-06 20:16:39 发布

原创最新推荐文章于 2025-06-06 20:16:39 发布 · 199 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#分布式爬虫 #scrapy-redis

本文介绍了一种利用Scrapy框架结合Redis数据库进行高效网页爬取的方法。通过Redis实现请求队列管理和去重，确保爬虫的稳定性和效率。文章详细展示了如何配置Scrapy与Redis的集成，包括调度器、去重过滤器和数据存储流程。

###实现代码

# -*- coding: utf-8 -*-
import scrapy
from ..items import ShengshiItem
from scrapy_redis.spiders import RedisSpider

class ShengshiSSpider(RedisSpider):
    name = 'shengshi_s'
    # allowed_domains = ['sheng-shi.com']
    redis_key = "ShengshiSSpider:start_urls"

    # 爬取的URL元祖/列表。爬虫从这里开始抓取数据，所以，第一次下载的数据将会从这些urls开始。
    # start_urls = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html']

    def parse(self, response):
        first_city = response.xpath('//tr[@class="provincetr"]/td/a')
        for city in first_city:
            f = city.xpath('./text()').extract_first()
            second_href = city.xpath('./@href').extract_first()
            if second_href is not None:
                new_url = response.urljoin(second_href)
                yield scrapy.Request(new_url, callback=self.city2, meta={'f': f},dont_filter=True)

    def city2(self, response):
        second_citys = response.xpath('//tr[@class="citytr"]/td[2]/a')
        for second_city in second_citys:
            se = second_city.xpath('./text()').extract_first()
            f = response.meta['f']
            third_href = second_city.xpath('./@href').extract_first()
            if third_href is not None:
                new_url = response.urljoin(third_href)
                yield scrapy.Request(new_url, callback=self.city3, meta={'f': f, 'se': se},dont_filter=True)

    def city3(self, response):
        third_citys = response.xpath('//tr[@class="countytr"]/td[2]/a')
        for third_city in third_citys:
            th = third_city.xpath('./text()').extract_first()
            # print(th)
            f = response.meta['f']
            se = response.meta['se']
            fourth_href = third_city.xpath('./@href').extract_first()
            # print(fourth_href)
            if fourth_href is not None:
                new_url = response.urljoin(fourth_href)
                yield scrapy.Request(new_url, callback=self.city4, meta={'f': f, 'se': se, 'th': th},dont_filter=True)
    def city4(self, response):
        fourth_citys = response.xpath('//tr[@class="towntr"]/td[2]/a')
        for fourth_city in fourth_citys:
            fo = fourth_city.xpath('./text()').extract_first()
            f = response.meta['f']
            se = response.meta['se']
            th = response.meta['th']
            fifth_href = fourth_city.xpath('./@href').extract_first()
            if fifth_href is not None:
                new_url = response.urljoin(fifth_href)
                yield scrapy.Request(new_url, callback=self.city5, meta={'f': f, 'se': se, 'th': th, 'fo': fo},
                                     dont_filter=True)

    def city5(self, response):
        fifth_citys = response.xpath('//tr[@class="villagetr"]/td[3]/text()').extract()
        for fifth_city in fifth_citys:
            item = ShengshiItem()
            f = response.meta['f']
            se = response.meta['se']
            th = response.meta['th']
            fo = response.meta['fo']
            item['first_city'] = f
            item['second_city'] = se
            item['third_city'] = th
            item['fourth_city'] = fo
            item['fifth_city'] = fifth_city
            yield item

##setting设置

#去重组件，在redis数据库里做去重操作
DUPEFILTER_CLASS="scrapy_redis.dupefilter.RFPDupeFilter"

#使用scrapy_redis的调度器，在redis里分配请求
SCHEDULER="scrapy_redis.scheduler.Scheduler"

# 是否在开始之前清空 调度器和去重记录，True=清空，False=不清空
SCHEDULER_FLUSH_ON_START = False

# 去调度器中获取数据时，如果为空，最多等待时间（最后没数据，未获取到）。
SCHEDULER_IDLE_BEFORE_CLOSE = 10

#是否在关闭时候保留原来的调度器和去重记录，True=保留，False=清空
SCHEDULER_PERSIST=True

#将数据存储到redis数据库里
# ITEM_PIPELINES = {
#    'scrapy_redis.pipelines.RedisPipeline':300
# }

#服务器地址
REDIS_HOST='127.0.0.1'

#端口
REDIS_PORT=6379

#存到redis的编码格式
#REDIS_ENCODING="UTF-8"

##item字段

import scrapy


class ShengshiItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    first_city = scrapy.Field()
    second_city = scrapy.Field()
    third_city = scrapy.Field()
    fourth_city = scrapy.Field()
    fifth_city = scrapy.Field()

##pipelines ：存储到MongoDB中

import pymongo


class ShengshiPipeline(object):
    def __init__(self):
        self.d = pymongo.MongoClient('localhost')
        self.db = self.d['shengshi2']

    def process_item(self, item, spider):
        self.db['liandong'].insert(dict(item))
        return item