scrapy 信号扩展中间件(signals EXTENSIONS)使用

最新推荐文章于 2025-11-22 22:13:12 发布

原创最新推荐文章于 2025-11-22 22:13:12 发布 · 782 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#中间件 #scrapy

爬虫专栏收录该内容

17 篇文章

订阅专栏

一. signals EXTENSIONS

# -*- coding: utf-8 -*-
# @Time    : 2021/12/31 10:58
# @Author  : Cocktail_py
# @File    : extensions.py

import time
from scrapy import signals
from datetime import datetime


class SignalsExtension(object):
    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def __init__(self, crawler):
        cs = crawler.signals.connect
        cs(self.engine_started, signal=signals.engine_started)
        cs(self.engine_stopped, signal=signals.engine_stopped)
        cs(self.spider_opened, signal=signals.spider_opened)
        cs(self.spider_idle, signal=signals.spider_idle)
        cs(self.spider_closed, signal=signals.spider_closed)
        cs(self.spider_error, signal=signals.spider_error)
        cs(self.request_scheduled, signal=signals.request_scheduled)
        cs(self.response_received, signal=signals.response_received)
        cs(self.response_downloaded, signal=signals.response_downloaded)
        cs(self.item_scraped, signal=signals.item_scraped)
        cs(self.item_dropped, signal=signals.item_dropped)
        cs(self.item_error, signal=signals.item_error)

    def engine_started(self):
        pass

    def engine_stopped(self):
        pass

    def spider_opened(self, spider):
        """爬虫开始运行"""
        spider_name = spider.name
        self.start_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        pass

    def spider_idle(self, spider):
        pass

    def spider_closed(self, spider, reason):
        """爬虫结束运行"""
        # 爬虫结束时间
        start_time = self.start_time
        finish_time = time.time()
        finish_time_str = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(finish_time))

        start_timestamp = int(time.mktime(time.strptime(start_time, "%Y-%m-%d %H:%M:%S")))
        finish_timestamp = int(time.mktime(time.strptime(finish_time_str, "%Y-%m-%d %H:%M:%S")))

    def spider_error(self, failure, response, spider):
        """"处理spiders异常"""

        exit(1)
        pass

    def request_scheduled(self, request, spider):
        pass

    def response_downloaded(self, response, request, spider):
        pass

    def response_received(self, response, request, spider):
        pass

    def item_scraped(self, item, response, spider):
        pass

    def item_dropped(self, item, spider, exception):
        pass

    def item_error(self, item, response, spider, failure):
        """处理pipelines/process_item异常"""
        exit(0)
        pass

    @classmethod
    def from_settings(cls, settings):
        pass
        # This is never called - but would be called if from_crawler()
        # didn't exist. from_crawler() can access the settings via
        # crawler.settings but also has access to everything that
        # crawler object provides like signals, stats and the ability
        # to schedule new requests with crawler.engine.download()

二.scrapy spider使用信号扩展中间件

# -*- coding: utf-8 -*-
# @Time    : 2021/12/31 10:58
# @Author  : Cocktail_py

import logging
import scrapy

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(message)s",
    level=logging.INFO)


class BaiduSpider(scrapy.Spider):
    name = 'baidu.com'

    allowed_domains = []

    custom_settings = {
        'EXTENSIONS': {
            'scrapy.extensions.corestats.CoreStats': None,
            'scrapy.extensions.telnet.TelnetConsole': None,
            # 引入信号扩展中间件
            'scrapy_project_name.extensions.SignalsExtension': 1,
        },
        "DOWNLOADER_MIDDLEWARES": {
 
        },
        "ITEM_PIPELINES": {

        },
        "DOWNLOAD_DELAY": 0,
        "RANDOMIZE_DOWNLOAD_DELAY": True,
        "COOKIES_ENABLED": True,
        'CONCURRENT_REQUESTS': 5,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
    }