scrapy 信号扩展中间件(signals EXTENSIONS)使用

一. signals EXTENSIONS
# -*- coding: utf-8 -*-
# @Time    : 2021/12/31 10:58
# @Author  : Cocktail_py
# @File    : extensions.py

import time
from scrapy import signals
from datetime import datetime


class SignalsExtension(object):
    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def __init__(self, crawler):
        cs = crawler.signals.connect
        cs(self.engine_started, signal=signals.engine_started)
        cs(self.engine_stopped, signal=signals.engine_stopped)
        cs(self.spider_opened, signal=signals.spider_opened)
        cs(self.spider_idle, signal=signals.spider_idle)
        cs(self.spider_closed, signal=signals.spider_closed)
        cs(self.spider_error, signal=signals.spider_error)
        cs(self.request_scheduled, signal=signals.request_scheduled)
        cs(self.response_received, signal=signals.response_received)
        cs(self.response_downloaded, signal=signals.response_downloaded)
        cs(self.item_scraped, signal=signals.item_scraped)
        cs(self.item_dropped, signal=signals.item_dropped)
        cs(self.item_error, signal=signals.item_error)

    def engine_started(self):
        pass

    def engine_stopped(self):
        pass

    def spider_opened(self, spider):
        """爬虫开始运行"""
        spider_name = spider.name
        self.start_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        pass

    def spider_idle(self, spider):
        pass

    def spider_closed(self, spider, reason):
        """爬虫结束运行"""
        # 爬虫结束时间
        start_time = self.start_time
        finish_time = time.time()
        finish_time_str = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(finish_time))

        start_timestamp = int(time.mktime(time.strptime(start_time, "%Y-%m-%d %H:%M:%S")))
        finish_timestamp = int(time.mktime(time.strptime(finish_time_str, "%Y-%m-%d %H:%M:%S")))

    def spider_error(self, failure, response, spider):
        """"处理spiders异常"""

        exit(1)
        pass

    def request_scheduled(self, request, spider):
        pass

    def response_downloaded(self, response, request, spider):
        pass

    def response_received(self, response, request, spider):
        pass

    def item_scraped(self, item, response, spider):
        pass

    def item_dropped(self, item, spider, exception):
        pass

    def item_error(self, item, response, spider, failure):
        """处理pipelines/process_item异常"""
        exit(0)
        pass

    @classmethod
    def from_settings(cls, settings):
        pass
        # This is never called - but would be called if from_crawler()
        # didn't exist. from_crawler() can access the settings via
        # crawler.settings but also has access to everything that
        # crawler object provides like signals, stats and the ability
        # to schedule new requests with crawler.engine.download()

二.scrapy spider使用信号扩展中间件
# -*- coding: utf-8 -*-
# @Time    : 2021/12/31 10:58
# @Author  : Cocktail_py

import logging
import scrapy

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(message)s",
    level=logging.INFO)


class BaiduSpider(scrapy.Spider):
    name = 'baidu.com'

    allowed_domains = []

    custom_settings = {
        'EXTENSIONS': {
            'scrapy.extensions.corestats.CoreStats': None,
            'scrapy.extensions.telnet.TelnetConsole': None,
            # 引入信号扩展中间件
            'scrapy_project_name.extensions.SignalsExtension': 1,
        },
        "DOWNLOADER_MIDDLEWARES": {
 
        },
        "ITEM_PIPELINES": {

        },
        "DOWNLOAD_DELAY": 0,
        "RANDOMIZE_DOWNLOAD_DELAY": True,
        "COOKIES_ENABLED": True,
        'CONCURRENT_REQUESTS': 5,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
    }

参考:
https://blog.youkuaiyun.com/qq_41020281/article/details/82779919
https://scrapy-chs.readthedocs.io/zh_CN/latest/topics/signals.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Cocktail_py

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值