一. signals EXTENSIONS
# -*- coding: utf-8 -*-
# @Time : 2021/12/31 10:58
# @Author : Cocktail_py
# @File : extensions.py
import time
from scrapy import signals
from datetime import datetime
class SignalsExtension(object):
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def __init__(self, crawler):
cs = crawler.signals.connect
cs(self.engine_started, signal=signals.engine_started)
cs(self.engine_stopped, signal=signals.engine_stopped)
cs(self.spider_opened, signal=signals.spider_opened)
cs(self.spider_idle, signal=signals.spider_idle)
cs(self.spider_closed, signal=signals.spider_closed)
cs(self.spider_error, signal=signals.spider_error)
cs(self.request_scheduled, signal=signals.request_scheduled)
cs(self.response_received, signal=signals.response_received)
cs(self.response_downloaded, signal=signals.response_downloaded)
cs(self.item_scraped, signal=signals.item_scraped)
cs(self.item_dropped, signal=signals.item_dropped)
cs(self.item_error, signal=signals.item_error)
def engine_started(self):
pass
def engine_stopped(self):
pass
def spider_opened(self, spider):
"""爬虫开始运行"""
spider_name = spider.name
self.start_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
pass
def spider_idle(self, spider):
pass
def spider_closed(self, spider, reason):
"""爬虫结束运行"""
# 爬虫结束时间
start_time = self.start_time
finish_time = time.time()
finish_time_str = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(finish_time))
start_timestamp = int(time.mktime(time.strptime(start_time, "%Y-%m-%d %H:%M:%S")))
finish_timestamp = int(time.mktime(time.strptime(finish_time_str, "%Y-%m-%d %H:%M:%S")))
def spider_error(self, failure, response, spider):
""""处理spiders异常"""
exit(1)
pass
def request_scheduled(self, request, spider):
pass
def response_downloaded(self, response, request, spider):
pass
def response_received(self, response, request, spider):
pass
def item_scraped(self, item, response, spider):
pass
def item_dropped(self, item, spider, exception):
pass
def item_error(self, item, response, spider, failure):
"""处理pipelines/process_item异常"""
exit(0)
pass
@classmethod
def from_settings(cls, settings):
pass
# This is never called - but would be called if from_crawler()
# didn't exist. from_crawler() can access the settings via
# crawler.settings but also has access to everything that
# crawler object provides like signals, stats and the ability
# to schedule new requests with crawler.engine.download()
二.scrapy spider使用信号扩展中间件
# -*- coding: utf-8 -*-
# @Time : 2021/12/31 10:58
# @Author : Cocktail_py
import logging
import scrapy
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(message)s",
level=logging.INFO)
class BaiduSpider(scrapy.Spider):
name = 'baidu.com'
allowed_domains = []
custom_settings = {
'EXTENSIONS': {
'scrapy.extensions.corestats.CoreStats': None,
'scrapy.extensions.telnet.TelnetConsole': None,
# 引入信号扩展中间件
'scrapy_project_name.extensions.SignalsExtension': 1,
},
"DOWNLOADER_MIDDLEWARES": {
},
"ITEM_PIPELINES": {
},
"DOWNLOAD_DELAY": 0,
"RANDOMIZE_DOWNLOAD_DELAY": True,
"COOKIES_ENABLED": True,
'CONCURRENT_REQUESTS': 5,
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
}
参考:
https://blog.youkuaiyun.com/qq_41020281/article/details/82779919
https://scrapy-chs.readthedocs.io/zh_CN/latest/topics/signals.html