scrapy-redis去重应用
看不懂,先存一下

1 # -*- coding: utf-8 -*-
2 import scrapy
3 from scrapy.http import Request
4
5
6 class ChoutiSpider(scrapy.Spider):
7 name = 'chouti'
8 allowed_domains = ['chouti.com']
9 start_urls = ['http://www.chouti.com/']
10
11 def start_requests(self):
12 url = "http://dig.chouti.com/"
13 yield Request(url=url, callback=self.parse)
14
15 def parse(self, response):
16 print('response', response)

自定义中间件,过滤重复URL的爬虫,并且保存redis中

1 #!/usr/bin/env python
2 # -*- coding:utf-8 -*-
3
4 import time
5 from scrapy.dupefilters import BaseDupeFilter
6 from scrapy.utils.request import request_fingerprint
7 import redis
8 from scrapy_redis.dupefilter import RFPDupeFilter
9 from scrapy_redis.connection import get_redis_from_settings
10 from scrapy_redis import defaults
11
12
13 class DupeFilter(BaseDupeFilter):
14 def __init__(self):
15 self.conn = redis.Redis(host='192.168.1.13', port=3306, password='woshinidaye')
16
17 def request_seen(self, request):
18 fd = request_fingerprint(request)
19 result = self.conn.sadd('visited_urls', fd)
20 if result == 1:
21 return False
22 return True
23
24
25 class RedisDupeFilter(RFPDupeFilter):
26 """
27 改下源码当中存入redis的key值,它源码里边是默认是存的时间戳作为key
28 """
29
30 @classmethod
31 def from_settings(cls, settings):
32 """Returns an instance from given settings.
33
34 This uses by default the key ``dupefilter:<timestamp>``. When using the
35 ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
36 it needs to pass the spider name in the key.
37
38 Parameters
39 ----------
40 settings : scrapy.settings.Settings
41
42 Returns
43 -------
44 RFPDupeFilter
45 A RFPDupeFilter instance.
46
47
48 """
49 server = get_redis_from_settings(settings)
50 # XXX: This creates one-time key. needed to support to use this
51 # class as standalone dupefilter with scrapy's default scheduler
52 # if scrapy passes spider on open() method this wouldn't be needed
53 # TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
54 # key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())} 时间戳经常变不好取以后我直接定死
55 key = defaults.DUPEFILTER_KEY % {'timestamp': 'woshinidie'}
56 debug = settings.getbool('DUPEFILTER_DEBUG')
57 return cls(server, key=key, debug=debug)

配置文件

1 # redis去重配置
2 REDIS_HOST = '192.168.1.13' # 主机名
3 REDIS_PORT = 3306 # 端口
4 REDIS_PARAMS = {'password': 'woshinidaye'} # Redis连接参数 默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
5 # REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient' # 指定连接Redis的Python模块 默认:redis.StrictRedis
6 REDIS_ENCODING = "utf-8" # redis编码类型 默认:'utf-8'
7
8 # REDIS_URL = 'redis://user:pass@hostname:9001' # 连接URL(优先于以上配置)源码可以看到
9 DUPEFILTER_KEY = 'dupefilter:%(timestamp)s'
10 # 纯源生的它内部默认是用的以时间戳作为key
11 # DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
12 # 我自定义在源码之上改了保存在redis中的key配置
13 DUPEFILTER_CLASS = 'redisdepth.xxx.RedisDupeFilter'
14 # 自定义redis去重配置
15 # DUPEFILTER_CLASS = 'redisdepth.xxx.DupeFilter'

Scrapy-redis的队列
包括:先进先出队列,后进先出队列,优先队列
1.先进先出队列

1 #!/usr/bin/env python
2 # -*- coding:utf-8 -*-
3
4 import redis
5
6
7 class FifoQueue(object):
8 def __init__(self):
9 self.server = redis.Redis(host='192.168.1.13', port=3306, password='woshinidaye')
10
11 def push(self, request):
12 """Push a request"""
13 self.server.lpush('User', request)
14
15 def pop(self):
16 """Pop a request"""
17 data = self.server.rpop('User')
18 return data
19
20
21 q = FifoQueue()
22 q.push(11)
23 q.push(22)
24 q.push(33)
25 print(q.pop())
26 # 先进先出队列

2.后进先出队列

1 #!/usr/bin/env python
2 # -*- coding:utf-8 -*-
3
4 import redis
5
6
7 class LifoQueue(object):
8
9 def __init__(self):
10 self.server = redis.Redis(host='192.168.1.13', port=3306, password='woshinidaye')
11
12 def push(self, request):
13 """Push a request"""
14 self.server.lpush('User', request)
15
16 def pop(self, timeout=0):
17 """Pop a request"""
18 data = self.server.lpop('User')
19 return data

3.优先队列

1 #!/usr/bin/env python
2 # -*- coding:utf-8 -*-
3
4 import redis
5
6
7 class PriorityQueue(object):
8 """Per-spider priority queue abstraction using redis' sorted set"""
9
10 def __init__(self):
11 self.server = redis.Redis(host='192.168.1.13', port=3306, password='woshinidaye')
12
13 def push(self, request, score):
14 """Push a request"""
15 score = -request.priority
16 # We don't use zadd method as the order of arguments change depending on
17 # whether the class is Redis or StrictRedis, and the option of using
18 # kwargs only accepts strings, not bytes.
19 self.server.execute_command('ZADD', 'xxxx', score, request)
20
21 def pop(self, timeout=0):
22 """
23 Pop a request
24 timeout not support in this queue class
25 """
26 # use atomic range/remove using multi/exec
27 pipe = self.server.pipeline()
28 pipe.multi()
29 pipe.zrange('xxxx', 0, 0).zremrangebyrank('xxxx', 0, 0)
30 results, count = pipe.execute()
31 if results:
32 return results[0]
33
34
35 q = PriorityQueue()
36
37 q.push('ZH', -99)
38 q.push('SB', -66)
39 q.push('JJ', -33)
40 # 如果优先级从小到大广度优先,从大到小就深度优先
41 print(q.pop()) # 默认取最小的
42 print(q.pop())
43 print(q.pop())

Scheduler源码分析(我在Notepad++写了直接贴过来的)

1 1.找到from scrapy_redis.scheduler import Scheduler
2 -执行Scheduler.from_crawler
3 -执行Scheduler.from_settings
4 - 读取配置文件:
5 SCHEDULER_PERSIST # 是否在关闭时候保留原来的调度器和去重记录,True=保留,False=清空
6 SCHEDULER_FLUSH_ON_START # 是否在开始之前清空 调度器和去重记录,True=清空,False=不清空
7 SCHEDULER_IDLE_BEFORE_CLOSE # 去调度器中获取数据时,如果为空,最多等待时间(最后没数据,未获取到)
8 - 读取配置文件:
9 SCHEDULER_QUEUE_KEY # 调度器中请求存放在redis中的key
10 SCHEDULER_QUEUE_CLASS # 这里可以选择三种先进先出、后进先出、优先级,默认使用优先级队列(默认),其他:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表)
11 SCHEDULER_DUPEFILTER_KEY # 去重规则,在redis中保存时对应的key
12 DUPEFILTER_CLASS # 这里有两种选择使用默认或者自己定义的
13 # 内置比如:DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
14 # 自定义的比如:DUPEFILTER_CLASS = 'redisdepth.xxx.DupeFilter' 这个优先级别高 在源码里边是先判断然后再后续操作
15 SCHEDULER_SERIALIZER # 对保存到redis中的数据进行序列化,默认使用pickle
16 - 读取配置文件:redis-server
17 # 源码在connection.py中17行
18 REDIS_HOST = '192.168.1.13' # 主机名
19 REDIS_PORT = 3306 # 端口
20 REDIS_PARAMS = {'password': 'woshinidaye'} # Redis连接参数 默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
21 # REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient' # 指定连接Redis的Python模块 默认:redis.StrictRedis
22 REDIS_ENCODING = "utf-8" # redis编码类型 默认:'utf-8'
23 # REDIS_URL = 'redis://user:pass@hostname:9001' # 连接URL(优先于以上配置)源码可以看到
24 2.爬虫开始执行起始URL
25 - 调用Scheduler.enqueue_request
26 def enqueue_request(self, request):
27 # 请求需要过滤?并且 去重规则是否已经有?(是否已经访问,如果未访问添加到去重记录)request_seen去重规则重要的一个方法
28 if not request.dont_filter and self.df.request_seen(request):
29 self.df.log(request, self.spider)
30 # 已经访问过不再进行访问
31 return False
32 if self.stats:
33 self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider)
34 # 未访问过,添加到调度器中把这个请求
35 self.queue.push(request)
36 return True
37 3.下载器去调度中获取任务,去执行任务下载
38 - 调用Scheduler.next_request
39 def next_request(self):
40 block_pop_timeout = self.idle_before_close
41 # 把任务取出来
42 request = self.queue.pop(block_pop_timeout)
43 if request and self.stats:
44 # 此时下载
45 self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider)
46 return request
47
48

settings需要的配置

1 # redis去重配置
2 REDIS_HOST = '192.168.1.13' # 主机名
3 REDIS_PORT = 3306 # 端口
4 REDIS_PARAMS = {'password': 'woshinidaye'} # Redis连接参数 默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
5 # REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient' # 指定连接Redis的Python模块 默认:redis.StrictRedis
6 REDIS_ENCODING = "utf-8" # redis编码类型 默认:'utf-8'
7
8 # REDIS_URL = 'redis://user:pass@hostname:9001' # 连接URL(优先于以上配置)源码可以看到
9 DUPEFILTER_KEY = 'dupefilter:%(timestamp)s'
10 # 纯源生的它内部默认是用的以时间戳作为key
11 # DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
12 # 我自定义在源码之上改了保存在redis中的key配置
13 DUPEFILTER_CLASS = 'redisdepth.xxx.RedisDupeFilter'
14 # 自定义redis去重配置
15 # DUPEFILTER_CLASS = 'redisdepth.xxx.DupeFilter'
16
17
18 # #############调度器配置###########################
19 # from scrapy_redis.scheduler import Scheduler
20
21 SCHEDULER = "scrapy_redis.scheduler.Scheduler"
22 DEPTH_PRIORITY = 1 # 广度优先
23 # DEPTH_PRIORITY = -1 # 深度优先
24 SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' # 默认使用优先级队列(默认),其他:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表)
25 # 广度优先
26 # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue'
27 # 深度优先
28 # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue'
29
30 SCHEDULER_QUEUE_KEY = '%(spider)s:requests' # 调度器中请求存放在redis中的key
31 SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat" # 对保存到redis中的数据进行序列化,默认使用pickle
32 SCHEDULER_PERSIST = True # 是否在关闭时候保留原来的调度器和去重记录,True=保留,False=清空
33 SCHEDULER_FLUSH_ON_START = True # 是否在开始之前清空 调度器和去重记录,True=清空,False=不清空
34 SCHEDULER_IDLE_BEFORE_CLOSE = 10 # 去调度器中获取数据时,如果为空,最多等待时间(最后没数据,未获取到)。
35 SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter' # 去重规则,在redis中保存时对应的key
36 SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' # 去重规则对应处理的类

总结:
深度优先:基于层级先进入到最深层级进行处理全部后再往上层级处理
广度优先:基于从第一层开始,每个层次处理之后进入下一层级处理
先进先出,广度优先 FifoQueue
后进先出,深度优先 LifoQueue
优先级队列:
DEPTH_PRIORITY = 1 # 广度优先
DEPTH_PRIORITY = -1 # 深度优先

调度器 队列 DupeFilter三者关系
调度器:获取哪个request
队列: 存放request
DupeFilter:对访问记录处理

补充点点
| 1 2 3 4 5 6 7 8 |
|
配置文件大解读

# -*- coding: utf-8 -*-
# Scrapy settings for redisdepth project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# 爬虫名称
BOT_NAME = 'redisdepth'
# 爬虫应用路径
SPIDER_MODULES = ['redisdepth.spiders']
NEWSPIDER_MODULE = 'redisdepth.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# 客服端user-agent请求头
#USER_AGENT = 'redisdepth (+http://www.yourdomain.com)'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
# 爬虫君子证书,禁止爬虫设置
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# 并发请求数 力度要粗点
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# 延迟下载秒数
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# 单域名访问并发数 并且延迟下次秒数也用在每个域名
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
# 单IP访问并发数,如果有值则忽略:CONCURRENT_REQUESTS_PER_DOMAIN,并且延迟下次秒数也应用在每个IP
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# 是否支持cookie,cookiejar进行操作cookie
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
# Telnet用于查看当前爬虫的信息,操作爬虫等...
# 使用telnet ip port ,然后通过命令操作
# TELNETCONSOLE_ENABLED = True
# TELNETCONSOLE_HOST = '127.0.0.1'
# TELNETCONSOLE_PORT = [6023,]
#TELNETCONSOLE_ENABLED = False
# 默认请求头
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# 爬虫中间件
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# # 'redisdepth.middlewares.RedisdepthSpiderMiddleware': 543,
# 'redisdepth.sd.Sd1': 666,
# 'redisdepth.sd.Sd2': 667,
#
# }
# 下载中间件
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# # 'redisdepth.middlewares.RedisdepthDownloaderMiddleware': 543,
# # 'redisdepth.md.Md1': 666,
# # 'redisdepth.md.Md2': 667
# }
# 自定义扩展,基于信号进行调用
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
'redisdepth.ext.MyExtension': 666,
}
# 定义pipeline处理请求
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'redisdepth.pipelines.RedisdepthPipeline': 300,
#}
"""
自动限速算法
from scrapy.contrib.throttle import AutoThrottle
自动限速设置
1. 获取最小延迟 DOWNLOAD_DELAY
2. 获取最大延迟 AUTOTHROTTLE_MAX_DELAY
3. 设置初始下载延迟 AUTOTHROTTLE_START_DELAY
4. 当请求下载完成后,获取其"连接"时间 latency,即:请求连接到接受到响应头之间的时间
5. 用于计算的... AUTOTHROTTLE_TARGET_CONCURRENCY
target_delay = latency / self.target_concurrency
new_delay = (slot.delay + target_delay) / 2.0 # 表示上一次的延迟时间
new_delay = max(target_delay, new_delay)
new_delay = min(max(self.mindelay, new_delay), self.maxdelay)
slot.delay = new_delay
"""
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# 开始自动限速
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
# 初始下载延迟
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# 最大下载延迟
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# 平均每秒并发数
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# 是否显示
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
"""
启用缓存
目的用于将已经发送的请求或相应缓存下来,以便以后使用
from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware
from scrapy.extensions.httpcache import DummyPolicy
from scrapy.extensions.httpcache import FilesystemCacheStorage
"""
# 是否启用缓存策略
#HTTPCACHE_ENABLED = True
# 缓存策略:所有请求均缓存,下次在请求直接访问原来的缓存即可
# HTTPCACHE_POLICY = "scrapy.extensions.httpcache.DummyPolicy"
# 缓存策略:根据Http响应头:Cache-Control、Last-Modified 等进行缓存的策略
# HTTPCACHE_POLICY = "scrapy.extensions.httpcache.RFC2616Policy"
# 缓存超时时间
#HTTPCACHE_EXPIRATION_SECS = 0
# 缓存保存路径
#HTTPCACHE_DIR = 'httpcache'
# 缓存忽略的http状态码
#HTTPCACHE_IGNORE_HTTP_CODES = []
# 缓存存储的插件
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

标签: Scra
本文介绍Scrapy-Redis在爬虫项目中的去重应用及调度器配置,包括自定义去重规则、调度器队列管理及优先级设置,深入解析Scrapy-Redis的队列工作原理。
519

被折叠的 条评论
为什么被折叠?



