scrapy本身是不支持分布式爬取的,但和组件scrapy-redis一起使用,则支持分布式爬取。
分布式爬取原理:所有爬虫端共享redis里面的request请求,然后从这里面获取请求,再去爬取,直至队列为空。
本例共使用了3个系统,win10和ubuntu作为爬虫客户端,另外一个ubuntu存放redis数据库用来保存爬取的数据 请求及去重指纹。
目的:通过简单分布式爬取腾讯社招每个分页的职位相关信息,明白使用scrapy及scrapy-redis组件搭建分布式爬虫的思路。可在使用crawlspider类爬虫的基础上修改,进而搭建分布式。
重点:在settings.py设置文件里面的添加的修改,主要是把爬取时的数据 请求 去重指纹保存到redis里面,因为scrapy默认不支持分布式,而scrapy-redis组件使用redis才支持分布式;爬虫文件主要是进行分布式爬取,注意每个爬虫端都要保证可以使用redis-cli来连接服务端的redis;把爬取到的数据可以从redis里面读取出来保存到mysql和mongodb里面。当然,也可以直接书写分布式爬虫,主要是保存数据到redis和爬虫文件,这和修改的用法一样。
遇到的问题:爬取时有一个爬虫端不能爬取数据,另外一个爬取完爬虫端没有关闭
解决的方法:经过测试发现,是由于异步请求的个数设置过大导致一个爬虫端就可以完成,把settings.py文件里面的CONCURRENT_REQUESTS 设置一个合适的数就可以。关于爬取完关闭爬虫,则采用超时方法解决,可参考scrapy_redis 解决空跑问题,自动关闭爬虫,主要在项目的设置文件settings.py及同级目录再添加一个文件解决。
要爬取的目标内容items.py
# -*- coding: utf-8 -*- import scrapy class TencentItem(scrapy.Item): # 职位名称 job_Title = scrapy.Field() # 详细链接 job_Link = scrapy.Field() # 职位类型 job_Type = scrapy.Field() # 职位人数 job_Number = scrapy.Field() # 工作位置 job_Location = scrapy.Field() # 发布日期 job_PublicDate = scrapy.Field() # utc时间 unix_Time = scrapy.Field() # 爬虫名 spider_Name = scrapy.Field()
爬虫文件tc.py
#!/usr/bin/env python # -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor # from scrapy.spiders import CrawlSpider, Rule from scrapy_redis.spiders import RedisCrawlSpider from scrapy.spiders import Rule from tencent.items import TencentItem # class TencentShezhaoSpider(CrawlSpider): class TencentXiaozhaoSpider(RedisCrawlSpider): name = 'tencent_shezhao' # allowed_domains = ['tencent.com'] # start_urls = ['https://hr.tencent.com/position.php?&start=0#a'] redis_key = 'TencentXiaozhaoSpider:start_urls' def __init__(self, *args, **kwargs): domain = kwargs.pop('domain', '') self.allowed_domains = filter(None, domain.split(',')) super(TencentXiaozhaoSpider, self).__init__(*args, **kwargs) # 获取匹配分页页码的链接的正则表达式 page_link = LinkExtractor(allow='start=\d+') rules = ( Rule(page_link, callback='parse_item',follow=True), ) def parse_item(self, response): # print '='*60 # print response.url # print '='*60 job_list = response.xpath('//tr[@class="odd"] | //tr[@class="even"]') item = TencentItem() for each in job_list: item['job_Title'] = each.xpath('./td[1]/a/text()')[0].extract() item['job_Link'] = each.xpath('./td[1]/a/@href')[0].extract() item['job_Type'] = each.xpath('./td[2]/text()').extract() item['job_Number'] = each.xpath('./td[3]/text()')[0].extract() item['job_Location'] = each.xpath('./td[4]/text()')[0].extract() item['job_PublicDate'] = each.xpath('./td[5]/text()')[0].extract() yield item
管道文件piplines.py内容
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import json from datetime import datetime class TencentPipeline(object): def process_item(self, item, spider): # self.filename = open('tencent.json','a') # jsontext = json.dumps(dict(item),ensure_ascii = False).encode('utf-8') + '\n' # self.filename.write(jsontext) # self.filename.close() item['unix_Time'] = datetime.utcnow() item['spider_Name'] = spider.name return item
设置settings.py文件内容
# -*- coding: utf-8 -*- # Scrapy settings for tencent project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'tencent' SPIDER_MODULES = ['tencent.spiders'] NEWSPIDER_MODULE = 'tencent.spiders' # 使用scrapy_redis的去重组件,不使用scrapy默认的去重组件 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" # 使用scrapy_redis的调度器,不使用scrapy默认的调度器 # SCHEDULER = 'scrapy_redis.scheduler.Scheduler' # SCHEDULER = 'scrapy_redis.scheduler.Scheduler' SCHEDULER = "scrapy_redis.scheduler.Scheduler" # 实测发现,redis数据库没有存放request,只有去重队列指纹集合和item数据列表,缺少个request请求列表。 # 实测发现2,启用下面的请求顺序,发现报错NameError: Module 'scrapy_redis.scheduler' doesn't define any object named 'SpiderPriorityQueue' # 默认的scrapy请求(按优先级顺序 )队列形式 # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.scheduler.SpiderPriorityQueue' # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' # 使用队列形式 # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue" # 队列形式,先进先出 # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.scheduler.SpiderQueue' # 栈形式,后进先出 # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.scheduler.SpiderStack' # 允许暂停,redis请求记录不丢失(就是断点续爬开启) SCHEDULER_PERSIST = True ITEM_PIPELINES = { 'tencent.pipelines.TencentPipeline': 300, 'scrapy_redis.pipelines.RedisPipeline': 400, } REDIS_HOST = '192.168.2.237' REDIS_PORT = 6379 # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'tencent (+http://www.yourdomain.com)' # Obey robots.txt rules # ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) CONCURRENT_REQUESTS = 3 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 1 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) # TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'tencent.middlewares.TencentSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'tencent.middlewares.TencentDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'tencent.pipelines.TencentPipeline': 300, #} # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
保存数据到mysql的process_item_for_mysql.py文件内容
#!/usr/bin/env python # -*- coding:utf-8 -*- import redis import MySQLdb import json def process_item(): rediscli = redis.Redis(host = '192.168.2.189', port = 6379, db = 0) mysqlcli = MySQLdb.connect(host = '192.168.2.189', port = 3306, user = 'root', password = 'mysql', db = 'tencent',charset = 'utf-8') offset = 0 while True: source,data = rediscli.blpop(['tc:items']) item = json.loads(data) try: cursor = mysqlcli.cursor() cursor.execute('INSERT INTO tencent (job_Title,job_Link,job_Type,job_Number,job_Location,job_PublicDate,unix_Time,spider_Name) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)',[item['job_Title'],item['job_Link'],item['job_Type'],item['job_Number'],item['job_Location'],item['job_PublicDate'],item['unix_Time'],item['spider_Name']]) cursor.commit() cursor.close() offset += 1 print offset except MySQLdb.Error,e: print 'mysql.error:%d,%s'%(e.args[0],e.args[1]) if __name__ == '__main__': process_item()
把数据保存到mongodb里面,文件process_item_for_mongodb.py内容
# -*- coding:utf-8 -*- import pymongo import redis import json def main(): # 连接redis数据库 rediscli = redis.Redis(host = '192.168.2.237', port = 6379, db = 0) # 连接mongodb数据库 mongocli = pymongo.MongoClient(host = 'localhost', port = 27017) # 在mongodb里面创建数据库tencent db = mongocli['tencent'] # 创建表shezhao sheet = db['shezhao'] while True: # 从redis里面获取数据 source,data = rediscli.blpop('tencent_shezhao:items') # 处理数据 item = json.loads(data) # 往表里面添加数据 sheet.insert(item) try: print u"Processing: %(name)s <%(link)s>" % item except KeyError: print u"Error procesing: %r" % item if __name__ == '__main__': main()
开始测试
启动爬虫端:windows和ubuntu上的2个爬虫文件,可以将同一份代码复制到另外一个爬虫端系统上
在redis服务端(另一个ubuntu,启动redis-server,用来存放爬取的数据),进入redis-cli,发出爬取指令
开始爬取
爬取结果,如下,使用redis desktop manager桌面管理工具查看redis存储的数据
分别运行python process_item_for_mysql.py,python process_item_for_mongodb.py,把数据保存到mysql和mongodb数据库里面。
保存到mongodb里面,首先要把mongodb的服务端运行起来,使用sudo mongod,
然后运行 python process_item_for_mongodb.py 保存数据到mongodb里面
保存到mysql里面的数据
备注:
想让爬虫爬取完后,要关闭爬虫,本例才用超时方法来解决,只需要在settings.py项目设置文件添加以下代码
# 设置关于爬虫爬完后关闭,防止空跑 MYEXT_ENABLED=True # 开启扩展 IDLE_NUMBER= 100 # 配置允许的空闲时长,每5秒会增加一次IDLE_NUMBER,直到增加到100,就是等待100秒后程序才会close # 在 EXTENSIONS 配置,激活扩展 EXTENSIONS = { 'tencent.extensions.RedisSpiderSmartIdleClosedExensions': 500, } ``` 2. 和settings.py文件同级的增加一个文件extensions.py,代码如下 ``` # -*- coding: utf-8 -*- # Define here the models for your scraped Extensions from scrapy import signals from scrapy.exceptions import NotConfigured class RedisSpiderSmartIdleClosedExensions(object): def __init__(self, idle_number, crawler): self.crawler = crawler self.idle_number = idle_number self.idle_list = [] self.idle_count = 0 @classmethod def from_crawler(cls, crawler): # first check if the extension should be enabled and raise # NotConfigured otherwise if not crawler.settings.getbool('MYEXT_ENABLED'): raise NotConfigured if not 'redis_key' in crawler.spidercls.__dict__.keys(): raise NotConfigured('Only supports RedisSpider') # get the number of items from settings idle_number = crawler.settings.getint('IDLE_NUMBER', 360) # instantiate the extension object ext = cls(idle_number, crawler) # connect the extension object to signals crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened) crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed) crawler.signals.connect(ext.spider_idle, signal=signals.spider_idle) return ext def spider_opened(self, spider): spider.logger.info("opened spider {}, Allow waiting time:{} second".format(spider.name, self.idle_number*5)) def spider_closed(self, spider): spider.logger.info("closed spider {}, Waiting time exceeded {} second".format(spider.name, self.idle_number*5)) def spider_idle(self, spider): # 程序启动的时候会调用这个方法一次,之后每隔5秒再请求一次 # 当持续半个小时都没有spider.redis_key,就关闭爬虫 # 判断是否存在 redis_key if not spider.server.exists(spider.redis_key): self.idle_count += 1 else: self.idle_count = 0 if self.idle_count > self.idle_number: # 执行关闭爬虫操作 self.crawler.engine.close_spider(spider, 'Waiting time exceeded')
结果发现:
总是有爬虫端系统报错 (error) LOADING Redis is loading the dataset in
memory,解决方法: [root@masterA RedisTest]# redis-cli flushall有一个爬虫端超过500秒才关闭爬虫,而另一个没超过100秒就关闭爬虫。