分布式爬虫步骤:
1.先用scrapy写出爬虫项目
2.redis 服务器
3.把原来单机版改成分布式
a 修改settings
b 修改爬虫类 继承
from scrapy_redis.spiders import RedisSpider —– spider
from scrapy_redis.spiders import RedisCrawlSpider —–crawlspider
c 设置 redis-key
d 启动所有slave (所有爬虫终端)
e 向master端的redis里 push数据(起始url)
lpush redis-key url
myspider_redis.py版本
#coding:utf8
from scrapy_redis.spiders import RedisSpider
class MySpider(RedisSpider):
"""Spider that reads urls from redis queue (myspider:start_urls)."""
name = 'myspider_redis'
# 请求队列的键
redis_key = 'myspider:start_urls'
allow_domain = ['hao123.com']
def parse(self, response):
return {
'name': response.css('title::text').extract_first(),
'url': response.url,
}
manage.py:
from scrapy import cmdline
import os
os.chdir('example/spiders')
cmdline.execute('scrapy runspider mycrawler_redis.py'.split())
redis-cli -h 192.168.6.6
lpush myspider:start_urls http://www.baidu.com
llen myspider:start_urls
mycrawler_redis.py版本
#coding:utf8
from scrapy.spiders import Rule # 按照规则提取url
from scrapy.linkextractors import LinkExtractor # 生成请求
from scrapy_redis.spiders import RedisCrawlSpider
class MyCrawler(RedisCrawlSpider):
"""Spider that reads urls from redis queue (myspider:start_urls)."""
name = 'mycrawler_redis'
redis_key = 'mycrawler:start_urls'
allowed_domains = ['itxdl.cn']
rules = (
# follow all links
Rule(LinkExtractor(), callback='parse_page', follow=True),
)
def parse_page(self, response):
return {
'name': response.css('title::text').extract_first(),
'url': response.url,
}
lpush mycrawler:start_urls http://www.baidu.com