本人在scrapy-redis项目中的setting.py中配置,可时中有问题。但是可以使用以下方法:
首先,你要确保安装了splash,并且已经启动
(1)先安装scrapy-splash库:
- pip install scrapy-splash
(2)然后将我们的Docker起起来
- docker run -p 8050:8050 scrapinghub/splash
在原有scrapy-redis项目基础上,只需要在spider中重写生成request的方法即可。主要原理时是把url转发给splash,让splash解析后返回
(1)普通爬虫中,
注意:如果是scrapy,并且没有使用scrapy-redis可以使用配置的方法,参见 http://blog.youkuaiyun.com/u013378306/article/details/54409215
# -*- coding: utf-8 -*-
from scrapy import Request
from scrapy.spiders import Spider
from scrapy.http import Request, HtmlResponse
from scrapy.selector import Selector
import json
class WeiXinSpider(Spider):
name = 'test'
start_urls = [
'https://item.jd.com/2600240.html'
]
global splashurl;
splashurl = "http://localhost:8050/render.html";# splash 服务器地址
#此处是重父类方法,并使把url传给splash解析
def make_requests_from_url(self, url):
global splashurl;
url=splashurl+"?url="+url;
body = json.dumps({"url": url, "wait": 5, 'images': 0, 'allowed_content_types': 'text/html; charset=utf-8'})
headers = {'Content-Type': 'application/json'}
return Request(url, body=body,headers=headers,dont_filter=True)
def parse(self, response):
print "############"+response._url
fo = open("jdeeeeeeeeee.html", "wb")
fo.write(response.body); # 写入文件
fo.close();
'''site = Selector(response)
links = site.xpath('//a/@href')
for link in links:
linkstr=link.extract()
print "*****"+linkstr
yield SplashRequest(linkstr, callback=self.parse)'''
(2)scrapy-redis中,和上面相同
#encoding: utf-8
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
from scrapy_redis.spiders import RedisCrawlSpider
import json
from scrapy.http import Request, HtmlResponse
class MyCrawler(RedisCrawlSpider):
"""Spider that reads urls from redis queue (myspider:start_urls)."""
name = 'mycrawler_redis'
redis_key = 'mycrawler:start_urls'
#start_urls = ['https://zhidao.baidu.com/question/2205192714330042628.html?fr=iks&word=scrapy&ie=gbk']
rules = (
# follow all links
Rule(LinkExtractor(allow=('/question/.*'),
restrict_xpaths=('//a[@class="related-link"]')), callback='parse_page', follow=True),
)
global splashurl;
splashurl = "http://localhost:8050/render.html";
# splash 服务器地址
#此处是重父类方法,并使把url传给splash解析
def make_requests_from_url(self, url):
global splashurl;
url = splashurl + "?url=" + url;
body = json.dumps({"url": url, "wait": 5, 'images': 0, 'allowed_content_types': 'text/html; charset=utf-8'})
headers = {'Content-Type': 'application/json'}
return Request(url, body=body, headers=headers, dont_filter=True)
def __init__(self, *args, **kwargs):
# Dynamically define the allowed domains list.
domain = kwargs.pop('domain', '')
self.allowed_domains = filter(None, domain.split(','))
super(MyCrawler, self).__init__(*args, **kwargs)
def parse_page(self, response):
print "#####"+response._url
return {
'name': response.css('title::text').extract_first(),
'url': response.url,
}