有些页面是ajax请求,爬虫无法直接请求到数据,这时需要用到selemium模拟打开网页
新建一个Request文件
from scrapy import Request
class SeleniumRequest(Request): # 继承Request,从功能上来说和Request是一样的
pass
在中间件中的process_request中处理,中间件需要在settings中激活
from selenium.webdriver import Chrome
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
from 爬虫开发.zhipin.zhipin.spiders.request import SeleniumRequest
from scrapy.http.response.html import HtmlResponse
class ZhipinDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
# 注册方法
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
return s
def process_request(self, request, spider):
# 所有的请求都到这里
# 需要进行判断,判断出是否需要用selenium处理请求
# 开始selenium,返回页面的源代码组成的response
if isinstance(request,SeleniumRequest):
# selenium 处理
self.web.get(request.url)
page_source = self.web.page_source
# 封装一个响应对象
return HtmlResponse(url=request.url,status=200,body=page_source,request=request,encoding='utf-8')
else:
return None
def spider_opened(self, spider):
# 爬虫打开时
self.web = Chrome()
def spider_closed(self,spider):
self.web.close()