一般发送request请求,spider扔给引擎进行处理,引擎给调度器进行处理请求,处理后给引擎,这时候引擎给下载器过程中会有download_middlewares,这时候可以通过中间件对请求处理
中间件代码:
from scrapy.http import HtmlResponse
from selenium.webdriver.chrome.options import Options
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
有头selenium模拟,这里用的谷歌,其他浏览器把下面的换成对应的浏览器即可
class SeleniumMiddleware(object):
def __init__(self):
self.options = Options()
这里谷歌驱动换成你对应的路径
self.browser = webdriver.Chrome(executable_path=r"C:\Users\Administrator\Desktop\爬虫\day06\ziliao\chromedriver",
chrome_options = self.options)
对发送过来的请求处理
def process_request(self,request,spider):
if int(request.meta['page']) ==2:
这里是将浏览器页面滑到底部,如果划得太多自行在scrollHeight减一些px self.browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
time.sleep(5)
找到该页面的下一页按钮,点击该按钮
div = self.browser.find_element_by_css_selector('.soupager')
#page = self.browser.find_element_by_css_selector('button.btn:nth-child(7)')
next_page = div.find_elements_by_tag_name('button')
# for button in next_page:
# if button.text =='下一页':
# button.click()s
next_page[1].click()
#time.sleep(5)
对于起始的url进行处理,第一个请求不需要处理,只用把得到的请求返回就行
else:
if int(request.meta['page']) ==0:
try:
print('url is :::', request.url)
self.browser.get(request.url)
except TimeoutException as e:
print('超时')
time.sleep(5)
这里返回就是到parse函数
return HtmlResponse(url=self.browser.current_url,body=self.browser.page_source,
encoding='utf-8',request=request)
下面是spider中的代码:
from scrapy import Request
import lxml.html
from scrapy.spider import Spider
class JobDes(object):
def __init__(self):
self.detail_url = ''
self.title = ''
def parse_lxml_zhilian(r):
tree = lxml.html.fromstring(r)
job_url = tree.xpath('./div/a/@href')[0]
# job_name = tree.xpath('//a[@class="contentpile__content__wrapper__item__info__boxle"]/@title')
print('job_url ::::',job_url)
# print(job_name)
return job_url
count = 0
class ZhilianSpider(scrapy.Spider):
name = 'zhilian'
# allowed_domains = ['ts.zhaopin.com']
# start_urls = ['http://ts.zhaopin.com/']
def start_requests(self):
url_str = 'https://sou.zhaopin.com/?pageSize=60&jl=489&kw=python&kt=3'
yield Request(url_str,callback=self.parse,dont_filter=True,meta={'page':'0'})
def parse(self, response):
# listContent > div:nth-child(1)
# print(response.body)
把智联的招聘信息都给爬下来
rs = response.xpath(".//div[@class='contentpile__content__wrapper clearfix']").extract()
#print('rs is :::::',rs)
这里是统计调度器中有多少个url,如果数量过多,防止url没有处理完,就是模拟浏览器处理太多url丢给spider解析,但是解析的太慢,而给的url过多可能造成丢包现象,所以当url太多时,休息几秒。
global count
count +=60
for r in rs:
job_url = parse_lxml_zhilian(r)
yield Request(url=job_url,callback=self.parse_detail,meta={'page':'3'},dont_filter=True)
if len(page_next) > 0 :
#计数当前调度器中有多少数据
while count > 300:
time.sleep(0.5)
#使用selenium模拟点击下一页,该请求不会产生实质的下载动作
yield Request(url=response.url,callback=self.parse,meta={'page':'2'},dont_filter=True)
#每解析一次详细页面就减少1次次数
def parse_detail(self,response):
global count
count -=1
print('*'*100)
print(response.url)
这里第三级页面没有解析完
# info = response.xpath('.//div[@class=new-info]').extract()
position = response.xpath(".//div[@class='new-info']//h1/text()").extract()[0]
salary = response.xpath(".//div[@class='l info-money']/strong/text()").extract()[0]
company = response.xpath(".//div[@class='company l']/a/text()").extract()[0]
print('*'*100)
print(position,salary,company)