用selenium爬取拉钩网的职位信息

最新推荐文章于 2020-12-24 23:19:58 发布

wg5foc08

最新推荐文章于 2020-12-24 23:19:58 发布

阅读量409

点赞数

CC 4.0 BY-SA版权

分类专栏： Python

本文链接：https://blog.youkuaiyun.com/wg5foc08/article/details/99421199

Python 专栏收录该内容

19 篇文章

订阅专栏

本文介绍如何使用selenium工具来绕过拉勾网的反爬策略，通过模拟浏览器行为获取ajax加载的职位信息。在requests库无法完全获取数据的情况下，selenium能实现对网页的完整爬取。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

拉钩网的职位信息数据为ajax数据，抓取ajax数据可以直接分析数据的url接口，但是直接用requests库发送url请求会被拉钩网识别出来。1.可以通过session保存会话信息模拟请求，这时可以爬取部分信息数据，但是仍然不能爬取大量的或者完整的数据2.用selenium模拟浏览器爬取拉钩网的数据，可以完整的爬取本文拟爬取的url
代码1：

import requests
from lxml import etree
import re
s = requests.Session()
headers = {
    'Host': 'www.lagou.com',
    'Referer': 'https://www.lagou.com/jobs/list_%E7%88%AC%E8%99%AB?labelWords=&fromSearch=true&suginput=',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'none',
    'Sec - Fetch - User': '?1',
    'Cache-Control': 'max-age=0',
    'Accept-Language': 'zh-CN,zh;q=0.9'
    # 'Cookie':'user_trace_token=20190812112316-0d50e102-1a67-4513-b2ed-829f86e84cf8; _ga=GA1.2.360922141.1565580207; LGUID=20190812112326-8c451941-bcb0-11e9-8932-525400f775ce; LGSID=20190812113115-a3cc62cd-bcb1-11e9-8933-525400f775ce; sajssdk_2015_cross_new_user=1; LG_HAS_LOGIN=1; JSESSIONID=ABAAABAAAGGABCB22B752908545747F65EBAE7CCAA3ACA0; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; privacyPolicyPopup=false; index_location_city=%E5%B9%BF%E5%B7%9E; WEBTJ-ID=20190812113313-16c83e318763fb-05af2e1098214a-7373e61-1049088-16c83e3187725c; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216c83e22bc4d4-0c3ca016d58f8b-7373e61-1049088-16c83e22bc5bf%22%2C%22%24device_id%22%3A%2216c83e22bc4d4-0c3ca016d58f8b-7373e61-1049088-16c83e22bc5bf%22%2C%22props%22%3A%7B%22%24latest_utm_source%22%3A%22m_cf_cpt_baidu_pcbt%22%7D%7D; LG_LOGIN_USER_ID=0a449e99436de8476af49c4bcc4264d477b7c86f6531a3f2a1241f689cd400bf; _putrc=CEDE58ACF737991F123F89F2B170EADC; login=true; unick=%E5%BC%A0%E5%86%B0%E7%8E%89; gate_login_token=ff6d51f45584f8a2a87d428b1379af08312a851aff7eb4f93dfeca10a86e3684; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1565580208,1565580676,1565581312; TG-TRACK-CODE=search_code; _gat=1; SEARCH_ID=de5b6c4f1b5c4bf4acd842ee88dc0fda; X_HTTP_TOKEN=20d4a6bcb499851a75928556516e63c1ef8eed622b; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1565582959; LGRID=20190812120919-f4fccc75-bcb6-11e9-8935-525400f775ce'
}
def parse_newurl(newurl,cookies):
    res = requests.get(url = newurl,headers=headers,cookies=cookies).content.decode()
    # print(res)
    text = etree.HTML(res)
    print(text)
    salary = text.xpath('//dd[@class = "job_request"]/h3/span[1]/text()')[0].strip()
    location = text.xpath('//dd[@class = "job_request"]/h3/span[2]/text()')[0]
    location = re.sub(r'["\/"]',"",location)
    work_year = text.xpath('//dd[@class = "job_request"]/h3/span[3]/text()')[0]
    work_year = re.sub(r'["\/"]',"",work_year)
    education = text.xpath('//dd[@class = "job_request"]/h3/span[4]/text()')[0]
    education = re.sub(r'["\/"]', "", education)
    title = text.xpath('//div[@class="job-name"]/@title')[0]
    youhuo = text.xpath('//dd[@class="job-advantage"]/p/text()')[0]
    zhize = text.xpath('//div[@class="job-detail"]/p[2]/text()')[0:]
    yaoqiu1 = text.xpath('//div[@class="job-detail"]/p[4]/text()')
    yaoqiu2 = text.xpath('//div[@class="job-detail"]/p[5]/text()')
    yaoqiu3 = text.xpath('//div[@class="job-detail"]/p[6]/text()')
    yaoqiu = yaoqiu1 + yaoqiu2 + yaoqiu3
    work_adrr  =  text.xpath('//*[@id="job_detail"]/dd[3]/div[1]/a[2]/text()')[0]
    item = {
        '城市': location,
        '地址':work_adrr,
        '职位': title,
        '工作经验':work_year,
        '教育': education,
        '薪资': salary,
        '工作福利': youhuo,
        '工作职责': zhize,
        '岗位要求':yaoqiu
        }
    print(item)

def parse_obj(obj,cookies):
    if obj:
        flag = 1
    else:
        flag = 0
    try:
        flag = 1
        contents = obj.get('content').get('positionResult').get('result')  # 职位的相关信息保存在content的键值中
        for x in contents:
            positionid = x.get('positionId')#获取positionid进一步得到详情页的url
            newurl = 'https://www.lagou.com/jobs/%s.html?show=f61f2a8bac78431baca1391bc653c46f' % positionid
            print(newurl)
            parse_newurl(newurl,cookies)

    except:
        print('haha')
def parse_page(page_size):

    get_url = 'https://www.lagou.com/jobs/list_%E7%88%AC%E8%99%AB?px=default&city=%E5%85%A8%E5%9B%BD#filterBox'
    s.get(url = get_url,headers=headers,timeout = 5)
    cookies = s.cookies#获取cookies信息
    post_url = 'https://www.lagou.com/jobs/positionAjax.json?'
    for page in range(page_size + 1):
        data = {
            'first':'true',
            'pn':page,
            'kd':'爬虫'
        }
        response = requests.post(url = post_url,data = data,headers = headers,cookies = cookies,timeout = 4)
        obj = response.json()
        parse_obj(obj,cookies)
if __name__ == '__main__':
    page_size = int(input('请输入爬取多少页数据：'))
    parse_page(page_size)

2.代码二

from selenium import webdriver
import time
import re
from lxml import etree
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import json
#创建浏览器对象
class LagouSpider(object):
    def __init__(self):
        self.driver = webdriver.Chrome()
        self.url = 'https://www.lagou.com/jobs/list_%E7%88%AC%E8%99%AB?px=default&city=%E5%85%A8%E5%9B%BD#filterBox'
        self.i = 0
        self.items = []
    def run(self):
        self.driver.get(self.url)
        try:
            while True:
                source = self.driver.page_source
                self.parse_url(source)
                # 将数据保存成json格式的文件
                fp = open('lagou4.txt', 'a',encoding='utf8')  # 打开一个json文件
                json.dump(self.items, fp, indent=4,ensure_ascii=False)  # 将数据写入到文件中
                fp.close()
                time.sleep(5)
                self.driver.switch_to_window(self.driver.window_handles[self.i])  # 将页面跳转到主页面
                next_btn = self.driver.find_element_by_class_name("pager_next ")
                if "pager_next pager_next_disabled" in next_btn.get_attribute("class"):
                    break
                else:
                    next_btn.click()
                time.sleep(5)
        except:
            self.driver.quit()
        self.driver.quit()
    def parse_url(self,source):
        text = etree.HTML(source)
        hrefs = text.xpath('//*[@id="s_position_list"]/ul/li')
        for href in hrefs:
            src = href.xpath('.//div[1]/div[1]/div[1]/a/@href')[0]#将每个职位对应的url提取出来
            self.parse_newurl(src)
    def parse_newurl(self,src):
        self.driver.execute_script("window.open('%s')"%src)  # 跳转页面
        time.sleep(10)
        self.driver.switch_to_window(self.driver.window_handles[1])#将页面跳转到打开的详情页
        WebDriverWait(self.driver,timeout=50).until(EC.presence_of_element_located((By.XPATH,'//dd[@class = "job_request"]/h3/span[1]')))
        newtext = self.driver.page_source
        text = etree.HTML(newtext)
        salary = text.xpath('//dd[@class = "job_request"]/h3/span[1]/text()')[0].strip()
        location = text.xpath('//dd[@class = "job_request"]/h3/span[2]/text()')[0]
        location = re.sub(r'["\/"]', "", location)
        work_year = text.xpath('//dd[@class = "job_request"]/h3/span[3]/text()')[0]
        work_year = re.sub(r'["\/"]', "", work_year)
        education = text.xpath('//dd[@class = "job_request"]/h3/span[4]/text()')[0]
        education = re.sub(r'["\/"]', "", education)
        title = text.xpath('//div[@class="job-name"]/@title')[0]
        youhuo = text.xpath('//dd[@class="job-advantage"]/p/text()')[0]
        zhize = text.xpath('//div[@class="job-detail"]/p[2]/text()')[0:]
        yaoqiu1 = text.xpath('//div[@class="job-detail"]/p[4]/text()')
        yaoqiu2 = text.xpath('//div[@class="job-detail"]/p[5]/text()')
        yaoqiu3 = text.xpath('//div[@class="job-detail"]/p[6]/text()')
        yaoqiu = yaoqiu1 + yaoqiu2 + yaoqiu3
        work_adrr = text.xpath('//*[@id="job_detail"]/dd[3]/div[1]/a[2]/text()')[0]
        item = {
            '城市': location,
            '地址': work_adrr,
            '职位': title,
            '工作经验': work_year,
            '教育': education,
            '薪资': salary,
            '工作福利': youhuo,
            '工作职责': zhize,
            '岗位要求': yaoqiu
        }
        self.items.append(item)
        self.driver.close()
        self.driver.switch_to_window(self.driver.window_handles[0])  # 将页面跳转到初始页
if __name__ == '__main__':
    spider  = LagouSpider()
    spider.run()