爬拉勾

本文介绍使用requests和selenium两种方式爬取拉勾网上关于Python职位的招聘信息,包括薪资、城市、经验要求等详细信息,展示了如何解析网页、处理请求及存储数据。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

用 requests 爬

import requests
import re
import time
import json
import random

from lxml import etree

User_Agent = [
	'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
	'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
	'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
	'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
	'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
	'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',
	'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)',
	'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201',
	'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)'
	]
headers = {
	'User-Agent': User_Agent[random.randint(0,9)],
	'Accept': 'application/json, text/javascript, */*; q=0.01',
	'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
}
data = {
  'first': 'true',
  'pn': 1,
  'kd': 'python'
}
# 原始 url
url = 'https://www.lagou.com/jobs/list_python/p-city_4?px=default#filterBox'
# positionAjax 中获得的 url
url_post = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city=%E5%A4%A9%E6%B4%A5&needAddtionalResult=false'
s = requests.Session()
s.get(url, headers=headers)
# 获取到从原始 url 中得到的 cookie
cookies = s.cookies
# 天津的 python 招聘当前一共四页
job_information = {}
for i in range(1,5):
	data['pn'] = i
	if i > 1:
		# 当页数大于 1,即 pn>1 时,first 为 false,并且多了个属性 sid 且是固定值
		# 将 first 值改为 false
		data['first'] = 'false'
		# 将 sid 属性值改/添加为 f2c32c03327c4ce58ac492bfcfb49600
		data['sid'] = 'f2c32c03327c4ce58ac492bfcfb49600'
	req = requests.post(url_post,headers=headers,cookies=cookies,data=data)
	result = req.json()['content']['positionResult']['result']
	print(result)
	for res in result:
		positionId = res['positionId']
		positionUrl = 'https://www.lagou.com/jobs/%d.html' % positionId
		# positionUrl = 'https://www.lagou.com/jobs/5268012.html'
		response = requests.get(positionUrl,headers=headers,cookies=cookies)
		html = etree.HTML(response.content.decode('utf8'))
		title = html.xpath('//div[@class="position-content "]//div[@class="job-name"]/@title')
		print(title)
		sss = html.xpath('//div[@class="position-content "]//dd[@class="job_request"]//span/text()')
		print(sss)
		salary = sss[0].replace('/','').strip()
		city = sss[1].replace('/','').strip()
		experience = sss[2].replace('/','').strip()
		education = sss[3].replace('/','').strip()
		job_time = sss[4].replace('/','').strip()
		job_information['salary'] = salary
		job_information['city'] = city
		job_information['experience'] = experience
		job_information['education'] = education
		job_information['job_time'] = job_time
		# city = html.xpath('//div[@data-companyid="488941"]//dd[@class="job_request"]/h3/span/text()')[1].replace('/', '').split()[0]
		# experience = html.xpath('//div[@data-companyid="488941"]//dd[@class="job_request"]/h3/span/text()')[2].replace('/', '').split()[0]
		# education = html.xpath('//div[@data-companyid="488941"]//dd[@class="job_request"]/h3/span/text()')[3].replace('/', '').split()[0]
		# job_time = html.xpath('//div[@data-companyid="488941"]//dd[@class="job_request"]/h3/span/text()')[4].replace('/', '').split()[0]
		job_type = html.xpath('//div[@class="position-head"]//li[@class="labels"]/text()')[0]
		welfare = html.xpath('//dl[@id="job_detail"]//dd[@class="job-advantage"]//p/text()')[0]
		dutys = html.xpath('//dl[@id="job_detail"]//dd[@class="job_bt"]//p/text()')
		job_duty = ''
		for duty in dutys:
			job_duty += duty
			job_duty += '\n'
		job_information['job_type'] = job_type
		job_information['welfare'] = welfare
		job_information['job_duty'] = job_duty
		addrs = html.xpath('//dl[@id="job_detail"]//dd[@class="job-address clearfix"]/div[@class="work_addr"]/a/text()')
		addr = ''
		for i in range(len(addrs)-1):
			addr += addrs[i]
		job_information['addr'] = addr
		time.sleep(random.randint(5,10))
print(job_information)

用 selenium 爬

import time
import re
import random
import json

from lxml import etree
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

class LaGou(object):

	def __init__(self):
		self.driver_path = 'D:\downloads\chromedriver_win32\chromedriver.exe'
		self.driver = webdriver.Chrome(executable_path=self.driver_path)
		# self.job_info_list = []

	def page_url(self):
		url = 'https://www.lagou.com/jobs/list_python/p-city_4?px=default#filterBox'
		self.driver.get(url)
		while True:
			page_source = self.driver.page_source
			html = etree.HTML(page_source)
			# self.job_information = {}
			# 除了最后一页每页 15 个 url
			links = html.xpath('//a[@class="position_link"]/@href')
			for link in links:
				self.job_information = {}
				# 打开新的标签页
				self.driver.execute_script('window.open("%s")' % link)
				# 为了防止 driver.close() 此时标签页还没关闭,切换到最新的标签页,即新打开的标签页
				self.driver.switch_to.window(self.driver.window_handles[1])
				source = self.driver.page_source
				self.job_info_onepage(source)
				self.driver.close()
				# 如果不切换回首页,没法新打开一个标签页,会报错
				self.driver.switch_to.window(self.driver.window_handles[0])
			# 当出现 class="pager_next pager_next_disabled" 时,说明已经到最后一页了,此时终止循环
			if 'class="pager_next pager_next_disabled"' in page_source:
				break
			next_pageBtn = self.driver.find_element(By.XPATH,'//span[@class="pager_next "]')
			# 因为首页会弹出个 js 的广告,会影响到 next_pageBtn.click(),所以用 execute_script
			# 这里是 js 的语法,arguments[0] 表示第一个参数,即 next_pageBtn
			self.driver.execute_script('arguments[0].click();',next_pageBtn)
			time.sleep(random.randint(1,3))

	def job_info_onepage(self,source):
		page_html = etree.HTML(source)
		title = page_html.xpath('//div[@class="job-name"]/@title')
		sss = page_html.xpath('//dd[@class="job_request"]//span/text()')
		salary = sss[0].replace('/', '').strip()
		city = sss[1].replace('/', '').strip()
		experience = sss[2].replace('/', '').strip()
		education = sss[3].replace('/', '').strip()
		job_time = sss[4].replace('/', '').strip()
		self.job_information['salary'] = salary
		self.job_information['city'] = city
		self.job_information['experience'] = experience
		self.job_information['education'] = education
		self.job_information['job_time'] = job_time
		welfare = page_html.xpath('//dl[@id="job_detail"]//dd[@class="job-advantage"]//p/text()')[0]
		dutys = page_html.xpath('//dl[@id="job_detail"]//dd[@class="job_bt"]//p/text()')
		job_duty = ''
		for duty in dutys:
			job_duty += duty
		self.job_information['welfare'] = welfare
		self.job_information['job_duty'] = job_duty
		addrs = page_html.xpath('//dl[@id="job_detail"]//dd[@class="job-address clearfix"]/div[@class="work_addr"]/a/text()')
		addr = ''
		# 获取的 addrs 最后一个都是地图,所以不要最后一个
		for i in range(len(addrs) - 1):
			addr += addrs[i]
		self.job_information['addr'] = addr
		time.sleep(random.randint(1, 4))
		json_job_info = json.dumps(self.job_information,ensure_ascii=False)
		with open('lagou_tianjin.json','a',encoding='utf8') as f:
			f.write(json_job_info)
			f.write('\n')

lagou = LaGou()
lagou.page_url()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值