python爬取招聘网站报告_python爬虫爬取拉勾招聘网

本文介绍了一个Python脚本,用于从拉勾网抓取指定关键字的职位信息,并将其保存为Excel文件。该脚本可以自定义抓取页数,适用于收集不同城市的职位详情。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

# -*- coding: utf-8 -*-

"""

Created on Mon Sep 7 21:44:39 2020

@author: ASUS

"""

外汇常见问题https://www.kaifx.cn/lists/question/

import requests

import time

import json

import xlwt

workbook = xlwt.Workbook(encoding=' utf-8')

mysheet = workbook.add_sheet('mysheet')

mysheet.write(0, 0, 'positionId')

mysheet.write(0, 1, 'positionName')

mysheet.write(0, 2, 'companyId')

mysheet.write(0, 3, 'companyFullName')

mysheet.write(0, 4, 'city')

mysheet.write(0, 5, 'companyLabelList')

mysheet.write(0, 6, 'companyLogo')

mysheet.write(0, 7, 'companyShortName')

mysheet.write(0, 8, 'companySize')

mysheet.write(0, 9, 'createTime')

mysheet.write(0, 10, 'district')

mysheet.write(0, 11, 'education')

mysheet.write(0, 12, 'financeStage')

mysheet.write(0, 13, 'firstType')

mysheet.write(0, 14, 'formatCreateTime')

mysheet.write(0, 15, 'industryField')

mysheet.write(0, 16, 'jobNature')

mysheet.write(0, 17, 'lastLogin')

mysheet.write(0, 18, 'latitude')

mysheet.write(0, 19, 'linestaion')

mysheet.write(0, 20, 'longitude')

mysheet.write(0, 21, 'matchScore')

mysheet.write(0, 22, 'positionAdvantage')

mysheet.write(0, 23, 'positionId')

mysheet.write(0, 24, 'positionLables')

mysheet.write(0, 25, 'positionName')

mysheet.write(0, 26, 'secondType')

mysheet.write(0, 27, 'skillLables')

mysheet.write(0, 28, 'stationname')

mysheet.write(0, 29, 'subwayline')

mysheet.write(0, 30, 'thirdType')

mysheet.write(0, 31, 'workYear')

def main(kd,pages,row):

# 通过访问主网页获取cookies和session

url1 = 'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput='

# 提交ajax请求,获取json数据

url = "https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false"

# 请求头

headers = {

'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',

'Referer': 'https://www.lagou.com/jobs/list_python?px=default&city=%E5%85%A8%E5%9B%BD',

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',

'Host': 'www.lagou.com'

}

# 使用data来决定获取多少页的json数据

for page in range(1, pages):

data = {

'first': 'false',

'pn': page,

'kd': 'python'

}

data['kd']=kd

s = requests.Session() # 建立session

s.get(url=url1, headers=headers, timeout=1)

cookie = s.cookies # 获取cookie

respon = s.post(url=url, headers=headers, data=data, cookies=cookie, timeout=3)

time.sleep(1)

#print(respon.text)

result = json.loads(respon.text)

info = result["content"]["positionResult"]["result"]

print(len(info))

for j in info:

mysheet.write(row, 0, j['positionId'])

mysheet.write(row, 1, j['positionName'])

mysheet.write(row, 2, j['companyId'])

mysheet.write(row, 3, j['companyFullName'])

mysheet.write(row, 4, j['city'])

mysheet.write(row, 5, j['companyLabelList'])

mysheet.write(row, 6, j['companyLogo'])

mysheet.write(row, 7, j['companyShortName'])

mysheet.write(row, 8, j['companySize'])

mysheet.write(row, 9, j['createTime'])

mysheet.write(row, 10, j['district'])

mysheet.write(row, 11, j['education'])

mysheet.write(row, 12, j['financeStage'])

mysheet.write(row, 13, j['firstType'])

mysheet.write(row, 14, j['formatCreateTime'])

mysheet.write(row, 15, j['industryField'])

mysheet.write(row, 16, j['jobNature'])

mysheet.write(row, 17, j['lastLogin'])

mysheet.write(row, 18, j['latitude'])

mysheet.write(row, 19, j['linestaion'])

mysheet.write(row, 20, j['longitude'])

mysheet.write(row, 21, j['matchScore'])

mysheet.write(row, 22, j['positionAdvantage'])

mysheet.write(row, 23, j['positionId'])

mysheet.write(row, 24, j['positionLables'])

mysheet.write(row, 25, j['positionName'])

mysheet.write(row, 26, j['secondType'])

mysheet.write(row, 27, j['skillLables'])

mysheet.write(row, 28, j['stationname'])

mysheet.write(row, 29, j['subwayline'])

mysheet.write(row, 30, j['thirdType'])

mysheet.write(row, 31, j['workYear'])

row=row+1

workbook.save('py3.xls')

# 获取前两页的职位json信息

kd=input('输入关键字:')

pages=int(input('输入要爬取多少页:'))

main(kd,pages,1)

# 结果如下:

# {"resubmitToken":null,"requestId":null,"msg":null,"success":true,"content":{"hrInfoMap":{"6187967":{"userId":11765418,"phone":null,"positionName":"招聘经理",........."pageSize":15},"code":0}

好的,您想要了解如何使用 Python 爬取智联招聘网站上的工作岗位信息对吗? 首先,我们可以使用 Python 中的 requests 库来向智联招聘网站发送 HTTP 请求,从而获取页面内容。然后,我们可以使用 BeautifulSoup 库来解析页面内容,提取出我们需要的信息。 以下是一个示例代码,用于爬取智联招聘网站上的 Python 工程师工作岗位信息: ```python import requests from bs4 import BeautifulSoup url = 'https://fe-api.zhaopin.com/c/i/sou?pageSize=60&cityId=489&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=Python&kt=3&_v=0.97530866&x-zp-page-request-id=ad1b5d3b9e3b4d2d9f0f0e6fcb3f8c1d-1626242840263-285644' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299' } response = requests.get(url, headers=headers) soup = BeautifulSoup(response.content, 'html.parser') job_list = soup.find_all('div', {'class': 'job-list'}) for job in job_list: job_name = job.find('a', {'target': '_blank', 'data-jid': True}).text.strip() job_salary = job.find('span', {'class': 'salary'}).text.strip() job_company = job.find('a', {'class': 'company-name'}).text.strip() job_location = job.find('span', {'class': 'job-area'}).text.strip() job_experience = job.find('span', {'class': 'job-exp'}).text.strip() print(job_name, job_salary, job_company, job_location, job_experience) ``` 在这个示例代码中,我们使用了 requests 库向智联招聘网站发送了一个 HTTP 请求,并设置了请求头部信息,以避免被网站识别为爬虫。然后,我们使用 BeautifulSoup 库解析了页面内容,并从中提取出了工作岗位信息。 您可以根据自己的需求修改代码中的参数和条件,以获得您需要的工作岗位信息。注意,爬取网站信息时要遵守相关法律法规和网站规定,不要过度频繁地请求网站,以免对网站造成影响。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值