python requests用接口爬拉钩网职位信息

最新推荐文章于 2025-05-11 10:24:34 发布

翻译最新推荐文章于 2025-05-11 10:24:34 发布 · 671 阅读

python爬虫专栏收录该内容

3 篇文章

订阅专栏

本文介绍了一个使用Python和requests库实现的针对拉勾网的职位信息爬虫实例。该爬虫能够抓取指定城市和关键字的职位详情，包括公司名称、规模、行业等，并将结果保存为文本文件。

# -*- coding:utf-8 -*-
# 参考：http://docs.python-requests.org/zh_CN/latest/user/quickstart.html
# 请求返回的是dict套接的形式，可用iteritems或者.json()转化成列表读取
__author__ = 'binsen'

import sys,requests,json
reload(sys)
sys.setdefaultencoding('utf-8')

kong = []
for page in range(1, 15):
    url = 'https://www.lagou.com/jobs/positionAjax.json'
    # proxies = {"http":"http://125.105.17.229:808"}
    headers = {'Accept':'application/json, text/javascript, */*; q=0.01',
               'Accept-Encoding':'gzip, deflate, br',
               'Accept-Language':'zh-CN,zh;q=0.8',
               'Connection':'keep-alive',
               'Content-Length':'55',
               'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
               'Cookie': '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~',
               'Host':'www.lagou.com',
               'Origin':'https://www.lagou.com',
               'Referer':'https://www.lagou.com/jobs/list_%E8%BD%AF%E4%BB%B6%E6%B5%8B%E8%AF%95?city=%E4%B8%8A%E6%B5%B7\
&cl=false&fromSearch=true&labelWords=&suginput=',
               'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.28\
40.99 Safari/537.36',
               'X-Anit-Forge-Code':'0',
               'X-Anit-Forge-Token':'None',
               'X-Requested-With':'XMLHttpRequest'
               }
    payload = {'px':'default','city':'杭州','needAddtionalResult':'false','first':'true', 'pn':page, 'kd':'python'}
    response = requests.post(url,data = payload,headers = headers) # 开启了鉴权的接口，加auth和HTTPBasicAuth
    wenben = response.text # 打印出文本       # response = requests.post(url,data = payload,proxies = proxies)
    # print type(response.content),type(wenben) # content取的是二进制str数据，可以取到图片，文件等数据，text取unicode的文本
    # print response.status_code
    response_json = response.json() # 从接口返回值中取出json字符串转成python字符串。此处也可用json.loads(wenben)反序列化转成python对象
    # print response_json # 多条公司的招聘数据，后续迭代出来
    res = response_json['content']['positionResult']['result']
    # print type(res) list
    print '正在爬取拉钩网第 %s 页の %s 职位的信息...' % (str(page), payload['kd'])
    for x in res:
        xinxi = "岗位:%s,公司名:%s,规模:%s,业务:%s,工作年限:%s,待遇:%s" % (x['positionName'],x['companyFullName'],\
x['companySize'],x['industryField'],x['workYear'],x['salary'])
        kong.append(xinxi)
with open('C:/Users/bin.sun/Desktop/lagou.txt', 'a') as file:
    for index,i in enumerate(kong, start=1):
        index1 = '%03d' % index
        shuju = index1 + ' ' + i + '\n'
        file.write(shuju)