import requests
import time
import numpy as np
import pandas as pd
base_url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
headers = {
"Cookie": "user_trace_token=20180806162737-937aceec-9952-11e8-a341-5254005c3644; LGUID=20180806162737-937ad172-9952-11e8-a341-5254005c3644; JSESSIONID=ABAAABAABEEAAJAE50BCF139F0736172F1F0188EC151863; _gat=1; PRE_UTM=m_cf_cpt_baidu_pc; PRE_HOST=bzclk.baidu.com; PRE_SITE=http%3A%2F%2Fbzclk.baidu.com%2Fadrc.php%3Ft%3D06KL00c00f7Ghk60yUKm0FNkUsjkuPdu00000PW4pNb00000LCecjM.THL0oUhY1x60UWY4rj0knj03rNqbusK15yDLnWfkuWN-nj0sn103rHm0IHdDPbmzPjI7fHn3f1m3PDnsnH9anDFArH6LrHm3PHcYf6K95gTqFhdWpyfqn101n1csPHnsPausThqbpyfqnHm0uHdCIZwsT1CEQLILIz4_myIEIi4WUvYE5LNYUNq1ULNzmvRqUNqWu-qWTZwxmh7GuZNxTAn0mLFW5HDLP1Rv%26tpl%3Dtpl_10085_15730_11224%26l%3D1500117464%26attach%3Dlocation%253D%2526linkName%253D%2525E6%2525A0%252587%2525E9%2525A2%252598%2526linkText%253D%2525E3%252580%252590%2525E6%25258B%252589%2525E5%25258B%2525BE%2525E7%2525BD%252591%2525E3%252580%252591%2525E5%2525AE%252598%2525E7%2525BD%252591-%2525E4%2525B8%252593%2525E6%2525B3%2525A8%2525E4%2525BA%252592%2525E8%252581%252594%2525E7%2525BD%252591%2525E8%252581%25258C%2525E4%2525B8%25259A%2525E6%25259C%2525BA%2526xp%253Did%28%252522m6c247d9c%252522%29%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FH2%25255B1%25255D%25252FA%25255B1%25255D%2526linkType%253D%2526checksum%253D220%26ie%3Dutf8%26f%3D8%26ch%3D2%26tn%3D98010089_dg%26wd%3D%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591%26oq%3D%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591%26rqlang%3Dcn%26oe%3Dutf8; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F%3Futm_source%3Dm_cf_cpt_baidu_pc; _putrc=347EB76F858577F7; login=true; unick=%E6%9D%8E%E5%87%AF%E6%97%8B; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=63; TG-TRACK-CODE=index_search; _gid=GA1.2.1110077189.1507624453; _ga=GA1.2.1827851052.1507624453; LGSID=20171011082529-afc7b124-ae1a-11e7-87db-525400f775ce; LGRID=20171011082545-b94d70d5-ae1a-11e7-87db-525400f775ce; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1507444213,1507624453,1507625209,1507681531; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1507681548; SEARCH_ID=e420ce4ae5a7496ca8acf3e7a5490dfc; index_location_city=%E5%8C%97%E4%BA%AC",
"Host": "www.lagou.com",
'Origin': 'https://www.lagou.com',
'Referer': 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=&fromSearch=true&suginput=',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3408.400 QQBrowser/9.6.12028.400'
}
proxies = {
'http': 'http://139.0.28.18:8080',
}
infos = {
'city': [],
'district': [],
'companyFullName': [],
'positionName': [],
'salary': [],
'workYear': [],
'companySize': [],
'education': [],
'financeStage': [],
'industryField': [],
'jobNature': [],
'positionAdvantage': [],
}
"""
"""
for i in range(1, 31):
print(i)
time.sleep(np.random.rand()*15)
data = {
'first': 'false',
'pn': str(i),
'kd': '数据分析师',
}
response = requests.post(url=base_url, data=data, headers=headers, proxies=proxies)
html = response.json()
results = html['content']['positionResult']['result']
for result in results:
infos['city'].append(result['city'])
infos['district'].append(result['district'])
infos['companyFullName'].append(result['companyFullName'])
infos['companySize'].append(result['companySize'])
infos['education'].append(result['education'])
infos['financeStage'].append(result['financeStage'])
infos['industryField'].append(result['industryField'])
infos['jobNature'].append(result['jobNature'])
infos['positionAdvantage'].append(result['positionAdvantage'])
infos['positionName'].append(result['positionName'])
infos['salary'].append(result['salary'])
infos['workYear'].append(result['workYear'])
ddata = pd.DataFrame(infos)
ddata.to_csv('lagou.csv', index=False, encoding='gb18030')