遇到了许多问题,最终在一次次尝试下终于成功。
一开始没有没有意识到拉勾网的cookie在变化,总是不能正常爬取。查资料发现session可以共享信息,这才爬取到了一点正常信息。
爬取一半出现list index out of range,也发现错误的网页每次也不相同,猜测拉勾网后台数据在变化,返回了一个空的列表,使用if else能够继续爬取但同时也丢失了几条数据(大佬指点指点)。
import requests
from lxml import etree
import re
#使用session共享cookie
s = requests.session()
headers_1 = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
def main():
#使用开始网址来实现cookie的共享
ref_url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
data_url='https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
data = {
'first': 'false',
'pn': 1,
'kd': 'python',
'sid': '15fc1c7c57184b6ab5137cb384bfa498'
}
for x in range(1,11):
data['pn'] = x
res = s.get(ref_url,headers=headers_1)
headers_2= {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
'Referer':ref_url
}
get_data(data_url,headers_2,data)
#先爬取一页
break
#获取职位详细页面的url
def get_data(url,headers,data):
resp = s.post(url,headers=headers,data=data)
html = resp.json()
results = html['content']['positionResult']['result']
for result in results:
id = result['positionId']
data_detail_url = 'https://www.lagou.com/jobs/{}.html?show=ab8107555dd04359ad1490655d0066e3'.format(id)
detail_data(data_detail_url)
def detail_data(url):
res = s.get(url,headers = headers_1)
text = res.text
html = etree.HTML(text)
a = html.xpath("//h2[@class='name']/text()")
b = html.xpath("//dd[@class='job_request']//span/text()")
#没有if时出现list index out of range错误,猜测可能是一个空的列表
if a==[]:
pass
else:
name = a[0]
print(name)
if b==[]:
pass
else:
salary = b[0]
city = re_sub(b[1])
education = re_sub(b[3])
print(salary,city,education)
#职位详细信息
c = html.xpath("//div[@class='job-detail']/p/text()")
for d in c:
print(d)
def re_sub(data):
data = re.sub(r'[\s/]','',data)
return data
if __name__ == '__main__':
main()