直接上代码吧,爬虫思路写在了注释中
import requests
from lxml import etree
import time
import re
'''
1.需求分析
title gsmc gz addr jy xl fuli
入口地址:https://www.zhaopin.com/
2.源码实现
所有职位分类标签://div[@class='zp-jobNavigater-pop-list']/a
职位详细列表:https://sou.zhaopin.com/?jl=489&kw=Java%E5%BC%80%E5%8F%91&kt=3
3.代码实现
'''
# 1.获取职位标签
def get_job_tag(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
response = requests.get(starturl, headers=headers).text
# print(response)
# 解析源码
HTML = etree.HTML(response)
# 获取职位分类标签
job_tag = HTML.xpath("//div[@class='zp-jobNavigater-pop-list']/a/text()")
return