直接上代码了,相比前篇文章智联招聘网的数据,前程无忧网的数据可以爬取很多。
网址:https://search.51job.com/list/040000,000000,0000,00,9,99,%20,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
简化后:https://search.51job.com/list/040000,000000,0000,00,9,99,%20,2,1.html?
相比之下,这份获取的数据更适合练习学习数据分析。
爬取的方法跟步骤跟智联招聘网那篇一样。都是用到了第三方库requests
import requests
import re
import os
import time
class Spider(object):
page_count = 0
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
global path, page_count
path = './前程无忧招聘网/'
if not os.path.exists(path):
os.mkdir(path)
self.path = path
with open(path + "前程无忧招聘.json", "a", encoding='utf-8') as fp:
fp.write('{')
def response(self, url, headers):
"""
请求访问服务器获取资源
"""
try:
response = requests.get(url, headers)
response.encoding = response.apparent_encoding
return response
except:
print('访问失败')
return None
def parse(self, response):
"""
解析出我们需要进入的岗位名称及其详情链接
"""
res = re.findall(r'<span>\s+<a target="_blank" title="(.*?)" href="(.*?)" .*?</a>', response.text,
re.S)
return res
def parse_S(self, response):
try:
price = re.findall('<div class="cn".*?<strong>(.*?)</strong>', response.text, re.S)[0]
except:
price = None
try:
company_name = re.findall('class="cname">.*?title="(.*?)" class.*?<em.*?', response.text, re.S)[0]
except:
company_name = None
try:
add = re.findall('<p class="msg.*?title="(.*?) ', response.text, re.S)[0]
except:
add = None
try:
num = re.findall('<p class="msg ltype".*?招(\d+)人 ', response.text, re.S)[0]
except:
num = '若干人'
detail = {}
detail['公司名称'] = company_name
detail['工作城市'] = add
detail['招聘人数'] = num
detail['工资情况'] = price
return detail
def getContent(self, url):
response = self.response(url, self.headers)
detail = self.parse_S(response)
return detail
def save(self, items):
Spider.page_count += 1
print(Spider.page_count)
with open(path + "前程无忧招聘.json", "a", encoding='utf-8') as fp:
fp.write(str(items) + ',')
def crawl(self, page):
crawl_url = 'https://search.51job.com/list/040000,000000,0000,00,9,99,%2520,2,{}.html?'.format(page)
response = self.response(crawl_url, self.headers)
res = self.parse(response)
items = {}
for name, url in res:
detail = self.getContent(url)
items['name'] = name
items['detail'] = detail
self.save(items)
def main(self):
for page in range(0, 100):
self.crawl(page)
with open(path + "前程无忧招聘.json", "a", encoding='utf-8') as fp:
fp.write('}')
start = time.time()
if __name__ == '__main__':
c = Spider()
c.main()
end = time.time()
print(end - start)
如发现可以改进的地方或者哪里做得不好,希望大家能够提出多多交流。