import re
import random
import requests
from openpyxl import Workbook
from bs4 import BeautifulSoup
#获取网页信息
def get_html(url):
head = {
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36'
}
# 设置代理ip
proxiesl = ['183.62.196.10:3128', '112.115.57.20:3128', '183.129.244.16', '114.113.126.86']
# 发送请求
r = requests.get(url, headers=head, proxies={'http': random.choice(proxiesl)})
r.encoding = 'utf-8'
return r.text
#保存数据
datalist =[]
#清洗数据
def get_data(data):
soup = BeautifulSoup(data,'html.parser')
ul = soup.find('div',attrs={'class':'job-content'})
for li in ul.find_all('li'):
lis = []
#岗位
gangwei = li.find('div',attrs={'class':'job-info'}).find('h3').text.strip()
#薪资
xinzi = li.find('p',attrs={'class':'condition clearfix'})['title'].split('_')[0]
#工作地点
address = li.find('p', attrs={'class': 'condition clearfix'})['title'].split('_')[1]
#学历
xueli = li.find('p', attrs={'class': 'condition clearfix'})['title'].split('_')[2]
#工作经验
jinyan = li.find('p', attrs={'class': 'condition clearfix'})['title'].split('_')[3]
#企业
qiye = li.find('div',attrs={'class':'company-info nohover'}).find('a')['title'][2:]
lis.append(gangwei)
lis.append(xinzi)
lis.append(xueli)
lis.append(jinyan)
lis.append(qiye)
datalist.append(lis)
return datalist
#获取北\上\广\深的前五页信息
list1 = ['010','020','050020','050090'] #北、上、广、深
for i in list1:
for j in range(5):
urls='https://www.liepin.com/zhaopin/?ckid=79036dc062e7d2ed&fromSearchBtn=2&init=-1&sfrom=click-pc_homepage-centre_searchbox-search_new&dqs='+str(i)+'°radeFlag=0&key=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90&headckid=2a5bfc993f2f70df&d_pageSize=40&siTag=ZFDYQyfloRvvhTxLnVV_Qg~F5FSJAXvyHmQyODXqGxdVw&d_headId=f39665a0eb9cccfef1dd0eebd3ac439e&d_ckId=30a683201b175a5dc632b9732087c357&d_sfrom=search_fp&d_curPage=2&curPage='+str(j)+''
data = get_html(urls)
get_data(data)
#保存到excel
def saveExccel():
wb = Workbook()
sheet = wb.create_sheet('人才网北、上、广、深招聘信息')
sheet.cell(1,1).value='岗位'
sheet.cell(1, 2).value = '薪资'
sheet.cell(1, 3).value = '学历'
sheet.cell(1, 4).value = '工作经验'
sheet.cell(1, 5).value = '企业'
for a in range(len(datalist)):
for b in range(len(datalist[a])):
sheet.cell(a+2,b+1).value=datalist[a][b]
wb.save('人才网北上广深招聘信息.xlsx')
saveExccel()