初步代码
# -*- encoding: utf-8 -*-
from com.lagou.crawl.WebRequest import *
from com.lagou.crawl.mysqldb import SQL
import time, json, random, math, requests, logging
# 格式化输出日志
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
logging.basicConfig(filename='lagou.log', level=logging.DEBUG, format=LOG_FORMAT, datefmt=DATE_FORMAT)
crawl_positions = ['大数据'] # 需要爬取的职位
crawl_citys = ['成都'] # 需要爬取的城市
work_exps = ['3年及以下', '3-5年', '5-10年', '10年以上'] # 工作经验
proxy_list = get_home_proxy() # 获取代理IP
def index_page():
logging.info('begin to sending request')
for position in crawl_positions:
for city in crawl_citys:
for work_exp in work_exps:
crawl_url = 'https://www.lagou.com/jobs/positionAjax.json?gj={gj}&px=new&city={city}&needAddtionalResult=false'.format(
gj=work_exp, city=city)
referer_url = 'https://www.lagou.com/jobs/list_{position}?px=new&gj={gj}&city={city}'.format(
position=position, gj=work_exp, city=city)
ses = requests.session() # 获取session
# 获取对应header
header = header_lagou(position, city, work_exp)
# 更新header
ses.headers.update(header)
# 获取代理IP
proxy = random.choice(proxy_list)
try:
ses.get(referer_url)
response = ses.post(url=crawl_url, data={
'first': 'true',
'pn': '1',
'kd': position
})
response.encoding = "utf-8"
data = json.loads(response.text)
if data['msg'] == None:
resultSize = data['content']['positionResult']['resultSize'] # 职位条数
totalCount = data['content']['positionResult']['totalCount'] # 总职位条数
total_page = get_page_num(totalCount, resultSize) # 总页数
# 从数据库中获取最近最新一条数据创建时间,来做全量或增量操作
logging.info('begin to get latest update time from database')
latest_positionId = SQL().get_latest_positionId()
logging.info('begin to crawl data')
for page_num in range(1, total_page + 1):
if page_num == 1:
print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
"正在爬取职位:{},城市:{},工作经验:{},总条数:{},总页数:{},第 {} 页".format(position, city, work_exp,
totalCount,
total_page, page_num))
dt = data['content']['positionResult']['result']
res = get_result(dt, latest_positionId)
job_list = res[0]
break_flag = res[1]
new_positionId = job_list[0].get("position_id")
# 更新爬取id
update_condition = (new_positionId, position, city, work_exp)
# 保存到数据库
save_result(job_list, update_condition)
print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), "爬取成功")
if break_flag:
break
else:
time.sleep(random.randint(1, 10))
print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
"正在爬取职位:{},城市:{},工作经验:{},总条数:{},总页数:{},第 {} 页".format(position, city,
work_exp,
totalCount, total_page,
page_num))
resp = ses.post(url=crawl_url, data={
'first': 'false',
'pn': page_num,
'kd': position
},)
resp.encoding = "utf-8"
resp_result = json.loads(resp.text)
job_result = resp_result['content']['positionResult']['result'] # 职位信息
res = get_result(job_result, latest_positionId)
jobList = res[0]
break_flag = res[1]
# 保存到数据库
save_result(jobList,())
if break_flag:
break
except Exception as e:
print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), "爬取失败", e) # 保存数据到数据库
return
def save_result(result, update_condition):
if not result or not len(result) > 0:
return
sql = SQL()
try:
if len(update_condition) == 0:
for res in result:
sql.insert('lagou_original', **res)
logging.info("save data success")
else:
sql.update_positionId(update_condition)
for res in result:
sql.insert('lagou_original', **res)
logging.info("save data success")
except Exception as e:
logging.error("save data failed", e)
# 格式化数据结果
def get_result(results, latest_positionId):
crawl_results = []
flag = False
for result in results:
positionId = result['positionId']
# 判断是否为上次更新时的positionId
if positionId and positionId == latest_positionId:
flag = True
break
crawl_results.append({
'position_id': positionId,
'position_name': result['positionName'],
'job_nature': result['jobNature'],
'education': result['education'],
'work_year': result['workYear'],
'salary': result['salary'],
'city': result['city'],
'position_advantage': result['positionAdvantage'],
'position_lables': ";".join(result['positionLables']),
'skill_lables': ";".join(result['skillLables']),
'is_school_job': result['isSchoolJob'],
'create_time': result['createTime'],
'company_full_name': result['companyFullName'],
'company_short_name': result['companyShortName'],
'finance_stage': result['financeStage'],
'company_size': result['companySize'],
'company_label_list': ";".join(result['companyLabelList']),
'district': result['district'],
'industry_field': result['industryField']
})
return (crawl_results, flag)
# 计算要抓取的页数
def get_page_num(totalCount, resultSize):
res = math.ceil(totalCount / resultSize) # 每页15个职位,向上取整
# 拉勾网最多显示30页结果
if res > 30:
return 30
else:
return res
if __name__ == '__main__':
index_page()