利用requests爬取拉勾相关职位信息

最新推荐文章于 2020-11-18 09:48:54 发布
原创最新推荐文章于 2020-11-18 09:48:54 发布 · 359 阅读
0 ·
CC 4.0 BY-SA版权
文章标签：
#Python #爬虫 #requests
爬虫专栏收录该内容
3 篇文章
订阅专栏
本文介绍如何运用Python的requests库进行网络爬虫，以拉勾网的相关职位信息为例，展示爬取数据的基本步骤和代码实现。
初步代码

# -*- encoding: utf-8 -*-


from com.lagou.crawl.WebRequest import *
from com.lagou.crawl.mysqldb import SQL
import time, json, random, math, requests, logging

# 格式化输出日志
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
logging.basicConfig(filename='lagou.log', level=logging.DEBUG, format=LOG_FORMAT, datefmt=DATE_FORMAT)
crawl_positions = ['大数据']  # 需要爬取的职位
crawl_citys = ['成都']  # 需要爬取的城市
work_exps = ['3年及以下', '3-5年', '5-10年', '10年以上']  # 工作经验
proxy_list = get_home_proxy()  # 获取代理IP


def index_page():
    logging.info('begin to sending request')
    for position in crawl_positions:
        for city in crawl_citys:
            for work_exp in work_exps:
                crawl_url = 'https://www.lagou.com/jobs/positionAjax.json?gj={gj}&px=new&city={city}&needAddtionalResult=false'.format(
                    gj=work_exp, city=city)
                referer_url = 'https://www.lagou.com/jobs/list_{position}?px=new&gj={gj}&city={city}'.format(
                    position=position, gj=work_exp, city=city)
                ses = requests.session()  # 获取session
                # 获取对应header
                header = header_lagou(position, city, work_exp)
                # 更新header
                ses.headers.update(header)
                # 获取代理IP
                proxy = random.choice(proxy_list)
                try:
                    ses.get(referer_url)
                    response = ses.post(url=crawl_url, data={
                        'first': 'true',
                        'pn': '1',
                        'kd': position
                    })
                    response.encoding = "utf-8"
                    data = json.loads(response.text)
                    if data['msg'] == None:
                        resultSize = data['content']['positionResult']['resultSize']  # 职位条数
                        totalCount = data['content']['positionResult']['totalCount']  # 总职位条数
                        total_page = get_page_num(totalCount, resultSize)  # 总页数
                        # 从数据库中获取最近最新一条数据创建时间，来做全量或增量操作
                        logging.info('begin to get latest update time from database')
                        latest_positionId = SQL().get_latest_positionId()
                        logging.info('begin to crawl data')
                        for page_num in range(1, total_page + 1):
                            if page_num == 1:
                                print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
                                      "正在爬取职位：{}，城市：{}，工作经验：{}，总条数：{}，总页数：{}，第 {} 页".format(position, city, work_exp,
                                                                                            totalCount,
                                                                                            total_page, page_num))
                                dt = data['content']['positionResult']['result']
                                res = get_result(dt, latest_positionId)
                                job_list = res[0]
                                break_flag = res[1]
                                new_positionId = job_list[0].get("position_id")
                                # 更新爬取id
                                update_condition = (new_positionId, position, city, work_exp)
                                # 保存到数据库
                                save_result(job_list, update_condition)
                                print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), "爬取成功")
                                if break_flag:
                                    break
                            else:
                                time.sleep(random.randint(1, 10))
                                print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
                                      "正在爬取职位：{}，城市：{}，工作经验：{}，总条数：{}，总页数：{}，第 {} 页".format(position, city,
                                                                                            work_exp,
                                                                                            totalCount, total_page,
                                                                                            page_num))
                                resp = ses.post(url=crawl_url, data={
                                    'first': 'false',
                                    'pn': page_num,
                                    'kd': position
                                },)
                                resp.encoding = "utf-8"
                                resp_result = json.loads(resp.text)
                                job_result = resp_result['content']['positionResult']['result']  # 职位信息
                                res = get_result(job_result, latest_positionId)
                                jobList = res[0]
                                break_flag = res[1]
                                # 保存到数据库
                                save_result(jobList,())
                                if break_flag:
                                    break
                except Exception as e:
                    print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), "爬取失败", e)  # 保存数据到数据库
                    return


def save_result(result, update_condition):
    if not result or not len(result) > 0:
        return
    sql = SQL()
    try:
        if len(update_condition) == 0:
            for res in result:
                sql.insert('lagou_original', **res)
            logging.info("save data success")
        else:
            sql.update_positionId(update_condition)
            for res in result:
                sql.insert('lagou_original', **res)
            logging.info("save data success")
    except Exception as e:
        logging.error("save data failed", e)


# 格式化数据结果
def get_result(results, latest_positionId):
    crawl_results = []
    flag = False
    for result in results:
        positionId = result['positionId']
        # 判断是否为上次更新时的positionId
        if positionId and positionId == latest_positionId:
            flag = True
            break
        crawl_results.append({
            'position_id': positionId,
            'position_name': result['positionName'],
            'job_nature': result['jobNature'],
            'education': result['education'],
            'work_year': result['workYear'],
            'salary': result['salary'],
            'city': result['city'],
            'position_advantage': result['positionAdvantage'],
            'position_lables': ";".join(result['positionLables']),
            'skill_lables': ";".join(result['skillLables']),
            'is_school_job': result['isSchoolJob'],
            'create_time': result['createTime'],
            'company_full_name': result['companyFullName'],
            'company_short_name': result['companyShortName'],
            'finance_stage': result['financeStage'],
            'company_size': result['companySize'],
            'company_label_list': ";".join(result['companyLabelList']),
            'district': result['district'],
            'industry_field': result['industryField']
        })
    return (crawl_results, flag)


# 计算要抓取的页数
def get_page_num(totalCount, resultSize):
    res = math.ceil(totalCount / resultSize)  # 每页15个职位,向上取整
    # 拉勾网最多显示30页结果
    if res > 30:
        return 30
    else:
        return res


if __name__ == '__main__':
    index_page()