2023年网络爬虫实训（第四天）

꧁Jupiter꧂

已于 2023-01-14 08:28:48 修改

阅读量328

点赞数 1

文章标签：爬虫 python

于 2023-01-13 16:36:07 首次发布

本文链接：https://blog.youkuaiyun.com/qq_62011165/article/details/128676284

版权

该代码实现了从腾讯招聘网站抓取特定岗位类别（如AI、算法与大数据）的职位信息，包括岗位名称、发布时间、工作地点和职责。同时，通过聚合数据平台获取世界杯赛程数据，解析并存储为JSON文件。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

任务1：完成腾讯招聘案例。掌握请求地址的分析方法，掌握二级页面的解析方式。

{ 岗位性质.json
     "category": [
         {
             "岗位性质": "设计类",
             "岗位性质代码": "40002",
             "子分类": {
                 "设计类": "40002001",
                 "游戏美术类": "40002002"
             }
         },
         {
             "岗位性质": "技术类",
             "岗位性质代码": "40001",
             "子分类": {
                 "技术研发类": "40001001",
                 "质量管理类": "40001002",
                 "技术运营类": "40001003",
                 "安全技术类": "40001004",
                 "AI、算法与大数据": "40001005",
                 "企管类": "40001006"
             }
         }
     ]
}

# https://careers.tencent.com/tencentcareer/api/post/Query?
# timestamp=1673052338226&
# parentCategoryId=40001&
# pageIndex=1&
# pageSize=10&
# language=zh-cn&
# area=cn

# 获取具体的岗位性质下的岗位
# https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1673053787347&categoryId=40001001,40001002&pageIndex=1&pageSize=10&language=zh-cn&area=cn
# https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1673052338226&CategoryId=40001001,40001002&pageIndex=1&pageSize=10&language=zh-cn&area=cn

# 岗位详情
# https://careers.tencent.com/tencentcareer/api/post/ByPostId?
# timestamp=1673056178640&
# postId=1575268332712501248&
# language=zh-cn

import requests
import json
import time
# { 岗位性质.json
#     "category": [
#         {
#             "岗位性质": "设计类",
#             "岗位性质代码": "40002",
#             "子分类": {
#                 "设计类": "40002001",
#                 "游戏美术类": "40002002"
#             }
#         },
#         {
#             "岗位性质": "技术类",
#             "岗位性质代码": "40001",
#             "子分类": {
#                 "技术研发类": "40001001",
#                 "质量管理类": "40001002",
#                 "技术运营类": "40001003",
#                 "安全技术类": "40001004",
#                 "AI、算法与大数据": "40001005",
#                 "企管类": "40001006"
#             }
#         }
#     ]
# }


# https://careers.tencent.com/tencentcareer/api/post/Query?
# timestamp=1673052338226&
# parentCategoryId=40001&
# pageIndex=1&
# pageSize=10&
# language=zh-cn&
# area=cn

# 获取具体的岗位性质下的岗位
# https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1673053787347&categoryId=40001001,40001002&pageIndex=1&pageSize=10&language=zh-cn&area=cn
# https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1673052338226&CategoryId=40001001,40001002&pageIndex=1&pageSize=10&language=zh-cn&area=cn

# 岗位详情
# https://careers.tencent.com/tencentcareer/api/post/ByPostId?
# timestamp=1673056178640&
# postId=1575268332712501248&
# language=zh-cn
class TXJob:
    """腾讯招聘岗位爬虫"""

    def __init__(self):
        # 1. 定义请求地址
        self.url = 'https://careers.tencent.com/tencentcareer/api/post/Query'
        # 2. 定义请求头
        self.header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}

    def get_response(self, timestamp, categoryId, pageSize):
        """发送请求并获取响应"""
        # 1. 构造请求参数
        r_params = {
            "timestamp": timestamp,
            "categoryId": categoryId,
            "pageIndex": 1,
            "pageSize": pageSize,
            "language": "zh-cn",
            "area": "cn"
        }
        # 2. 发送请求
        return requests.get(self.url, headers=self.header, params=r_params).content.decode('utf-8')

    def parse_list_data(self, data):
        """解析列表页数据"""
        # 1. 将json -> python对象
        post = json.loads(data)
        # 2. 遍历所有岗位
        inf = []  # 定义空列表
        for p in post["Data"]["Posts"]:
            # 2.1 创建数据模型用以存储数据
            model = dict()
            # 1.存储岗位名称
            model['岗位名称'] = p['RecruitPostName']
            # 2.存储岗位发布时间
            model['岗位发布时间'] = p['LastUpdateTime']
            # 3.存储工作地点
            model['工作地点'] = p['LocationName']
            # 4.存储岗位性质
            model["岗位性质"] = p['CategoryName']
            # 5.存储岗位工作内容
            model['岗位工作内容'] = p['Responsibility']
            # *.获取岗位id
            post_id = p['PostId']
            # 解析当前岗位的详情页数据
            self.parse_detail_data(post_id, model)
            inf.append(model)  # 最终结果json中的内容
        with open(f"./json解析腾讯招聘/json解析腾讯招聘5.json", 'w', encoding='utf-8') as f:  # 将得到的内容，写入到json文件中
            f.write(json.dumps(inf, ensure_ascii=False, indent=4))
            f.close()

    def parse_detail_data(self, post_id, model):
        """解析详情页数据"""
        # 请求地址
        post_detail_url = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId'
        # 发送详情页数据请求
        data = json.loads(requests.get(post_detail_url, headers=self.header, params={"postId": post_id}).content.decode('utf-8'))
        # 6.存储岗位要求
        model['岗位要求'] = data["Data"]["Requirement"]
        print(model)

    def run(self, time_stamp, categoryId):
        """爬虫启动程序"""
        # 1. 请求列表页数据
        list_data = self.get_response(time_stamp, categoryId, pageSize=28)  # 爬取28页内容
        # 2. 解析列表页数据
        self.parse_list_data(list_data)

def get_categoryId():
    """
    获取岗位性质
    技术类 -> 技术研发类， 质量管理类， 技术运营类，安全技术类， AI、算法与大数据， 企管类
    设计   -> 设计类，游戏美术类
    产品   -> 产品类，游戏产品类，项目类
    """
    # 1. 加载json文件
    with open(f'./json解析腾讯招聘/岗位性质.json', 'r', encoding='utf-8') as f:
        json_str = f.read()
        f.close()
    return json.loads(json_str)


if __name__ == '__main__':
    # 1. 对岗位性质的分类进行简易处理
    data = get_categoryId()
    postID = data['category'][1]['子分类']['AI、算法与大数据']
    # # 2.获取13位时间戳启动程序
    timestamp = str(int(time.time() * 1000))
    TXJob().run(timestamp, postID)
    print('\n成功将输出内容存入：json解析腾讯招聘5.json 中！')

任务2：以https://dashboard.juhe.cn/data/index/my 聚合数据网独自挑选数据接口练习json请求数据与数据解析。提交py文件到第三次作业文件夹中

import requests
import json

class WorldCup:
    """世界杯数据爬虫"""
    def __init__(self):
        # 1. 定义请求地址
        self.url = 'https://apis.juhe.cn/fapigw/worldcup2022/schedule'
        # 2. 定义请求头
        self.header = {"User-Agent": "application/x-www-form-urlencoded"}

    def get_response(self):
        """发送请求并获取响应"""
        # 1. 构造请求参数
        params = {"key": "f39918c67cc9fd5e44f6d39248479763"}
        # 2. 发送请求
        return requests.get(self.url, headers=self.header, params=params).content.decode('utf-8')

    @staticmethod
    def parser(data):
        """解析数据"""
        # 1. 将json -> python对象
        pythonObj = json.loads(data)
        # 2. 遍历所有岗位
        information = []  # 定义空列表
        model1 = dict()  # 创建数据模型以存储数据
        model2 = dict()
        model3 = dict()
        model2['reason'] = pythonObj['reason']  # 存储reason
        model3['error_code'] = pythonObj['error_code']  # 存储error_code
        information.append(model2)  # 将model2加入information
        print(model2)
        for p in pythonObj['result']["data"]:
            # 1.存储schedule_date
            model1['schedule_date'] = p['schedule_date']
            # 2.存储schedule_date_format
            model1['schedule_date_format'] = p['schedule_date_format']
            # 3.存储schedule_week
            model1['schedule_week'] = p['schedule_week']
            # 4.存储schedule_current
            model1["schedule_current"] = p['schedule_current']
            # 5.存储schedule_list
            model1["schedule_list"] = p['schedule_list']
            information.append(model1)  # 将model1加入information
            print(model1)
        information.append(model3)  # 将model3加入information
        print(model3)
        with open(f"./世界杯/世界杯数据.json", 'w', encoding='utf-8') as f:  # 将得到的内容，写入到json文件中
            f.write(json.dumps(information, ensure_ascii=False, indent=4))
            f.close()

    def run(self):
        """爬虫启动程序"""
        # 1. 请求数据
        data = self.get_response()
        # 2. 解析数据
        self.parser(data)


if __name__ == '__main__':
    WorldCup().run()