任务1:完成腾讯招聘案例。掌握请求地址的分析方法,掌握二级页面的解析方式。
{ 岗位性质.json
"category": [
{
"岗位性质": "设计类",
"岗位性质代码": "40002",
"子分类": {
"设计类": "40002001",
"游戏美术类": "40002002"
}
},
{
"岗位性质": "技术类",
"岗位性质代码": "40001",
"子分类": {
"技术研发类": "40001001",
"质量管理类": "40001002",
"技术运营类": "40001003",
"安全技术类": "40001004",
"AI、算法与大数据": "40001005",
"企管类": "40001006"
}
}
]
}
# https://careers.tencent.com/tencentcareer/api/post/Query?
# timestamp=1673052338226&
# parentCategoryId=40001&
# pageIndex=1&
# pageSize=10&
# language=zh-cn&
# area=cn
# 获取具体的岗位性质下的岗位
# https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1673053787347&categoryId=40001001,40001002&pageIndex=1&pageSize=10&language=zh-cn&area=cn
# https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1673052338226&CategoryId=40001001,40001002&pageIndex=1&pageSize=10&language=zh-cn&area=cn
# 岗位详情
# https://careers.tencent.com/tencentcareer/api/post/ByPostId?
# timestamp=1673056178640&
# postId=1575268332712501248&
# language=zh-cn
import requests
import json
import time
# { 岗位性质.json
# "category": [
# {
# "岗位性质": "设计类",
# "岗位性质代码": "40002",
# "子分类": {
# "设计类": "40002001",
# "游戏美术类": "40002002"
# }
# },
# {
# "岗位性质": "技术类",
# "岗位性质代码": "40001",
# "子分类": {
# "技术研发类": "40001001",
# "质量管理类": "40001002",
# "技术运营类": "40001003",
# "安全技术类": "40001004",
# "AI、算法与大数据": "40001005",
# "企管类": "40001006"
# }
# }
# ]
# }
# https://careers.tencent.com/tencentcareer/api/post/Query?
# timestamp=1673052338226&
# parentCategoryId=40001&
# pageIndex=1&
# pageSize=10&
# language=zh-cn&
# area=cn
# 获取具体的岗位性质下的岗位
# https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1673053787347&categoryId=40001001,40001002&pageIndex=1&pageSize=10&language=zh-cn&area=cn
# https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1673052338226&CategoryId=40001001,40001002&pageIndex=1&pageSize=10&language=zh-cn&area=cn
# 岗位详情
# https://careers.tencent.com/tencentcareer/api/post/ByPostId?
# timestamp=1673056178640&
# postId=1575268332712501248&
# language=zh-cn
class TXJob:
"""腾讯招聘岗位爬虫"""
def __init__(self):
# 1. 定义请求地址
self.url = 'https://careers.tencent.com/tencentcareer/api/post/Query'
# 2. 定义请求头
self.header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
def get_response(self, timestamp, categoryId, pageSize):
"""发送请求并获取响应"""
# 1. 构造请求参数
r_params = {
"timestamp": timestamp,
"categoryId": categoryId,
"pageIndex": 1,
"pageSize": pageSize,
"language": "zh-cn",
"area": "cn"
}
# 2. 发送请求
return requests.get(self.url, headers=self.header, params=r_params).content.decode('utf-8')
def parse_list_data(self, data):
"""解析列表页数据"""
# 1. 将json -> python对象
post = json.loads(data)
# 2. 遍历所有岗位
inf = [] # 定义空列表
for p in post["Data"]["Posts"]:
# 2.1 创建数据模型用以存储数据
model = dict()
# 1.存储岗位名称
model['岗位名称'] = p['RecruitPostName']
# 2.存储岗位发布时间
model['岗位发布时间'] = p['LastUpdateTime']
# 3.存储工作地点
model['工作地点'] = p['LocationName']
# 4.存储岗位性质
model["岗位性质"] = p['CategoryName']
# 5.存储岗位工作内容
model['岗位工作内容'] = p['Responsibility']
# *.获取岗位id
post_id = p['PostId']
# 解析当前岗位的详情页数据
self.parse_detail_data(post_id, model)
inf.append(model) # 最终结果json中的内容
with open(f"./json解析腾讯招聘/json解析腾讯招聘5.json", 'w', encoding='utf-8') as f: # 将得到的内容,写入到json文件中
f.write(json.dumps(inf, ensure_ascii=False, indent=4))
f.close()
def parse_detail_data(self, post_id, model):
"""解析详情页数据"""
# 请求地址
post_detail_url = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId'
# 发送详情页数据请求
data = json.loads(requests.get(post_detail_url, headers=self.header, params={"postId": post_id}).content.decode('utf-8'))
# 6.存储岗位要求
model['岗位要求'] = data["Data"]["Requirement"]
print(model)
def run(self, time_stamp, categoryId):
"""爬虫启动程序"""
# 1. 请求列表页数据
list_data = self.get_response(time_stamp, categoryId, pageSize=28) # 爬取28页内容
# 2. 解析列表页数据
self.parse_list_data(list_data)
def get_categoryId():
"""
获取岗位性质
技术类 -> 技术研发类, 质量管理类, 技术运营类,安全技术类, AI、算法与大数据, 企管类
设计 -> 设计类,游戏美术类
产品 -> 产品类,游戏产品类,项目类
"""
# 1. 加载json文件
with open(f'./json解析腾讯招聘/岗位性质.json', 'r', encoding='utf-8') as f:
json_str = f.read()
f.close()
return json.loads(json_str)
if __name__ == '__main__':
# 1. 对岗位性质的分类进行简易处理
data = get_categoryId()
postID = data['category'][1]['子分类']['AI、算法与大数据']
# # 2.获取13位时间戳启动程序
timestamp = str(int(time.time() * 1000))
TXJob().run(timestamp, postID)
print('\n成功将输出内容存入:json解析腾讯招聘5.json 中!')
任务2:以https://dashboard.juhe.cn/data/index/my 聚合数据网独自挑选数据接口练习json请求数据与数据解析。提交py文件到第三次作业文件夹中
import requests
import json
class WorldCup:
"""世界杯数据爬虫"""
def __init__(self):
# 1. 定义请求地址
self.url = 'https://apis.juhe.cn/fapigw/worldcup2022/schedule'
# 2. 定义请求头
self.header = {"User-Agent": "application/x-www-form-urlencoded"}
def get_response(self):
"""发送请求并获取响应"""
# 1. 构造请求参数
params = {"key": "f39918c67cc9fd5e44f6d39248479763"}
# 2. 发送请求
return requests.get(self.url, headers=self.header, params=params).content.decode('utf-8')
@staticmethod
def parser(data):
"""解析数据"""
# 1. 将json -> python对象
pythonObj = json.loads(data)
# 2. 遍历所有岗位
information = [] # 定义空列表
model1 = dict() # 创建数据模型以存储数据
model2 = dict()
model3 = dict()
model2['reason'] = pythonObj['reason'] # 存储reason
model3['error_code'] = pythonObj['error_code'] # 存储error_code
information.append(model2) # 将model2加入information
print(model2)
for p in pythonObj['result']["data"]:
# 1.存储schedule_date
model1['schedule_date'] = p['schedule_date']
# 2.存储schedule_date_format
model1['schedule_date_format'] = p['schedule_date_format']
# 3.存储schedule_week
model1['schedule_week'] = p['schedule_week']
# 4.存储schedule_current
model1["schedule_current"] = p['schedule_current']
# 5.存储schedule_list
model1["schedule_list"] = p['schedule_list']
information.append(model1) # 将model1加入information
print(model1)
information.append(model3) # 将model3加入information
print(model3)
with open(f"./世界杯/世界杯数据.json", 'w', encoding='utf-8') as f: # 将得到的内容,写入到json文件中
f.write(json.dumps(information, ensure_ascii=False, indent=4))
f.close()
def run(self):
"""爬虫启动程序"""
# 1. 请求数据
data = self.get_response()
# 2. 解析数据
self.parser(data)
if __name__ == '__main__':
WorldCup().run()