任务1: 完成豆瓣电视案例的所有代码,并增加tv选项.掌握json解析数据的方式,熟练使用json并且能将最后获取的数据写入到json文件中
# 地址分析
# 电影的 标签 请求地址:
# https://movie.douban.com/j/search_tags?type=movie&source=index
# 电视剧的 标签 请求地址:
# https://movie.douban.com/j/search_tags?type=tv&source=index
# 请求参数:在get请求中,参数是直接拼接在地址后面的
# 原则上讲,?前面的才是地址,?后面的是请求这个地址需要携带的参数,多个参数通过 & 连接
# 但是,我们可以讲带有请求参数的地址直接当成一个地址去发送请求
# 电影的请求地址
# https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&page_limit=50&page_start=0
# 电视剧的请求地址
# https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%83%AD%E9%97%A8&page_limit=50&page_start=0
import requests
import json
HEADER = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
def get_tv_movie_tags():
"""获取电影&电视剧的标签"""
# 1.定义请求地址
request_url = "https://movie.douban.com/j/search_tags" # 标签 请求地址
# 2.定义 2次 请求参数 (字典)
get_params1 = {
'type': 'movie',
'source': 'index'
}
get_params2 = {
'type': 'tv',
'source': 'index'
}
# 3.发送请求
response1 = requests.get(request_url, params=get_params1, headers=HEADER).content.decode('utf-8')
response2 = requests.get(request_url, params=get_params2, headers=HEADER).content.decode('utf-8')
# 创建字典存储 movie和 tv的标签
temp = {
"type1": "movie",
'type2': 'tv',
"tags1": json.loads(response1)['tags'],
"tags2": json.loads(response2)['tags']
}
return {"data": {'movie': temp, 'tv': temp}}
class DB:
def __init__(self, type, tag, nums=50):
"""初始化"""
# 1. 确认请求地址
self.url = 'https://movie.douban.com/j/search_subjects' # 请求地址
# 2. 保存类型 & 标签
self.type = type
self.tag = tag
self.nums = nums
# 3. 数据源
self.data = []
def get_response(self):
# 1. 构造请求参数
get_params = {
"type": self.type,
"tag": self.tag,
"page_limit": self.nums,
"page_start": 0
}
# 2. 发送请求并获取响应
return requests.get(self.url, params=get_params, headers=HEADER).content.decode('utf-8')
def param_data(self, data):
"""解析数据"""
# 1. 将json字符串装换为python字典对象
data_obj = json.loads(data)
# 2. 遍历
for d in data_obj['subjects']:
temp = {}
# 存储我们要保存的数据
temp['title'] = d['title']
temp['rate'] = d['rate']
# 缩略图请求地址
img_url = d['cover']
img_bytes = requests.get(img_url).content
# 保存
with open(f'./豆瓣/{d["title"]}.JPG', 'wb') as f:
f.write(img_bytes)
# 加入到数据源中
self.data.append(temp)
def run1(self):
"""爬虫启动程序"""
# 1. 请求数据
data_ = self.get_response()
# 2. 解析数据
self.param_data(data_)
print(self.data)
json.dump(self.data, open(f'./豆瓣json/豆瓣movie.json', 'w', encoding='utf-8'), ensure_ascii=False) # 最后获取的数据写入到json文件中
def run2(self):
"""爬虫启动程序"""
# 1. 请求数据
data_ = self.get_response()
# 2. 解析数据
self.param_data(data_)
print(self.data)
json.dump(self.data, open(f'./豆瓣json/豆瓣tv.json', 'w', encoding='utf-8'), ensure_ascii=False) # 最后获取的数据写入到json文件中
if __name__ == '__main__':
# 1. 请求tag数据
data = get_tv_movie_tags()
# 2. 启动爬虫程序
print('以下爬取豆瓣高分movie...\n')
DB(data['data']['movie']['type1'], data['data']['movie']['tags1'][2]).run1()
print('爬取结束, 以下爬取豆瓣高分tv...\n')
DB(data['data']['tv']['type2'], data['data']['tv']['tags2'][2]).run2()
print('\n爬取结束!')
任务2:json解析腾讯招聘:可以根据传入的参数,获取不同岗位性质的岗位,在列表页中,获取岗位名称,岗位发布时间,岗位工作地点,岗位性质, 岗位工作内容, 在详情页面中获取岗位要求,最终将得到的数据,写入到json文件中.
import requests # 导包
import json
import time
HEADER = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
response = requests.get(url="https://careers.tencent.com/tencentcareer/api/post/ByCategories?timestamp=1672994913950&language=zh-cn", headers=HEADER)
with open(f"./json解析腾讯招聘/json解析腾讯招聘1.json", 'w', encoding='utf-8') as f:
f.write(response.content.decode('utf-8'))
f.close()
with open(f"./json解析腾讯招聘/json解析腾讯招聘1.json", 'r', encoding='utf-8') as f:
load1 = json.load(f)
print("工作类别有:")
# {"Code":200,"Data":[{"CategoryId":"40001","CategoryName":"技术类","PostNumber":557,"OrderNumber":1},{"CategoryId":"40003","CategoryName":"产品类","PostNumber":375,"OrderNumber":2},{"CategoryId":"40006","CategoryName":"内容类","PostNumber":6,"OrderNumber":3},{"CategoryId":"40002","CategoryName":"设计类","PostNumber":174,"OrderNumber":4},{"CategoryId":"40005","CategoryName":"销售、服务与支持类","PostNumber":88,"OrderNumber":5},{"CategoryId":"40008","CategoryName":"人力资源类","PostNumber":33,"OrderNumber":6},{"CategoryId":"40004","CategoryName":"营销与公关类","PostNumber":28,"OrderNumber":7},{"CategoryId":"40011","CategoryName":"战略与投资类","PostNumber":31,"OrderNumber":8},{"CategoryId":"40007","CategoryName":"财务类","PostNumber":23,"OrderNumber":9},{"CategoryId":"40009","CategoryName":"法律与公共策略类","PostNumber":13,"OrderNumber":10},{"CategoryId":"40010","CategoryName":"行政支持类","PostNumber":16,"OrderNumber":11}]}
Id = {}
for data in load1["Data"]:
Id[data["CategoryName"]] = data["CategoryId"] # 某类的Id为CategoryId
print(data["CategoryName"], end=' ')
choice = int(Id[input("\n请输入要查找的工作类别:\n")])
response = requests.get(url=f"https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1673003996873&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId={choice}&attrId=&keyword=&pageIndex=1&pageSize=10&language=zh-cn&area=cn", headers=HEADER)
with open(f"./json解析腾讯招聘/json解析腾讯招聘2.json", 'w', encoding='utf-8') as f:
f.write(response.content.decode('utf-8'))
f.close()
with open(f"./json解析腾讯招聘/json解析腾讯招聘2.json", 'r', encoding='utf-8') as f:
load2 = json.load(f)
print("该工作类别的岗位名称有:")
Id.clear() # 清空
for data in load2["Data"]["Posts"]:
Id[data["RecruitPostName"]] = data["PostId"]
print(data["RecruitPostName"])
choice = int(Id[input("请输入要查找的岗位名称:\n")])
time = str(int(time.time()*1000)) # 获取13位时间戳
response = requests.get(url=f"https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp={time}&postId={choice}&language=zh-cn", headers=HEADER)
with open(f"./json解析腾讯招聘/json解析腾讯招聘3.json", 'w', encoding='utf-8') as f:
f.write(response.content.decode('utf-8'))
f.close()
with open(f"./json解析腾讯招聘/json解析腾讯招聘3.json", 'r', encoding='utf-8') as f:
load3 = json.load(f)
Information = load3['Data']
print("岗位名称:", Information["RecruitPostName"], "\n岗位发布时间:", Information["LastUpdateTime"], "\n岗位工作地点:", Information["LocationName"], "\n岗位性质:", Information["CategoryName"], "\n岗位工作内容:\n", Information["Responsibility"], "\n岗位要求:\n", Information["Requirement"])
temp = {'RecruitPostName': Information['RecruitPostName'], 'LastUpdateTime': Information['LastUpdateTime'],
'LocationName': Information['LocationName'], 'CategoryName': Information['CategoryName'],
'Responsibility': Information['Responsibility'], 'Requirement': Information['Requirement']}
with open(f"./json解析腾讯招聘/json解析腾讯招聘4.json", 'w', encoding='utf-8') as f:
f.write(json.dumps(temp, ensure_ascii=False, indent=4))
f.close()