代码部分
import requests
import time
import logging
from random import randint
# 设置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# 请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
# 创建 Session 对象,复用连接
session = requests.Session()
session.headers.update(headers)
def generate_params(page_index):
"""
动态生成请求的参数
"""
return {
"timestamp": int(time.time() * 1000),
"countryId": None, # 如果没有指定值,设置为 None
"cityId": None, # 如果没有指定值,设置为 None
"bgIds": None, # 如果没有指定值,设置为 None
"productId": None, # 如果没有指定值,设置为 None
"categoryId": "40002002,40001002,40001004", # 类别ID
"parentCategoryId": None, # 如果没有指定值,设置为 None
"attrId": 1, # 假设 attrId 是 1
"keyword": None, # 如果没有指定值,设置为 None
"pageIndex": page_index, # 当前页索引
"pageSize": 10, # 每页显示的记录数
"language": "zh-cn", # 语言设置
"area": "cn" # 区域设置
}
def get_page_data(page_index, url):
"""
获取指定页的数据
"""
params = generate_params(page_index)
try:
response = session.get(url, params=params, timeout=10)
response.raise_for_status() # 如果响应状态码不是 200,会抛出异常
# 处理编码问题
if page_index == 1:
logging.info(f"Declared encoding: {response.encoding}")
logging.info(f"Apparent encoding: {response.apparent_encoding}")
response.encoding = response.apparent_encoding # 使用推测的编码
logging.info(f"正在获取第 {page_index} 页数据...")
json_data = response.json()
if not json_data:
logging.warning(f"第 {page_index} 页没有返回有效数据!")
return json_data
except requests.exceptions.RequestException as e:
logging.error(f"请求失败,状态码: {response.status_code if response else '无响应'}")
logging.error(f"异常信息: {e}")
return None
def crawl_pages(url, total_pages=10):
"""
爬取多页数据
"""
for page_index in range(1, total_pages + 1):
json_data = get_page_data(page_index, url)
if json_data:
logging.info(f"第 {page_index} 页数据爬取成功!")
print("第{}页的数据如下:".format(page_index))
print(json_data)
else:
logging.error(f"第 {page_index} 页数据爬取失败,跳过该页...")
time.sleep(randint(3, 6)) # 随机休眠时间,防止被反爬
# 基础 URL
url = "https://careers.tencent.com/tencentcareer/api/post/Query"
# 爬取 10 页数据
crawl_pages(url)
结果部分