import requests
import json
import time
import random
import os
import concurrent.futures
from datetime import datetime
from urllib.parse import urlencode
import hashlib
# 配置文件
CONFIG = {
"user_agents": [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
],
"max_workers": 5, # 并发爬取的工作线程数
"timeout": 15, # 请求超时时间(秒)
"output_dir": "data/baidu_hotsearch/", # 数据存储目录
"cache_dir": "cache/", # 缓存目录
"categories": [
{"name": "实时热点", "tab": "realtime", "priority": 1},
{"name": "文娱榜", "tab": "entertainment", "priority": 2},
{"name": "小说榜", "tab": "novel", "priority": 3},
{"name": "电影榜", "tab": "movie", "priority": 4},
{"name": "游戏榜", "tab": "game", "priority": 5},
{"name": "汽车榜", "tab": "car", "priority": 6},
{"name": "电视剧榜", "tab": "teleplay", "priority": 7},
{"name": "纪录片榜", "tab": "documentary", "priority": 8},
]
}
# 创建目录
os.makedirs(CONFIG["output_dir"], exist_ok=True)
os.makedirs(CONFIG["cache_dir"], exist_ok=True)
def get_random_headers():
"""生成随机请求头"""
return {
'User-Agent': random.choice(CONFIG["user_agents"]),
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
'Referer': 'https://top.baidu.com/board',
'Pragma': 'no-cache',
'X-Requested-With': 'XMLHttpRequest'
}
def get_cache_key(category_tab):
"""生成缓存key"""
return hashlib.md5(f"baidu_hotsearch_{category_tab}".encode()).hexdigest()
def is_cache_valid(cache_path, max_age=1800):
"""检查缓存是否有效(30分钟内)"""
if not os.path.exists(cache_path):
return False
cache_mod_time = os.path.getmtime(cache_path)
return (time.time() - cache_mod_time) < max_age
def fetch_baidu_category(category, use_cache=True):
"""
爬取指定类别的百度热搜榜数据
:param category: 类别字典 {name:, tab:, priority:}
:param use_cache: 是否使用缓存
:return: 解析后的热搜列表
"""
cache_path = os.path.join(CONFIG["cache_dir"], f"{get_cache_key(category['tab'])}.json")
# 检查缓存
if use_cache and is_cache_valid(cache_path):
try:
with open(cache_path, 'r', encoding='utf-8') as f:
return json.load(f)
except Exception:
pass # 缓存读取失败,继续爬取
# 构建API URL
api_url = f"https://top.baidu.com/api/board?platform=wise&tab={category['tab']}"
try:
# 添加随机延时
time.sleep(random.uniform(0.5, 1.5))
# 发起API请求
response = requests.get(
api_url,
headers=get_random_headers(),
timeout=CONFIG["timeout"]
)
response.raise_for_status() # 检查HTTP状态码
data = response.json()
# 验证API返回的数据结构
if not isinstance(data, dict) or not data.get('data') or not data['data'].get('cards'):
raise ValueError("API返回数据结构异常")
hotsearch_list = []
# 遍历所有卡片数据
for card in data['data']['cards']:
# 跳过无效卡片
if not card.get('content'):
continue
# 处理卡片内的热搜项目
for item in card['content']:
# 不同类型的处理
if item.get('word'):
# 标准热搜项
hot_item = {
'title': item['word'],
'heat': item.get('hotScore', '0'),
'hot_change': item.get('hotChange', ''),
'image_url': item.get('img', ''),
'label': item.get('label', ''),
'desc': item.get('desc', ''),
'category': category['name'],
'category_tab': category['tab'],
'source': '百度热搜'
}
# 构建URL
if item.get('url'):
hot_item['url'] = item['url']
else:
query = item.get('query') or item.get('word')
if query:
encoded_query = urlencode({'wd': query})
hot_item['url'] = f"https://www.baidu.com/s?{encoded_query}"
hotsearch_list.append(hot_item)
elif item.get('query'):
# 查询类热搜
hotsearch_list.append({
'title': item['query'],
'heat': item.get('hotScore', '0'),
'hot_change': item.get('hotChange', ''),
'image_url': item.get('img', ''),
'desc': item.get('desc', ''),
'category': category['name'],
'category_tab': category['tab'],
'source': '百度热搜',
'url': f"https://www.baidu.com/s?{urlencode({'wd': item['query']})}"
})
# 按热度排序并重新分配排名
hotsearch_list.sort(key=lambda x: int(x['heat']), reverse=True)
for rank, item in enumerate(hotsearch_list, 1):
item['rank'] = rank
# 更新缓存
try:
with open(cache_path, 'w', encoding='utf-8') as f:
json.dump(hotsearch_list, f, ensure_ascii=False, indent=2)
except Exception as e:
print(f"缓存更新失败: {str(e)}")
return hotsearch_list
except Exception as e:
print(f"爬取分类'{category['name']}'失败: {str(e)}")
return []
def parallel_fetch_all_categories():
"""并行爬取所有分类的热搜数据"""
all_results = {}
with concurrent.futures.ThreadPoolExecutor(max_workers=CONFIG['max_workers']) as executor:
# 提交所有分类的爬取任务
future_to_category = {
executor.submit(fetch_baidu_category, category): category['tab']
for category in CONFIG['categories']
}
# 收集结果
for future in concurrent.futures.as_completed(future_to_category):
category_tab = future_to_category[future]
try:
result = future.result()
if result:
# 按优先级排序标识
priority = next((c['priority'] for c in CONFIG['categories'] if c['tab'] == category_tab), 99)
all_results[category_tab] = {
'priority': priority,
'category': category_tab,
'data': result
}
print(f"成功爬取分类 '{category_tab}': 获取 {len(result)} 条热搜")
except Exception as e:
print(f"分类 '{category_tab}' 爬取失败: {str(e)}")
return all_results
def save_results(all_results):
"""保存所有分类的热搜数据"""
# 1. 分别保存每个分类的数据
for tab, category_data in all_results.items():
filename = f"{tab}_hotsearch_{datetime.now().strftime('%Y%m%d_%H%M')}.json"
filepath = os.path.join(CONFIG["output_dir"], filename)
# 添加元数据
result = {
'meta': {
'source': '百度热搜',
'category': next(c['name'] for c in CONFIG['categories'] if c['tab'] == tab),
'tab': tab,
'timestamp': datetime.now().isoformat(),
'count': len(category_data['data'])
},
'data': category_data['data']
}
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"已保存分类数据到 {filepath}")
# 2. 保存合并的热搜总榜(按热度排序)
all_hot_items = []
for tab, category_data in all_results.items():
all_hot_items.extend(category_data['data'])
# 按热度降序排序
all_hot_items.sort(key=lambda x: int(x['heat']), reverse=True)
# 重新分配全局排名
for rank, item in enumerate(all_hot_items, 1):
item['global_rank'] = rank
# 保存总榜数据
combined_filename = os.path.join(
CONFIG["output_dir"],
f"combined_hotsearch_{datetime.now().strftime('%Y%m%d_%H%M')}.json"
)
combined_result = {
'meta': {
'source': '百度热搜',
'timestamp': datetime.now().isoformat(),
'categories': [tab for tab in all_results.keys()],
'total_count': len(all_hot_items)
},
'data': all_hot_items
}
with open(combined_filename, 'w', encoding='utf-8') as f:
json.dump(combined_result, f, ensure_ascii=False, indent=2)
print(f"已保存总榜数据到 {combined_filename}")
def print_top_items(all_results, top_n=5):
"""打印各分类的热搜TOP N"""
print("\n百度热搜榜各分类TOP {}:".format(top_n))
print("{:<8} {:<5} {:<35} {:<10} {:<10}".format(
"分类", "排名", "标题", "热度", "变化趋势"
))
print("-" * 80)
# 按分类优先级排序
sorted_results = sorted(all_results.items(), key=lambda x: x[1]['priority'])
for tab, category_data in sorted_results:
data = category_data['data'][:top_n]
category_name = next(c['name'] for c in CONFIG['categories'] if c['tab'] == tab)
print(f"[{category_name}]")
for item in data:
title = item['title'][:32] + "..." if len(item['title']) > 32 else item['title']
heat = format(int(item['heat']), ",") # 千位分隔符
change = item.get('hot_change', '') or item.get('hotChange', '')
print("{:<8} {:<5} {:<35} {:<10} {:<10}".format(
"",
item['rank'],
title,
heat,
change
))
print("")
def main():
"""主函数"""
print("开始爬取百度热搜多个分类数据...")
start_time = time.time()
# 并行爬取所有分类
all_results = parallel_fetch_all_categories()
if not all_results:
print("所有分类爬取失败")
return
# 打印TOP榜单
print_top_items(all_results)
# 保存数据
save_results(all_results)
elapsed = time.time() - start_time
print(f"爬取完成! 共处理 {len(all_results)} 个分类, 耗时: {elapsed:.2f}秒")
if __name__ == "__main__":
main()这个代码爬出来的用csv保存