import requests
import time
import hashlib
import json
import pandas as pd
from urllib.parse import quote
from typing import List, Union, Dict, Any
class BilibiliCrawler:
def __init__(self, keywords: Union[str, List[str]] = None, max_results: int = 30):
"""
初始化爬虫
参数:
keywords: 关键词或关键词列表,默认为["高等数学", "概率论", "Python"]
max_results: 每个关键词最大爬取结果数
"""
# 处理关键词类型,支持字符串和列表
if keywords is None:
self.keywords = ["高等数学", "概率论", "Python"]
elif isinstance(keywords, str):
self.keywords = [keywords]
else:
self.keywords = keywords
self.max_results = max_results
self.session = requests.Session()
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
'Referer': 'https://www.bilibili.com/',
'Origin': 'https://www.bilibili.com',
'Cookie': 'your_cookie_here' # 替换为实际Cookie
}
self.video_data = []
self.mixin_key = None # 缓存混合密钥
self.request_count = 0 # 请求计数器
def get_mixin_key(self) -> str:
"""获取并缓存混合密钥"""
if self.mixin_key:
return self.mixin_key
nav_url = "https://api.bilibili.com/x/web-interface/nav"
try:
nav_res = self.session.get(nav_url, headers=self.headers, timeout=10)
nav_res.raise_for_status()
wbi_img = nav_res.json().get('data', {}).get('wbi_img', {})
img_key = wbi_img.get('img_url', '').split('/')[-1].split('.')[0]
sub_key = wbi_img.get('sub_url', '').split('/')[-1].split('.')[0]
self.mixin_key = (img_key + sub_key)[:32]
return self.mixin_key
except Exception as e:
print(f"获取混合密钥失败: {str(e)}")
# 使用备用密钥
return "ea1db124af3c7062474693fa704f4ff8"
def get_wbi_sign(self, params: Dict[str, Any]) -> str:
"""生成WBI签名"""
# 获取混合密钥
mixin_key = self.get_mixin_key()
# 对参数排序并生成查询字符串
sorted_params = dict(sorted(params.items()))
query = '&'.join([f'{k}={quote(str(v))}' for k, v in sorted_params.items()])
# 计算MD5签名
wbi_sign = hashlib.md5((query + mixin_key).encode()).hexdigest()
return wbi_sign
def search_videos(self):
"""搜索多个关键词的视频"""
total_videos = 0
for keyword in self.keywords:
print(f"\n开始爬取关键词: {keyword}")
keyword_count = 0
page = 1
while keyword_count < self.max_results and total_videos < self.max_results * len(self.keywords):
# 计算本页需要获取的视频数量
page_size = min(
20,
self.max_results - keyword_count,
self.max_results * len(self.keywords) - total_videos
)
# 基础参数
params = {
'search_type': 'video',
'keyword': keyword,
'page': page,
'page_size': page_size
}
# 添加WBI签名
params['w_rid'] = self.get_wbi_sign(params)
try:
# 发送搜索请求
response = self.session.get(
"https://api.bilibili.com/x/web-interface/wbi/search/type",
params=params,
headers=self.headers,
timeout=15
)
self.request_count += 1
# 处理412错误
if response.status_code == 412:
print(f"第{page}页遇到412错误,正在重试...")
time.sleep(3)
response = self.session.get(
"https://api.bilibili.com/x/web-interface/wbi/search/type",
params=params,
headers=self.headers,
timeout=15
)
self.request_count += 1
response.raise_for_status()
data = response.json()
# 检查API响应状态
if data.get('code') != 0:
error_msg = data.get('message', '未知错误')
print(f"API返回错误: {error_msg} (代码: {data.get('code')})")
break
# 检查是否有有效数据
results = data.get('data', {}).get('result')
if not results:
print(f"关键词 '{keyword}' 第{page}页无结果")
break
# 提取视频基本信息
for video in results:
if keyword_count >= self.max_results or total_videos >= self.max_results * len(self.keywords):
break
bvid = video.get('bvid')
if bvid:
video_info = self.get_video_details(bvid, keyword)
if video_info:
self.video_data.append(video_info)
keyword_count += 1
total_videos += 1
print(
f"已获取 {keyword_count}/{self.max_results} 条数据 | 关键词: {keyword} | 总进度: {total_videos}/{self.max_results * len(self.keywords)}")
# 控制请求频率 (每5次请求休息2秒)
if self.request_count % 5 == 0:
time.sleep(2)
else:
time.sleep(0.5)
page += 1
time.sleep(1.5) # 页间延迟
except requests.exceptions.RequestException as e:
print(f"请求失败({keyword}): {str(e)}")
time.sleep(5) # 请求失败后延长等待
except json.JSONDecodeError as e:
print(f"JSON解析失败({keyword}): {str(e)}")
print(f"响应内容: {response.text[:200]}...")
except Exception as e:
print(f"未知错误({keyword}): {str(e)}")
break
print(f"关键词 '{keyword}' 爬取完成, 共获取 {keyword_count} 条数据")
def get_video_details(self, bvid: str, keyword: str) -> Dict[str, Any]:
"""获取视频详细信息"""
detail_url = f"https://api.bilibili.com/x/web-interface/view?bvid={bvid}"
try:
response = self.session.get(detail_url, headers=self.headers, timeout=10)
self.request_count += 1
# 处理详情页412错误
if response.status_code == 412:
print(f"视频{bvid}详情页412错误,重试中...")
time.sleep(2)
response = self.session.get(detail_url, headers=self.headers, timeout=10)
self.request_count += 1
response.raise_for_status()
data = response.json()
if data.get('code') != 0:
error_msg = data.get('message', '未知错误')
print(f"视频详情API错误({bvid}): {error_msg}")
return None
video_data = data.get('data', {})
if not video_data:
print(f"视频{bvid}详情数据为空")
return None
stat = video_data.get('stat', {})
return {
'bvid': bvid,
'关键词': keyword,
'标题': video_data.get('title', ''),
'博主': video_data.get('owner', {}).get('name', ''),
'播放量': stat.get('view', 0),
'点赞量': stat.get('like', 0),
'收藏量': stat.get('favorite', 0),
'弹幕量': stat.get('danmaku', 0),
'分享量': stat.get('share', 0),
'硬币数': stat.get('coin', 0),
'视频时长': video_data.get('duration', 0),
'发布时间': time.strftime('%Y-%m-%d %H:%M:%S',
time.localtime(video_data.get('pubdate', 0)))
}
except requests.exceptions.RequestException as e:
print(f"获取视频详情失败({bvid}): 请求错误 - {str(e)}")
return None
except json.JSONDecodeError as e:
print(f"获取视频详情失败({bvid}): JSON解析错误 - {str(e)}")
return None
except Exception as e:
print(f"获取视频详情失败({bvid}): 未知错误 - {str(e)}")
return None
def save_to_csv(self, filename='bilibili_videos.csv'):
"""保存数据到CSV"""
if not self.video_data:
print("没有数据可保存")
return
df = pd.DataFrame(self.video_data)
# 优化列顺序
columns_order = [
'bvid', '关键词', '标题', '博主', '播放量',
'点赞量', '收藏量', '弹幕量', '分享量', '硬币数',
'视频时长', '发布时间'
]
df = df.reindex(columns=[col for col in columns_order if col in df.columns])
df.to_csv(filename, index=False, encoding='utf_8_sig')
print(f"已保存 {len(self.video_data)} 条数据到 {filename}")
# 使用示例
if __name__ == "__main__":
# 自定义关键词列表和最大结果数
keywords = [
"概率论",
"Python数据分析",
"随机过程",
"高等数学",
"Scipy概率分布"
]
crawler = BilibiliCrawler(keywords=keywords, max_results=20)
crawler.search_videos()
crawler.save_to_csv('bilibili_math_programming_videos.csv')
在代码之后要怎么进行数据可视化