import requests
import time
import random
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
# ================ 核心配置 ================
KEYWORD = "石楠花" # 搜索关键词
MAX_PAGES = 1 # 爬取页数(每页20条)建议从1开始测试
OUTPUT_FILE = f"B站_{KEYWORD}_视频数据_{datetime.now().strftime('%Y%m%d')}.xlsx"
# 反爬配置
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (iPhone; CPU iPhone OS 15_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (Linux; Android 12; SM-S908E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.41 Mobile Safari/537.36"
]
# ================ 核心函数 ================
def get_random_headers():
"""生成随机请求头"""
return {
"User-Agent": random.choice(USER_AGENTS),
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Referer": "https://search.bilibili.com/"
}
def convert_bilibili_number(num_str):
"""处理B站数字格式(万/亿转换)"""
if isinstance(num_str, int):
return num_str
num_str = str(num_str).replace(',', '')
if '万' in num_str:
# 移除"万"字并乘以10000
return int(float(num_str.replace('万', '')) * 10000
if '亿' in num_str:
# 移除"亿"字并乘以100000000
return int(float(num_str.replace('亿', '')) * 100000000
try:
# 尝试转换为整数
return int(num_str)
except:
# 转换失败返回0
return 0
def get_bilibili_via_api(keyword, max_pages=3):
"""通过官方API获取数据(推荐方式)"""
results = []
for page in range(1, max_pages + 1):
try:
url = "https://api.bilibili.com/x/web-interface/search/type"
params = {
"search_type": "video",
"keyword": keyword,
"page": page,
"page_size": 20
}
response = requests.get(
url,
params=params,
headers=get_random_headers(),
timeout=10
)
data = response.json()
if data.get('code') != 0:
print(f"API返回错误: {data.get('message')}")
continue
for video in data['data']['result']:
# 确保所有字段都存在
video_data = {
"平台": "B站",
"视频ID": video.get('bvid', ''),
"标题": video.get('title', ''),
"播放量": convert_bilibili_number(video.get('play', 0)),
"点赞量": convert_bilibili_number(video.get('like', 0)),
"评论数": convert_bilibili_number(video.get('comment', 0)),
"弹幕数": convert_bilibili_number(video.get('danmaku', 0)),
"作者": video.get('author', ''),
"时长": video.get('duration', ''),
"发布时间": datetime.fromtimestamp(video.get('pubdate', 0)).strftime('%Y-%m-%d') if video.get('pubdate') else '',
"采集时间": datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}
results.append(video_data)
print(f"已获取第{page}页数据,共{len(data['data']['result'])}条")
# 控制请求频率
time.sleep(random.uniform(1.5, 3.0))
except Exception as e:
print(f"第{page}页获取失败: {str(e)}")
time.sleep(5)
return results
def save_to_excel(data, filename):
"""保存数据到Excel"""
if not data:
print("没有数据可保存")
return
df = pd.DataFrame(data)
# 保存为Excel
df.to_excel(filename, index=False)
print(f"数据已保存至: {filename}")
print(f"共{len(df)}条记录")
# ================ 主执行程序 ================
if __name__ == "__main__":
print(f"【B站视频数据采集开始】关键词: {KEYWORD}")
start_time = time.time()
try:
# 使用API方式获取数据
video_data = get_bilibili_via_api(KEYWORD, MAX_PAGES)
if video_data:
save_to_excel(video_data, OUTPUT_FILE)
else:
print("未获取到有效数据")
except Exception as e:
print(f"程序异常终止: {str(e)}")
print(f"任务完成! 耗时: {time.time() - start_time:.2f}秒")
# 程序结束后暂停,方便查看结果
input("按Enter键退出...")