使用方法:在浏览器里提取自己的cookie,复制到main里cookie变量,设置aid范围,导出csv是可选项,使用create database bilibili_data CHARACTER SET utf8mb4 collate utf8mb4_unicode_ci;创建数据库,虽然b站视频已改为bvid但是仍保留了avid并可以通过avid抓取到bvid 谨慎选择爬取范围代码仅供学习使用严禁用于非法用途
import requests
import time
import random
import pymysql
from tqdm import tqdm
import json
import logging
# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class BilibiliCrawler:
def __init__(self, cookie, mysql_config):
"""
初始化爬虫
:param cookie: B站cookie字符串
:param mysql_config: MySQL配置字典
"""
self.base_url = "https://api.bilibili.com/x/web-interface/view"
self.user_info_url = "https://api.bilibili.com/x/space/acc/info" # 用户信息API
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Referer": "https://www.bilibili.com/",
"Cookie": cookie
}
self.mysql_config = mysql_config
self.conn = None
self.cursor = None
def connect_to_mysql(self):
"""连接到MySQL数据库"""
try:
self.conn = pymysql.connect(**self.mysql_config)
self.cursor = self.conn.cursor()
# 创建表(如果不存在)- 增加了新的字段
self.cursor.execute("""
CREATE TABLE IF NOT EXISTS bilibili_videos (
aid INT PRIMARY KEY,
bvid VARCHAR(20),
title VARCHAR(255),
up_mid INT,
up_name VARCHAR(100),
up_sex VARCHAR(10),
view_count INT,
danmaku_count INT,
reply_count INT,
favorite_count INT,
pubdate DATETIME,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci
""")
self.conn.commit()
logger.info("成功连接到MySQL数据库并创建表")
except Exception as e:
logger.error(f"连接MySQL失败: {e}")
raise
def close_mysql(self):
"""关闭MySQL连接"""
if self.cursor:
self.cursor.close()
if self.conn:
self.conn.close()
logger.info("MySQL连接已关闭")
def get_user_info(self, mid):
"""
获取用户信息
:param mid: 用户mid
:return: 用户性别或None(如果请求失败)
"""
try:
params = {"mid": mid}
response = requests.get(self.user_info_url, params=params, headers=self.headers)
data = response.json()
if data["code"] == 0 and "data" in data:
return data["data"].get("sex", "保密")
else:
logger.warning(f"获取用户 mid={mid} 信息失败: {data.get('message', '未知错误')}")
return "保密"
except Exception as e:
logger.error(f"请求用户 mid={mid} 信息时出错: {e}")
return "保密"
def get_video_info(self, aid):
"""
获取单个视频的信息
:param aid: 视频的aid
:return: 视频信息字典或None(如果请求失败)
"""
try:
params = {"aid": aid}
response = requests.get(self.base_url, params=params, headers=self.headers)
data = response.json()
if data["code"] == 0 and "data" in data:
video_data = data["data"]
up_mid = video_data.get("owner", {}).get("mid", 0)
# 获取UP主性别信息
up_sex = self.get_user_info(up_mid)
# 获取统计信息
stat = video_data.get("stat", {})
return {
"aid": aid,
"bvid": video_data.get("bvid", ""),
"title": video_data.get("title", ""),
"up_mid": up_mid,
"up_name": video_data.get("owner", {}).get("name", ""),
"up_sex": up_sex,
"view_count": stat.get("view", 0),
"danmaku_count": stat.get("danmaku", 0),
"reply_count": stat.get("reply", 0),
"favorite_count": stat.get("favorite", 0),
"pubdate": video_data.get("pubdate", 0)
}
else:
logger.warning(f"获取视频 aid={aid} 失败: {data.get('message', '未知错误')}")
return None
except Exception as e:
logger.error(f"请求视频 aid={aid} 时出错: {e}")
return None
def save_to_mysql(self, video_info):
"""
将视频信息保存到MySQL
:param video_info: 视频信息字典
"""
try:
# 将Unix时间戳转换为MySQL datetime格式
pubdate = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(video_info["pubdate"]))
sql = """
INSERT INTO bilibili_videos (
aid, bvid, title, up_mid, up_name, up_sex,
view_count, danmaku_count, reply_count, favorite_count, pubdate
)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE
bvid = VALUES(bvid),
title = VALUES(title),
up_mid = VALUES(up_mid),
up_name = VALUES(up_name),
up_sex = VALUES(up_sex),
view_count = VALUES(view_count),
danmaku_count = VALUES(danmaku_count),
reply_count = VALUES(reply_count),
favorite_count = VALUES(favorite_count),
pubdate = VALUES(pubdate)
"""
self.cursor.execute(sql, (
video_info["aid"],
video_info["bvid"],
video_info["title"],
video_info["up_mid"],
video_info["up_name"],
video_info["up_sex"],
video_info["view_count"],
video_info["danmaku_count"],
video_info["reply_count"],
video_info["favorite_count"],
pubdate
))
self.conn.commit()
except Exception as e:
self.conn.rollback()
logger.error(f"保存视频 aid={video_info['aid']} 到MySQL失败: {e}")
def batch_crawl(self, start_aid, end_aid, batch_size=100, delay_range=(1, 3)):
"""
批量爬取视频信息
:param start_aid: 起始aid
:param end_aid: 结束aid
:param batch_size: 每批处理的视频数量
:param delay_range: 每批之间的延迟范围(秒)
"""
self.connect_to_mysql()
try:
total_videos = end_aid - start_aid + 1
success_count = 0
fail_count = 0
# 使用tqdm创建进度条
with tqdm(total=total_videos, desc="爬取进度") as pbar:
for current_aid in range(start_aid, end_aid + 1):
video_info = self.get_video_info(current_aid)
if video_info:
self.save_to_mysql(video_info)
success_count += 1
else:
fail_count += 1
pbar.update(1)
# 每处理batch_size个视频后休息一下,避免请求过于频繁
if (current_aid - start_aid + 1) % batch_size == 0 and current_aid < end_aid:
delay = random.uniform(*delay_range)
logger.info(
f"已处理 {current_aid - start_aid + 1}/{total_videos} 个视频,休息 {delay:.2f} 秒...")
time.sleep(delay)
logger.info(f"爬取完成! 成功: {success_count}, 失败: {fail_count}, 总计: {total_videos}")
except KeyboardInterrupt:
logger.warning("用户中断爬取过程")
except Exception as e:
logger.error(f"爬取过程中发生错误: {e}")
finally:
self.close_mysql()
def export_to_csv(self, filename="bilibili_videos.csv"):
"""
将数据库中的数据导出为CSV文件
:param filename: 导出的文件名
"""
try:
self.connect_to_mysql()
self.cursor.execute("SELECT * FROM bilibili_videos")
results = self.cursor.fetchall()
# 获取列名
columns = [desc[0] for desc in self.cursor.description]
# 写入CSV
import csv
with open(filename, 'w', newline='', encoding='utf-8-sig') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(columns) # 写入表头
writer.writerows(results) # 写入数据
logger.info(f"成功导出数据到 {filename}")
except Exception as e:
logger.error(f"导出CSV失败: {e}")
finally:
self.close_mysql()
# 使用示例
if __name__ == "__main__":
# 配置信息
cookie = "buvid3=944D946F-1CE1-C825-22FF-64FF97CBC28739765infoc; b_nut=1740447239; _uuid=763D1649-3216-E10FA-4EBA-F89AD1B6353341915infoc; buvid4=0D850847-0DDF-A354-757E-998CECFA02B741292-025022501-ZQUQK9BvvPIkwuA8%2FpXFfQ%3D%3D; rpdid=|(u|kJJYRmlR0J'u~R|RR~kRu; sid=79yzd3jc; header_theme_version=CLOSE; enable_web_push=DISABLE; enable_feed_channel=DISABLE; home_feed_column=5; bsource=search_bing; bp_t_offset_170489283=1038085211667038208; fingerprint=68c05acc17e286fd6360544928fd00ea; buvid_fp_plain=undefined; buvid_fp=68c05acc17e286fd6360544928fd00ea; CURRENT_FNVAL=4048; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NDA4ODE5NTAsImlhdCI6MTc0MDYyMjY5MCwicGx0IjotMX0.dq-A-D1E_o0TVBdJbnmDNoiNsh6cmrMUKPkuoFoiyuA; bili_ticket_expires=1740881890; browser_resolution=1707-781; b_lsid=F77D8251_1954595439A"
mysql_config = {
"host": "localhost",
"user": "root",
"password": "19218278482",
"database": "bilibili_data",
"charset": "utf8mb4"
}
# 创建爬虫实例
crawler = BilibiliCrawler(cookie=cookie, mysql_config=mysql_config)
# 批量爬取视频信息 (例如爬取aid从170000到170100的视频)
crawler.batch_crawl(start_aid=10000000, end_aid=11000000, batch_size=20, delay_range=(2, 5))
# 可选:导出数据到CSV文件
# crawler.export_to_csv()
代码仅用作学习使用严禁用于非法用途