pyhon爬虫某国内第一大视频网站,批量爬取视频信息播放数量点赞数量收藏数量up主的信息,爬取结果存储于mysql数据库中,也可导出csv,附带源码

使用方法:在浏览器里提取自己的cookie,复制到main里cookie变量,设置aid范围,导出csv是可选项,使用create database bilibili_data CHARACTER SET utf8mb4 collate utf8mb4_unicode_ci;创建数据库,虽然b站视频已改为bvid但是仍保留了avid并可以通过avid抓取到bvid 谨慎选择爬取范围代码仅供学习使用严禁用于非法用途

import requests
import time
import random
import pymysql
from tqdm import tqdm
import json
import logging

# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


class BilibiliCrawler:
    def __init__(self, cookie, mysql_config):
        """
        初始化爬虫
        :param cookie: B站cookie字符串
        :param mysql_config: MySQL配置字典
        """
        self.base_url = "https://api.bilibili.com/x/web-interface/view"
        self.user_info_url = "https://api.bilibili.com/x/space/acc/info"  # 用户信息API
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Referer": "https://www.bilibili.com/",
            "Cookie": cookie
        }
        self.mysql_config = mysql_config
        self.conn = None
        self.cursor = None

    def connect_to_mysql(self):
        """连接到MySQL数据库"""
        try:
            self.conn = pymysql.connect(**self.mysql_config)
            self.cursor = self.conn.cursor()
            # 创建表(如果不存在)- 增加了新的字段
            self.cursor.execute("""
            CREATE TABLE IF NOT EXISTS bilibili_videos (
                aid INT PRIMARY KEY,
                bvid VARCHAR(20),
                title VARCHAR(255),
                up_mid INT,
                up_name VARCHAR(100),
                up_sex VARCHAR(10),
                view_count INT,
                danmaku_count INT,
                reply_count INT,
                favorite_count INT,
                pubdate DATETIME,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            ) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci
            """)
            self.conn.commit()
            logger.info("成功连接到MySQL数据库并创建表")
        except Exception as e:
            logger.error(f"连接MySQL失败: {e}")
            raise

    def close_mysql(self):
        """关闭MySQL连接"""
        if self.cursor:
            self.cursor.close()
        if self.conn:
            self.conn.close()
        logger.info("MySQL连接已关闭")

    def get_user_info(self, mid):
        """
        获取用户信息
        :param mid: 用户mid
        :return: 用户性别或None(如果请求失败)
        """
        try:
            params = {"mid": mid}
            response = requests.get(self.user_info_url, params=params, headers=self.headers)
            data = response.json()

            if data["code"] == 0 and "data" in data:
                return data["data"].get("sex", "保密")
            else:
                logger.warning(f"获取用户 mid={mid} 信息失败: {data.get('message', '未知错误')}")
                return "保密"
        except Exception as e:
            logger.error(f"请求用户 mid={mid} 信息时出错: {e}")
            return "保密"

    def get_video_info(self, aid):
        """
        获取单个视频的信息
        :param aid: 视频的aid
        :return: 视频信息字典或None(如果请求失败)
        """
        try:
            params = {"aid": aid}
            response = requests.get(self.base_url, params=params, headers=self.headers)
            data = response.json()

            if data["code"] == 0 and "data" in data:
                video_data = data["data"]
                up_mid = video_data.get("owner", {}).get("mid", 0)

                # 获取UP主性别信息
                up_sex = self.get_user_info(up_mid)

                # 获取统计信息
                stat = video_data.get("stat", {})

                return {
                    "aid": aid,
                    "bvid": video_data.get("bvid", ""),
                    "title": video_data.get("title", ""),
                    "up_mid": up_mid,
                    "up_name": video_data.get("owner", {}).get("name", ""),
                    "up_sex": up_sex,
                    "view_count": stat.get("view", 0),
                    "danmaku_count": stat.get("danmaku", 0),
                    "reply_count": stat.get("reply", 0),
                    "favorite_count": stat.get("favorite", 0),
                    "pubdate": video_data.get("pubdate", 0)
                }
            else:
                logger.warning(f"获取视频 aid={aid} 失败: {data.get('message', '未知错误')}")
                return None
        except Exception as e:
            logger.error(f"请求视频 aid={aid} 时出错: {e}")
            return None

    def save_to_mysql(self, video_info):
        """
        将视频信息保存到MySQL
        :param video_info: 视频信息字典
        """
        try:
            # 将Unix时间戳转换为MySQL datetime格式
            pubdate = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(video_info["pubdate"]))

            sql = """
            INSERT INTO bilibili_videos (
                aid, bvid, title, up_mid, up_name, up_sex, 
                view_count, danmaku_count, reply_count, favorite_count, pubdate
            )
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            ON DUPLICATE KEY UPDATE
            bvid = VALUES(bvid),
            title = VALUES(title),
            up_mid = VALUES(up_mid),
            up_name = VALUES(up_name),
            up_sex = VALUES(up_sex),
            view_count = VALUES(view_count),
            danmaku_count = VALUES(danmaku_count),
            reply_count = VALUES(reply_count),
            favorite_count = VALUES(favorite_count),
            pubdate = VALUES(pubdate)
            """

            self.cursor.execute(sql, (
                video_info["aid"],
                video_info["bvid"],
                video_info["title"],
                video_info["up_mid"],
                video_info["up_name"],
                video_info["up_sex"],
                video_info["view_count"],
                video_info["danmaku_count"],
                video_info["reply_count"],
                video_info["favorite_count"],
                pubdate
            ))
            self.conn.commit()
        except Exception as e:
            self.conn.rollback()
            logger.error(f"保存视频 aid={video_info['aid']} 到MySQL失败: {e}")

    def batch_crawl(self, start_aid, end_aid, batch_size=100, delay_range=(1, 3)):
        """
        批量爬取视频信息
        :param start_aid: 起始aid
        :param end_aid: 结束aid
        :param batch_size: 每批处理的视频数量
        :param delay_range: 每批之间的延迟范围(秒)
        """
        self.connect_to_mysql()

        try:
            total_videos = end_aid - start_aid + 1
            success_count = 0
            fail_count = 0

            # 使用tqdm创建进度条
            with tqdm(total=total_videos, desc="爬取进度") as pbar:
                for current_aid in range(start_aid, end_aid + 1):
                    video_info = self.get_video_info(current_aid)

                    if video_info:
                        self.save_to_mysql(video_info)
                        success_count += 1
                    else:
                        fail_count += 1

                    pbar.update(1)

                    # 每处理batch_size个视频后休息一下,避免请求过于频繁
                    if (current_aid - start_aid + 1) % batch_size == 0 and current_aid < end_aid:
                        delay = random.uniform(*delay_range)
                        logger.info(
                            f"已处理 {current_aid - start_aid + 1}/{total_videos} 个视频,休息 {delay:.2f} 秒...")
                        time.sleep(delay)

            logger.info(f"爬取完成! 成功: {success_count}, 失败: {fail_count}, 总计: {total_videos}")

        except KeyboardInterrupt:
            logger.warning("用户中断爬取过程")
        except Exception as e:
            logger.error(f"爬取过程中发生错误: {e}")
        finally:
            self.close_mysql()

    def export_to_csv(self, filename="bilibili_videos.csv"):
        """
        将数据库中的数据导出为CSV文件
        :param filename: 导出的文件名
        """
        try:
            self.connect_to_mysql()
            self.cursor.execute("SELECT * FROM bilibili_videos")
            results = self.cursor.fetchall()

            # 获取列名
            columns = [desc[0] for desc in self.cursor.description]

            # 写入CSV
            import csv
            with open(filename, 'w', newline='', encoding='utf-8-sig') as csvfile:
                writer = csv.writer(csvfile)
                writer.writerow(columns)  # 写入表头
                writer.writerows(results)  # 写入数据

            logger.info(f"成功导出数据到 {filename}")
        except Exception as e:
            logger.error(f"导出CSV失败: {e}")
        finally:
            self.close_mysql()


# 使用示例
if __name__ == "__main__":
    # 配置信息
    cookie = "buvid3=944D946F-1CE1-C825-22FF-64FF97CBC28739765infoc; b_nut=1740447239; _uuid=763D1649-3216-E10FA-4EBA-F89AD1B6353341915infoc; buvid4=0D850847-0DDF-A354-757E-998CECFA02B741292-025022501-ZQUQK9BvvPIkwuA8%2FpXFfQ%3D%3D; rpdid=|(u|kJJYRmlR0J'u~R|RR~kRu; sid=79yzd3jc; header_theme_version=CLOSE; enable_web_push=DISABLE; enable_feed_channel=DISABLE; home_feed_column=5; bsource=search_bing; bp_t_offset_170489283=1038085211667038208; fingerprint=68c05acc17e286fd6360544928fd00ea; buvid_fp_plain=undefined; buvid_fp=68c05acc17e286fd6360544928fd00ea; CURRENT_FNVAL=4048; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NDA4ODE5NTAsImlhdCI6MTc0MDYyMjY5MCwicGx0IjotMX0.dq-A-D1E_o0TVBdJbnmDNoiNsh6cmrMUKPkuoFoiyuA; bili_ticket_expires=1740881890; browser_resolution=1707-781; b_lsid=F77D8251_1954595439A"



    mysql_config = {
        "host": "localhost",
        "user": "root",
        "password": "19218278482",
        "database": "bilibili_data",
        "charset": "utf8mb4"
    }

    # 创建爬虫实例
    crawler = BilibiliCrawler(cookie=cookie, mysql_config=mysql_config)

    # 批量爬取视频信息 (例如爬取aid从170000到170100的视频)
    crawler.batch_crawl(start_aid=10000000, end_aid=11000000, batch_size=20, delay_range=(2, 5))

    # 可选:导出数据到CSV文件
    # crawler.export_to_csv()

代码仅用作学习使用严禁用于非法用途

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值