self.crawler.stats.get_stats()

本文介绍Scrapy框架中数据收集器(StatsCollector)的使用方法。通过实例展示了如何利用数据收集器跟踪爬虫状态,如记录404页面及其数量。文章详细解释了数据收集的各种函数,并提供了一个具体的Python爬虫示例。

第三百五十四节,Python分布式爬虫打造搜索引擎Scrapy精讲—数据收集(Stats Collection)

标签:扩展 color print pac www. val status cal 关闭

第三百五十四节,Python分布式爬虫打造搜索引擎Scrapy精讲—数据收集(Stats Collection)

Scrapy提供了方便的收集数据的机制。数据以key/value方式存储,值大多是计数值。 该机制叫做数据收集器(Stats Collector),可以通过 Crawler API 的属性 stats 来使用
无论数据收集(stats collection)开启或者关闭,数据收集器永远都是可用的。 因此您可以import进自己的模块并使用其API(增加值或者设置新的状态键(stat keys))。 该做法是为了简化数据收集的方法: 您不应该使用超过一行代码来收集您的spider,Scrpay扩展或任何您使用数据收集器代码里头的状态。

数据收集器的另一个特性是(在启用状态下)很高效,(在关闭情况下)非常高效(几乎察觉不到)。

数据收集器对每个spider保持一个状态表。当spider启动时,该表自动打开,当spider关闭时,自动关闭。

数据收集各种函数

stats.set_value(‘数据名称‘, 数据值)设置数据
stats.inc_value(‘数据名称‘)增加数据值,自增1
stats.max_value(‘数据名称‘, value)当新的值比原来的值大时设置数据
stats.min_value(‘数据名称‘, value)当新的值比原来的值小时设置数据
stats.get_value(‘数据名称‘)获取数据值
stats.get_stats()获取所有数据

-- coding: utf-8 --

import scrapy
from scrapy.http import Request,FormRequest

class PachSpider(scrapy.Spider): #定义爬虫类,必须继承scrapy.Spider
name = ‘pach‘ #设置爬虫名称
allowed_domains = [‘www.dict.cn‘] #爬取域名

def start_requests(self):    #起始url函数,会替换start_urls
    return [Request(
        url=‘http://www.dict.cn/9999998888‘,
        callback=self.parse
    )]

# 利用数据收集器,收集所有404的url以及,404页面数量
handle_httpstatus_list = [404]                                  # 设置不过滤404

def __init__(self):
    self.fail_urls = []                                         # 创建一个变量来储存404URL

def parse(self, response):                                      # 回调函数
    if response.status == 404:                                  # 判断返回状态码如果是404
        self.fail_urls.append(response.url)                     # 将URL追加到列表
        self.crawler.stats.inc_value(‘failed_url‘)              # 设置一个数据收集,值为自增,每执行一次自增1
        print(self.fail_urls)                                   # 打印404URL列表
        print(self.crawler.stats.get_value(‘failed_url‘))       # 打印数据收集值
    else:
        title = response.css(‘title::text‘).extract()
        print(title)
import requests import time import hashlib import json import pandas as pd from urllib.parse import quote from typing import List, Union, Dict, Any class BilibiliCrawler: def __init__(self, keywords: Union[str, List[str]] = None, max_results: int = 30): """ 初始化爬虫 参数: keywords: 关键词或关键词列表,默认为["高等数学", "概率论", "Python"] max_results: 每个关键词最大爬取结果数 """ # 处理关键词类型,支持字符串和列表 if keywords is None: self.keywords = ["高等数学", "概率论", "Python"] elif isinstance(keywords, str): self.keywords = [keywords] else: self.keywords = keywords self.max_results = max_results self.session = requests.Session() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36', 'Referer': 'https://www.bilibili.com/', 'Origin': 'https://www.bilibili.com', 'Cookie': 'your_cookie_here' # 替换为实际Cookie } self.video_data = [] self.mixin_key = None # 缓存混合密钥 self.request_count = 0 # 请求计数器 def get_mixin_key(self) -> str: """获取并缓存混合密钥""" if self.mixin_key: return self.mixin_key nav_url = "https://api.bilibili.com/x/web-interface/nav" try: nav_res = self.session.get(nav_url, headers=self.headers, timeout=10) nav_res.raise_for_status() wbi_img = nav_res.json().get('data', {}).get('wbi_img', {}) img_key = wbi_img.get('img_url', '').split('/')[-1].split('.')[0] sub_key = wbi_img.get('sub_url', '').split('/')[-1].split('.')[0] self.mixin_key = (img_key + sub_key)[:32] return self.mixin_key except Exception as e: print(f"获取混合密钥失败: {str(e)}") # 使用备用密钥 return "ea1db124af3c7062474693fa704f4ff8" def get_wbi_sign(self, params: Dict[str, Any]) -> str: """生成WBI签名""" # 获取混合密钥 mixin_key = self.get_mixin_key() # 对参数排序并生成查询字符串 sorted_params = dict(sorted(params.items())) query = '&'.join([f'{k}={quote(str(v))}' for k, v in sorted_params.items()]) # 计算MD5签名 wbi_sign = hashlib.md5((query + mixin_key).encode()).hexdigest() return wbi_sign def search_videos(self): """搜索多个关键词的视频""" total_videos = 0 for keyword in self.keywords: print(f"\n开始爬取关键词: {keyword}") keyword_count = 0 page = 1 while keyword_count < self.max_results and total_videos < self.max_results * len(self.keywords): # 计算本页需要获取的视频数量 page_size = min( 20, self.max_results - keyword_count, self.max_results * len(self.keywords) - total_videos ) # 基础参数 params = { 'search_type': 'video', 'keyword': keyword, 'page': page, 'page_size': page_size } # 添加WBI签名 params['w_rid'] = self.get_wbi_sign(params) try: # 发送搜索请求 response = self.session.get( "https://api.bilibili.com/x/web-interface/wbi/search/type", params=params, headers=self.headers, timeout=15 ) self.request_count += 1 # 处理412错误 if response.status_code == 412: print(f"第{page}页遇到412错误,正在重试...") time.sleep(3) response = self.session.get( "https://api.bilibili.com/x/web-interface/wbi/search/type", params=params, headers=self.headers, timeout=15 ) self.request_count += 1 response.raise_for_status() data = response.json() # 检查API响应状态 if data.get('code') != 0: error_msg = data.get('message', '未知错误') print(f"API返回错误: {error_msg} (代码: {data.get('code')})") break # 检查是否有有效数据 results = data.get('data', {}).get('result') if not results: print(f"关键词 '{keyword}' 第{page}页无结果") break # 提取视频基本信息 for video in results: if keyword_count >= self.max_results or total_videos >= self.max_results * len(self.keywords): break bvid = video.get('bvid') if bvid: video_info = self.get_video_details(bvid, keyword) if video_info: self.video_data.append(video_info) keyword_count += 1 total_videos += 1 print( f"已获取 {keyword_count}/{self.max_results} 条数据 | 关键词: {keyword} | 总进度: {total_videos}/{self.max_results * len(self.keywords)}") # 控制请求频率 (每5次请求休息2秒) if self.request_count % 5 == 0: time.sleep(2) else: time.sleep(0.5) page += 1 time.sleep(1.5) # 页间延迟 except requests.exceptions.RequestException as e: print(f"请求失败({keyword}): {str(e)}") time.sleep(5) # 请求失败后延长等待 except json.JSONDecodeError as e: print(f"JSON解析失败({keyword}): {str(e)}") print(f"响应内容: {response.text[:200]}...") except Exception as e: print(f"未知错误({keyword}): {str(e)}") break print(f"关键词 '{keyword}' 爬取完成, 共获取 {keyword_count} 条数据") def get_video_details(self, bvid: str, keyword: str) -> Dict[str, Any]: """获取视频详细信息""" detail_url = f"https://api.bilibili.com/x/web-interface/view?bvid={bvid}" try: response = self.session.get(detail_url, headers=self.headers, timeout=10) self.request_count += 1 # 处理详情页412错误 if response.status_code == 412: print(f"视频{bvid}详情页412错误,重试中...") time.sleep(2) response = self.session.get(detail_url, headers=self.headers, timeout=10) self.request_count += 1 response.raise_for_status() data = response.json() if data.get('code') != 0: error_msg = data.get('message', '未知错误') print(f"视频详情API错误({bvid}): {error_msg}") return None video_data = data.get('data', {}) if not video_data: print(f"视频{bvid}详情数据为空") return None stat = video_data.get('stat', {}) return { 'bvid': bvid, '关键词': keyword, '标题': video_data.get('title', ''), '博主': video_data.get('owner', {}).get('name', ''), '播放量': stat.get('view', 0), '点赞量': stat.get('like', 0), '收藏量': stat.get('favorite', 0), '弹幕量': stat.get('danmaku', 0), '分享量': stat.get('share', 0), '硬币数': stat.get('coin', 0), '视频时长': video_data.get('duration', 0), '发布时间': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(video_data.get('pubdate', 0))) } except requests.exceptions.RequestException as e: print(f"获取视频详情失败({bvid}): 请求错误 - {str(e)}") return None except json.JSONDecodeError as e: print(f"获取视频详情失败({bvid}): JSON解析错误 - {str(e)}") return None except Exception as e: print(f"获取视频详情失败({bvid}): 未知错误 - {str(e)}") return None def save_to_csv(self, filename='bilibili_videos.csv'): """保存数据到CSV""" if not self.video_data: print("没有数据可保存") return df = pd.DataFrame(self.video_data) # 优化列顺序 columns_order = [ 'bvid', '关键词', '标题', '博主', '播放量', '点赞量', '收藏量', '弹幕量', '分享量', '硬币数', '视频时长', '发布时间' ] df = df.reindex(columns=[col for col in columns_order if col in df.columns]) df.to_csv(filename, index=False, encoding='utf_8_sig') print(f"已保存 {len(self.video_data)} 条数据到 {filename}") # 使用示例 if __name__ == "__main__": # 自定义关键词列表和最大结果数 keywords = [ "概率论", "Python数据分析", "随机过程", "高等数学", "Scipy概率分布" ] crawler = BilibiliCrawler(keywords=keywords, max_results=20) crawler.search_videos() crawler.save_to_csv('bilibili_math_programming_videos.csv') 在代码之后要怎么进行数据可视化
06-14
import requests import time import hashlib import json class BilibiliCrawler: def __init__(self, keyword="高等数学", max_results=30): self.keyword = keyword self.max_results = max_results self.session = requests.Session() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36', 'Referer': 'https://www.bilibili.com/', 'Origin': 'https://www.bilibili.com', 'Cookie': 'your_cookie_here' # 替换为实际Cookie } self.video_data = [] def get_wbi_sign(self, params): """生成WBI签名 - B站新版反爬机制""" # 获取导航栏信息提取密钥 nav_url = "https://api.bilibili.com/x/web-interface/nav" nav_res = self.session.get(nav_url, headers=self.headers) wbi_img = nav_res.json().get('data', {}).get('wbi_img', {}) img_key = wbi_img.get('img_url', '').split('/')[-1].split('.')[0] sub_key = wbi_img.get('sub_url', '').split('/')[-1].split('.')[0] # 生成混合密钥 mixin_key = img_key + sub_key chr_filter = ''.join([chr(i) for i in range(33, 127)]) mixin_key = ''.join([c for c in mixin_key if c in chr_filter])[:32] # 生成签名 params = dict(sorted(params.items())) query = '&'.join([f'{k}={v}' for k, v in params.items()]) wbi_sign = hashlib.md5((query + mixin_key).encode()).hexdigest() return wbi_sign def search_videos(self): """搜索高等数学视频""" search_url = "https://api.bilibili.com/x/web-interface/wbi/search/type" page = 1 count = 0 while count < self.max_results: # 基础参数 params = { 'search_type': 'video', 'keyword': self.keyword, 'page': page, 'page_size': min(20, self.max_results - count) } # 添加WBI签名 params['w_rid'] = self.get_wbi_sign(params) try: response = self.session.get( search_url, params=params, headers=self.headers, timeout=15 ) # 处理412错误 if response.status_code == 412: print(f"第{page}页遇到412错误,正在重试...") time.sleep(3) # 增加延迟 response = self.session.get( search_url, params=params, headers=self.headers, timeout=15 ) response.raise_for_status() data = response.json() if data.get('code') != 0 or not data.get('data', {}).get('result'): break # 提取视频基本信息 for video in data['data']['result']: if count >= self.max_results: break bvid = video.get('bvid') if bvid: video_info = self.get_video_details(bvid) if video_info: self.video_data.append(video_info) count += 1 print(f"已获取 {count}/{self.max_results} 条数据") time.sleep(1) # 重要:控制请求频率 page += 1 time.sleep(2) # 页间延迟 except Exception as e: print(f"请求失败: {str(e)}") break def get_video_details(self, bvid): """获取视频详细信息""" detail_url = f"https://api.bilibili.com/x/web-interface/view?bvid={bvid}" try: response = self.session.get(detail_url, headers=self.headers, timeout=10) # 处理详情页412错误 if response.status_code == 412: print(f"视频{bvid}详情页412错误,重试中...") time.sleep(2) response = self.session.get(detail_url, headers=self.headers, timeout=10) response.raise_for_status() data = response.json() if data.get('code') != 0: return None stat = data.get('data', {}).get('stat', {}) return { 'bvid': bvid, '标题': data['data']['title'], '博主': data['data']['owner']['name'], '播放量': stat.get('view', 0), '点赞量': stat.get('like', 0), '收藏量': stat.get('favorite', 0), '弹幕量': stat.get('danmaku', 0), '分享量': stat.get('share', 0), '硬币数': stat.get('coin', 0) } except Exception as e: print(f"获取视频详情失败({bvid}): {str(e)}") return None def save_to_csv(self, filename='bilibili_math_videos.csv'): """保存数据到CSV""" if not self.video_data: print("没有数据可保存") return import pandas as pd df = pd.DataFrame(self.video_data) df.to_csv(filename, index=False, encoding='utf_8_sig') print(f"已保存{len(self.video_data)}条数据到{filename}") # 使用示例 if __name__ == "__main__": crawler = BilibiliCrawler(keyword="高等数学", max_results=20) crawler.search_videos() crawler.save_to_csv() 在这个代码中如何更改呢
06-14
不要大幅改动下面已有的代码,新导入pandas 库,将爬取到的视频标题,播放量,弹幕数,点赞数等以及简要计算的结果以表格形式导出 import asyncio import time import numpy as np from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options from lxml import etree from bilibili_api import video, search from scrapy import Spider, Request from scrapy.crawler import CrawlerProcess # 配置Selenium chrome_options = Options() chrome_options.add_argument("--headless") # 无头模式 chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--no-sandbox") # 关键词搜索函数(使用Selenium获取动态内容) def bilibili_search(keyword, max_videos=30): driver = webdriver.Chrome(options=chrome_options) try: driver.get(f"https://search.bilibili.com/all?keyword={keyword}") time.sleep(3) # 等待页面加载 # 滚动加载更多结果 for _ in range(3): driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(1.5) # 使用lxml解析页面 page_source = driver.page_source tree = etree.HTML(page_source) # 提取视频链接和BV号 video_items = tree.xpath('//li[contains(@class, "video-item")]') results = [] for item in video_items[:max_videos]: try: bvid = item.xpath('.//a[contains(@href, "video/BV")]/@href')[0].split('/')[-1] title = item.xpath('.//a[@title]/@title')[0] results.append({"bvid": bvid, "title": title}) except: continue return results finally: driver.quit() # 使用bilibili-api获取视频详细信息 async def get_video_stats(bvid): try: v = video.Video(bvid=bvid) info = await v.get_info() stat = info['stat'] return { "like": stat['like'], "danmaku": stat['danmaku'], "view": stat['view'], "coin": stat['coin'], "favorite": stat['favorite'] } except Exception as e: print(f"Error getting stats for {bvid}: {e}") return None # Scrapy爬虫框架实现 class BilibiliSpider(Spider): name = "bilibili_videos" def __init__(self, keyword="Python爬虫", *args, **kwargs): super(BilibiliSpider, self).__init__(*args, **kwargs) self.keyword = keyword self.results = [] self.stats_data = [] def start_requests(self): # 使用Selenium获取搜索结果 search_results = bilibili_search(self.keyword) for result in search_results: self.results.append(result) yield Request( url=f"https://www.bilibili.com/video/{result['bvid']}", callback=self.parse, meta={'bvid': result['bvid']} ) def parse(self, response): bvid = response.meta['bvid'] # 使用lxml解析视频页面 tree = etree.HTML(response.text) # 提取部分信息(API获取更完整) try: danmaku = tree.xpath('//div[@class="video-data"]/span[contains(text(), "弹幕")]/text()')[0].split()[0] danmaku = int(danmaku.replace(',', '')) except: danmaku = 0 # 异步获取完整统计信息 loop = asyncio.get_event_loop() stats = loop.run_until_complete(get_video_stats(bvid)) if stats: stats['danmaku_from_page'] = danmaku # 对比页面解析结果 stats['bvid'] = bvid self.stats_data.append(stats) # 找到对应的视频标题 title = next((r['title'] for r in self.results if r['bvid'] == bvid), "Unknown") yield { "title": title, "bvid": bvid, **stats } def closed(self, reason): # 计算统计数据 if self.stats_data: likes = [d['like'] for d in self.stats_data] danmakus = [d['danmaku'] for d in self.stats_data] views = [d['view'] for d in self.stats_data] print(f"\n{'='*50}") print(f"关键词 '{self.keyword}' 搜索结果统计:") print(f"视频总数: {len(self.stats_data)}") print(f"平均点赞数: {np.mean(likes):.1f}") print(f"平均弹幕数: {np.mean(danmakus):.1f}") print(f"平均播放量: {np.mean(views):.1f}") print(f"最高点赞视频: {max(likes)}") print(f"最高弹幕视频: {max(danmakus)}") print(f"{'='*50}") # 运行爬虫 if __name__ == "__main__": process = CrawlerProcess(settings={ 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'CONCURRENT_REQUESTS': 4, # 控制并发数 'DOWNLOAD_DELAY': 1.5, # 请求延迟 'FEED_FORMAT': 'json', 'FEED_URI': 'bilibili_videos.json' }) process.crawl(BilibiliSpider, keyword="石楠花") process.start()
最新发布
07-28
Traceback (most recent call last): File "D:\PyCharm 2024.2.3\学习\Lib\site-packages\selenium\webdriver\common\driver_finder.py", line 64, in _binary_paths raise ValueError(f"The path is not a valid file: {path}") ValueError: The path is not a valid file: chromedriver The above exception was the direct cause of the following exception: Traceback (most recent call last): File "D:\PyCharm 2024.2.3\学习\pythonProject\dazuoye\bilibili_crawler.py", line 364, in <module> data_file = crawl_data() ^^^^^^^^^^^^ File "D:\PyCharm 2024.2.3\学习\pythonProject\dazuoye\bilibili_crawler.py", line 336, in crawl_data crawler = BilibiliCrawler( ^^^^^^^^^^^^^^^^ File "D:\PyCharm 2024.2.3\学习\pythonProject\dazuoye\bilibili_crawler.py", line 43, in __init__ self.driver = self._init_driver() ^^^^^^^^^^^^^^^^^^^ File "D:\PyCharm 2024.2.3\学习\pythonProject\dazuoye\bilibili_crawler.py", line 60, in _init_driver driver = webdriver.Chrome(service=service, options=chrome_options) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "D:\PyCharm 2024.2.3\学习\Lib\site-packages\selenium\webdriver\chrome\webdriver.py", line 45, in __init__ super().__init__( File "D:\PyCharm 2024.2.3\学习\Lib\site-packages\selenium\webdriver\chromium\webdriver.py", line 50, in __init__ if finder.get_browser_path(): ^^^^^^^^^^^^^^^^^^^^^^^^^ File "D:\PyCharm 2024.2.3\学习\Lib\site-packages\selenium\webdriver\common\driver_finder.py", line 47, in get_browser_path return self._binary_paths()["browser_path"] ^^^^^^^^^^^^^^^^^^^^ File "D:\PyCharm 2024.2.3\学习\Lib\site-packages\selenium\webdriver\common\driver_finder.py", line 78, in _binary_paths raise NoSuchDriverException(msg) from err selenium.common.exceptions.NoSuchDriverException: Message: Unable to obtain driver for chrome; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location
06-24
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值