运行时报错:network buffer size带宽问题

本文介绍了在运行时遇到的解压错误Error-26601及带宽问题,并提供了两种解决方案:一是调整网络缓冲区大小,二是取消下载非HTML资源,通过实际操作验证了解决方案的有效性。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

运行时报错:
Action.c(8): Error -26601: Decompression function (wgzMemDecompressBuffer) failed, return code=-5 (Z_BUF_ERROR), inSize=0, inUse=0, outUse=0

分析:
这个错误为数据包较大,未下载完整或其他原因导致解压错误。
请求页面的脚本,出现的此问题。接口的脚本都是返回响应消息无此问题

解决一:
1.进入运行时设置 ;
2.internet protocol->preferences->options->general->network buffer size注:network buffer size 我的默认是12288,我调整为122880)


[img]http://dl2.iteye.com/upload/attachment/0111/0228/41c0e84d-7020-3876-9afb-b6a3283707d9.png[/img]

解决二:
单机(感觉单机最大带宽只能压到100M)
[img]http://dl2.iteye.com/upload/attachment/0111/0238/35e23c82-5d08-3ee4-8b0e-9706e5d5e6e1.png[/img]

分析:使用了第一种方法后,带宽的错误没有了。但是检查点失败报错较多。使用第二种方法后,检查点报错也没了。

运行时设置,去掉Download non-HTML resources(不下载资源)

[img]http://dl2.iteye.com/upload/attachment/0111/0779/f2abd7ac-f281-3e24-80a3-67e849378f9f.png[/img]

[img]http://dl2.iteye.com/upload/attachment/0111/0238/35e23c82-5d08-3ee4-8b0e-9706e5d5e6e1.png[/img](图1)

设置后运行

[img]http://dl2.iteye.com/upload/attachment/0111/0246/f923d735-621d-3978-a812-b686243f137a.png[/img](图2)


疑问:图1和图2 byte/sec 只差一倍,为什么图2带宽一点都没有了?
import numpy as np import cv2 from ultralytics import YOLO import torch import time import queue from threading import Thread import psutil import multiprocessing import multiprocessing from plate_recognition.plate_rec import init_model, get_plate_result import GPUtil import pynvml import argparse import statistics from collections import defaultdict import os import threading import platform import subprocess import signal from concurrent.futures import ThreadPoolExecutor # 全局配置 CLASSES = ['danger', 'car_danger', 'headstock', 'light', 'number', '1number', 'double_number'] DETECTOR_MODEL_PATH = './weights/best.engine' # DETECTOR_MODEL_PATH = './weights/best_fp32.engine' TEXT_MODEL_PATH = './weights/plate_rec.pth' def show_frame(frame_data, stream_id, latency_queue): frame, capture_time = frame_data window_name = f"Stream {stream_id}" if frame is not None and frame.size > 0: cv2.imshow(window_name, frame) if cv2.waitKey(1) & 0xFF == ord('q'): return latency = time.time() - capture_time latency_queue.put((stream_id, latency)) def display_process(display_queue, stream_id, latency_queue): window_name = f"Stream {stream_id}" cv2.namedWindow(window_name, cv2.WINDOW_NORMAL) cv2.resizeWindow(window_name, 800, 600) try: while True: frame_data = display_queue.get() if frame_data is None: break frame, capture_time = frame_data if frame is not None and frame.size > 0: cv2.imshow(window_name, frame) latency = time.time() - capture_time latency_queue.put((stream_id, latency)) if cv2.waitKey(1) & 0xFF == ord('q'): break finally: cv2.destroyWindow(window_name) print(f"显示进程{stream_id}退出") class EnhancedResourceMonitor: def __init__(self, gpu_id, process_mgr, interval=0.5): self.gpu_id = gpu_id self.interval = interval self.running = False self.lock = threading.Lock() self.data = defaultdict(list) self.process_mgr = process_mgr # 进程管理器引用 # GPU硬件信息 self.gpu_arch = "Ada Lovelace" self.sm_count = 56 self.peak_tflops = 35.6 self.cores_per_sm = 128 def start(self): pynvml.nvmlInit() self.handle = pynvml.nvmlDeviceGetHandleByIndex(self.gpu_id) self.running = True self.thread = Thread(target=self._monitor, daemon=True) self.thread.start() def _monitor(self): while self.running: try: # 监控所有工作进程 worker_stats = [] for p in self.process_mgr.processes: try: proc = psutil.Process(p.pid) with proc.oneshot(): worker_stats.append({ 'cpu': proc.cpu_percent(), 'mem': proc.memory_info().rss / (1024 ** 2), 'threads': proc.num_threads() }) except (psutil.NoSuchProcess, psutil.AccessDenied): continue # GPU监控 util = pynvml.nvmlDeviceGetUtilizationRates(self.handle) mem_info = pynvml.nvmlDeviceGetMemoryInfo(self.handle) clock_mhz = pynvml.nvmlDeviceGetClockInfo(self.handle, pynvml.NVML_CLOCK_SM) # 计算实际算力 current_tflops = (self.sm_count * (clock_mhz / 1000) * self.cores_per_sm * 2) / 1000 util_percent = (current_tflops / self.peak_tflops) * 100 # 记录数据 with self.lock: if worker_stats: self.data['worker_cpu'].append(sum(s['cpu'] for s in worker_stats)) self.data['worker_mem'].append(sum(s['mem'] for s in worker_stats)) self.data['worker_threads'].append(sum(s['threads'] for s in worker_stats)) self.data['gpu_util'].append(util.gpu) self.data['gpu_mem'].append(mem_info.used / (1024 ** 2)) self.data['gpu_tflops'].append(current_tflops) except Exception as e: print(f"监控错误: {str(e)}") time.sleep(self.interval) def stop(self): self.running = False if hasattr(self, 'thread'): self.thread.join() pynvml.nvmlShutdown() return self._generate_report() def _generate_report(self): report = "\n[程序资源报告]\n" # 进程统计 if self.data.get('worker_threads'): report += f"- 工作进程数: {len(self.process_mgr.processes)}\n" report += f"- 总线程数: {max(self.data['worker_threads'])}\n" report += f"- 峰值CPU使用: {max(self.data['worker_cpu']):.1f}%\n" report += f"- 峰值内存占用: {max(self.data['worker_mem']):.1f}MB\n" # GPU统计 if self.data.get('gpu_tflops'): avg_tflops = statistics.mean(self.data['gpu_tflops']) report += "\n[GPU资源]\n" report += f"- 平均利用率: {statistics.mean(self.data['gpu_util']):.1f}%\n" report += f"- 峰值显存: {max(self.data['gpu_mem']):.1f}MB\n" report += f"- 平均算力: {avg_tflops:.1f} TFLOPS\n" report += f"- 算力利用率: {avg_tflops/self.peak_tflops*100:.1f}%\n" return report class ResourceMonitor: def __init__(self, gpu_id, interval=0.5): self.gpu_id = gpu_id self.interval = interval self.running = False self.data = defaultdict(list) self.gpu_arch = "Ada Lovelace" self.sm_count = 56 # RTX 4070 SUPER有56个SM self.peak_tflops = 35.6 # 理论算力35.6 TFLOPS self.cores_per_sm = 128 # Ada架构每个SM有128个CUDA核心 self.lock = threading.Lock() # 添加锁 self.main_pid = os.getpid() # 记录主进程PID def start(self): pynvml.nvmlInit() self.handle = pynvml.nvmlDeviceGetHandleByIndex(self.gpu_id) self.running = True self.thread = Thread(target=self._monitor, daemon=True) self.thread.start() def _monitor(self): while self.running: try: # 改进的进程监控 main_process = psutil.Process(self.main_pid) with main_process.oneshot(): # 原子化读取 process_cpu = main_process.cpu_percent(interval=0.1) # 更短间隔 process_mem = main_process.memory_info().rss / (1024 ** 2) process_threads = main_process.num_threads() # 确保不会记录到0值 if process_cpu == 0 and len(self.data['process_cpu']) > 0: process_cpu = self.data['process_cpu'][-1] * 0.9 # 使用上次值的90% # 记录数据 with self.lock: self.data['process_cpu'].append(process_cpu) self.data['process_memory'].append(process_mem) self.data['process_threads'].append(process_threads) # 系统进程统计 process_count = len(list(psutil.process_iter())) cpu_percent = psutil.cpu_percent() mem = psutil.virtual_memory() # 获取当前Python进程的资源使用 current_process = psutil.Process() process_cpu = current_process.cpu_percent() process_mem = current_process.memory_info().rss / (1024 ** 2) # MB process_threads = current_process.num_threads() # GPU监控 util = pynvml.nvmlDeviceGetUtilizationRates(self.handle) mem_info = pynvml.nvmlDeviceGetMemoryInfo(self.handle) graphics_clock = pynvml.nvmlDeviceGetClockInfo(self.handle, pynvml.NVML_CLOCK_GRAPHICS) sm_clock = pynvml.nvmlDeviceGetClockInfo(self.handle, pynvml.NVML_CLOCK_SM) power_usage = pynvml.nvmlDeviceGetPowerUsage(self.handle) / 1000 # 瓦特 total_mem = sum(p.memory_info().rss for p in psutil.process_iter(['pid', 'name']) if 'python' in p.info['name'].lower()) / (1024**2) # MB # 获取当前GPU钟频率 clock_mhz = pynvml.nvmlDeviceGetClockInfo( self.handle, pynvml.NVML_CLOCK_SM ) # 收集数据 self.data['system_processes'].append(process_count) self.data['system_cpu'].append(cpu_percent) self.data['system_memory'].append(mem.used / (1024**3)) # GB self.data['process_cpu'].append(process_cpu) self.data['process_memory'].append(process_mem) self.data['process_threads'].append(process_threads) self.data['gpu_util'].append(util.gpu) self.data['gpu_mem'].append(mem_info.used / (1024**2)) # MB self.data['gpu_power'].append(power_usage) self.data['gpu_clock_graphics'].append(graphics_clock) self.data['gpu_clock_sm'].append(sm_clock) # 实算力计算 (TFLOPS = SM数 * 钟频率(GHz) * 每SM核心数 * 2 / 1000) current_tflops = (self.sm_count * (clock_mhz / 1000) * self.cores_per_sm * 2) / 1000 util_percent = (current_tflops / self.peak_tflops) * 100 self.data['gpu_tflops'].append(current_tflops) self.data['gpu_sm_clock'].append(clock_mhz) self.data['gpu_util_actual'].append(util_percent) except Exception as e: print(f"算力监控错误: {e}") except (psutil.NoSuchProcess, pynvml.NVMLError) as e: print(f"监控错误(忽略): {str(e)}") except Exception as e: print(f"意外的监控错误: {str(e)}") time.sleep(self.interval) def stop(self): self.running = False self.thread.join() pynvml.nvmlShutdown() return self._generate_report() def _generate_report(self): if not any(len(v) > 0 for v in self.data.values()): return "无监控数据" report = "\n[资源使用报告]\n" report += "\n[算力分析 - RTX 4070 SUPER]\n" report += f"- GPU架构: {self.gpu_arch}\n" report += f"- 流式多处理器(SM): {self.sm_count}\n" report += f"- CUDA核心: {self.sm_count * self.cores_per_sm}\n" report += f"- 理论峰值算力: {self.peak_tflops} TFLOPS\n" if self.data.get('gpu_tflops'): avg_tflops = statistics.mean(self.data['gpu_tflops']) max_tflops = max(self.data['gpu_tflops']) avg_clock = statistics.mean(self.data['gpu_sm_clock']) report += "\n[实际运行数据]\n" report += f"- 平均SM: {avg_clock} MHz\n" report += f"- 平均算力: {avg_tflops:.1f} TFLOPS\n" report += f"- 峰值算力: {max_tflops:.1f} TFLOPS\n" report += f"- 算力利用率: {avg_tflops/self.peak_tflops*100:.1f}%\n" # 瓶颈分析 avg_util = statistics.mean(self.data['gpu_util']) if avg_util > 90 and util_percent < 70: report += "\n[警告] 高GPU利用率但低算力利用率,可能存在内存带宽瓶颈\n" elif avg_tflops < 0.7 * self.peak_tflops: report += "\n[提示] 算力未充分利用,建议检查:\n" report += " • 批次大小(batch size)是否过小\n" report += " • 模型是否存在大量分支操作\n" # 系统级统计 report += "[系统资源]\n" system_metrics = { 'system_processes': ('系统进程数', '{:.0f}'), 'system_cpu': ('系统CPU使用率(%)', '{:.1f}'), 'system_memory': ('系统内存使用(GB)', '{:.2f}') } for key, (name, fmt) in system_metrics.items(): values = self.data.get(key, []) if values: report += ( f"{name}:\n" f" 平均值: {fmt.format(statistics.mean(values))}\n" f" 最大值: {fmt.format(max(values))}\n" f" 最小值: {fmt.format(min(values))}\n" f" 采样数: {len(values)}\n\n" ) # 进程级统计 report += "[主进程资源]\n" process_metrics = { 'process_cpu': ('进程CPU使用率(%)', '{:.1f}'), 'process_memory': ('进程内存使用(MB)', '{:.1f}'), 'process_threads': ('程内部的线程数量', '{:.0f}') } for key, (name, fmt) in process_metrics.items(): values = self.data.get(key, []) if values: report += ( f"{name}:\n" f" 平均值: {fmt.format(statistics.mean(values))}\n" f" 最大值: {fmt.format(max(values))}\n" f" 最小值: {fmt.format(min(values))}\n" f" 采样数: {len(values)}\n\n" ) # GPU统计 report += "[GPU资源]\n" gpu_metrics = { 'gpu_util': ('GPU利用率(%)', '{:.1f}'), 'gpu_mem': ('显存使用(MB)', '{:.1f}'), 'gpu_power': ('GPU功耗(W)', '{:.1f}'), 'gpu_clock_graphics': ('图形钟(MHz)', '{:.0f}'), 'gpu_clock_sm': ('SM钟(MHz)', '{:.0f}') } for key, (name, fmt) in gpu_metrics.items(): values = self.data.get(key, []) if values: report += ( f"{name}:\n" f" 平均值: {fmt.format(statistics.mean(values))}\n" f" 最大值: {fmt.format(max(values))}\n" f" 最小值: {fmt.format(min(values))}\n" f" 采样数: {len(values)}\n\n" ) return report class VideoProcessor: def __init__(self, device): # 添加CUDA初始化 torch.cuda.empty_cache() # 加载模型前设置优化选项 torch.backends.cudnn.benchmark = True torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True # 加载模型 self.model = YOLO(DETECTOR_MODEL_PATH, task='detect') self.plate_rec_model = init_model(device, TEXT_MODEL_PATH) self.rtsp_url = "rtsp://admin:guoxinzhike901@192.168.1.108:554/cam/realmonitor?channel=1&subtype=0" self.max_retries = 3 # 预热GPU # with torch.no_grad(): # dummy_input = torch.randn(1, 3, 640, 640).to(device) # _ = self.model(dummy_input) self.device = device self.frame_count = 0 self.plate_text_cache = {} def _reconnect(self): cap = cv2.VideoCapture(self.rtsp_url, cv2.CAP_FFMPEG) cap.set(cv2.CAP_PROP_RTSP_TRANSPORT, cv2.CAP_RTSP_TRANSPORT_TCP) return cap # 在VideoProcessor类中添加中文显示支持 def process_frame(self, frame): # 增强版中文显示函数(带错误处理和字体回退) def put_chinese_text(img, text, position, font_scale, color, thickness): """支持中文显示的增强函数""" try: from PIL import Image, ImageDraw, ImageFont img_pil = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) draw = ImageDraw.Draw(img_pil) # 尝试加载字体(优先使用自定义字体,失败则回退系统字体) try: font_path = os.path.join("fonts", "platech.ttf") font = ImageFont.truetype(font_path, int(font_scale * 30)) except: font = ImageFont.load_default() print("警告:使用默认字体,中文显示可能不正常") draw.text(position, text, font=font, fill=color) return cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR) except Exception as e: print(f"文本渲染失败,使用OpenCV默认显示: {str(e)}") cv2.putText(img, text, position, cv2.FONT_HERSHEY_SIMPLEX, font_scale, color, thickness) return img self.frame_count += 1 # 1. 输入帧验证 if frame is None or frame.size == 0: print("错误:接收到空帧") return frame # 2. 模型推理(添加详细日志) try: results = self.model.track( frame, persist=True, imgsz=640, verbose=False # 关闭YOLO内置输出 ) # 调试输出 print(f"帧 {self.frame_count}: 检测到 {len(results)} 个结果") except Exception as e: print(f"模型推理错误: {str(e)}") return frame # 3. 结果解析与渲染 class_colors = { 'danger': (0, 0, 255), 'car_danger': (0, 165, 255), 'headstock': (255, 0, 0), 'light': (255, 255, 0), 'number': (0, 255, 0), '1number': (0, 255, 255), 'double_number': (128, 0, 128) } for result in results: # 验证检测结果有效性 if not hasattr(result, 'boxes') or result.boxes is None: print("警告:结果中未包含有效检测框") continue for box in result.boxes: try: # 解析检测框数据 cls_id = int(box.cls[0].item()) class_name = CLASSES[cls_id] x1, y1, x2, y2 = map(int, box.xyxy[0].tolist()) conf = box.conf[0].item() track_id = int(box.id[0].item()) if box.id is not None else None # 车牌特殊处理 if class_name == 'number' and (track_id not in self.plate_text_cache or self.frame_count % 5 == 0): plate_img = frame[y1:y2, x1:x2] if plate_img.size > 0: plate_text = get_plate_result(plate_img, self.device, self.plate_rec_model) or "识别失败" self.plate_text_cache[track_id] = plate_text try: if track_id not in self.plate_text_cache or self.frame_count % 5 == 0: plate_img = frame[y1:y2, x1:x2] if plate_img.size > 0: plate_text = get_plate_result(plate_img, self.device, self.plate_rec_model) or "识别失败" self.plate_text_cache[track_id] = plate_text else: plate_text = "无效区域" display_text = f"{self.plate_text_cache.get(track_id, '加载中...')} ID:{track_id} {conf:.2f}" except Exception as e: print(f"车牌处理异常: {str(e)}") display_text = f"车牌识别错误 ID:{track_id}" else: display_text = f"{class_name} {conf:.2f}" + (f" ID:{track_id}" if track_id else "") # 渲染检测框和文本 color = class_colors.get(class_name, (255, 255, 255)) cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2) # 文本位置修正(确保不超出画面) y_text = max(y1 - 10, 10) frame = put_chinese_text(frame, display_text, (x1, y_text), 0.7, color, 2) except Exception as e: print(f"单检测框处理错误: {str(e)}") continue return frame def display_thread(display_queue, stream_id, latency_queue): window_name = f"Stream {stream_id}" cv2.namedWindow(window_name, cv2.WINDOW_NORMAL) cv2.resizeWindow(window_name, 800, 600) try: while True: frame_data = display_queue.get() if frame_data is None: break frame, capture_time = frame_data if frame is not None and frame.size > 0: cv2.imshow(window_name, frame) latency = time.time() - capture_time latency_queue.put((stream_id, latency)) if cv2.waitKey(1) & 0xFF == ord('q'): break finally: cv2.destroyWindow(window_name) print(f"显示线程{stream_id}退出") class StreamSimulator: def __init__(self, source_url, num_streams, shared_frame_queue): self.source_url = source_url self.num_streams = num_streams self.shared_frame_queue = shared_frame_queue self.display_queues = [multiprocessing.Queue(maxsize=2000) for _ in range(num_streams)] # 使用 multiprocessing.Queue self.stop_flag = multiprocessing.Event() self.capture_process = None def start(self): self.capture_process = multiprocessing.Process(target=self._capture_and_distribute) self.capture_process.start() def stop(self): self.stop_flag.set() if self.capture_process: self.capture_process.join(timeout=5) if self.capture_process.is_alive(): self.capture_process.terminate() print("强制终止捕获进程") def _capture_and_distribute(self): rtsp_url = self.source_url cap = cv2.VideoCapture(rtsp_url, cv2.CAP_FFMPEG) cap.set(cv2.CAP_PROP_BUFFERSIZE, 1) cap.set(cv2.CAP_PROP_FPS, 15) skip_frames = 2 # 每 2 帧处理 1 帧 frame_count = 0 try: while not self.stop_flag.is_set(): ret, frame = cap.read() if not ret: print("帧读取失败,重连中...") cap.release() time.sleep(2) cap = cv2.VideoCapture(rtsp_url, cv2.CAP_FFMPEG) continue frame_count += 1 if frame_count % skip_frames == 0: for i in range(self.num_streams): try: if not self.shared_frame_queue.full(): self.shared_frame_queue.put((frame.copy(), i, time.time()), block=False) else: print(f"共享帧队列已满,丢弃旧帧") self.shared_frame_queue.get_nowait() self.shared_frame_queue.put((frame.copy(), i, time.time()), block=False) except Exception as e: print(f"帧队列操作警告: {type(e).__name__}") finally: cap.release() self.shared_frame_queue.put(None) for q in self.display_queues: q.put(None) def dispatch_process(result_queue, display_queues): frame_buffer = [] while True: data = result_queue.get() if data is None: break # 检查是否为 'stats' 二元组 if isinstance(data, tuple) and len(data) == 2 and data[0] == 'stats': continue # 跳过中间统计数据 # 检查是否为帧数据(三元组) elif isinstance(data, tuple) and len(data) == 3: processed_frame, stream_id, capture_time = data frame_buffer.append((processed_frame, stream_id, capture_time)) frame_buffer.sort(key=lambda x: x[2]) # 按间戳排序 for frame, sid, _ in frame_buffer: if not display_queues[sid].full(): display_queues[sid].put((frame, capture_time), block=False) else: print(f"显示队列 {sid} 已满,丢帧") frame_buffer = [] # 清空缓冲区 # 检查是否为字典(最终统计数据) elif isinstance(data, dict): continue # 跳过最终统计数据 else: print(f"警告:未知数据类型: {type(data)}") def display_process(display_queue, stream_id, latency_queue): window_name = f"Stream {stream_id}" cv2.namedWindow(window_name, cv2.WINDOW_NORMAL) cv2.resizeWindow(window_name, 800, 600) frame_count = 0 try: while True: frame_data = display_queue.get() if frame_data is None: break frame, capture_time = frame_data if frame is not None and frame.size > 0: cv2.imshow(window_name, frame) frame_count += 1 if frame_count % 10 == 0: latency = time.time() - capture_time latency_queue.put((stream_id, latency)) if cv2.getWindowProperty(window_name, cv2.WND_PROP_VISIBLE) < 1: break if cv2.waitKey(1) & 0xFF == ord('q'): break finally: cv2.destroyWindow(window_name) print(f"显示进程{stream_id}退出") def worker_process(input_queue, gpu_id, result_queue, stats_queue, monitor_interval=5): import numpy as np # 显式导入 numpy print(f"In worker process {os.getpid()}, np is {np}, type(np.empty((1,))) = {type(np.empty((1,)))}") from collections import defaultdict torch.set_num_threads(1) cv2.setNumThreads(1) torch.cuda.set_device(gpu_id) device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu") processor = VideoProcessor(device) start_time = time.time() stats = { 'frame_count': 0, 'avg_fps': 0, 'max_gpu_mem': 0, 'process_time': 0, 'stream_id': None } frame_counts_per_stream = defaultdict(int) try: while True: frame_data = input_queue.get() if frame_data is None: break frame, stream_id, capture_time = frame_data stats['stream_id'] = stream_id # 单帧处理 start_process = time.time() results = processor.model.track(frame, imgsz=640, verbose=False) # 处理单帧 stats['process_time'] += time.time() - start_process processed_frame = processor.process_frame(frame) # 移除 cProfile if processed_frame is not None and processed_frame.size > 0: stats['frame_count'] += 1 frame_counts_per_stream[stream_id] += 1 result_queue.put((processed_frame, stream_id, capture_time)) # 定期更新统计信息 if stats['frame_count'] % monitor_interval == 0: duration = time.time() - start_time stats['avg_fps'] = stats['frame_count'] / duration if torch.cuda.is_available(): mem = torch.cuda.max_memory_allocated() / (1024 ** 2) stats['max_gpu_mem'] = max(stats['max_gpu_mem'], mem) stats['worker_pid'] = os.getpid() stats['frame_counts_per_stream'] = dict(frame_counts_per_stream) stats_queue.put(('stats', stats.copy())) except Exception as e: print(f"工作进程错误: {e}") finally: stats['worker_pid'] = os.getpid() stats['frame_counts_per_stream'] = dict(frame_counts_per_stream) stats_queue.put(stats) def get_gpu_info(): """获取GPU信息""" pynvml.nvmlInit() gpu_count = pynvml.nvmlDeviceGetCount() gpus = [] for i in range(gpu_count): handle = pynvml.nvmlDeviceGetHandleByIndex(i) name = pynvml.nvmlDeviceGetName(handle) mem = pynvml.nvmlDeviceGetMemoryInfo(handle) gpus.append({ 'id': i, 'name': name.decode('utf-8') if isinstance(name, bytes) else name, 'total_mem': mem.total / (1024 ** 2) }) pynvml.nvmlShutdown() return gpus import os import argparse def monitor_resources(gpu_id, interval=5): """资源监控线程""" pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id) while True: # GPU监控 util = pynvml.nvmlDeviceGetUtilizationRates(handle) mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) # CPU监控 cpu_percent = psutil.cpu_percent() mem = psutil.virtual_memory() print(f"\n[资源监控] GPU: {util.gpu}% 显存: {mem_info.used/1024**2:.1f}/{mem_info.total/1024**2:.1f}MB | " f"CPU: {cpu_percent}% 内存: {mem.used/1024**3:.1f}/{mem.total/1024**3:.1f}GB") time.sleep(interval) class DynamicProcessManager: def __init__(self, num_workers): self.num_workers = num_workers self.processes = [] self.result_queues = [] def start_workers(self, input_queue, gpu_id, result_queue, stats_queue): for i in range(self.num_workers): p = multiprocessing.Process( target=worker_process, args=(input_queue, gpu_id, result_queue, stats_queue) ) self.processes.append(p) p.start() def stop_workers(self): for p in self.processes: if p.is_alive(): p.terminate() try: p.join(timeout=1) except: pass if p.is_alive(): if platform.system() == "Windows": subprocess.run(['taskkill', '/F', '/PID', str(p.pid)], check=False) else: os.kill(p.pid, signal.SIGKILL) print(f"强制终止进程 {p.pid}") self.processes = [] def get_gpu_info(): pynvml.nvmlInit() gpu_count = pynvml.nvmlDeviceGetCount() gpus = [] for i in range(gpu_count): handle = pynvml.nvmlDeviceGetHandleByIndex(i) name = pynvml.nvmlDeviceGetName(handle) mem = pynvml.nvmlDeviceGetMemoryInfo(handle) gpus.append({ 'id': i, 'name': name.decode('utf-8') if isinstance(name, bytes) else name, 'total_mem': mem.total / (1024 ** 2) }) pynvml.nvmlShutdown() return gpus class ProgramMonitor: def __init__(self, gpu_id, process_manager, result_queue, stats_queue, args): self.gpu_id = gpu_id self.result_queue = result_queue self.stats_queue = stats_queue self.process_manager = process_manager self.args = args self.running = False self.stop_flag = threading.Event() self.data = { 'process': defaultdict(list), 'workers': defaultdict(list), 'gpu': defaultdict(list), 'fps_per_stream': defaultdict(list), 'total_fps': [], 'worker_stats': [], 'cpu_per_core': [], 'mem_bandwidth': [] } self.lock = threading.Lock() self.gpu_info = { 'arch': "Ada Lovelace", 'sm_count': 56, 'cores_per_sm': 128, 'peak_tflops': 35.6 } self.total_frame_counts = defaultdict(int) self.last_frame_counts = defaultdict(lambda: defaultdict(int)) self.start_time = None self.stop_time = None self.last_mem_time = time.time() self.last_mem_bytes = psutil.virtual_memory().used def start(self): pynvml.nvmlInit() self.handle = pynvml.nvmlDeviceGetHandleByIndex(self.gpu_id) self.running = True self.start_time = time.time() self.thread = Thread(target=self._monitor, daemon=True) self.thread.start() def _monitor(self): last_cpu_times = {} while not self.stop_flag.is_set(): try: # Process stats from stats_queue try: data = self.stats_queue.get_nowait() if isinstance(data, tuple) and data[0] == 'stats': stats = data[1] worker_pid = stats['worker_pid'] frame_counts_per_stream = stats['frame_counts_per_stream'] with self.lock: for stream_id, count in frame_counts_per_stream.items(): delta = count - self.last_frame_counts[worker_pid][stream_id] self.total_frame_counts[stream_id] += delta self.last_frame_counts[worker_pid][stream_id] = count except queue.Empty: pass # Main process monitoring main_process = psutil.Process(os.getpid()) with main_process.oneshot(): current_cpu_time = main_process.cpu_times() pid = main_process.pid if pid in last_cpu_times: cpu_usage = self._calculate_cpu_usage(last_cpu_times[pid], current_cpu_time) self.data['process']['cpu'].append(cpu_usage) last_cpu_times[pid] = current_cpu_time self.data['process']['mem'].append(main_process.memory_info().rss / (1024 ** 2)) self.data['process']['threads'].append(main_process.num_threads()) # Worker processes monitoring for p in self.process_manager.processes: try: proc = psutil.Process(p.pid) with proc.oneshot(): current_cpu_time = proc.cpu_times() pid = p.pid if pid in last_cpu_times: cpu_usage = self._calculate_cpu_usage(last_cpu_times[pid], current_cpu_time) self.data['workers']['cpu'].append(cpu_usage) last_cpu_times[pid] = current_cpu_time self.data['workers']['mem'].append(proc.memory_info().rss / (1024 ** 2)) self.data['workers']['threads'].append(proc.num_threads()) except (psutil.NoSuchProcess, psutil.AccessDenied): continue # Memory bandwidth monitoring current_time = time.time() current_mem_bytes = psutil.virtual_memory().used time_delta = current_time - self.last_mem_time if time_delta > 0: mem_bandwidth = (current_mem_bytes - self.last_mem_bytes) / time_delta / (1024 ** 2) with self.lock: self.data['mem_bandwidth'].append(mem_bandwidth) self.last_mem_time = current_time self.last_mem_bytes = current_mem_bytes # CPU per core monitoring cpu_per_core = psutil.cpu_percent(percpu=True) with self.lock: self.data['cpu_per_core'].append(cpu_per_core) self._monitor_gpu() except Exception as e: print(f"监控错误: {str(e)}") time.sleep(0.5) def _monitor_gpu(self): try: util = pynvml.nvmlDeviceGetUtilizationRates(self.handle) mem_info = pynvml.nvmlDeviceGetMemoryInfo(self.handle) clock_mhz = pynvml.nvmlDeviceGetClockInfo(self.handle, pynvml.NVML_CLOCK_SM) current_tflops = (self.gpu_info['sm_count'] * (clock_mhz / 1000) * self.gpu_info['cores_per_sm'] * 2) / 1000 with self.lock: self.data['gpu']['util'].append(util.gpu) self.data['gpu']['mem'].append(mem_info.used / (1024 ** 2)) self.data['gpu']['tflops'].append(current_tflops) except pynvml.NVMLError as e: print(f"GPU监控错误: {str(e)}") def stop(self): self.stop_time = time.time() self.running = False self.stop_flag.set() if self.thread.is_alive(): self.thread.join(timeout=2) report = self.generate_report() pynvml.nvmlShutdown() return report def generate_report(self): report = "\n=== 程序资源使用报告 ===\n" # System information (unchanged) report += "\n[系统信息]\n" report += f"- CPU核心数: {psutil.cpu_count(logical=False)}物理/{psutil.cpu_count()}逻辑\n" report += f"- 系统内存: {psutil.virtual_memory().total / (1024**3):.1f}GB\n" report += f"- 系统CPU使用率: {psutil.cpu_percent(interval=1):.1f}%\n" report += f"- 系统内存使用: {psutil.virtual_memory().used / (1024**3):.1f}GB / {psutil.virtual_memory().total / (1024**3):.1f}GB\n" gpu_name_raw = pynvml.nvmlDeviceGetName(self.handle) gpu_name = gpu_name_raw.decode('utf-8') if isinstance(gpu_name_raw, bytes) else gpu_name_raw total_gpu_mem = pynvml.nvmlDeviceGetMemoryInfo(self.handle).total / (1024 ** 2) report += f"- GPU型号: {gpu_name}\n" report += f"- GPU总显存: {total_gpu_mem:.1f}MB\n" # Main process stats (unchanged) if self.data['process']['cpu']: report += "\n[主进程资源]\n" report += f"- 平均CPU使用率: {statistics.mean(self.data['process']['cpu']):.1f}%\n" report += f"- 峰值CPU使用率: {max(self.data['process']['cpu']):.1f}%\n" report += f"- 平均内存占用: {statistics.mean(self.data['process']['mem']):.1f}MB\n" report += f"- 峰值内存占用: {max(self.data['process']['mem']):.1f}MB\n" report += f"- 线程数: {max(self.data['process']['threads'])}\n" # Worker processes stats (unchanged except for FPS section) if self.data['workers']['cpu']: num_workers = min(self.args.streams * 4, psutil.cpu_count(logical=True) * 2) num_samples = len(self.data['workers']['cpu']) // num_workers if num_samples > 0: worker_cpu_per_sample = [self.data['workers']['cpu'][i*num_workers:(i+1)*num_workers] for i in range(num_samples)] worker_mem_per_sample = [self.data['workers']['mem'][i*num_workers:(i+1)*num_workers] for i in range(num_samples)] worker_threads_per_sample = [self.data['workers']['threads'][i*num_workers:(i+1)*num_workers] for i in range(num_samples)] avg_worker_cpu = statistics.mean([statistics.mean(sample) for sample in worker_cpu_per_sample]) total_worker_cpu = statistics.mean([sum(sample) for sample in worker_cpu_per_sample]) avg_worker_mem = statistics.mean([statistics.mean(sample) for sample in worker_mem_per_sample]) total_worker_mem = statistics.mean([sum(sample) for sample in worker_mem_per_sample]) max_total_worker_threads = max([sum(sample) for sample in worker_threads_per_sample]) report += f"\n[工作进程资源 ({num_workers}个)]\n" report += f"- 平均CPU使用率(每个进程): {avg_worker_cpu:.1f}%\n" report += f"- 总CPU使用率: {total_worker_cpu:.1f}%\n" report += f"- 平均内存占用(每个进程): {avg_worker_mem:.1f}MB\n" report += f"- 总内存占用: {total_worker_mem:.1f}MB\n" report += f"- 总线程数(峰值): {max_total_worker_threads}\n" # Video stream performance with accurate FPS if self.total_frame_counts: elapsed_time = self.stop_time - self.start_time report += "\n[视频流性能]\n" for stream_id in range(self.args.streams): if stream_id in self.total_frame_counts: avg_fps = self.total_frame_counts[stream_id] / elapsed_time report += f"- 视频流 {stream_id}: 平均 FPS {avg_fps:.1f}\n" total_frames = sum(self.total_frame_counts.values()) total_fps = total_frames / elapsed_time report += f"- 总吞吐量: {total_fps:.1f} FPS\n" # CPU per core (unchanged) if self.data.get('cpu_per_core'): avg_cpu_per_core = [statistics.mean([sample[i] for sample in self.data['cpu_per_core']]) for i in range(len(self.data['cpu_per_core'][0]))] overall_avg_cpu = statistics.mean(avg_cpu_per_core) report += "\n[CPU 硬件线程利用率]\n" for i, avg in enumerate(avg_cpu_per_core): report += f"- 逻辑处理器 {i}: {avg:.1f}%\n" report += f"- 16 个硬件线程平均利用率: {overall_avg_cpu:.1f}%\n" # Total process stats (unchanged) if self.data['process']['cpu'] and self.data['workers']['cpu']: num_display_processes = self.args.streams total_cpu = statistics.mean(self.data['process']['cpu']) + total_worker_cpu total_mem = statistics.mean(self.data['process']['mem']) + total_worker_mem total_threads = max(self.data['process']['threads']) + max_total_worker_threads total_processes = 1 + num_workers + num_display_processes + 1 report += "\n[所有进程总计]\n" report += f"- 总CPU使用率: {total_cpu:.1f}%\n" report += f"- 总内存占用: {total_mem:.1f}MB\n" report += f"- 总线程数: {total_threads}\n" report += f"- 总进程数: {total_processes}(1个主进程 + {num_workers}个工作进程 + {num_display_processes}个显示进程 + 1个分发进程)\n" # GPU stats (unchanged) if self.data['gpu']['tflops']: avg_tflops = statistics.mean(self.data['gpu']['tflops']) util_percent = min((avg_tflops / self.gpu_info['peak_tflops']) * 100, 100.0) report += "\n[GPU资源]\n" report += f"- 平均利用率: {statistics.mean(self.data['gpu']['util']):.1f}%\n" report += f"- 峰值显存: {max(self.data['gpu']['mem']):.1f}MB\n" report += f"- 平均算力: {avg_tflops:.1f}/{self.gpu_info['peak_tflops']} TFLOPS\n" report += f"- 算力利用率: {util_percent:.1f}%\n" # Memory bandwidth (unchanged) if self.data.get('mem_bandwidth'): avg_mem_bandwidth = statistics.mean(self.data['mem_bandwidth']) max_mem_bandwidth = max(self.data['mem_bandwidth']) report += "\n[存储器带宽]\n" report += f"- 平均内存带宽: {avg_mem_bandwidth:.1f} MB/s\n" report += f"- 峰值内存带宽: {max_mem_bandwidth:.1f} MB/s\n" return report def _calculate_cpu_usage(self, prev_times, curr_times): """ 计算基于前后的 CPU 间的使用率百分比。 参数: prev_times: 上一次的 CPU 间(psutil.cpu_times 对象) curr_times: 当前的 CPU 间(psutil.cpu_times 对象) 返回: CPU 使用率(百分比) """ delta_user = curr_times.user - prev_times.user delta_system = curr_times.system - prev_times.system delta_total = (curr_times.user + curr_times.system) - (prev_times.user + prev_times.system) if delta_total > 0: cpu_usage = ((delta_user + delta_system) / delta_total) * 100 else: cpu_usage = 0.0 return cpu_usage # _monitor_gpu and _calculate_cpu_usage remain unchanged def main(): parser = argparse.ArgumentParser() parser.add_argument('--streams', type=int, default=1) parser.add_argument('--source', type=str, default="") parser.add_argument('--gpu_id', type=int, default=0) args = parser.parse_args() camera_config = { 'username': 'admin', 'password': 'guoxinzhike901' } source_url = args.source if args.source else \ f"rtsp://{camera_config['username']}:{camera_config['password']}@192.168.1.108/" gpus = get_gpu_info() print("\n[硬件配置]") print(f"- CPU核心: {psutil.cpu_count(logical=False)}物理/{psutil.cpu_count()}逻辑") print(f"- 内存: {psutil.virtual_memory().total / (1024**3):.1f}GB") print(f"- 使用GPU {args.gpu_id}: {gpus[args.gpu_id]['name']}") print(f" 显存: {gpus[args.gpu_id]['total_mem']:.1f}MB") os.environ['OMP_NUM_THREADS'] = '1' os.environ['MKL_NUM_THREADS'] = '1' print(f"\n[测试配置]") print(f"- 模拟视频流数: {args.streams}") print(f"- 视频源: {source_url}") # 创建共享队列 frame_queue_size = max(2000, 200 * args.streams) shared_frame_queue = multiprocessing.Queue(maxsize=frame_queue_size) display_queue_size = max(50, 20 * args.streams) shared_result_queue = multiprocessing.Queue(maxsize=2000) stats_queue = multiprocessing.Queue() # New queue for stats # 固定工作进程数为 16 num_workers = 1 #min(args.streams * 8, psutil.cpu_count(logical=True) * 2) process_mgr = DynamicProcessManager(num_workers) simulator = StreamSimulator(source_url, args.streams, shared_frame_queue) monitor = ProgramMonitor(args.gpu_id, process_mgr, shared_result_queue, stats_queue, args) monitor.args = args # 传递 args latency_queue = multiprocessing.Queue() # 启动工作进程 process_mgr.start_workers(shared_frame_queue, args.gpu_id, shared_result_queue, stats_queue) # 启动分发进程 dispatch_p = multiprocessing.Process( target=dispatch_process, args=(shared_result_queue, simulator.display_queues), daemon=True ) dispatch_p.start() simulator.start() monitor.start() # 启动显示进程 display_threads = [] for i in range(args.streams): t = Thread(target=display_thread, args=(simulator.display_queues[i], i+1, latency_queue)) display_threads.append(t) t.start() time.sleep(0.5) print("\n[测试开始] 程序将运行30秒...") start_time = time.time() end_time = start_time + 60*5 try: while time.time() < end_time: time.sleep(1) remaining = int(end_time - time.time()) if remaining % 10 == 0 or remaining <= 5: print(f"剩余: {remaining}秒") finally: runtime = time.time() - start_time print(f"\n[测试完成] 实际运行: {runtime:.1f}秒") print("停止模拟器...") simulator.stop() print("生成报告并停止监控...") report = monitor.stop() print("停止工作进程...") process_mgr.stop_workers() # 停止显示线程 for q in simulator.display_queues: q.put(None) for t in display_threads: t.join() # 停止分发进程 shared_result_queue.put(None) dispatch_p.join(timeout=5) if dispatch_p.is_alive(): dispatch_p.terminate() # 收集延迟测量 latencies = [] while not latency_queue.empty(): try: stream_id, latency = latency_queue.get_nowait() latencies.append(latency) except queue.Empty: break if latencies: min_latency = min(latencies) max_latency = max(latencies) avg_latency = sum(latencies) / len(latencies) report += f"\n[延迟统计]\n" report += f"- 测量次数: {len(latencies)}\n" report += f"- 最低延迟: {min_latency:.3f}秒\n" report += f"- 最高延迟: {max_latency:.3f}秒\n" report += f"- 平均延迟: {avg_latency:.3f}秒\n" else: report += "\n[延迟统计]\n- 无延迟数据\n" if torch.cuda.is_available(): torch.cuda.empty_cache() print(report) if __name__ == '__main__': multiprocessing.set_start_method('spawn') # multiprocessing.set_start_method('fork') # Linux 默认方法 main() # 测试4路视频流 # python det_ocr_shipinliu_pre.py --streams 1 --gpu_id 0 """ === 程序资源使用报告 === === 程序资源使用报告 === [系统信息] - CPU核心数: 10物理/16逻辑 - 系统内存: 63.8GB - 系统CPU使用率: 14.1% - 系统内存使用: 26.3GB / 63.8GB - GPU型号: NVIDIA GeForce RTX 4070 SUPER - GPU总显存: 12282.0MB [主进程资源] - 平均CPU使用率: 16.3% - 峰值CPU使用率: 28.1% - 平均内存占用: 385.1MB - 峰值内存占用: 385.7MB - 线程数: 9 [工作进程资源 (16个)] - 平均CPU使用率(每个进程): 22.1% - 总CPU使用率: 354.2% - 平均内存占用(每个进程): 801.5MB - 总内存占用: 12823.3MB - 总线程数(峰值): 304 [所有进程总计] - 总CPU使用率: 370.5% - 总内存占用: 13208.4MB - 总线程数: 313 - 总进程数: 19(1个主进程 + 16个工作进程 + 1个显示进程 + 1个分发进程) [GPU资源] - 平均利用率: 31.3% - 峰值显存: 8226.7MB - 平均算力: 22.7/35.6 TFLOPS - 算力利用率: 63.8% [延迟统计] - 测量次数: 67 - 最低延迟: 0.024秒 - 最高延迟: 2.499秒 - 平均延迟: 0.287秒 """ # python det_ocr_shipinliu_pre.py --streams 2 --gpu_id 0 """ === 程序资源使用报告 === [系统信息] - CPU核心数: 10物理/16逻辑 - 系统内存: 63.8GB - 系统CPU使用率: 9.8% - 系统内存使用: 26.3GB / 63.8GB - GPU型号: NVIDIA GeForce RTX 4070 SUPER - GPU总显存: 12282.0MB [主进程资源] - 平均CPU使用率: 15.3% - 峰值CPU使用率: 40.6% - 平均内存占用: 386.4MB - 峰值内存占用: 387.1MB - 线程数: 9 [工作进程资源 (16个)] - 平均CPU使用率(每个进程): 20.8% - 总CPU使用率: 333.1% - 平均内存占用(每个进程): 960.3MB - 总内存占用: 15364.2MB - 总线程数(峰值): 328 [所有进程总计] - 总CPU使用率: 348.4% - 总内存占用: 15750.6MB - 总线程数: 337 - 总进程数: 20(1个主进程 + 16个工作进程 + 2个显示进程 + 1个分发进程) [GPU资源] - 平均利用率: 50.5% - 峰值显存: 8328.6MB - 平均算力: 12.6/35.6 TFLOPS - 算力利用率: 35.4% [延迟统计] - 测量次数: 327 - 最低延迟: 0.027秒 - 最高延迟: 0.757秒 - 平均延迟: 0.080秒 """ # python det_ocr_shipinliu_pre.py --streams 3 --gpu_id 0 """ [系统信息] - CPU核心数: 10物理/16逻辑 - 系统内存: 63.8GB - 系统CPU使用率: 9.5% - 系统内存使用: 26.2GB / 63.8GB - GPU型号: NVIDIA GeForce RTX 4070 SUPER - GPU总显存: 12282.0MB [主进程资源] - 平均CPU使用率: 26.2% - 峰值CPU使用率: 53.1% - 平均内存占用: 386.1MB - 峰值内存占用: 386.6MB - 线程数: 9 [工作进程资源 (16个)] - 平均CPU使用率(每个进程): 43.9% - 总CPU使用率: 702.5% - 平均内存占用(每个进程): 1018.8MB - 总内存占用: 16301.3MB - 总线程数(峰值): 322 [所有进程总计] - 总CPU使用率: 728.7% - 总内存占用: 16687.5MB - 总线程数: 331 - 总进程数: 21(1个主进程 + 16个工作进程 + 3个显示进程 + 1个分发进程) [GPU资源] - 平均利用率: 52.2% - 峰值显存: 7861.9MB - 平均算力: 18.9/35.6 TFLOPS - 算力利用率: 53.1% [延迟统计] - 测量次数: 327 - 最低延迟: 0.030秒 - 最高延迟: 3.756秒 - 平均延迟: 1.077秒 """ # python det_ocr_shipinliu_pre.py --streams 4 --gpu_id 0 cpu100 """ === 程序资源使用报告 === [系统信息] - CPU核心数: 10物理/16逻辑 - 系统内存: 63.8GB - 系统CPU使用率: 58.6% - 系统内存使用: 36.3GB / 63.8GB - GPU型号: NVIDIA GeForce RTX 4070 SUPER - GPU总显存: 12282.0MB [主进程资源] - 平均CPU使用率: 28.0% - 峰值CPU使用率: 53.1% - 平均内存占用: 386.4MB - 峰值内存占用: 386.8MB - 线程数: 9 [工作进程资源 (16个)] - 平均CPU使用率(每个进程): 48.0% - 总CPU使用率: 768.7% - 平均内存占用(每个进程): 1585.2MB - 总内存占用: 25363.6MB - 总线程数(峰值): 320 [所有进程总计] - 总CPU使用率: 796.7% - 总内存占用: 25750.1MB - 总线程数: 329 - 总进程数: 22(1个主进程 + 16个工作进程 + 4个显示进程 + 1个分发进程) [GPU资源] - 平均利用率: 52.9% - 峰值显存: 7991.3MB - 平均算力: 20.2/35.6 TFLOPS - 算力利用率: 56.8% [延迟统计] - 测量次数: 327 - 最低延迟: 1.480秒 - 最高延迟: 14.222秒 - 平均延迟: 8.113秒 """ # python det_ocr_shipinliu_pre.py --streams 5 --gpu_id 0 """ """ # python det_ocr_shipinliu_pre.py --streams 16 --gpu_id 0 """ """ # python det_ocr_shipinliu_pre.py --streams 20 --gpu_id 0 """ """ (yolov8_bt) (base) zhang@zhang:~/danger/yolov7_crnn_ocr_detection$ python det_ocr_shipinliu_pre.py --streams 1 --gpu_id 0 [硬件配置] - CPU核心: 10物理/16逻辑 - 内存: 62.6GB - 使用GPU 0: NVIDIA GeForce RTX 4070 SUPER 显存: 12282.0MB [测试配置] - 模拟视频流数: 1 - 视频源: rtsp://admin:guoxinzhike901@192.168.1.108/ [测试开始] 程序将运行30秒... In worker process 35804, np is <module 'numpy' from '/home/zhang/miniconda3/envs/yolov8_bt/lib/python3.9/site-packages/numpy/__init__.py'>, type(np.empty((1,))) = <class 'numpy.ndarray'> Loading weights/best.engine for TensorRT inference... [06/07/2025-18:54:11] [TRT] [I] Loaded engine size: 39 MiB [06/07/2025-18:54:11] [TRT] [W] Using an engine plan file across different models of devices is not recommended and is likely to affect performance or even cause errors. [06/07/2025-18:54:13] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in engine deserialization: CPU +0, GPU +33, now: CPU 0, GPU 33 (MiB) [06/07/2025-18:54:13] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +39, now: CPU 0, GPU 72 (MiB) 剩余: 290秒 剩余: 280秒 剩余: 270秒 (yolov8_bt) (base) zhang@zhang:~/danger/yolov7_crnn_ocr_detection$ python -c "import numpy as np; print(np.__version__)" 1.23.0 代码运行报错
最新发布
06-08
<think>我们正在处理用户在TensorRT推理过程中可能遇到的GPU资源分配或numpy版本兼容性问题。根据用户提供的信息,错误可能发生在工作进程35804中,其中numpy被导入并创建了一个空数组,但类型显示为`<class'numpy.ndarray'>`,这看起来正常。然而,用户提到报错可能与numpy版本1.23.0相关。此外,用户还引用了TensorRT、模型优化部署等内容。为了排查问题,我们需要分步骤进行:1.**检查错误日志**:由于用户没有提供具体的错误日志,我们首先需要用户提供错误信息。但根据问题描述,错误可能与GPU资源分配(如显存不足)或numpy版本兼容性有关。2.**numpy版本问题**:用户提到numpy版本1.23.0。我们需要确认该版本与TensorRT以及所使用的其他库(如PyTorch、TensorFlow、ONNX等)的兼容性。已知某些TensorRT版本与较新的numpy版本可能存在兼容性问题。3.**GPU资源分配问题**:在TensorRT推理过程中,可能因为显存不足、多进程竞争显存或CUDA上下文初始化失败而报错。根据引用[1]中关于TensorRT构建推理引擎的代码,以及引用[5]中安装PyTorch和CUDA11.8的命令,我们可以推测用户的环境是:-PyTorch2.0.1-CUDA11.8-可能使用TensorRT进行推理同,引用[3]中涉及NeMoTTS模型的频谱图生成,这可能是一个使用场景。###排查步骤:####1.确认numpy版本兼容性-TensorRT8.x版本与numpy1.23.0可能存在兼容性问题。建议降级numpy到1.21.6或升级TensorRT到最新版(如果适用)。-执行以下命令降级numpy:```bashpipinstallnumpy==1.21.6```####2.检查GPU显存资源-在运行TensorRT推理前,检查可用显存:```pythonimporttorchprint(f"AvailableGPUmemory:{torch.cuda.memory_allocated()/1024**2}MBused,{torch.cuda.memory_reserved()/1024**2}MBreserved")```-如果显存不足,尝试:-减少推理批量大小(batchsize)-释放不必要的显存(如清除缓存)-确保没有其他进程占用显存####3.多进程环境下的CUDA初始化-如果在多进程中使用TensorRT,注意CUDA上下文初始化问题。每个进程都需要独立初始化CUDA,同避免进程间显存竞争。建议:-使用`multiprocessing`的`spawn`启动方法(而非`fork`),因为CUDA运行与`fork`存在兼容性问题[^4]。-在子进程中初始化TensorRT和CUDA。####4.验证TensorRT和ONNX模型解析-确保ONNX模型解析正确(引用[1]):```python#继续使用用户提供的TensorRT构建代码logger=trt.Logger(trt.Logger.ERROR)builder=trt.Builder(logger)EXPLICIT_BATCH=1<< int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)network=builder.create_network(EXPLICIT_BATCH)#解析ONNX模型parser=trt.OnnxParser(network,logger)#假设onnx_model_path是ONNX模型路径withopen(onnx_model_path,"rb")asmodel:ifnotparser.parse(model.read()):forerrorinrange(parser.num_errors):print(parser.get_error(error))returnNone```-如果解析失败,打印错误信息。####5.模型优化部署-考虑引用[2]中提到的模型量化、剪枝技术,以减少显存占用和计算需求。###可能的问题原因总结:-**numpy版本过高**:降级到1.21.6可能解决问题。-**显存不足**:调整批量大小或清理显存。-**多进程竞争**:确保正确管理每个进程的显存,避免竞争。-**TensorRT解析ONNX模型失败**:检查ONNX模型是否与TensorRT兼容。###示例解决方案代码:以下是一个安全初始化TensorRT并处理多进程的建议代码框架:```pythonimporttensorrtastrtimportnumpyasnpimportmultiprocessingasmpdefinit_worker():#每个工作进程初始化CUDA上下文importtorchtorch.cuda.init()deftrt_inference(onnx_model_path):#构建TensorRT引擎logger=trt.Logger(trt.Logger.ERROR)builder=trt.Builder(logger)network=builder.create_network(1<< int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))parser=trt.OnnxParser(network,logger)withopen(onnx_model_path,"rb")asf:ifnotparser.parse(f.read()):foriinrange(parser.num_errors):print(parser.get_error(i))returnNone#继续配置和构建引擎config=builder.create_builder_config()config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE,1<<30)#1GBengine=builder.build_engine(network,config)returnengineif__name__=="__main__":#使用spawn方法启动进程mp.set_start_method('spawn')#初始化主进程CUDA(可选)init_worker()#创建工作进程processes=[]foriinrange(4):p=mp.Process(target=worker_function,args=(...))p.start()processes.append(p)forpinprocesses:p.join()```###相关问题:§§相关问题§§1.如何解决TensorRT在解析ONNX模型的错误?2.多进程环境下如何正确管理CUDA上下文?3.TensorRT推理显存不足的优化方法有哪些?</think> ### TensorRT推理报错排查指南 根据您描述的GPU资源分配和Numpy 1.23.0兼容性问题,结合TensorRT使用场景,以下是系统化的排查方案: --- ### 1. GPU资源分配问题排查 **常见症状**:显存不足(OOM)、CUDA初始化失败或GPU利用率异常 ```python import tensorrt as trt import torch # GPU资源检测函数 def check_gpu_resources(): # 检查设备状态 if not torch.cuda.is_available(): print("WARNING: CUDA不可用!请检查驱动和CUDA安装") return device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测显存状态 print(f"显存总量: {torch.cuda.get_device_properties(device).total_memory/1024**3:.2f} GB") print(f"当前占用: {torch.cuda.memory_allocated(device)/1024**3:.2f} GB") print(f"保留缓存: {torch.cuda.memory_reserved(device)/1024**3:.2f} GB") # 清理缓存的PyTorch显存 if torch.cuda.is_available(): torch.cuda.empty_cache() # 列出所有可用GPU device_count = torch.cuda.device_count() print(f"可用GPU数: {device_count}") for i in range(device_count): print(f"GPU {i}: {torch.cuda.get_device_name(i)}") # 在TensorRT初始化前调用 check_gpu_resources() ``` **关键检查点**: 1. 显存泄露:TensorRT推理后未释放资源 2. GPU设备冲突:多进程/多线程争用 3. 显存碎片化:大量小内存分配导致 4. TensorRT的workspace设置:`builder_config.memory_pool_limit`配置过大会导致OOM --- ### 2. Numpy 1.23.0兼容性问题 **潜在冲突点**: ```python # Numpy版本兼容性检查 import numpy as np print(f"Numpy版本: {np.__version__}") # TensorRT接口兼容性检测 try: arr = np.empty((3, 224, 224), dtype=np.float32) tensor = trt.TensorRT.Dims3(arr.shape) print("Numpy到TensorRT转换正常") except Exception as e: print(f"类型转换错误: {str(e)}") ``` **解决方案**: 1. **降级方案**(临): ```bash pip uninstall numpy -y pip install numpy==1.21.6 # TensorRT 8.5官方验证版本 ``` 2. **修复序列化问题**: ```python # 当在进程间传递numpy数组 import pickle def serialize_array(arr): return pickle.dumps({ 'dtype': str(arr.dtype), 'shape': arr.shape, 'data': arr.tobytes() }) def deserialize_array(data): meta = pickle.loads(data) arr = np.frombuffer(meta['data'], dtype=meta['dtype']) return arr.reshape(meta['shape']) ``` --- ### 3. TensorRT环境诊断脚本 ```python import tensorrt as trt def tensorrt_health_check(): # TRT基础功能检测 logger = trt.Logger(trt.Logger.ERROR) try: # 检查可用的解析器 import types trt_version = types.SimpleNamespace() trt_version.major = trt.__version__.split('.')[0] # 验证核心功能 builder = trt.Builder(logger) if not builder: raise RuntimeError("Builder创建失败") # 检查ONNX解析器 EXPLICIT_BATCH = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) network = builder.create_network(EXPLICIT_BATCH) parser = trt.OnnxParser(network, logger) print("TensorRT环境状态:正常") return True except Exception as e: print(f"TensorRT诊断失败: {str(e)}") print("可能原因:") print("- CUDA/cuDNN版本不匹配") print("- libnvinfer.so加载失败") print("- Python绑定损坏") return False # 执行诊断 tensorrt_health_check() ``` --- ### 4. 综合故障树分析 ```mermaid graph TD A[TensorRT推理错误] --> B{错误类型} B -->|OOM| C[GPU资源不足] B -->|类型错误| D[Numpy兼容性] B -->|初始化失败| E[环境配置错误] C --> C1[减少batch size] C --> C2[清除显存缓存] C --> C3[降低模型精度] D --> D1[降级numpy] D --> D2[修复序列化] D --> D3[升级TensorRT] E --> E1[验证CUDA安装] E --> E2[检查LD_LIBRARY_PATH] E --> E3[重装TensorRT] ``` --- ### 推荐操作步骤 1. 运行GPU资源检测脚本 2. 检查TensorRT诊断输出 3. 根据错误类型选择解决方案分支 4. 验证是否解决后用docker冻结环境: ```Dockerfile FROM nvcr.io/nvidia/tensorrt:22.12-py3 RUN pip install numpy==1.21.6 onnxruntime torch==2.0.1 ``` **注**:TensorRT 8.5对CUDA11.8支持较好,但需匹配cuDNN 8.9.1+[^5]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值