监测深度学习模型训练时CPU和GPU的使用情况_如何查看深度学习中资源利用率-优快云博客

本文链接：https://blog.youkuaiyun.com/weixin_45275802/article/details/142518441

监测代码：

import pynvml
import torch
import psutil
import os

# 获取当前进程信息
process = psutil.Process(os.getpid())  # 获取当前 Python 进程的 PID

def print_resource_usage(phase, device=None):
    """
    打印当前阶段的资源使用情况，包括 CPU 使用率、内存占用和 GPU 占用情况。

    :param phase: 当前的阶段标识（如 "Before Computation", "After Computation"）
    :param device: GPU 设备，可以是 torch.device 对象或字符串（如 'cuda:0'）。
    """
    # 获取CPU使用率
    cpu_usage = process.cpu_percent(interval=1)
    memory_info = process.memory_info()
    # 将内存使用量从字节转换为 MB
    memory_usage = memory_info.rss / (1024 ** 2)

    # 初始化 NVML
    pynvml.nvmlInit()

    if device is not None and torch.cuda.is_available():
        # 处理 device 类型，确保兼容 torch.device 对象和字符串
        if isinstance(device, torch.device):
            gpu_index = device.index if device.index is not None else 0  # 获取 GPU 索引
        elif isinstance(device, str) and 'cuda' in device:
            gpu_index = int(device.split(':')[-1])  # 解析 GPU 设备编号，例如 'cuda:0' -> 0
        else:
            raise ValueError("Invalid device format. Please provide a valid torch.device or string like 'cuda:0'.")

        # 获取 GPU 句柄
        handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index)
        # 获取显存信息
        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        # 总显存、已用显存和空闲显存
        total_memory = mem_info.total / (1024 ** 2)   # MB
        used_memory = mem_info.used / (1024 ** 2)     # MB
        free_memory = mem_info.free / (1024 ** 2)     # MB

        # 获取 PyTorch 的显存占用
        gpu_memory_allocated = torch.cuda.memory_allocated(device) / (1024 ** 2)
        gpu_memory_reserved = torch.cuda.memory_reserved(device) / (1024 ** 2)

        # 获取当前进程的显存使用情况
        process_gpu_memory = 0
        # 获取所有正在使用 GPU 的进程
        count = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
        for proc in count:
            # 检查当前进程的 PID 是否匹配
            if proc.pid == os.getpid():
                process_gpu_memory = proc.usedGpuMemory / (1024 ** 2)  # 将字节转为 MB

        print(f"[{phase}] CPU Memory Usage: {memory_usage:.2f} MB | "
              f"GPU {device}"
              f"GPU Memory Allocated (PyTorch): {gpu_memory_allocated:.2f} MB | GPU Memory Reserved (PyTorch): {gpu_memory_reserved:.2f} MB | "
              f"Current Process GPU Memory Usage: {process_gpu_memory:.2f} MB")
    else:
        print(f"[{phase}] CPU Usage: {cpu_usage:.2f}% | Memory Usage: {memory_usage:.2f} MB")

    # 关闭 NVML
    pynvml.nvmlShutdown()

示例用法

# 传入 GPU 设备 torch.device('cuda:5') 进行监控
print_resource_usage("Before Computation", device=torch.device('cuda:5'))