监测深度学习模型训练时CPU和GPU的使用情况

监测代码:

import pynvml
import torch
import psutil
import os

# 获取当前进程信息
process = psutil.Process(os.getpid())  # 获取当前 Python 进程的 PID

def print_resource_usage(phase, device=None):
    """
    打印当前阶段的资源使用情况,包括 CPU 使用率、内存占用和 GPU 占用情况。

    :param phase: 当前的阶段标识(如 "Before Computation", "After Computation")
    :param device: GPU 设备,可以是 torch.device 对象或字符串(如 'cuda:0')。
    """
    # 获取CPU使用率
    cpu_usage = process.cpu_percent(interval=1)
    memory_info = process.memory_info()
    # 将内存使用量从字节转换为 MB
    memory_usage = memory_info.rss / (1024 ** 2)

    # 初始化 NVML
    pynvml.nvmlInit()

    if device is not None and torch.cuda.is_available():
        # 处理 device 类型,确保兼容 torch.device 对象和字符串
        if isinstance(device, torch.device):
            gpu_index = device.index if device.index is not None else 0  # 获取 GPU 索引
        elif isinstance(device, str) and 'cuda' in device:
            gpu_index = int(device.split(':')[-1])  # 解析 GPU 设备编号,例如 'cuda:0' -> 0
        else:
            raise ValueError("Invalid device format. Please provide a valid torch.device or string like 'cuda:0'.")

        # 获取 GPU 句柄
        handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index)
        # 获取显存信息
        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        # 总显存、已用显存和空闲显存
        total_memory = mem_info.total / (1024 ** 2)   # MB
        used_memory = mem_info.used / (1024 ** 2)     # MB
        free_memory = mem_info.free / (1024 ** 2)     # MB

        # 获取 PyTorch 的显存占用
        gpu_memory_allocated = torch.cuda.memory_allocated(device) / (1024 ** 2)
        gpu_memory_reserved = torch.cuda.memory_reserved(device) / (1024 ** 2)

        # 获取当前进程的显存使用情况
        process_gpu_memory = 0
        # 获取所有正在使用 GPU 的进程
        count = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
        for proc in count:
            # 检查当前进程的 PID 是否匹配
            if proc.pid == os.getpid():
                process_gpu_memory = proc.usedGpuMemory / (1024 ** 2)  # 将字节转为 MB

        print(f"[{phase}] CPU Memory Usage: {memory_usage:.2f} MB | "
              f"GPU {device}"
              f"GPU Memory Allocated (PyTorch): {gpu_memory_allocated:.2f} MB | GPU Memory Reserved (PyTorch): {gpu_memory_reserved:.2f} MB | "
              f"Current Process GPU Memory Usage: {process_gpu_memory:.2f} MB")
    else:
        print(f"[{phase}] CPU Usage: {cpu_usage:.2f}% | Memory Usage: {memory_usage:.2f} MB")

    # 关闭 NVML
    pynvml.nvmlShutdown()

示例用法

# 传入 GPU 设备 torch.device('cuda:5') 进行监控
print_resource_usage("Before Computation", device=torch.device('cuda:5'))
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值