监测代码:
import pynvml
import torch
import psutil
import os
process = psutil.Process(os.getpid())
def print_resource_usage(phase, device=None):
"""
打印当前阶段的资源使用情况,包括 CPU 使用率、内存占用和 GPU 占用情况。
:param phase: 当前的阶段标识(如 "Before Computation", "After Computation")
:param device: GPU 设备,可以是 torch.device 对象或字符串(如 'cuda:0')。
"""
cpu_usage = process.cpu_percent(interval=1)
memory_info = process.memory_info()
memory_usage = memory_info.rss / (1024 ** 2)
pynvml.nvmlInit()
if device is not None and torch.cuda.is_available():
if isinstance(device, torch.device):
gpu_index = device.index if device.index is not None else 0
elif isinstance(device, str) and 'cuda' in device:
gpu_index = int(device.split(':')[-1])
else:
raise ValueError("Invalid device format. Please provide a valid torch.device or string like 'cuda:0'.")
handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index)
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
total_memory = mem_info.total / (1024 ** 2)
used_memory = mem_info.used / (1024 ** 2)
free_memory = mem_info.free / (1024 ** 2)
gpu_memory_allocated = torch.cuda.memory_allocated(device) / (1024 ** 2)
gpu_memory_reserved = torch.cuda.memory_reserved(device) / (1024 ** 2)
process_gpu_memory = 0
count = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
for proc in count:
if proc.pid == os.getpid():
process_gpu_memory = proc.usedGpuMemory / (1024 ** 2)
print(f"[{phase}] CPU Memory Usage: {memory_usage:.2f} MB | "
f"GPU {device}"
f"GPU Memory Allocated (PyTorch): {gpu_memory_allocated:.2f} MB | GPU Memory Reserved (PyTorch): {gpu_memory_reserved:.2f} MB | "
f"Current Process GPU Memory Usage: {process_gpu_memory:.2f} MB")
else:
print(f"[{phase}] CPU Usage: {cpu_usage:.2f}% | Memory Usage: {memory_usage:.2f} MB")
pynvml.nvmlShutdown()
示例用法
print_resource_usage("Before Computation", device=torch.device('cuda:5'))