文章目录
pytorch 分布式训练的四种方法。
我将会产生一份伪数据0到19共20个数,batch size=10,使用两个GPU来训练,提前执行一下
export CUDA_VISIBLE_DEVICES="0,1"
1、原生pytorch(mp.spawn)
import os
import torch
import torch.distributed as dist
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.distributed import DistributedSampler
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
class CustomDataset(Dataset):
"""
自定义数据集类
"""
def __init__(self):
# 创建0到19的数据
self.data = torch.arange(20)
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return self.data[idx]
def setup(rank, world_size):
"""
初始化分布式环境
"""
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
dist.init_process_group("nccl", rank=rank, world_size=world_size)
def cleanup():
"""
清理分布式环境
"""
dist.destroy_process_group()
def run_demo(rank, world_size):
print(f"Running on rank {
rank}")
# 设置分布式环境
setup(rank, world_size)
# 将进程绑定到GPU上
torch.cuda.set_device(rank)
# 创建数据集和分布式采样器
dataset = CustomDataset()
sampler = DistributedSampler(dataset,
num_replicas=world_size,
rank=rank,
shuffle=False)
# 创建数据加载器
dataloader = DataLoader(
dataset=dataset,
batch_size=10, # 每批次10个样本
sampler=sampler, # 使用分布式采样器
pin_memory=True
)
print(f"\nGPU {
rank}