使用多卡分布式训练可以极大加速训练过程,配合pytorch官方的混合精度训练方式使得显存进一步压缩,可以加大batch_size
常用命令:
1.检测cuda可用
device = "cuda" if torch.cuda.is_avaliable() else "cpu"
2.检测GPU数目
gpu_cnt = torch.cuda.device_count()
3.设置可见的GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" # 只用0123号GPU
多进程GPU分布式训练例程
import os
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
# 在外部加载dataset
train_dataset = ... # 加载dataset
def setup(rank, world_size):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
# 初始化进程组
dist.init_process_group("nccl", rank=rank, world_size=world_size)
def main(demo_fn, world_size):
setup(rank, world_size)
# 设置随机种子
torch.manual_seed(18)
torch.cuda.manual_seed_all(18)
torch.backends.cudnn.deterministic = True
torch.cuda.set_device(rank) # 每个进程只看到一个GPU
# 设置采样器sampler
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
# shuffle 一定要为False,否则会与sampler冲突
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=..., sampler=train_sampler, shuffle=Flash)
# 加载模型
model = ...
# 模型放到cuda,并用ddp包装
model = DDP(model.cuda(), device_ids=[rank])
# 初始化优化器
optimizer = optim.SGD(model.parameters())
scaler = torch.cuda.amp.GradScaler()
for epoch in range(100):
# 每个epoch之前要使用epoch更新一下随机种子
train_sampler.set_epoch(epoch)
for batch_idx, (data, target) in enumerate(train_loader):
data = data.cuda()
target = target.cuda()
# 在上下文管理器内进行前向传播与损失计算
with torch.cuda.amp.autocast():
output = model(images)
loss = criterion(output, target)
optimizer.zero_grad()
# 梯度反传
scaler.scale(loss).backward()
# 更新参数
scaler.step(optimizer)
# 更新scaler
scaler.update()
if __name__ == "__main__":
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" # 只用0123号GPU
world_size = torch.cuda.device_count() # world_size 就是可用的GPU数量
mp.spawn(main,
args=(world_size,),
nprocs=world_size,
join=True)