目前使用的版本比较简单,不涉及到torch.multiprocessing的内容,只涉及到DistributedDataParallel的部分内容,这里给出一段code snippets以供参考:
### test_distributed_gpu.py
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
import torch.distributed as dist################
from tqdm import tqdm
class dummyDataset(nn.Module):
def __init__(self):
super(dummyDataset, self).__init__()
N = 50000
self.data = torch.randn(N, 2048, 3)
self.label = (torch.rand(N, 1) > 0.5).float().long()
def __getitem__(self, item):
jitter = torch.rand(2048,3)
return self.data[item] + jitter, self.label[item]
def __len__(self):
return len(self.data)
class dummyModel(nn.Module):
def __init__(self):
super(dummyModel, self).__init__()
self.layer1 = nn.Sequential(nn.Conv1d(3, 64, 1),
nn.BatchNorm1d(64),
nn.ReLU())
self.layer2 = nn.Sequential(nn.Conv1d(64, 256, 1),
nn.BatchNorm1d(256),
nn.ReLU())
self.fc = nn.Conv1d(256, 2, 1)
def forward(self, data):
"""
:param data:[B, 3, N]
:return: [B, 2, N]
"""
x = self.layer2(self.layer1(data))
x = torch.max(x, dim=-1, keepdims=True)[0]
x = self.fc(x).squeeze(-1)
return x
if __name__ == '__main__':
torch.manual_seed(1234)
parser = argparse.ArgumentParser(description='DDP')
parser.add_argument('--local_rank', default=-1, type=int, help='node rank for distributed training')################
parser.add_argument('--lr', type=float, help='lr')
args = parser.parse_args()
dist.init_process_group(backend='nccl')################
torch.cuda.set_device(args.local_rank)################
print(f'current rank -> {args.local_rank}')
device = torch.device("cuda", args.local_rank)################
### configurating dataset
train_dataset = dummyDataset()
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)################
train_loader = torch.utils.data.DataLoader(train_dataset,
batch_size=16,
num_workers=2,
shuffle=False,################
sampler=train_sampler)################
### configurating model
model = dummyModel().to(device)
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank])################
### configurating optimizer
optimizer = optim.SGD(
model.parameters(),
lr=args.lr,
momentum=0.98,
weight_decay=0.000001,
)
scheduler = optim.lr_scheduler.ExponentialLR(
optimizer,
gamma=0.95,
)
### configurating loss
loss_func = nn.CrossEntropyLoss()
model.train()
for epoch in range(100):
train_sampler.set_epoch(epoch)################
data_iter = train_loader.__iter__()
print(f"lr at epoch {epoch} -> {optimizer.param_groups[0]['lr']}")
for iter in tqdm(range(train_loader.__len__())):
input_pcd, labels = data_iter.__next__() # [B, 2048, 3]
input_pcd = input_pcd.permute(0, 2, 1).contiguous().to(device)
output = model(input_pcd) # [B]
labels = labels[:, 0].to(device) # [B, 1] -> [B]
optimizer.zero_grad()
loss = loss_func(output, labels) # ([B, 2], [B])
loss.backward()
optimizer.step()
if epoch > 0 and epoch % 5 == 0:
scheduler.step()
if epoch > 0 and epoch % 10 == 0:
state = {'epoch': epoch,
'state_dict': model.module.state_dict(),################
'optimizer': optimizer.state_dict(),
'scheduler': scheduler.state_dict(),
}
torch.save(state, 'test_model.pth')
print(f'saving epoch -> {epoch}')
其中相较于单卡训练,需要改动的部分已用################进行标记
脚本启动方式:
#!/bin/bash
CUDA_VISIBLE_DEVICES=0,1,2,3 \
python -m torch.distributed.launch \
--nproc_per_node=4 \
--master_port 29501 \
test_distributed_gpu.py \
--lr 0.005 \
nproc_per_node == number of visible devices,即用几张卡训练,此参数设置为几
几个需要注意的点:
- DPP训练dataloader的shuffle默认为False,设置为True会报错;sampler需要调用set_epoch方法打乱数据顺序,否则每个epoch数据顺序是一样的;
- 若出现 RuntimeError: Address already in use 错误,则在启动脚本中加入参数–master_port 29501指定端口,端口号可以任意给出;
- 若出现程序中止而显存不释放的问题(常出现于Ctrl+C或者Ctrl+Z挂起以结束程序的方式),使用命令netstat -ntlp查看端口号,找到相应pid,杀死程序kill -9 pid即可;或使用命令fuser -v /dev/nvidia*查看不同显卡上的僵尸进程,找到PID手动kill以释放显存。
- pdb调试在单卡下进行,即visible_devices设为1张,nproc_per_node=1下进行,多卡pdb仍然有些问题。