-
多GPU训练实现
import torch from torch import nn # 将参数复制到其他GPU上 def get_params(params, device): new_params = [p.clone().to(device) for p in params] for p in new_params: p.requires_grad_() return new_params # 将所有向量相加,并将结果广播给所有GPU def allreduce(data): for i in range(1, len(data)): data[0][:] += data[i].to(data[0].device) for i in range(1, len(data)): data[i] = data[0].to(data[i].device) # 将一个小批量数据均匀地分布在多个GPU上面 data = torch.arange(20).reshape(4, 5) devices = [torch.device('cuda:0'), torch.device('cuda:1')] split = nn.parallel.scatter(data, devices) def split_batch(X, y, devices): '''将X和y拆分到多个设备上''' assert X.shape[0] == y.shape[0] return (nn.parallel.scatter(X, devices), nn.parallel.scatter(y, devices)) # 在一个小批量上实现多GPU训练 def train_batch(X, y, device_params, devices, lr): X_shards, y_shards = split(X, y, devices) ls = [loss(model(X_shard, device_W), y_shard).sum() for X_shard, y_shard, device_W in zip(X_shards, y_shards, device_params)] for l in ls: l.backword() with torch.no_grad(): for i in range(len(devices_params[0])): allreduce([device_params[c][i].grad for c in range(len(devices))]) for param in device_params: SGD(param, lr, X.shape[0])
-
多GPU的简洁实现
import torch from torch import nn # 相比于单GPU训练,只用修改这一行就行 net = nn.DataParallel(net, device_ids=devices)
-
对于精度来说,
batch_size=1
通常是最好的情况【QA】。 -
验证集准确率震荡较大是
lr
影响最大【QA】。 -
batch_size
增大,lr
也要增大【QA】。 -
如果有网络中
batch normlization
,lr
可以稍微调大一点【QA】。