Environment
- OS: Ubuntu 14.04
- Python version: 3.7
- PyTorch version: 1.4.0
- IDE: PyCharm
0. 写在前面
训练一个 CNN 模型往往需要一些时间,那么在训练过程中需要适时地保存模型。如果手动或意外中断了训练,那么之后从某个 checkpoint 恢复训练,而不用从头开始。这里记一下如何保存、加载、训练模型。
1. 保存和加载
模型在硬盘中存储为 .pkl 的二进制格式。一般地,可以保存整个模型,但推荐的是保存模型的参数。
torch.save函数实现模型的保存
import torch
import torchvision
model = torchvision.models.resnet50(pretrained=True)
torch.save(model, 'resnet50.pkl') # 保存整个模型
torch.save(model.state_dict(), 'resnet50_state_dict.pkl') # 保存模型参数
torch.load函数实现模型的加载
import torch
model = torch.load('resnet50.pkl') # 载入整个模型
若加载的是模型的参数,则需要构建一个模型,再调用 load_state_dict 方法载入参数
import torch
import torchvision
state_dict = torch.load('resnet50_state_dict.pkl') # 载入模型参数
model = torchvision.models.resnet50(pretrained=False)
model.load_state_dict(state_dict)
# <All keys matched successfully>
2. 训练代码
训练一个 ResNet50 模型,完成 TinyMind人民币面值识别 任务。数据集文件夹中的目录结构见 PyTorch学习笔记(二)读取数据。
├── data.py
├── models
├── rmbdata
└── train.py
以下为 train.py 中较为通用的代码
首先,导入需要的模块
import os
import time
import copy
import argparse
import torch
from torchvision.transforms import Compose, Resize, CenterCrop, RandomErasing, RandomGrayscale, ToTensor
from torch.utils.data import DataLoader
import torchvision
from torch.nn import CrossEntropyLoss
from torch.optim import SGD
from torch.optim.lr_scheduler import StepLR
from torch.utils.tensorboard import SummaryWriter
from data import RMBFaceValueDataset
接着,定义训练模型的函数
def train_model(model, criterion, optimizer, scheduler, num_epochs=36):
"""
Train the model.
Params:
model: torch.nn.Module
Neural network model
criterion: torch.nn.Module
Loss function
optimizer: torch.optim.Optimizer
Optimization strategy
scheduler: torch.optim.lr_scheduler._LRScheduler
Learning rate scheduler
num_epochs: int
Number of epochs for training
Returns:
model: torch.nn.Module
Trained model
"""
since = time.time()
# initialize the best accuracy and its corresponding model
best_acc = 0.0
if use_gpu and len(device_ids) > 1: # whether the model has been packed as Parallel
best_model_wts = copy.deepcopy(model.module.state_dict())
else:
best_model_wts = copy.deepcopy(model.state_dict())
for epoch in range(start_epoch, num_epochs + 1):
epoch_since = time.time()
print('Epoch {}/{}'.format(epoch, num_epochs))
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
tic_batch = time.time()
# Iterate over data
for i, (inputs, labels) in enumerate(dataloaders[phase]):
inputs, labels = inputs.to(device), labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward, and track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
if phase == 'train':
# record training loss for each iteration
curr_iter = (epoch - 1) * len(dataloaders['train']) + i + 1
writer.add_scalar('Training_Loss', loss.item(), curr_iter)
if i % args.print_freq == 0:
print(
'Epoch {}/{}-batch:{}/{} lr:{:.4f} {} Loss: {:.6f} Acc: {:.4f} Time: {:.4f}batch/sec'.format(
epoch, num_epochs, i, round(dataset_sizes[phase] / args.batch_size) - 1,
scheduler.get_lr()[0], phase,
loss.item(), torch.sum(preds == labels.data).item() / labels.size(0),
args.print_freq / (time.time() - tic_batch)
)
)
tic_batch = time.time()
epoch_loss = running_loss / dataset_sizes[phase] # "dataset_size" used as an outer variable
epoch_acc = running_corrects.double() / dataset_sizes[phase]
print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
# record training acc and testing acc for each epoch
writer.add_scalars('accuracy', {phase: epoch_acc}, epoch)
# deep copy the model state_dict with highest accuracy in val
if phase == 'val' and epoch_acc > best_acc:
best_acc = epoch_acc
if use_gpu and len(device_ids) > 1: # whether the model has been packed as Parallel
best_model_wts = copy.deepcopy(model.module.state_dict())
else:
best_model_wts = copy.deepcopy(model.state_dict())
scheduler.step(epoch)
if epoch % args.save_epoch_freq == 0:
if not os.path.exists(args.save_path):
os.makedirs(args.save_path)
checkpoint = {
'model_state_dict': best_model_wts, # save the model state_dict of the highest acc
'optim_state_dict': optimizer.state_dict(),
'epoch': epoch
}
torch.save(checkpoint, os.path.join(args.save_path, "epoch_" + str(epoch) + ".pth"))
# record distribution of weight for conv and fc layers
for name, param in model.named_parameters():
layer, attr = os.path.splitext(name)
if 'conv' in layer or 'fc' in layer:
writer.add_histogram('{}_{}'.format(layer, attr[1:]), param, epoch)
epoch_elapsed = time.time() - epoch_since
print('Time taken for the epoch: {:.0f}m {:.0f}s'.format(epoch_elapsed // 60, epoch_elapsed % 60))
time_elapsed = time.time() - since
print('\nTraining complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
print('-' * 30)
然后,是相当于主函数的内容 if __name__ == '__main__',包括
- 从程序接收训练的参数设置
- 构造数据集和加载器
- 得到模型(从
torchvision.models得到,因此也不用一个单独的 model 模块) - 是否从已有的模型继续训练
- 是否使用 GPU
- 定义损失函数、优化器和学习率调整
- 最后调用训练函数
if __name__ == '__main__':
# get configs
parser = argparse.ArgumentParser(description='Train ResNet50 in PyTorch with RMB face value dataset')
parser.add_argument('--data-dir', type=str, default=os.path.join(os.curdir, 'rmbdata'))
parser.add_argument('--batch-size', type=int, default=128)
parser.add_argument('--num-classes', type=int, default=9)
parser.add_argument('--num-epochs', type=int, default=36)
parser.add_argument('--lr', type=float, default=0.03)
parser.add_argument('--num-workers', type=int, default=4)
parser.add_argument('--gpus', type=str, default='0,1') # args.gpus get '0,1' like input
parser.add_argument('--print-freq', type=int, default=1)
parser.add_argument('--save-epoch-freq', type=int, default=1)
parser.add_argument('--save-path', type=str, default=os.path.join(os.curdir, 'models'))
parser.add_argument('--resume', type=str, default='', help='For training from one checkpoint')
parser.add_argument('--log-dir', type=str, default=os.path.join(os.curdir, 'logs'))
args = parser.parse_args()
# get GPU info
use_gpu = torch.cuda.is_available()
device = torch.device("cuda" if use_gpu else "cpu")
if use_gpu:
torch.cuda.manual_seed_all(227)
print('Number of GPUs available:', torch.cuda.device_count())
print('GPU device name:', torch.cuda.get_device_name())
# ========== get dataset and dataloader ==========
# set transforms
data_transforms = {
'train': Compose([
Resize(256),
CenterCrop(224),
RandomGrayscale(p=0.8),
ToTensor(),
RandomErasing(p=0.8, scale=(0.33, 0.67), ratio=(0.3, 3.3))
]),
'val': Compose([Resize(256), CenterCrop(224), ToTensor()]),
'test': Compose([Resize(256), CenterCrop(224), ToTensor()])
}
# get dataset
data_dir = os.path.join(os.curdir, 'rmbdata')
class_to_idx = {
'0.1': 0, '0.2': 1, '0.5': 2,
'1.0': 3, '2.0': 4, '5.0': 5,
'10.0': 6, '50.0': 7, '100.0': 8
}
image_datasets = {x: RMBFaceValueDataset(
os.path.join(data_dir, x), transform=data_transforms[x], class_to_idx=class_to_idx)
for x in ['train', 'val', 'test']
}
# get dataloader
dataloaders = {x: DataLoader(
image_datasets[x],
batch_size=args.batch_size,
num_workers=args.num_workers,
sampler=None,
shuffle=True,
drop_last=False
) for x in ['train', 'val', 'test']}
# other info for further train
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val', 'test']}
# ========== get model and train ==========
model = torchvision.models.resnet50(pretrained=False, num_classes=args.num_classes)
criterion = CrossEntropyLoss()
optimizer = SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=0.0001)
scheduler = StepLR(optimizer, step_size=7, gamma=0.1)
start_epoch = 1
# visualize the model as computational graph
if not os.path.exists(args.log_dir):
os.mkdir(args.log_dir)
writer = SummaryWriter(log_dir=args.log_dir)
input_tensor = torch.randn((1, 3, 224, 224))
writer.add_graph(model=model, input_to_model=input_tensor, verbose=False)
# continue training from some checkpoint
if args.resume:
if os.path.isfile(args.resume):
print(("=> loading checkpoint '{}'".format(args.resume)))
checkpoint = torch.load(args.resume)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optim_state_dict'])
scheduler.last_epoch = checkpoint['epoch']
start_epoch = checkpoint['epoch'] + 1 # the start epoch of current train
else:
print(("=> no checkpoint found at '{}'".format(args.resume)))
device_ids = [int(i) for i in args.gpus.strip().split(',')]
if use_gpu:
# move model to GPU
model.to(device)
# move parameters in optimizer to GPU
for state in optimizer.state.values():
for k, v in state.items():
if torch.is_tensor(v):
state[k] = v.to(device)
# whether to use multi-gpus parallel
if len(device_ids) > 1:
model = torch.nn.DataParallel(model, device_ids=[int(i) for i in args.gpus.strip().split(',')])
train_model(model, criterion, optimizer, scheduler, num_epochs=args.num_epochs)
writer.close()
使用 GPU 训练时,加载 optimizer 后,需要将其中的参数放到 GPU,参考了解决办法 pytorch重载optimizer参数时报错:RuntimeError: expected device cpu but got device cuda:0的解决方法。
本文介绍了在PyTorch中如何保存和加载模型,特别是在训练CNN时保存参数以便中断后恢复训练。此外,还展示了训练ResNet50模型进行人民币面值识别任务的通用代码,包括数据加载、模型构建、训练参数设置和GPU使用。
1524

被折叠的 条评论
为什么被折叠?



