文章目录
常见错误
AttributeError: ‘module’ object has no attribute ‘empty’
在138机器上执行命令如下
conda activate pytorch0.2.0
python
import torch
x=torch.empty(5, 3)
提示错误,如标题所示。
分析:模块没有属性“empty”,也就是torch模块中没有属性empty。说明了PyTorch0.2.0版本中,torch模块不包含改属性。
验证:在139的pytorch1.0中测试成功。
RuntimeError: received 0 items of ancdata
train_loader = DataLoader(train_dset,batch_size=32,shuffle=False,num_workers=3)
for batch_idx , (d, la) in enumerate(train_loader):
错误是在dataloader加载数据时出现的错误,原因是pytorch多线程共享tensor是通过打开文件的方式实现的,而打开文件的数量是有限制的,通过ulimit -a
可查看,当需共享的tensor超过open files限制时,即会出现该错误。
解决办法有2种:
1、增加open files的限制数量:
不能用sudo ulimit -n命令,而需执行:
sudo sh -c "ulimit -n 65535 && exec su $LOGNAME"
解释如下:
ulimit is a shell builtin like cd, not a separate program. sudo looks for a binary to run, but there is no ulimit binary, which is why you get the error message. You need to run it in a shell.
However, while you do need to be root to raise the limit to 65535, you probably don’t want to run your program as root. So after you raise the limit you should switch back to the current user.
To do this, run:
sudo sh -c “ulimit -n 65535 && exec su $LOGNAME”
and you will get a new shell, without root privileges, but with the raised limit. The exec causes the new shell to replace the process with sudo privileges, so after you exit that shell, you won’t accidentally end up as root again.
2、修改多线程的tensor方式为file_system(默认方式为file_descriptor,受限于open files数量):
torch.multiprocessing.set_sharing_strategy(‘file_system‘)
方法2有效
tensorboard: command not found
重新安装tensorboard(推荐)
pip install tb-nightly
http://www.voidcn.com/article/p-yfxuhjvo-brr.html
failed to initialize nvml driver/library version mismatch ubuntu
sudo dpkg --list | grep nvidia*
cat /proc/driver/nvidia/version
lsmod | grep nvidia
查看下有哪些进程使用了 nvidia*
sudo fuser -v /dev/nvidia*
#查找占用GPU资源的PID
sudo lsof -n -w /dev/nvidia*
#查找占用GPU资源的PID
Python读取、保存、查看.mat文件
import scipy.io as sio
import matplotlib.pyplot as plt
import numpy as np
# 保存mat文件
sio.savemat('testpython.mat', {'a': 1,'b': 2,'c': 3,'d': 4})
sio.savemat('testpython2.mat', {'x': [[1, 2, 3, 4],[ 5, 6, 7, 8]]})
# 读取mat文件
data = sio.loadmat('testpython.mat')
x1 = data['a']
x2 = data['b']
x3 = data['c']
x4 = data['d']
# 查看mat文件
sio.whosmat('testpython.mat')
创建验证集
《深度学习之PyTorch实战计算机视觉》唐进民 p182
7.2.1 验证数据集和测试数据集
从训练集的猫和狗的图片中各抽出2500张图片组成一个具有5000张图片的验证数据集
chapter3/
dogsandcats/
train/
dog.183.jpg
cat.2.jpg
cat.17.jpg
dog.186.jpg
cat.27.jpg
dog.193.jpg
chapter3/
dogsandcats/
train/
dog/
dog.183.jpg
dog.186.jpg
dog.193.jpg
cat/
cat.17.jpg
cat.2.jpg
cat.27.jpg
valid/
dog/
dog.173.jpg
dog.156.jpg
dog.123.jpg
cat/
cat.172.jpg
cat.20.jpg
cat.21.jpg
- shell脚本
import numpy
import random, shutil
def moveSomeFileToNewDir(srcDir, tarDir, rate):
srcPaths = os.listdir(srcDir)
tarPaths = random.sample(srcPaths, int(len(paths) * rate) )
for name in tarPaths:
shutil.move(os.path.join(srcDir, name), os.path.join(tarDir, name) )
#fileDir = r"train"
#valDir = r'valid'
#moveSomeFileToNewDir(fileDir, valDir, 0.2)
#从命令行读入参数
moveSomeFileToNewDir(sys.argv[1], sys.argv[2], (float)(sys.argv[3]))
train文件夹有25000张图片,移动5000张到valid文件夹,运行:python split.py train valid
- Python脚本
import os
from glob import glob
#path = '/home/yhr/data/DogsVSCats/train'
path = '/home/yhr/data/kaggle_DogsVSCats'
files = glob(os.path.join(path,'*/*.jpg')) # dog.1224.jpg
print('Total no of images', len(files)) # 25000
if not os.path.exists(os.path.join(path, 'valid')):
os.mkdir(os.path.join(path, 'valid'))
for t in ['train', 'valid']:
for folder in ['dog/', 'cat/']:
os.mkdir(os.path.join(path, t, folder))
no_of_images = len(files)
# move images from train to train/dog and trian/cat
for i in range(no_of_images):
folder = files[i].split('/')[-1].split('.')[0]
image = files[i].split('/')[-1]
os.rename(files[i], os.path.join(path,'train',folder,image))
# move images from train/dog to valid/dog
for folder in ['dog/', 'cat/']:
sub_path = os.path.join(path, 'train', folder)
files = glob(os.path.join(sub_path, '*.jpg'))
no_of_images = len(files)
shuffle = np.random.permutation(no_of_images)
for i in shuffle[:2500]:
#shutil.copyfile(files[i],'../chapter3/dogsandcats/valid/')
folder = files[i].split('/')[-1].split('.')[0]
image = files[i].split('/')[-1]
os.rename(files[i], os.path.join(path,'valid',folder,image))
把数据加载到PyTorch张量,按batch加载
- 把所有图片转换成相同大小
- 用数据集的均值和标准差将数据集归一化
- 把图片数据集转换成PyTorch张量
simple_transform = transforms.Compose([transforms.Resize((224,224))
,transforms.ToTensor()
,transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
train = ImageFolder('/home/yhr/data/DogsVSCats/train/',simple_transform)
valid = ImageFolder('/home/yhr/data/DogsVSCats/valid/',simple_transform)
train_data_gen = torch.utils.data.DataLoader(train,shuffle=True,batch_size=64,num_workers=3)
valid_data_gen = torch.utils.data.DataLoader(valid,batch_size=64,num_workers=3)
dataset_sizes = {'train':len(train_data_gen.dataset),'valid':len(valid_data_gen.dataset)}
dataloaders = {'train':train_data_gen,'valid':valid_data_gen}
图片展示
def imshow(inp):
"""Imshow for Tensor."""
inp = inp.numpy().transpose((1, 2, 0))
mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])
inp = std * inp + mean #反归一化
inp = np.clip(inp, 0, 1)
plt.imshow(inp)
imshow(train[50][0])
构建模型
model_ft = models.resnet18(pretrained=True)
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, 2)
if torch.cuda.is_available():
model_ft = model_ft.cuda()
训练模型
def train_model(model, criterion, optimizer, scheduler, num_epochs=5):
since = time.time()
best_model_wts = model.state_dict()
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'valid']:
if phase == 'train':
print("training...")
scheduler.step()
model.train(True) # Set model to training mode
else:
print("validating")
model.train(False) # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for data in dataloaders[phase]:
# get the inputs
inputs, labels = data
# wrap them in Variable
if torch.cuda.is_available():
#inputs = Variable(inputs.cuda())
#labels = Variable(labels.cuda())
inputs = Variable(inputs)
labels = Variable(labels)
inputs, labels = inputs.to(device), labels.to(device)
else:
inputs, labels = Variable(inputs), Variable(labels)
# zero the parameter gradients
optimizer.zero_grad()
# forward
outputs = model(inputs)
_, preds = torch.max(outputs.data, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item()
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects / dataset_sizes[phase]
print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'valid' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = model.state_dict()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model
提取预训练特征
my_resnet = resnet34(pretrained=True)
if is_cuda:
my_resnet = my_resnet.cuda()
m = nn.Sequential(*list(my_resnet.children())[:-1])
for p in my_resnet.parameters():
p.requires_grad = False
#For training data
#Iterate through the train data and store the calculated features and the labels
trn_labels = []
trn_features = []
for batch_idx , (d, la) in enumerate(train_loader):
o = m(Variable(d.cuda()))
o = o.view(o.size(0), -1)
trn_labels.extend(la)
trn_features.extend(o.cpu().data)
#For validation data
#Iterate through the validation data and store the calculated features and the labels
val_labels = []
val_features = []
for d, la in val_loader:
o = m(Variable(d.cuda()))
o = o.view(o.size(0), -1)
val_labels.extend(la)
val_features.extend(o.cpu().data)
自定义Pytorch数据集
class FeaturesDataset(Dataset):
def __init__(self,featlst,labellst):
self.featlst = featlst
self.labellst = labellst
def __getitem__(self,index):
return (self.featlst[index],self.labellst[index])
def __len__(self):
return len(self.labellst)
#Creating dataset for train and validation
trn_feat_dset = FeaturesDataset(trn_features,trn_labels)
val_feat_dset = FeaturesDataset(val_features,val_labels)
#Creating data loader for train and validation
trn_feat_loader = DataLoader(trn_feat_dset,batch_size=64,shuffle=True)
val_feat_loader = DataLoader(val_feat_dset,batch_size=64)
数据集处理
问题 | 方法 |
---|---|
合并文件夹 CUB数据集train,test合并成data文件夹 | cp -Rap train/* test/ mv test data |