线性回归
优化函数 - 随机梯度下降
(w,b)←(w,b)−η∣B∣∑i∈B∂(w,b)l(i)(w,b)(\mathbf{w},b) \leftarrow (\mathbf{w},b) - \frac{\eta}{|\mathcal{B}|} \sum_{i \in \mathcal{B}} \partial_{(\mathbf{w},b)} l^{(i)}(\mathbf{w},b)(w,b)←(w,b)−∣B∣ηi∈B∑∂(w,b)l(i)(w,b)
读取数据集
def data_iter(batch_size, features, labels):
num_examples = len(features)
indices = list(range(num_examples))
random.shuffle(indices) # random read 10 samples
for i in range(0, num_examples, batch_size):
j = torch.LongTensor(indices[i: min(i + batch_size, num_examples)]) # the last time may be not enough for a whole batch
yield features.index_select(0, j), labels.index_select(0, j)
初始化模型参数
w = torch.tensor(np.random.normal(0, 0.01, (num_inputs, 1)), dtype=torch.float32)
b = torch.zeros(1, dtype=torch.float32)
w.requires_grad_(requires_grad=True)
b.requires_grad_(requires_grad=True)
定义优化函数
def sgd(params, lr, batch_size):
for param in params:
param.data -= lr * param.grad / batch_size
训练
# 清零梯度
w.grad.data.zero_()
b.grad.data.zero_()
线性回归模型使用pytorch的简洁实现
torch.set_default_tensor_type('torch.FloatTensor')
读取数据集
import torch.utils.data as Data
batch_size = 10
# combine featues and labels of dataset
dataset = Data.TensorDataset(features, labels)
# put dataset into DataLoader
data_iter = Data.DataLoader(
dataset=dataset, # torch TensorDataset format
batch_size=batch_size, # mini batch size
shuffle=True, # whether shuffle the data or not
num_workers=2, # read data in multithreading
)
定义模型
class LinearNet(nn.Module):
def __init__(self, n_feature):
super(LinearNet, self).__init__() # call father function to init
self.linear = nn.Linear(n_feature, 1) # function prototype: `torch.nn.Linear(in_features, out_features, bias=True)`
def forward(self, x):
y = self.linear(x)
return y
net = LinearNet(num_inputs)
# ways to init a multilayer network
# method one
net = nn.Sequential(
nn.Linear(num_inputs, 1)
# other layers can be added here
)
# method two
net = nn.Sequential()
net.add_module('linear', nn.Linear(num_inputs, 1))
# net.add_module ......
# method three
from collections import OrderedDict
net = nn.Sequential(OrderedDict([
('linear', nn.Linear(num_inputs, 1))
# ......
]))
初始化模型参数
from torch.nn import init
init.normal_(net[0].weight, mean=0.0, std=0.01)
init.constant_(net[0].bias, val=0.0) # or you can use `net[0].bias.data.fill_(0)` to modify it directly
Softmax与分类模型
直接使用输出层的输出有两个问题:
1.一方面,由于输出层的输出值的范围不确定,我们难以直观上判断这些值的意义。例如,刚才举的例子中的输出值10表示“很置信”图像类别为猫,因为该输出值是其他两类的输出值的100倍。但如果o1=o3=103o_1=o_3=10^3o1=o3=103,那么输出值10却2.又表示图像类别为猫的概率很低。
另一方面,由于真实标签是离散值,这些离散值与不确定范围的输出值之间的误差难以衡量。
softmax
y^1,y^2,y^3=softmax(o1,o2,o3)\hat{y}_1, \hat{y}_2, \hat{y}_3 = \text{softmax}(o_1, o_2, o_3)y^1,y^2,y^3=softmax(o1,o2,o3)
其中
y^1=exp(o1)∑i=13exp(oi),y^2=exp(o2)∑i=13exp(oi),y^3=exp(o3)∑i=13exp(oi).\hat{y}1 = \frac{ \exp(o_1)}{\sum_{i=1}^3 \exp(o_i)},\quad \hat{y}2 = \frac{ \exp(o_2)}{\sum_{i=1}^3 \exp(o_i)},\quad \hat{y}3 = \frac{ \exp(o_3)}{\sum_{i=1}^3 \exp(o_i)}.y^1=∑i=13exp(oi)exp(o1),y^2=∑i=13exp(oi)exp(o2),y^3=∑i=13exp(oi)exp(o3).
容易看出y^1+y^2+y^3=1\hat{y}_1 + \hat{y}_2 + \hat{y}_3 = 1y^1+y^2+y^3=1且0≤y^1,y^2,y^3≤10 \leq \hat{y}_1, \hat{y}_2, \hat{y}_3 \leq 10≤y^1,y^2,y^3≤1,因此y^2=0.8\hat{y}_2=0.8y^2=0.8是一个合法的概率分布。
且argmaxioi=argmaxiy^i\underset{i}{\arg\max} o_i = \underset{i}{\arg\max} \hat{y}_iiargmaxoi=iargmaxy^i,也就是说softmax运算不改变预测类别输出
交叉熵损失函数
由于平方损失则过于严格,改善上述问题的一个方法是使用更适合衡量两个概率分布差异的测量函数。其中,交叉熵(cross entropy)是一个常用的衡量方法:
H(y(i),y^(i))=−∑j=1qyj(i)logy^j(i),H\left(\boldsymbol y^{(i)}, \boldsymbol {\hat y}^{(i)}\right ) = -\sum_{j=1}^q y_j^{(i)} \log \hat y_j^{(i)},H(y(i),y^(i))=−j=1∑qyj(i)logy^j(i),
假设训练数据集的样本数为nnn,交叉熵损失函数定义为
ℓ(Θ)=1n∑i=1nH(y(i),y^(i)),\ell(\boldsymbol{\Theta}) = \frac{1}{n} \sum_{i=1}^n H\left(\boldsymbol y^{(i)}, \boldsymbol {\hat y}^{(i)}\right ),ℓ(Θ)=n1i=1∑nH(y(i),y^(i)),
获取Fashion-MNIST训练集和读取数据
我这里我们会使用torchvision包,它是服务于PyTorch深度学习框架的,主要用来构建计算机视觉模型。torchvision主要由以下几部分构成:
1.torchvision.datasets: 一些加载数据的函数及常用的数据集接口;
2.torchvision.models: 包含常用的模型结构(含预训练模型),例如AlexNet、VGG、ResNet等;
3.torchvision.transforms: 常用的图片变换,例如裁剪、旋转等;
4.torchvision.utils: 其他的一些有用的方法。
获取数据集
mnist_train = torchvision.datasets.FashionMNIST(root='/home/kesci/input/FashionMNIST2065', train=True, download=True, transform=transforms.ToTensor())
mnist_test = torchvision.datasets.FashionMNIST(root='/home/kesci/input/FashionMNIST2065', train=False, download=True, transform=transforms.ToTensor())
train_iter = torch.utils.data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=num_workers)
test_iter = torch.utils.data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=num_workers)
对多维Tensor按维度操作
X = torch.tensor([[1, 2, 3], [4, 5, 6]])
print(X.sum(dim=0, keepdim=True)) # dim为0,按照相同的列求和,并在结果中保留列特征
print(X.sum(dim=1, keepdim=True)) # dim为1,按照相同的行求和,并在结果中保留行特征
print(X.sum(dim=0, keepdim=False)) # dim为0,按照相同的列求和,不在结果中保留列特征
print(X.sum(dim=1, keepdim=False)) # dim为1,按照相同的行求和,不在结果中保留行特征
定义损失函数
def cross_entropy(y_hat, y):
return - torch.log(y_hat.gather(1, y.view(-1, 1)))
定义准确率
def accuracy(y_hat, y):
return (y_hat.argmax(dim=1) == y).float().mean().item()
模型预测
X, y = iter(test_iter).next()
简洁实现
class LinearNet(nn.Module):
def __init__(self, num_inputs, num_outputs):
super(LinearNet, self).__init__()
self.linear = nn.Linear(num_inputs, num_outputs)
def forward(self, x): # x 的形状: (batch, 1, 28, 28)
y = self.linear(x.view(x.shape[0], -1))
return y
# net = LinearNet(num_inputs, num_outputs)
class FlattenLayer(nn.Module):
def __init__(self):
super(FlattenLayer, self).__init__()
def forward(self, x): # x 的形状: (batch, *, *, ...)
return x.view(x.shape[0], -1)
from collections import OrderedDict
net = nn.Sequential(
# FlattenLayer(),
# LinearNet(num_inputs, num_outputs)
OrderedDict([
('flatten', FlattenLayer()),
('linear', nn.Linear(num_inputs, num_outputs))]) # 或者写成我们自己定义的 LinearNet(num_inputs, num_outputs) 也可以
)
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr=0.1)
多层感知机
激活函数
1.ReLU函数
ReLU(x)=max(x,0).\text{ReLU}(x) = \max(x, 0).ReLU(x)=max(x,0).
2.sigmoid函数
sigmoid(x)=11+exp(−x).\text{sigmoid}(x) = \frac{1}{1 + \exp(-x)}.sigmoid(x)=1+exp(−x)1.
3.tanh函数
tanh(x)=1−exp(−2x)1+exp(−2x).\text{tanh}(x) = \frac{1 - \exp(-2x)}{1 + \exp(-2x)}.tanh(x)=1+exp(−2x)1−exp(−2x).
tanh′(x)=1−tanh2(x).\text{tanh}'(x) = 1 - \text{tanh}^2(x).tanh′(x)=1−tanh2(x).