损失函数L对参数的梯度推导:
损失函数L对全连接层W、X、b的梯度
激活层为softmax时,CrossEntropy损失函数对激活层输入Z的梯度
代码实现:
network.py
import time
from collections import OrderedDict
import numpy as np
# 参数初始化
class HeUniform:
def __call__(self, weight_shape):
n_in = weight_shape[0]
b = np.sqrt(6 / n_in)
return np.random.uniform(-b, b, size=weight_shape)
class SGD:
def __init__(self, lr):
self.lr = lr
def __call__(self, params, params_grad):
return params - self.lr * params_grad
class Sigmoid:
def __call__(self, z):
return self.forward(z)
def forward(self, z):
return 1 / (1 + np.exp(-z))
def grad(self, x):
ex = np.exp(-x)
return ex / ((1 + ex) ** 2)
def softmax(x):
e_x = np.exp(x)
# e_x.sum(axis=-1, keepdims=True)逐行求和
# e_x / e_x.sum(axis=-1, keepdims=True) 每行逐元素除以该行之和
return e_x / e_x.sum(axis=-1, keepdims=True)
def minibatch(X, batch_size, shuffle=True):
n = X.shape[0]
n_batches = int(np.ceil(n / batch_size))
idx = np.arange(n)
if shuffle:
np.random.shuffle(idx)
def mb_generator():
for i in range(n_batches):
yield idx[i * batch_size: (i + 1) * batch_size]
return mb_generator(), n_batches
class FullyConnected:
def __init__(self, n_out, acti_fn=None):
self.X = None # 模型输入
self.params = {} # 参数(W、b)值
self.gradients = {} # 参数(W、b)的梯度
self.optimizer = SGD(lr=0.01)
self.n_in = None # 输入维度,等于X.shape[1],即feature数
self.n_out = n_out # 输出维度
self.acti_fn = Sigmoid() if acti_fn else None # 激活函数
self.init_weight = HeUniform()
self.is_initialized = False
def _init_params(self):
b = np.zeros((1, self.n_out))
W = self.init_weight((self.n_in, self.n_out))
self.gradients = {'W': np.zeros_like(W), 'b': np.zeros_like(b)}
self.params = {'W': W, 'b': b}
self.is_initialized = True
def forward(self, X):
if not self.is_initialized:
self.n_in = X.shape[1]
self._init_params()
W = self.params['W']
b = self.params['b']
Z = X @ W + b
a = self.acti_fn.forward(Z) if self.acti_fn else Z
self.X = X
return a
# 关键部分,推导过程见 https://blog.youkuaiyun.com/qq_37613112/article/details/148426857
def backward(self, dLda):
dx, dw, db = self._bwd(dLda)
self.gradients['W'] += dw
self.gradients['b'] += db
return dx
def _bwd(self, dLda):
W = self.params['W']
b = self.params['b']
Z = self.X @ W + b
dz = dLda * self.acti_fn.grad(Z) if self.acti_fn else dLda
dx = dz @ W.T
dw = self.X.T @ dz
db = np.sum(dz, axis=0)
return dx, dw, db
# 处理完一个batch后清空梯度
def flush_gradients(self):
self.X = None
for k, v in self.gradients.items():
self.gradients[k] = np.zeros_like(v)
# SGD更新参数
def update(self):
for k, v in self.gradients.items():
if k in self.params:
self.params[k] = self.optimizer(self.params[k], v)
class CrossEntropy:
def __call__(self, y_pred, y_true):
return self.loss(y_pred, y_true)
def loss(self, y_pred, y_true):
eps = np.finfo(float).eps
return -np.sum(y_true * np.log(y_pred + eps))
# 推导过程见 https://blog.youkuaiyun.com/qq_37613112/article/details/148710092
def grad(self, y_pred, y_true):
return y_pred - y_true
class DFN(object):
# 两层神经网络
def __init__(self, hidden_dims_1=None, hidden_dims_2=None, loss=CrossEntropy()):
self.hidden_dim1 = hidden_dims_1
self.hidden_dim2 = hidden_dims_2
self.loss = loss
self.is_initialized = False
def _set_params(self):
self.layers = OrderedDict()
self.layers['FC1'] = FullyConnected(
n_out=self.hidden_dim1,
acti_fn='sigmoid'
)
self.layers['FC2'] = FullyConnected(
n_out=self.hidden_dim2
)
self.is_initialized = True
def forward(self, X):
out = X
for k, v in self.layers.items():
out = v.forward(out)
return out
def backward(self, grad):
out = grad
for k, v in reversed(list(self.layers.items())):
out = v.backward(out)
def update(self):
for k, v in reversed(list(self.layers.items())):
v.update()
self.flush_gradients()
def flush_gradients(self):
for k, v in self.layers.items():
v.flush_gradients()
def fit(self, X_train, Y_train, n_epochs=20, batch_size=64):
if not self.is_initialized:
self._set_params()
pre_loss = np.inf
for i in range(n_epochs):
start_time = time.time()
epoch_loss = 0.0
batch_generator, n_batches = minibatch(X_train, batch_size)
for j, batch_idx in enumerate(batch_generator):
X_batch, Y_batch = X_train[batch_idx], Y_train[batch_idx]
out = self.forward(X_batch)
y_pred = softmax(out)
batch_loss = self.loss(y_pred, Y_batch)
grad = self.loss.grad(y_pred, Y_batch)
self.backward(grad)
self.update()
epoch_loss += batch_loss
epoch_loss /= X_train.shape[0]
print(
f'[Epoch: {i}] Avg. loss: {epoch_loss} Delta: {pre_loss - epoch_loss} ({(time.time() - start_time) / 60.0}m/epoch)')
pre_loss = epoch_loss
def evaluate(self, X_test, Y_test, batch_size=128):
batch_generator, n_batches = minibatch(X_test, batch_size)
acc = 0.0
for j, batch_idx in enumerate(batch_generator):
X_batch, Y_batch = X_test[batch_idx], Y_test[batch_idx]
out = self.forward(X_batch)
y_pred = np.argmax(out, axis=1)
Y_batch = np.argmax(Y_batch, axis=1)
acc += np.sum(y_pred == Y_batch)
return acc / X_test.shape[0]
util.py
import numpy as np
def load_data(path='./data/mnist/mnist.npz'):
f = np.load(path)
X_train, y_train = f['x_train'], f['y_train']
X_test, y_test = f['x_test'], f['y_test']
f.close()
return (X_train, y_train), (X_test, y_test)
def pre_process():
(X_train, y_train), (X_test, y_test) = load_data()
# np.eye(10):生成10x10单位矩阵
# 假设y_train.astype(int)为[2, 3, 0, ……]
# 则y_train为
# [
# [0, 0, 1, ……],
# [0, 0, 0, 1, ……],
# [1, ……]
# ]
# 从而实现one-hot
y_train = np.eye(10)[y_train.astype(int)]
y_test = np.eye(10)[y_test.astype(int)]
# X_train.shape[0]:样本数
# 以下操作使三维数据变成二维
X_train = X_train.reshape(-1, X_train.shape[1] * X_train.shape[2]).astype('float32')
X_test = X_test.reshape(-1, X_test.shape[1] * X_test.shape[2]).astype('float32')
indices = np.random.permutation(range(X_train.shape[0]))[:20000]
X_train, y_train = X_train[indices], y_train[indices]
X_train /= 255
X_train = (X_train - 0.5) * 2
X_test /= 255
X_test = (X_test - 0.5) * 2
return X_train, y_train, X_test, y_test
manual.py
import random
import numpy as np
from network import DFN
from util import pre_process
np.random.seed(42)
random.seed(42)
if __name__ == "__main__":
model = DFN(hidden_dims_1=200, hidden_dims_2=10)
X_train, y_train, X_test, y_test = pre_process()
model.fit(X_train, y_train, n_epochs=20)
evaluate = model.evaluate(X_test, y_test)
print(evaluate)
运行结果:
与pytorch版本的代码运行结果对比:
代码
import random
import time
import numpy as np
import torch
import torch.nn as nn
from util import pre_process
from torch.utils.data import DataLoader, TensorDataset
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)
class NNDFN(nn.Module):
def __init__(self, hidden_dims_1=100, hidden_dims_2=10):
super(NNDFN, self).__init__()
self.fc1 = nn.Linear(784, hidden_dims_1)
self.sigmoid = nn.Sigmoid()
self.fc2 = nn.Linear(hidden_dims_1, hidden_dims_2)
self.slope = 1
self.intercept = 0
for m in self.modules():
if isinstance(m, nn.Linear):
nn.init.kaiming_uniform_(m.weight, mode='fan_in')
nn.init.zeros_(m.bias)
def forward(self, X):
return self.fc2(self.sigmoid(self.fc1(X)))
if __name__ == '__main__':
X_train, y_train, X_test, y_test = pre_process()
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset)
test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset)
model = NNDFN()
loss_fn = nn.CrossEntropyLoss() # 结合了softmax和交叉熵损失函数
optimizer = torch.optim.SGD(params=model.parameters(), lr=0.01)
pre_loss = np.inf
for epoch in range(20):
epoch_loss = 0.0
start_time = time.time()
for idx, (inputs, labels) in enumerate(train_loader):
optimizer.zero_grad()
y_pred = model.forward(inputs)
loss = loss_fn(y_pred, torch.argmax(labels, dim=1))
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_loss /= X_train.shape[0]
print(
f'[Epoch: {epoch}] Avg. loss: {epoch_loss} Delta: {pre_loss - epoch_loss} ({(time.time() - start_time) / 60.0}m/epoch)')
pre_loss = epoch_loss
acc = 0
for idx, (inputs, labels) in enumerate(test_loader):
out = model.forward(inputs)
_, pred = torch.max(out, dim=1) # 求每行最大值及位置,返回最大值及其索引,pred大小为n x 1
acc += (pred == torch.argmax(labels, dim=1)).sum().item()
print(f'acc:{acc / X_test.shape[0]}')