目录
FullyConnedtedNets.ipynb
上一个作业中实现了一个两层全连接网络,但是它并非结构化的,因为它在同一个函数中计算整个网络的梯度。这种做法在浅层神经网络中还容易实现,但是在深度神经网络就难以实现了。所以需要构造一个结构化的神经网络。
首先,每一层实现一个forward()和一个backward()函数。
forward()主要计算输出,结构如下:
def layer_forward(x, w):
""" 接收输入x和权重w"""
# 做一些计算……
z = # ……得到一些中间值
# 做更多的计算……
out = # 输出
cache = (x, w, z, out) # 保存计算梯度是所需的值
return out, cache
backward()主要计算梯度,结构如下:
def layer_backward(dout, cache):
"""
接收上游传回来的导数∂loss/∂outputs和cache中保存的计算梯度所需的值,
然后计算∂loss/∂x和∂loss/∂w
"""
# 取出cache里保存的值
x, w, z, out = cache
# 使用cache里的值计算梯度
dx = # 赋值
dw = # 赋值
return dx, dw
刚开始运行下面的代码时可能会出错。懂得解决的话自己解决就好了,不懂的话可以参考这篇文章。
# 像以前一样,为后续代码做准备
from __future__ import print_function
import time
import numpy as np
import matplotlib.pyplot as plt
from cs231n.classifiers.fc_net import *
from cs231n.data_utils import get_CIFAR10_data
from cs231n.gradient_check import eval_numerical_gradient, eval_numerical_gradient_array
from cs231n.solver import Solver
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # 设置画图的默认大小
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
# 自动加载额外的文件
%load_ext autoreload
%autoreload 2
def rel_error(x, y):
""" 返回相对误差 """
return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))
加载预处理过(减均值)的CIFAR-10数据,函数内部和以前的差不多,只不过现在封装起来了。
data = get_CIFAR10_data()
for k, v in list(data.items()):
# 输出数据的维度
print(('%s: ' % k, v.shape))
'''('X_train: ', (49000, 3, 32, 32))
('y_train: ', (49000,))
('X_val: ', (1000, 3, 32, 32))
('y_val: ', (1000,))
('X_test: ', (1000, 3, 32, 32))
('y_test: ', (1000,))
'''
affine层的forward和backward
先完成cs231n/layers.py里的affine_forward()再运行下面代码
# 测试affine_forward()是否编程正确
num_inputs = 2
input_shape = (4, 5, 6)
output_dim = 3
# np.prod()函数用来计算数组中所有元素的乘积
input_size = num_inputs * np.prod(input_shape)
weight_size = output_dim * np.prod(input_shape)
# reshape只接受整数,所以要使用*input_shape
x = np.linspace(-0.1, 0.5, num=input_size).reshape(num_inputs, *input_shape)
w = np.linspace(-0.2, 0.3, num=weight_size).reshape(np.prod(input_shape), output_dim)
b = np.linspace(-0.3, 0.1, num=output_dim)
out, _ = affine_forward(x, w, b)
correct_out = np.array([[ 1.49834967, 1.70660132, 1.91485297],
[ 3.25553199, 3.5141327, 3.77273342]])
# 比较编程输出和给定的输出,误差应该小于1e-9.
print('Testing affine_forward function:')
print('difference: ', rel_error(out, correct_out))
# Testing affine_forward function:
# difference: 9.769849468192957e-10
先完成cs231n/layers.py里的affine_backward()再运行下面代码
# 固定随机种子,使结果可以复现
np.random.seed(231)
x = np.random.randn(10, 2, 3)
w = np.random.randn(6, 5)
b = np.random.randn(5)
dout = np.random.randn(10, 5)
# 计算数值梯度
dx_num = eval_numerical_gradient_array(lambda x: affine_forward(x, w, b)[0], x, dout)
dw_num = eval_numerical_gradient_array(lambda w: affine_forward(x, w, b)[0], w, dout)
db_num = eval_numerical_gradient_array(lambda b: affine_forward(x, w, b)[0], b, dout)
# 计算解析梯度(自己实现的)
_, cache = affine_forward(x, w, b)
dx, dw, db = affine_backward(dout, cache)
# 相对误差应该都小于1e-10
print('Testing affine_backward function:')
print('dx error: ', rel_error(dx_num, dx))
print('dw error: ', rel_error(dw_num, dw))
print('db error: ', rel_error(db_num, db))
# Testing affine_backward function:
# dx error: 5.399100368651805e-11
# dw error: 9.904211865398145e-11
# db error: 2.4122867568119087e-11
ReLU层的forward和backward
**先完成cs231n/layers.py里的relu_forward()和relu_backward()**再运行下面代码
# 测试relu_forward()
x = np.linspace(-0.5, 0.5, num=12).reshape(3, 4)
# 调用函数计算输出
out, _ = relu_forward(x)
correct_out = np.array([[ 0., 0., 0., 0., ],
[ 0., 0., 0.04545455, 0.13636364,],
[ 0.22727273, 0.31818182, 0.40909091, 0.5, ]])
# 相对误差应该在5e-8左右
print('Testing relu_forward function:')
print('difference: ', rel_error(out, correct_out))
# Testing relu_forward function:
# difference: 4.999999798022158e-08
# 测试relu_backward()
np.random.seed(231)
x = np.random.randn(10, 10)
# randn()只接受整数,所以要使用*x.shape
dout = np.random.randn(*x.shape)
# 计算数值梯度
dx_num = eval_numerical_gradient_array(lambda x: relu_forward(x)[0], x, dout)
# 计算解析梯度
_, cache = relu_forward(x)
dx = relu_backward(dout, cache)
# 相对误差应该在3e-12左右
print('Testing relu_backward function:')
print('dx error: ', rel_error(dx_num, dx))
# Testing relu_backward function:
# dx error: 3.2756349136310288e-12
affine层和ReLU层的forward和backward合并
在cs231n/layer_utils.py中实现了合并,代码如下:
def affine_relu_forward(x, w, b):
# affine的前向传播
a, fc_cache = affine_forward(x, w, b)
# ReLU的前向传播
out, relu_cache = relu_forward(a)
cache = (fc_cache, relu_cache)
return out, cache
def affine_relu_backward(dout, cache):
fc_cache, relu_cache = cache
# ReLU的反向传播
da = relu_backward(dout, relu_cache)
# affine的反向传播
dx, dw, db = affine_backward(da, fc_cache)
return dx, dw, db
运行下面代码,验证affine+ReLU的正确性
from cs231n.layer_utils import affine_relu_forward, affine_relu_backward
# 固定随机种子数
np.random.seed(231)
x = np.random.randn(2, 3, 4)
w = np.random.randn(12, 10)
b = np.random.randn(10)
dout = np.random.randn(2, 10)
# 前向传播
out, cache = affine_relu_forward(x, w, b)
# 反向传播
dx, dw, db = affine_relu_backward(dout, cache)
# 计算数值梯度
dx_num = eval_numerical_gradient_array(lambda x: affine_relu_forward(x, w, b)[0], x, dout)
dw_num = eval_numerical_gradient_array(lambda w: affine_relu_forward(x, w, b)[0], w, dout)
db_num = eval_numerical_gradient_array(lambda b: affine_relu_forward(x, w, b)[0], b, dout)
print('Testing affine_relu_forward:')
print('dx error: ', rel_error(dx_num, dx))
print('dw error: ', rel_error(dw_num, dw))
print('db error: ', rel_error(db_num, db))
# Testing affine_relu_forward:
# dx error: 2.299579177309368e-11
# dw error: 8.162011105764925e-11
# db error: 7.826724021458994e-12
损失层:softmax和SVM
在cs231n/layers.py里实现了两个损失,代码如下:
def svm_loss(x, y):
"""
输入:
- x: 维度为(N, C),其中x[i, j]表示第i个输入关于第j个类的得分
- y: 标签,维度为(N,)
返回:
- loss: 损失
- dx: x(也就是得分)的梯度
"""
N = x.shape[0]
# x[np.arange(N), y]得到的维度是(N,)
correct_class_scores = x[np.arange(N), y]
# correct_class_scores[:, np.newaxis]的维度是(N,1)
margins = np.maximum(0, x - correct_class_scores[:, np.newaxis] + 1.0)
# 正确类得分不参与计算
margins[np.arange(N), y] = 0
loss = np.sum(margins) / N
# 计算梯度
num_pos = np.sum(margins > 0, axis=1)
dx = np.zeros_like(x)
dx[margins > 0] = 1
dx[np.arange(N), y] -= num_pos
dx /= N
return loss, dx
def softmax_loss(x, y):
# 减去最大值,使其全部小于0,这样exp(shifted_logits)就会在0到1之间,
# 防止exp(x)过大而溢出。而且该减去操作不影响最终损失。
shifted_logits = x - np.max(x, axis=1, keepdims=True)
Z = np.sum(np.exp(shifted_logits), axis=1, keepdims=True)
log_probs = shifted_logits - np.log(Z)
N = x.shape[0]
# 只使用正确类的得分
loss = -np.sum(log_probs[np.arange(N), y]) / N
# 计算梯度
probs = np.exp(log_probs)
dx = probs.copy()
dx[np.arange(N), y] -= 1
dx /= N
return loss, dx
验证两个损失层的正确性
np.random.seed(231)
num_classes, num_inputs = 10, 50
x = 0.001 * np.random.randn(num_inputs, num_classes)
y = np.random.randint(num_classes, size=num_inputs)
# 计算数值梯度和解析梯度
dx_num = eval_numerical_gradient(lambda x: svm_loss(x, y)[0], x, verbose=False)
loss, dx = svm_loss(x, y)
# 损失应该是n(10)-1=9左右,dx的相对误差一个在1e-9左右
print('Testing svm_loss:')
print('loss: ', loss)
print('dx error: ', rel_error(dx_num, dx))
# Testing svm_loss:
# loss: 8.999602749096233
# dx error: 1.4021566006651672e-09
dx_num = eval_numerical_gradient(lambda x: softmax_loss(x, y)[0], x, verbose=False)
loss, dx = softmax_loss(x, y)
# 损失应该是ln(10)≈2.3,dx的相对误差一个在1e-8左右
print('\nTesting softmax_loss:')
print('loss: ', loss)
print('dx error: ', rel_error(dx_num, dx))
# Testing softmax_loss:
# loss: 2.302545844500738
# dx error: 9.384673161989355e-09
训练两层全连接网络
完成cs231n/classifiers/fc_net.py里的TwoLayerNet类后再运行以下代码,测试代码是否正确
np.random.seed(231)
N, D, H, C = 3, 5, 50, 7
X = np.random.randn(N, D)
y = np.random.randint(C, size=N)
std = 1e-3
# 初始化模型
model = TwoLayerNet(input_dim=D, hidden_dim=H, num_classes=C, weight_scale=std)
# 测试初始化
print('Testing initialization ... ')
W1_std = abs(model.params['W1'].std() - std)
b1 = model.params['b1']
W2_std = abs(model.params['W2'].std() - std)
b2 = model.params['b2']
# 两个标准差的误差小于std的十分之一
assert W1_std < std / 10, 'First layer weights do not seem right'
assert np.all(b1 == 0), 'First layer biases do not seem right'
assert W2_std < std / 10, 'Second layer weights do not seem right'
assert np.all(b2 == 0), 'Second layer biases do not seem right'
# 测试前向传播得到的得分
print('Testing test-time forward pass ... ')
model.params['W1'] = np.linspace(-0.7, 0.3, num=D*H).reshape(D, H)
model.params['b1'] = np.linspace(-0.1, 0.9, num=H)
model.params['W2'] = np.linspace(-0.3, 0.4, num=H*C).reshape(H, C)
model.params['b2'] = np.linspace(-0.9, 0.1, num=C)
X = np.linspace(-5.5, 4.5, num=N*D).reshape(D, N).T
scores = model.loss(X)
correct_scores = np.asarray(
[[11.53165108, 12.2917344, 13.05181771, 13.81190102, 14.57198434, 15.33206765, 16.09215096],
[12.05769098, 12.74614105, 13.43459113, 14.1230412, 14.81149128, 15.49994135, 16.18839143],
[12.58373087, 13.20054771, 13.81736455, 14.43418138, 15.05099822, 15.66781506, 16.2846319 ]])
scores_diff = np.abs(scores - correct_scores).sum()
assert scores_diff < 1e-6, 'Problem with test-time forward pass'
# 测试损失(无正则化)
print('Testing training loss (no regularization)')
y = np.asarray([0, 5, 1])
loss, grads = model.loss(X, y)
correct_loss = 3.4702243556
assert abs(loss - correct_loss) < 1e-10, 'Problem with training-time loss'
# 测试损失(有正则化)
model.reg = 1.0
loss, grads = model.loss(X, y)
correct_loss = 26.5948426952
assert abs(loss - correct_loss) < 1e-10, 'Problem with regularization loss'
# 测试梯度,分有正则化和无正则化两种情况
for reg in [0.0, 0.7]:
print('Running numeric gradient check with reg = ', reg)
model.reg = reg
loss, grads = model.loss(X, y)
for name in sorted(grads):
f = lambda _: model.loss(X, y)[0]
grad_num = eval_numerical_gradient(f, model.params[name], verbose=False)
print('%s relative error: %.2e' % (name, rel_error(grad_num, grads[name])))
查看cs231n/solver.py,然后使用一个Solver实例训练一个两层网络,使其在验证集上至少达到50%的准确率
model = TwoLayerNet(reg=0.7)
solver = None
solver = Solver(model,data,
update_rule='sgd',
optim_config={
'learning_rate': 1e-3,
},
lr_decay=0.95,
num_epochs=10, batch_size=100,
print_every=100)
solver.train()
可视化训练损失和准确率
# 可视化训练损失
plt.subplot(2, 1, 1)
plt.title('Training loss')
plt.plot(solver.loss_history, 'o')
plt.xlabel('Iteration')
# 可视化准确率
plt.subplot(2, 1, 2)
plt.title('Accuracy')
plt.plot(solver.train_acc_history, '-o', label='train')
plt.plot(solver.val_acc_history, '-o', label='val')
# [0.5]表示虚线在0.5处,len(solver.val_acc_history)代表虚线的长度
plt.plot([0.5] * len(solver.val_acc_history), 'k--')
plt.xlabel('Epoch')
plt.legend(loc='lower right')
plt.gcf().set_size_inches(15, 12)
plt.show()
训练多层全连接网络
完成cs231n/classifiers/fc_net.py里的FullyConnectedNet类后再运行下面的代码
梯度的相对误差应该在1e-6左右或者更小
np.random.seed(231)
N, D, H1, H2, C = 2, 15, 20, 30, 10
X = np.random.randn(N, D)
y = np.random.randint(C, size=(N,))
for reg in [0, 3.14]:
print('Running check with reg = ', reg)
model = FullyConnectedNet([H1, H2], input_dim=D, num_classes=C,
reg=reg, weight_scale=5e-2, dtype=np.float64)
loss, grads = model.loss(X, y)
print('Initial loss: ', loss)
for name in sorted(grads):
f = lambda _: model.loss(X, y)[0]
grad_num = eval_numerical_gradient(f, model.params[name], verbose=False, h=1e-5)
print('%s relative error: %.2e' % (name, rel_error(grad_num, grads[name])))
另一个检查神经网络是否正常的方法是:使用小数据集在较复杂的网络上训练,使其在训练集上达到100%的准确率,也就是过拟合。
# 使用一个三层网络训练50个数据集,调整学习率和初始化标准差使其过拟合
num_train = 50
small_data = {
'X_train': data['X_train'][:num_train],
'y_train': data['y_train'][:num_train],
'X_val': data['X_val'],
'y_val': data['y_val'],
}
# 原始数据weight_scale = 1e-2 和 learning_rate = 1e-4
# 改为weight_scale = 1或改为learning_rate = 1e-2都可达到过拟合
weight_scale = 1e-2
learning_rate = 1e-2
model = FullyConnectedNet([100, 100],
weight_scale=weight_scale, dtype=np.float64)
solver = Solver(model, small_data,
print_every=10, num_epochs=20, batch_size=25,
update_rule='sgd',
optim_config={
'learning_rate': learning_rate,
}
)
solver.train()
# 画出损失的变化过程
plt.plot(solver.loss_history, 'o')
plt.title('Training loss history')
plt.xlabel('Iteration')
plt.ylabel('Training loss')
plt.show()
使用一个5层的神经网络,调整学习率和初始化标准差,使得50个训练数据在epochs=20时达到过拟合
num_train = 50
small_data = {
'X_train': data['X_train'][:num_train],
'y_train': data['y_train'][:num_train],
'X_val': data['X_val'],
'y_val': data['y_val'],
}
# 原始数据为weight_scale = 1e-5
learning_rate = 1e-3
weight_scale = 5.7e-2
model = FullyConnectedNet([100, 100, 100, 100],
weight_scale=weight_scale, dtype=np.float64)
solver = Solver(model, small_data,
print_every=10, num_epochs=20, batch_size=25,
update_rule='sgd',
optim_config={
'learning_rate': learning_rate,
}
)
solver.train()
# 画出损失变化
plt.plot(solver.loss_history, 'o')
plt.title('Training loss history')
plt.xlabel('Iteration')
plt.ylabel('Training loss')
plt.show()
Q:训练三层网络和训练五层网络相比,有什么不同吗?
A:在训练五层网络的时候,损失对weight_scale更敏感,调整weight_scale能使神经网络更容易收敛到局部最小。原因是五层网络的复杂度使模型有更多可能性的同时,也会使损失函数更复杂,从而更难优化,也就更依赖于初始化。
fc_net.py
两层全连接网络
from builtins import range
from builtins import object
import numpy as np
from cs231n.layers import *
from cs231n.layer_utils import *
class TwoLayerNet(object):
"""
模块化的两层全连接网络,使用ReLU作为激活函数,softmax作为损失。
网络结构为affine - relu - affine - softmax
注意,这个类不实现梯度下降,梯度优化是在Solver类中实现的。
"""
def __init__(self, input_dim=3*32*32, hidden_dim=100, num_classes=10,
weight_scale=1e-3, reg=0.0):
"""
- weight_scale: 初始化权重时的标准差
"""
self.params = {}
self.reg = reg
# 初始化权重和偏置
self.params['W1'] = weight_scale*np.random.randn(input_dim,hidden_dim)
self.params['W2'] = weight_scale*np.random.randn(hidden_dim,num_classes)
self.params['b1'] = np.zeros((hidden_dim))
self.params['b2'] = np.zeros((num_classes))
计算两层全连接网络的损失和梯度
def loss(self, X, y=None):
"""
输入:
- X: 维度为(N, d_1, ..., d_k)
- y: 维度为(N,)
输出:
如果y==None返回scores
- scores: 维度为(N, C)
否则,返回:
- loss: 损失
- grads: self.params里的参数的梯度
"""
scores = None
# 实现前向传播,计算得分
W1,b1 = self.params['W1'],self.params['b1']
W2,b2 = self.params['W2'],self.params['b2']
out_relu,affine_relu_cache = affine_relu_forward(X, W1, b1)
scores,fc_cache = affine_forward(out_relu,W2,b2)
# 如果y==None,返回scores
if y is None:
return scores
loss, grads = 0, {}
# dscores维度(N,C)
loss,dscores = softmax_loss(scores, y)
# 加上正则化,loss/N在softmax_loss里已经实现了
loss += self.reg*0.5*(np.sum(W1**2)+np.sum(W2**2))
# 求梯度
dout_relu,dW2,db2 = affine_backward(dscores,fc_cache)
dx,dW1,db1 = affine_relu_backward(dout_relu,affine_relu_cache)
grads['W1'] = dW1+self.reg*W1
grads['b1'] = db1
grads['W2'] = dW2+self.reg*W2
grads['b2'] = db2
return loss, grads
多层全连接网络
class FullyConnectedNet(object):
"""
L层全连接的神经网络的结构为:{affine - relu} x (L - 1) - affine - softmax
"""
def __init__(self, hidden_dims, input_dim=3*32*32, num_classes=10,
dropout=0, use_batchnorm=False, reg=0.0,
weight_scale=1e-2, dtype=np.float32, seed=None):
"""
输入:
- dropout: 现在暂时用不到的参数
- use_batchnorm: 是否使用批量归一化
- dtype: 如果使用float32,速度比较快,但是精度较差,所以在检验梯度时应该使用float64
- seed: dropout时使用的,现在暂时不管
"""
self.use_batchnorm = use_batchnorm
self.use_dropout = dropout > 0
self.reg = reg
# 神经网络的总层数为隐藏层层数加一
self.num_layers = 1 + len(hidden_dims)
self.dtype = dtype
self.params = {}
# 初始化权重和偏置
for i in range(len(hidden_dims)):
D = hidden_dims[i]
self.params['W'+str(i+1)] = weight_scale*np.random.randn(input_dim,D)
self.params['b'+str(i+1)] = np.zeros((D))
input_dim = D
self.params['W'+str(self.num_layers)] = weight_scale*np.random.randn(hidden_dims[-1],num_classes)
self.params['b'+str(self.num_layers)] = np.zeros((num_classes))
# 下面本来有批量归一化和dropout的初始化的,但是暂时用不到就没粘出来
计算多次全连接网络的损失和梯度
def loss(self, X, y=None):
X = X.astype(self.dtype)
mode = 'test' if y is None else 'train'
# 这里本来有设置dropout和批量归一化的模式的,但是现在用不到,先删除了
scores = None
#实现前向传播,计算得分
cache = {}
input_X = X
# 和两层网络的类似,只不过要加一层循环
for i in range(self.num_layers-1):
W,b = self.params['W'+str(i+1)],self.params['b'+str(i+1)]
input_X,cache[i] = affine_relu_forward(input_X,W,b)
W,b = self.params['W'+str(self.num_layers)],self.params['b'+str(self.num_layers)]
scores,cache[self.num_layers-1] = affine_forward(input_X,W,b)
# 如果是test模式,也就是说y==None,直接返回
if mode == 'test':
return scores
loss, grads = 0.0, {}
# 计算损失和损失层的梯度
loss,dscores = softmax_loss(scores,y)
# 损失层前一层的affine层后面没有ReLU层,所以要单独计算
W = self.params['W'+str(self.num_layers)]
# 损失加上该层的正则化损失
loss += self.reg*0.5*np.sum(W**2)
dout_relu,dW,db = affine_backward(dscores,cache[self.num_layers-1])
grads['W'+str(self.num_layers)] = dW+self.reg*W
grads['b'+str(self.num_layers)] = db
for i in range(self.num_layers-1,0,-1):
W = self.params['W'+str(i)]
loss += self.reg*0.5*np.sum(W**2)
dout_relu,dW,db = affine_relu_backward(dout_relu,cache[i-1])
grads['W'+str(i)] = dW+self.reg*W
grads['b'+str(i)] = db
return loss, grads
FullyConnedtedNets.ipynb中还用到了solver.py,今天不想写了,明天(要不就是后天或大后天)写到下一篇去好了。