神经网络各层的底层实现
下面代码均来自: https://github.com/sharedeeply/cs231n-assignment-solution
Batch Normalization
batch_normalization的前向通路实现,均值和方差是对所有样本同一位置上的值求均值和方差 。
def batchnorm_forward(x, gamma, beta, bn_param):
mode = bn_param['mode']
eps = bn_param.get('eps', 1e-5)
momentum = bn_param.get('momentum', 0.9)
N, D = x.shape
running_mean = bn_param.get('running_mean', np.zeros(D, dtype=x.dtype))
running_var = bn_param.get('running_var', np.zeros(D, dtype=x.dtype))
if mode == 'train':
mu = np.mean(x, axis=0)
sigma2 = np.var(x, axis=0)
x_hat = (x - mu) / np.sqrt(sigma2 + eps)
out = gamma * x_hat + beta
running_mean = momentum * running_mean + (1 - momentum) * mu# 通过对过程中均值和方差的存储得到在测试时需要使用的均值和方差
running_var = momentum * running_var + (1 - momentum) * sigma2
inv_sigma = 1. / np.sqrt(sigma2 + eps)
cache = (x, x_hat, gamma, mu, inv_sigma)
elif mode == 'test':
x_hat = (x - running_mean) / np.sqrt(running_var + eps)
out = gamma * x_hat + beta
else:
raise ValueError('Invalid forward batchnorm mode "%s"' % mode)
# Store the updated running means back into bn_param
bn_param['running_mean'] = running_mean
bn_param['running_var'] = running_var
return out, cache
batch_normalization的反向传播,其中求导时应该注意均值和方差其实都是一个关于x的变量。
def batchnorm_backward(dout, cache):
x, x_hat, gamma, mu, inv_sigma = cache
N = x.shape[0]
dbeta = np.sum(dout, axis=0)
dgamma = np.sum(x_hat * dout, axis=0)
dvar = np.sum(-0.5 * inv_sigma ** 3 * (x - mu) * gamma * dout, axis=0)
dmu = np.sum(-1 * inv_sigma * gamma * dout, axis=0)
dx = gamma * dout * inv_sigma + (2 / N) * (x - mu) * dvar + (1 / N) * dmu
return dx, dgamma, dbeta
ReLU
def relu_forward(x):
out = x * (x > 0)
cache = x
return out, cache
def relu_backward(dout, cache):
dx, x = None, cache
dx = dout * (x > 0)
return dx
Dropout
def dropout_forward(x, dropout_param):
p, mode = dropout_param['p'], dropout_param['mode']
if 'seed' in dropout_param:
np.random.seed(dropout_param['seed'])
mask = None
out = None
if mode == 'train':
mask = (np.random.rand(*x.shape) < p) / p# 在训练时除以概率,使得在测试时不需要做任何处理;或者可以在训练不除,在测试时乘以概率
out = x * mask
elif mode == 'test':
out = x
cache = (dropout_param, mask)
out = out.astype(x.dtype, copy=False)
return out, cache
def dropout_backward(dout, cache):
dropout_param, mask = cache
mode = dropout_param['mode']
dx = None
if mode == 'train':
dx = dout * mask# 只更新未被丢弃的参数
elif mode == 'test':
dx = dout
return dx
卷积神经网络
文中使用for循环实现了卷积神经网络。有些简陋,不过可以帮助我们很好的理解神经网络的操作过程。
def conv_forward_naive(x, w, b, conv_param):
out = None
stride = conv_param['stride']
padding = conv_param['pad']
if isinstance(stride, numbers.Number):
stride = (stride, stride) #
if isinstance(padding, numbers.Number):
padding = [(padding, padding), (padding, padding)]
else:
padding = [(i,) * 2 for i in padding]
pad = [(0, 0), (0, 0)]
pad.extend(padding)
x_pad = np.pad(x, pad_width=pad, mode='constant', constant_values=0)
n, c, pad_h, pad_w = x_pad.shape
f, w_c, hh, ww = w.shape
assert c == w_c, 'input channels must equal to filter channels'
out_h = (pad_h - hh) // stride[0] + 1
out_w = (pad_w - ww) // stride[1] + 1
out = np.zeros(shape=(n, f, out_h, out_w))
for i in range(n): # 每个样本点
for j in range(f): # 每个filter
for _w in range(out_w): # 水平方向
for _h in range(out_h): # 竖直方向
out[i, j, _h, _w] = np.sum(
x_pad[i, :, _h*stride[1]: _h*stride[1]+hh, _w*stride[0]: _w*stride[0]+ww] * w[j]) + b[j]# 输出矩阵点与原矩阵对应位置是stride倍数关系
cache = (x, w, b, conv_param)
return out, cache
def conv_backward_naive(dout, cache):
dx, dw, db = None, None, None
x, w, b, conv_param = cache
stride = conv_param['stride']
padding = conv_param['pad']
if isinstance(stride, numbers.Number):
# isinstance判断类型是否一致
stride = (stride, stride) #
if isinstance(padding, numbers.Number):
padding = [(padding, padding), (padding, padding)]
else:
padding = [(i,) * 2 for i in padding]
pad = [(0, 0), (0, 0)]
pad.extend(padding)
x_pad = np.pad(x, pad_width=pad, mode='constant', constant_values=0)
n, c, pad_h, pad_w = x_pad.shape
f, w_c, hh, ww = w.shape
assert c == w_c, 'input channels must equal to filter channels'
out_h = (pad_h - hh) // stride[0] + 1
out_w = (pad_w - ww) // stride[1] + 1
dw = np.zeros_like(w)
db = np.zeros_like(b)
dx_pad = np.zeros_like(x_pad)
for i in range(n): # 每个样本点
for j in range(f): # 每个filter
for _w in range(out_w): # 水平方向
for _h in range(out_h): # 竖直方向
dw[j] += dout[i, j, _h, _w] * x_pad[i, :, _h*stride[1]: _h*stride[1]+ww, _w*stride[0]: _w*stride[0]+hh]
db[j] += dout[i, j, _h, _w]
dx_pad[i, :, _h*stride[1]: _h*stride[1]+hh, _w*stride[0]: _w*stride[0]+ww] += \
dout[i, j, _h, _w] * w[j]
dx = dx_pad[:, :, pad[2][0]:-pad[2][1], pad[3][0]:-pad[3][1]]
return dx, dw, db
由上面实现我们可知,w一层的导数为遍历整张图片之后导数的叠加。
Pooling
def max_pool_forward_naive(x, pool_param):
out = None
n, c, h, w = x.shape
pool_h = pool_param['pool_height']
pool_w = pool_param['pool_width']
stride = pool_param['stride']
out_h = 1 + (h - pool_h) // stride
out_w = 1 + (w - pool_w) // stride
out = np.zeros(shape=(n, c, out_h, out_w))
for i in range(n):
for j in range(c):
for _h in range(out_h):
for _w in range(out_w):
out[i, j, _h, _w] = np.max(x[i, j, _h*stride: _h*stride+pool_h, _w*stride: _w*stride+pool_w])
cache = (x, pool_param)
return out, cache
def max_pool_backward_naive(dout, cache):
dx = None
x, pool_param = cache
n, c, h, w = x.shape
pool_h = pool_param['pool_height']
pool_w = pool_param['pool_width']
stride = pool_param['stride']
out_h = 1 + (h - pool_h) // stride
out_w = 1 + (w - pool_w) // stride
dx = np.zeros_like(x)
for i in range(n):
for j in range(c):
for _h in range(out_h):
for _w in range(out_w):
# 查看一下 np.unravel_index 有什么作用
indices = np.unravel_index(np.argmax(x[i, j, _h*stride: _h*stride+pool_h,
_w*stride: _w*stride+pool_w]), dims=(pool_h, pool_w))
# np.argmax函数返回最大值所在的位置(这个位置是将numpy拉成一行array的位置),然后使用np.unravel_index获取这个位置相对于矩阵的具体坐标,即max_pooling所保留值的位置也是需要更新权重的位置。
dx[i, j, _h*stride+indices[0], _w*stride+indices[1]] += dout[i, j, _h, _w]
return dx