def layer_forward(x, w):
""" Receive inputs x and weights w """
# Do some computations ...
z = # ... some intermediate value
# Do some more computations ...
out = # the output
cache = (x, w, z, out) # Values we need to compute gradients
return out, cache
The backward pass will receive upstream derivatives and the cache object,
and will return gradients with respect to the inputs and weights, like this:
python
def layer_backward(dout, cache):
"""
Receive derivative of loss with respect to outputs and cache,
and compute derivative with respect to inputs.
"""
# Unpack cache values
x, w, z, out = cache
# Use values in cache to compute derivatives
dx = # Derivative of loss with respect to x
dw = # Derivative of loss with respect to w
return dx, dw
__coauthor__ = 'Deeplayer'
# 6.22.2016 #
from layer_utils import *
class TwoLayerNet(object):
"""
A two-layer fully-connected neural network with ReLU nonlinearity and
softmax loss that uses a modular layer design. We assume an input dimension
of D, a hidden dimension of H, and perform classification over C classes.
The architecure should be affine - relu - affine - softmax.
Note that this class does not implement gradient descent; instead, it
will interact with a separate Solver object that is responsible for running
optimization.
The learnable parameters of the model are stored in the dictionary
self.params that maps parameter names to numpy arrays.
"""
def __init__(self, input_dim=3*32*32, hidden_dim=100, num_classes=10,
weight_scale=1e-3, reg=0.0):
"""
Initialize a new network.
Inputs:
- input_dim: An integer giving the size of the input
- hidden_dim: An integer giving the size of the hidden layer
- num_classes: An integer giving the number of classes to classify
- dropout: Scalar between 0 and 1 giving dropout strength.
- weight_scale: Scalar giving the standard deviation for random
initialization of the weights.
- reg: Scalar giving L2 regularization strength.
"""
self.params = {}
self.reg = reg
self.params['W1'] = weight_scale * np.random.randn(input_dim, hidden_dim)
self.params['b1'] = np.zeros((1, hidden_dim))
self.params['W2'] = weight_scale * np.random.randn(hidden_dim, num_classes)
self.params['b2'] = np.zeros((1, num_classes))
def loss(self, X, y=None):
"""
Compute loss and gradient for a minibatch of data.
Inputs:
- X: Array of input data of shape (N, d_1, ..., d_k)
- y: Array of labels, of shape (N,). y[i] gives the label for X[i].
Returns:
If y is None, then run a test-time forward pass of the model and return:
- scores: Array of shape (N, C) giving classification scores, where
scores[i, c] is the classification score for X[i] and class c.
If y is not None, then run a training-time forward and backward pass and
return a tuple of:
- loss: Scalar value giving the loss
- grads: Dictionary with the same keys as self.params, mapping parameter
names to gradients of the loss with respect to those parameters.
"""
scores = None
N = X.shape[0]
# Unpack variables from the params dictionary
W1, b1 = self.params['W1'], self.params['b1']
W2, b2 = self.params['W2'], self.params['b2']
h1, cache1 = affine_relu_forward(X, W1, b1)
out, cache2 = affine_forward(h1, W2, b2)
scores = out # (N,C)
# If y is None then we are in test mode so just return scores
if y is None:
return scores
loss, grads = 0, {}
data_loss, dscores = softmax_loss(scores, y)
reg_loss = 0.5 * self.reg * np.sum(W1*W1) + 0.5 * self.reg * np.sum(W2*W2)
loss = data_loss + reg_loss
# Backward pass: compute gradients
dh1, dW2, db2 = affine_backward(dscores, cache2)
dX, dW1, db1 = affine_relu_backward(dh1, cache1)
# Add the regularization gradient contribution
dW2 += self.reg * W2
dW1 += self.reg * W1
grads['W1'] = dW1
grads['b1'] = db1
grads['W2'] = dW2
grads['b2'] = db2
return loss, grads
---> layers.py
__coauthor__ = 'Deeplayer'
# 6.22.2016
#import numpy as np
def affine_forward(x, w, b):
"""
Computes the forward pass for an affine (fully-connected) layer.
The input x has shape (N, d_1, ..., d_k) and contains a minibatch of N
examples, where each example x[i] has shape (d_1, ..., d_k). We will
reshape each input into a vector of dimension D = d_1 * ... * d_k, and
then transform it to an output vector of dimension M.
Inputs:
- x: A numpy array containing input data, of shape (N, d_1, ..., d_k)
- w: A numpy array of weights, of shape (D, M)
- b: A numpy array of biases, of shape (M,)
Returns a tuple of:
- out: output, of shape (N, M)
- cache: (x, w, b)
"""
out = None
# Reshape x into rows
N = x.shape[0]
x_row = x.reshape(N, -1) # (N,D)
out = np.dot(x_row, w) + b # (N,M)
cache = (x, w, b)
return out, cache
def affine_backward(dout, cache):
"""
Computes the backward pass for an affine layer.
Inputs:
- dout: Upstream derivative, of shape (N, M)
- cache: Tuple of:
- x: Input data, of shape (N, d_1, ... d_k)
- w: Weights, of shape (D, M)
Returns a tuple of:
- dx: Gradient with respect to x, of shape (N, d1, ..., d_k)
- dw: Gradient with respect to w, of shape (D, M)
- db: Gradient with respect to b, of shape (M,)
"""
x, w, b = cache
dx, dw, db = None, None, None
dx = np.dot(dout, w.T) # (N,D)
dx = np.reshape(dx, x.shape) # (N,d1,...,d_k)
x_row = x.reshape(x.shape[0], -1) # (N,D)
dw = np.dot(x_row.T, dout) # (D,M)
db = np.sum(dout, axis=0, keepdims=True) # (1,M)
return dx, dw, db
def relu_forward(x):
"""
Computes the forward pass for a layer of rectified linear units (ReLUs).
Input:
- x: Inputs, of any shape
Returns a tuple of:
- out: Output, of the same shape as x
- cache: x
"""
out = None
out = ReLU(x)
cache = x
return out, cache
def relu_backward(dout, cache):
"""
Computes the backward pass for a layer of rectified linear units (ReLUs).
Input:
- dout: Upstream derivatives, of any shape
- cache: Input x, of same shape as dout
Returns:
- dx: Gradient with respect to x
"""
dx, x = None, cache
dx = dout
dx[x <= 0] = 0
return dx
def svm_loss(x, y):
"""
Computes the loss and gradient using for multiclass SVM classification.
Inputs:
- x: Input data, of shape (N, C) where x[i, j] is the score for the jth class
for the ith input.
- y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and
0 <= y[i] < C
Returns a tuple of:
- loss: Scalar giving the loss
- dx: Gradient of the loss with respect to x
"""
N = x.shape[0]
correct_class_scores = x[np.arange(N), y]
margins = np.maximum(0, x - correct_class_scores[:, np.newaxis] + 1.0)
margins[np.arange(N), y] = 0
loss = np.sum(margins) / N
num_pos = np.sum(margins > 0, axis=1)
dx = np.zeros_like(x)
dx[margins > 0] = 1
dx[np.arange(N), y] -= num_pos
dx /= N
return loss, dx
def softmax_loss(x, y):
"""
Computes the loss and gradient for softmax classification. Inputs:
- x: Input data, of shape (N, C) where x[i, j] is the score for the jth class
for the ith input.
- y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and
0 <= y[i] < C
Returns a tuple of:
- loss: Scalar giving the loss
- dx: Gradient of the loss with respect to x
"""
probs = np.exp(x - np.max(x, axis=1, keepdims=True))
probs /= np.sum(probs, axis=1, keepdims=True)
N = x.shape[0]
loss = -np.sum(np.log(probs[np.arange(N), y])) / N
dx = probs.copy()
dx[np.arange(N), y] -= 1
dx /= N
return loss, dx
def ReLU(x):
"""ReLU non-linearity."""
return np.maximum(0, x)
---> optim.py
__coauthor__ = 'Deeplayer'
# 6.22.2016
import numpy as np
def sgd(w, dw, config=None):
"""
Performs vanilla stochastic gradient descent.
config format:
- learning_rate: Scalar learning rate.
"""
if config is None: config = {}
config.setdefault('learning_rate', 1e-2)
w -= config['learning_rate'] * dw
return w, config
def sgd_momentum(w, dw, config=None):
"""
Performs stochastic gradient descent with momentum.
config format:
- learning_rate: Scalar learning rate.
- momentum: Scalar between 0 and 1 giving the momentum value.
Setting momentum = 0 reduces to sgd.
- velocity: A numpy array of the same shape as w and dw used to store a moving
average of the gradients.
"""
if config is None: config = {}
config.setdefault('learning_rate', 1e-2)
config.setdefault('momentum', 0.9)
v = config.get('velocity', np.zeros_like(w))
next_w = None
v = config['momentum'] * v - config['learning_rate'] * dw
next_w = w + v
config['velocity'] = v
return next_w, config
def rmsprop(x, dx, config=None):
"""
Uses the RMSProp update rule, which uses a moving average of squared gradient
values to set adaptive per-parameter learning rates.
config format:
- learning_rate: Scalar learning rate.
- decay_rate: Scalar between 0 and 1 giving the decay rate for the squared
gradient cache.
- epsilon: Small scalar used for smoothing to avoid dividing by zero.
- cache: Moving average of second moments of gradients.
"""
if config is None: config = {}
config.setdefault('learning_rate', 1e-2)
config.setdefault('decay_rate', 0.99)
config.setdefault('epsilon', 1e-8)
config.setdefault('cache', np.zeros_like(x))
next_x = None
cache = config['cache']
decay_rate = config['decay_rate']
learning_rate = config['learning_rate']
epsilon = config['epsilon']
cache = decay_rate * cache + (1 - decay_rate) * (dx**2)
x += - learning_rate * dx / (np.sqrt(cache) + epsilon)
config['cache'] = cache
next_x = x
return next_x, config
def adam(x, dx, config=None):
"""
Uses the Adam update rule, which incorporates moving averages of both the
gradient and its square and a bias correction term.
config format:
- learning_rate: Scalar learning rate.
- beta1: Decay rate for moving average of first moment of gradient.
- beta2: Decay rate for moving average of second moment of gradient.
- epsilon: Small scalar used for smoothing to avoid dividing by zero.
- m: Moving average of gradient.
- v: Moving average of squared gradient.
- t: Iteration number.
"""
if config is None: config = {}
config.setdefault('learning_rate', 1e-3)
config.setdefault('beta1', 0.9)
config.setdefault('beta2', 0.999)
config.setdefault('epsilon', 1e-8)
config.setdefault('m', np.zeros_like(x))
config.setdefault('v', np.zeros_like(x))
config.setdefault('t', 0)
next_x = None
m = config['m']
v = config['v']
beta1 = config['beta1']
beta2 = config['beta2']
learning_rate = config['learning_rate']
epsilon = config['epsilon']
t = config['t']
t += 1
m = beta1 * m + (1 - beta1) * dx
v = beta2 * v + (1 - beta2) * (dx**2)
m_bias = m / (1 - beta1**t)
v_bias = v / (1 - beta2**t)
x += - learning_rate * m_bias / (np.sqrt(v_bias) + epsilon)
next_x = x
config['m'] = m
config['v'] = v
config['t'] = t
return next_x, config
__coauthor__ = 'Deeplayer'
# 6.22.2016
from layer_utils import *
class FullyConnectedNet(object):
"""
A fully-connected neural network with an arbitrary number of hidden layers,
ReLU nonlinearities, and a softmax loss function. This will also implement
dropout and batch normalization as options. For a network with L layers,
the architecture will be
{affine - [batch norm] - relu - [dropout]} x (L - 1) - affine - softmax
where batch normalization and dropout are optional, and the {...} block is
repeated L - 1 times.
Similar to the TwoLayerNet above, learnable parameters are stored in the
self.params dictionary and will be learned using the Solver class.
def __init__(self, hidden_dims, input_dim=3*32*32,
num_classes=10,
dropout=0, use_batchnorm=False, reg=0.0,
weight_scale=1e-2, dtype=np.float32, seed=None):
"""
def __init__(self, hidden_dims, input_dim=3*32*32,
num_classes=10,
dropout=0, use_batchnorm=False, reg=0.0,
weight_scale=1e-2, dtype=np.float32, seed=None):
self.use_batchnorm = use_batchnorm
self.use_dropout = dropout > 0
self.reg = reg
self.num_layers = 1 + len(hidden_dims)
self.dtype = dtype
self.params = {}
layers_dims = [input_dim] + hidden_dims + [num_classes]
for i in xrange(self.num_layers):
self.params['W' + str(i+1)] = weight_scale * np.random.randn(layers_dims[i], layers_dims[i+1])
self.params['b' + str(i+1)] = np.zeros((1, layers_dims[i+1]))
if self.use_batchnorm and i < len(hidden_dims):
self.params['gamma' + str(i+1)] = np.ones((1, layers_dims[i+1]))
self.params['beta' + str(i+1)] = np.zeros((1, layers_dims[i+1]))
# When using dropout we need to pass a dropout_param dictionary to each
# dropout layer so that the layer knows the dropout probability and the mode
# (train / test). You can pass the same dropout_param to each dropout layer.
self.dropout_param = {}
if self.use_dropout:
self.dropout_param = {'mode': 'train', 'p': dropout}
if seed is not None:
self.dropout_param['seed'] = seed
# With batch normalization we need to keep track of running means and
# variances, so we need to pass a special bn_param object to each batch
# normalization layer. You should pass self.bn_params[0] to the forward pass
# of the first batch normalization layer, self.bn_params[1] to the forward
# pass of the second batch normalization layer, etc.
self.bn_params = []
if self.use_batchnorm:
self.bn_params = [{'mode': 'train'} for i in xrange(self.num_layers - 1)]
# Cast all parameters to the correct datatype
for k, v in self.params.iteritems():
self.params[k] = v.astype(dtype)
def loss(self, X, y=None):
"""
Compute loss and gradient for the fully-connected net.
Input / output: Same as TwoLayerNet above.
"""
X = X.astype(self.dtype)
mode = 'test' if y is None else 'train'
# Set train/test mode for batchnorm params and dropout param since they
# behave differently during training and testing.
if self.dropout_param is not None:
self.dropout_param['mode'] = mode
if self.use_batchnorm:
for bn_param in self.bn_params:
bn_param['mode'] = mode
scores = None
h, cache1, cache2, cache3, bn, out = {}, {}, {}, {}, {}, {}
out[0] = X
# Forward pass: compute loss
for i in xrange(self.num_layers-1):
# Unpack variables from the params dictionary
W, b = self.params['W' + str(i+1)], self.params['b' + str(i+1)]
if self.use_batchnorm:
gamma, beta = self.params['gamma' + str(i+1)], self.params['beta' + str(i+1)]
h[i], cache1[i] = affine_forward(out[i], W, b)
bn[i], cache2[i] = batchnorm_forward(h[i], gamma, beta, self.bn_params[i])
out[i+1], cache3[i] = relu_forward(bn[i])
else:
out[i+1], cache3[i] = affine_relu_forward(out[i], W, b)
W, b = self.params['W' + str(self.num_layers)], self.params['b' + str(self.num_layers)]
scores, cache = affine_forward(out[self.num_layers-1], W, b)
# If test mode return early
if mode == 'test':
return scores
loss, reg_loss, grads = 0.0, 0.0, {}
data_loss, dscores = softmax_loss(scores, y)
for i in xrange(self.num_layers):
reg_loss += 0.5 * self.reg * np.sum(self.params['W' + str(i+1)]*self.params['W' + str(i+1)])
loss = data_loss + reg_loss
# Backward pass: compute gradients
dout, dbn, dh = {}, {}, {}
t = self.num_layers-1
dout[t], grads['W'+str(t+1)], grads['b'+str(t+1)] = affine_backward(dscores, cache)
for i in xrange(t):
if self.use_batchnorm:
dbn[t-1-i] = relu_backward(dout[t-i], cache3[t-1-i])
dh[t-1-i], grads['gamma'+str(t-i)], grads['beta'+str(t-i)] = batchnorm_backward(dbn[t-1-i], cache2[t-1-i])
dout[t-1-i], grads['W'+str(t-i)], grads['b'+str(t-i)] = affine_backward(dh[t-1-i], cache1[t-1-i])
else:
dout[t-1-i], grads['W'+str(t-i)], grads['b'+str(t-i)] = affine_relu_backward(dout[t-i], cache3[t-1-i])
# Add the regularization gradient contribution
for i in xrange(self.num_layers):
grads['W'+str(i+1)] += self.reg * self.params['W' + str(i+1)]
return loss, grads
CS231n Convolutional Neural Networks for Visual Recognition.png
通常,池化层的采样窗口大小为2x2。
有些人认为池化层并不是必要的,如Striving for Simplicity: The All Convolutional Net。此外,有人发现去除池化层对于生成式模型(generative models)很重要,例如variational autoencoders(VAEs),generative adversarial networks(GANs)。可能在以后的模型结构中,池化层会逐渐减少或者消失。
__coauthor__ = 'Deeplayer'
# 6.25.2016 #
def max_pool_forward_naive(x, pool_param):
HH, WW = pool_param['pool_height'], pool_param['pool_width']
s = pool_param['stride']
N, C, H, W = x.shape
H_new = 1 + (H - HH) / s
W_new = 1 + (W - WW) / s
out = np.zeros((N, C, H_new, W_new))
for i in xrange(N):
for j in xrange(C):
for k in xrange(H_new):
for l in xrange(W_new):
window = x[i, j, k*s:HH+k*s, l*s:WW+l*s]
out[i, j, k, l] = np.max(window)
cache = (x, pool_param)
return out, cache
def max_pool_backward_naive(dout, cache):
x, pool_param = cache
HH, WW = pool_param['pool_height'], pool_param['pool_width']
s = pool_param['stride']
N, C, H, W = x.shape
H_new = 1 + (H - HH) / s
W_new = 1 + (W - WW) / s
dx = np.zeros_like(x)
for i in xrange(N):
for j in xrange(C):
for k in xrange(H_new):
for l in xrange(W_new):
window = x[i, j, k*s:HH+k*s, l*s:WW+l*s]
m = np.max(window)
dx[i, j, k*s:HH+k*s, l*s:WW+l*s] = (window == m) * dout[i, j, k, l]
return dx