backpropagation

最新推荐文章于 2024-09-26 23:26:40 发布

墨轩子卿

最新推荐文章于 2024-09-26 23:26:40 发布

阅读量818

点赞数

分类专栏：机器学习

本文链接：https://blog.youkuaiyun.com/qq_46141221/article/details/121233910

版权

神经网络反向传播手写数字识别损失函数梯度下降

关键词由优快云通过智能技术生成

机器学习专栏收录该内容

12 篇文章

订阅专栏

反向传播算法

利用反向传播识别手写数字：

概念，载入数据等重复的过程请参考文章:neural network，本文章只记录代码实现过程

数据初始化：

input_layer_size = 400
hidden_layer_size = 25
num_labels = 10

# loading and visualizing data
data = sio.loadmat('ex4data1.mat')
X = data['X']
y = data['y']
m = X.shape[0]
Y = np.zeros((np.size(y), 10))
for i in range(Y.shape[0]):
    Y[i, y[i] - 1] = 1

随机数据展示：

def displayData(x):
    example_width = int(np.round(np.sqrt(np.size(x, 1))))
    m, n = x.shape
    example_height = int(n / example_width)
    display_rows = int(np.floor(np.sqrt(m)))
    display_cols = int(np.ceil(m / display_rows))
    pad = 1
    display_array = - np.ones((pad + display_rows * (example_height + pad), pad + display_cols * (example_width + pad)))
    curr_ex = 0
    for j in range(display_rows):
        for i in range(display_cols):
            if curr_ex > m:
                break
            max_vals = np.max(np.abs(X[curr_ex, :]))
            display_array[pad + j * (example_height + pad):pad + j * (example_height + pad) + example_height,
            pad + i * (example_width + pad):pad + i * (example_width + pad) + example_width] \
                = x[curr_ex, :].reshape((example_height, example_width)) / max_vals
            curr_ex += 1
        if curr_ex > m:
            break
    plt.figure()
    plt.imshow(display_array.T, cmap='gray', extent=[-1, 1, -1, 1])
    plt.axis('off')
    plt.show()


sel = np.random.permutation(m)
sel = sel[0:100]
displayData(X[sel, :])

bp-reset-hidden

加载参数：

# loading parameters
print('Loading Saved Neural Network Parameters ')
para = sio.loadmat('ex4weights.mat')
Theta1 = para['Theta1']
Theta2 = para['Theta2']
para = np.append(Theta1, Theta2)

正向传播与损失函数：

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# sigmoid函数导数
def sigmoidGradient(z):
    return sigmoid(z) * (1 - sigmoid(z))

#损失函数
def nnCostFunction(para, input_layer_size, hidden_layer_size, num_labels, X, y, lamd):
    Theta1 = para[0:hidden_layer_size * (input_layer_size+1)].reshape((hidden_layer_size, input_layer_size+1))
    Theta2 = para[(hidden_layer_size*(input_layer_size+1)):].reshape((num_labels, 1+hidden_layer_size))
    m = X.shape[0]
    a1 = np.c_[np.ones(m), X]
    z2 = a1.dot(Theta1.T)
    a2 = sigmoid(z2)
    n = a2.shape[0]
    a2 = np.c_[np.ones(n), a2]
    a3 = sigmoid(a2.dot(Theta2.T))
    J = np.sum(-y * np.log(a3) - ((1 - y)*(np.log((1 - a3)))))/m + lamd * 
        (np.sum(Theta1[:, 1:] ** 2) + np.sum(Theta2[:, 1:]**2)) / (2 * m)

    # backpropagation
    delta3 = a3 - y
    delta2 = delta3.dot(Theta2)
    delta2 = delta2[:, 1:]*(sigmoidGradient(z2))
    Delta1 = np.zeros(Theta1.shape)
    Delta2 = np.zeros(Theta2.shape)
    Delta1 = Delta1 + delta2.T.dot(a1)
    Delta2 = Delta2 + delta3.T.dot(a2)
    Theta1_grad = ((1 / m) * Delta1) + ((lamd / m) * Theta1)
    Theta2_grad = ((1 / m) * Delta2) + ((lamd / m) * Theta2)
    Theta1_grad[:, 0] = Theta1_grad[:, 0] - ((lamd / m) * Theta1[:, 0])
    Theta2_grad[:, 0] = Theta2_grad[:, 0] - ((lamd / m) * Theta2[:, 0])
    grad = np.append(Theta1_grad, Theta2_grad)
    return J, grad


print('Feedforward using neural network')
lamd = 0
J, grad = nnCostFunction(para, input_layer_size, hidden_layer_size, num_labels, X, Y, lamd)
print('Cost at parameters (loaded from ex4weights): %f , (this value should be about 0.287629)' % J)

此时正向传播的各项值都已经计算出来了，第二三的梯度也都求出来了，损失函数也写出来了

结果如下：

Feedforward using neural network
Cost at parameters (loaded from ex4weights): 0.287629 , (this value should be about 0.287629)

正则校验：

lamd = 1
J, grad = nnCostFunction(para, input_layer_size, hidden_layer_size, num_labels, X, Y, lamd)
print('Cost at parameters (loaded from ex4weights): %f (this value should be about 0.383770)' % J)

结果如下：

Press [enter] to continue>? 
Cost at parameters (loaded from ex4weights): 0.383770 (this value should be about 0.383770)

随机初始化参数：

def randInitializeWeights(L_in, L_out):
    w = np.zeros((L_in, L_out))
    epsilon_init = 0.12
    w = np.random.rand(L_out, 1+L_in) * 2 * epsilon_init - epsilon_init
    return w

initial_Theta1 = randInitializeWeights(input_layer_size, hidden_layer_size)
initial_Theta2 = randInitializeWeights(hidden_layer_size, num_labels)
intial_para = np.append(initial_Theta1, initial_Theta2)

梯度下降校验：



def debugInitWeights(fout, fin):
    w = np.sin(np.arange(fout*(fin+1))+1).reshape(fout, fin+1)/10
    return w


# 数值逼近
def computeNumericalGradient(J, theta, args):
    numgrad = np.zeros(np.size(theta))
    perturb = np.zeros(np.size(theta))
    epsilon = 1e-4
    for i in range(np.size(theta)):
        perturb[i] = epsilon
        loss1, _ = J(theta-perturb, *args)
        loss2, _ = J(theta+perturb, *args)
        numgrad[i] = (loss2-loss1)/(2*epsilon)
        perturb[i] = 0
    return numgrad


def checkNNGradient(lamd):
    input_layer_size = 3
    hidden_layer_size = 5
    num_labels = 3
    m = 5

    theta1 = debugInitWeights(hidden_layer_size, input_layer_size)
    theta2 = debugInitWeights(num_labels, hidden_layer_size)

    x = debugInitWeights(m, input_layer_size-1)
    y = 1+(np.arange(m)+1) % num_labels
    y = y.reshape(m, 1)
    Y = np.zeros((m, num_labels))
    for i in range(Y.shape[0]):
        Y[i, y[i] - 1] = 1

    nn_params = np.concatenate((theta1.flatten(), theta2.flatten()))

    cost, grad = nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, x, Y, lamd)
    numgrad = computeNumericalGradient(nnCostFunction, nn_params,
                                       (input_layer_size, hidden_layer_size, num_labels, x, Y, lamd))
    print(numgrad, '\n', grad)
    print('The above two columns you get should be very similar.\n \
    (Left-Your Numerical Gradient, Right-Analytical Gradient)')
    diff = lin.norm(numgrad-grad)/lin.norm(numgrad+grad) # 范数
    print('If your backpropagation implementation is correct, then \n\
         the relative difference will be small (less than 1e-9). \n\
         \nRelative Difference: ', diff)

结果如下：

Checking Backpropagation
[ 1.23162247e-02  1.73828185e-04  2.61455146e-04  1.08701450e-04
  3.92471369e-03  1.90101250e-04  2.22272334e-04  5.00872543e-05
 -8.08459407e-03  3.13170623e-05 -2.17840346e-05 -5.48569878e-05
 -1.26669105e-02 -1.56130209e-04 -2.45506164e-04 -1.09164882e-04
 -5.59342546e-03 -2.00036570e-04 -2.43630216e-04 -6.32313668e-05
  3.09347722e-01  1.61067138e-01  1.47036522e-01  1.58268577e-01
  1.57616707e-01  1.47236360e-01  1.08133003e-01  5.61633717e-02
  5.19510542e-02  5.47353405e-02  5.53082757e-02  5.17752619e-02
  1.06270372e-01  5.57611045e-02  5.05568118e-02  5.38805142e-02
  5.47407215e-02  5.02929547e-02] 
 [ 1.23162247e-02  1.73828184e-04  2.61455144e-04  1.08701450e-04
  3.92471369e-03  1.90101252e-04  2.22272331e-04  5.00872547e-05
 -8.08459407e-03  3.13170587e-05 -2.17840341e-05 -5.48569864e-05
 -1.26669105e-02 -1.56130210e-04 -2.45506163e-04 -1.09164881e-04
 -5.59342547e-03 -2.00036572e-04 -2.43630220e-04 -6.32313673e-05
  3.09347722e-01  1.61067138e-01  1.47036522e-01  1.58268577e-01
  1.57616707e-01  1.47236360e-01  1.08133003e-01  5.61633717e-02
  5.19510542e-02  5.47353405e-02  5.53082757e-02  5.17752619e-02
  1.06270372e-01  5.57611045e-02  5.05568118e-02  5.38805141e-02
  5.47407215e-02  5.02929547e-02]
The above two columns you get should be very similar.
     (Left-Your Numerical Gradient, Right-Analytical Gradient)
If your backpropagation implementation is correct, then 
         the relative difference will be small (less than 1e-9).

正则校验：

[ 0.01231622  0.05473167  0.00872866 -0.04529945  0.00392471 -0.01657483
  0.03964147  0.05941158 -0.00808459 -0.03260995 -0.0600212  -0.03224923
 -0.01266691  0.05928031  0.03877176 -0.01738336 -0.00559343 -0.04525927
  0.008749    0.05471348  0.30934772  0.21562498  0.15550372  0.11286043
  0.10008125  0.13047143  0.108133    0.11552487  0.07667816  0.02209407
 -0.00469114  0.01958089  0.10627037  0.11519755  0.08957408  0.03660632
 -0.00294313  0.00523372] 
 [ 0.01231622  0.05473167  0.00872866 -0.04529945  0.00392471 -0.01657483
  0.03964147  0.05941158 -0.00808459 -0.03260995 -0.0600212  -0.03224923
 -0.01266691  0.05928031  0.03877176 -0.01738336 -0.00559343 -0.04525927
  0.008749    0.05471348  0.30934772  0.21562498  0.15550372  0.11286043
  0.10008125  0.13047143  0.108133    0.11552487  0.07667816  0.02209407
 -0.00469114  0.01958089  0.10627037  0.11519755  0.08957408  0.03660632
 -0.00294313  0.00523372]
The above two columns you get should be very similar.
     (Left-Your Numerical Gradient, Right-Analytical Gradient)
If your backpropagation implementation is correct, then 
         the relative difference will be small (less than 1e-9).

可见梯度下降还是比较准确的

损失校验：

debug_J = nnCostFunction(para, input_layer_size, hidden_layer_size, num_labels, X, Y, lamd)[0]
print('Cost at (fixed) debugging parameters (w/ lambda = %f): %f (for lambda = 3, this value should be about 0.576051)'
      , lamd, debug_J)

结果如下：

Cost at (fixed) debugging parameters (w/ lambda = %f): %f (for lambda = 3, this value should be about 0.576051) 3 0.5760512469501331

训练网络

def costFun(para, input_layer_size, hidden_layer_size, num_labels, X, y, lamd):
    return nnCostFunction(para, input_layer_size, hidden_layer_size, num_labels, X, y, lamd)[0]


def gradFun(para, input_layer_size, hidden_layer_size, num_labels, X, y, lamd):
    return nnCostFunction(para, input_layer_size, hidden_layer_size, num_labels, X, y, lamd)[1]


print('Training neural network')
lamd = 1
result = op.fmin_cg(f=costFun, x0=intial_para, fprime=gradFun, args=(input_layer_size, hidden_layer_size, num_labels, X, Y, lamd), maxiter=50, disp=False)
theta1 = result[0:hidden_layer_size*(input_layer_size+1)].reshape(hidden_layer_size, (input_layer_size+1))
theta2 = result[hidden_layer_size*(input_layer_size+1):].reshape(num_labels, (hidden_layer_size+1))

中间层可视化展示：

print('Visualizing Neural Network')
displayData(theta1[:, 1:])

bp-reset-data

模型检验预测：

def predict(theta1, theta2, x):
    m = x.shape[0]
    h1 = sigmoid(np.c_[np.ones(m), x].dot(theta1.T))
    h2 = sigmoid(np.c_[np.ones(m), h1].dot(theta2.T))
    p = np.argmax(h2, 1)
    return p

p = predict(theta1, theta2, X)
print('Training Set Accuracy: ', np.mean(np.double(p+1 == y.flatten())) * 100)

结果如下：

Training Set Accuracy:  96.17999999999999

和上次老师给的数据有一点差距，但足以见得神经网络的厉害