主要内容:
L2:Improving Deep Neutral Networks:Hyper parameter tuning、Regularization and Optimization
week1: Practical aspects of Deep Learning
1.如何划分测试集、验证集、测试集
2.高偏差和高方差的理解以及如何处理
3.几种正则化方法(L2正则化、dropout、数据增强、早停)
4.加快训练过程的方法(标准化输入、权重参数初始化)
5.梯度检查
主要案例:
1.权重初始化。构建三层神经网络,使用三种不同的方法进行权重初始化,分析对比,体会不同。
- 使用0来初始化参数。
- 使用随机数来初始化参数。
- 使用抑梯度异常初始化参数(参见视频中的梯度消失和梯度爆炸)
2.正则化。构建三层神经网络,不使用正则化,使用L2正则化,使用dropout。对比分析三种模型效果。
3.梯度检查。可验证反向传播的梯度与梯度的数值近似值之间的接近程度。
一、权重初始化
1.预先需要的一些函数
import numpy as np
import matplotlib.pyplot as plt
import pylab
import sklearn
import sklearn.datasets
def load_dataset():
np.random.seed(1)
# 通过sklearn.dataset创建数据集
# make_circles中n_samples生成的总点数,noise表示噪声比例0.05
# 返回值:train_X为(n_samples,2)每行一个样本点,每列代表一个特征
# train_Y为(n_samples,) 存储每个样本的标签0/1
train_X, train_Y = sklearn.datasets.make_circles(n_samples=300, noise=.05)
np.random.seed(2)
test_X, test_Y = sklearn.datasets.make_circles(n_samples=100, noise=.05)
# 可视化
plt.scatter(train_X[:, 0], train_X[:, 1], c=train_Y, s=40, cmap=plt.cm.Spectral)
pylab.show()
train_X = train_X.T # (2,300)
train_Y = train_Y.reshape((1, train_Y.shape[0])) # (1,300)
test_X = test_X.T
test_Y = test_Y.reshape((1, test_Y.shape[0]))
return train_X, train_Y, test_X, test_Y
# sigmoid
def sigmoid(x):
s = 1 / (1 + np.exp(-x))
return s
# relu
def relu(x):
s = np.maximum(0, x)
return s
# 前向传播
def forward_propagation(X, parameters):
# 取出数据
W1 = parameters["W1"]
b1 = parameters["b1"]
W2 = parameters["W2"]
b2 = parameters["b2"]
W3 = parameters["W3"]
b3 = parameters["b3"]
# LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID
z1 = np.dot(W1, X) + b1
a1 = relu(z1)
z2 = np.dot(W2, a1) + b2
a2 = relu(z2)
z3 = np.dot(W3, a2) + b3
a3 = sigmoid(z3)
cache = (z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3)
return a3, cache
# 反向传播计算导数
def backward_propagation(X, Y, cache):
m = X.shape[1] # train 300,test 100
(z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3) = cache
# dz3通过dj/da3和da3/dz3计算出来的,后者为sigmoid的导数
dz3 = 1. / m * (a3 - Y) # 这里乘1/m让后面梯度下降不需要再除m
dW3 = np.dot(dz3, a2.T)
db3 = np.sum(dz3, axis=1, keepdims=True)
# np.int64(a2 > 0 为relu的导数
da2 = np.dot(W3.T, dz3)
dz2 = np.multiply(da2, np.int64(a2 > 0))
dW2 = np.dot(dz2, a1.T)
db2 = np.sum(dz2, axis=1, keepdims=True)
da1 = np.dot(W2.T, dz2)
dz1 = np.multiply(da1, np.int64(a1 > 0))
dW1 = np.dot(dz1, X.T)
db1 = np.sum(dz1, axis=1, keepdims=True)
gradients = {"dz3": dz3, "dW3": dW3, "db3": db3,
"da2": da2, "dz2": dz2, "dW2": dW2, "db2": db2,
"da1": da1, "dz1": dz1, "dW1": dW1, "db1": db1}
return gradients
def update_parameters(parameters, grads, learning_rate):
L = len(parameters) // 2 # 参数中有w、b,除以2代表层数
for k in range(L):
parameters["W" + str(k + 1)] = parameters["W" + str(k + 1)] - learning_rate * grads["dW" + str(k + 1)]
parameters["b" + str(k + 1)] = parameters["b" + str(k + 1)] - learning_rate * grads["db" + str(k + 1)]
return parameters
# 经典损失函数
def compute_loss(a3, Y):
m = Y.shape[1]
logprobs = np.multiply(-np.log(a3), Y) + np.multiply(-np.log(1 - a3), 1 - Y)
loss = 1. / m * np.nansum(logprobs)
return loss
# 通过训练得到的parameters进行预测
def predict(X, y, parameters):
m = X.shape[1]
p = np.zeros((1, m), dtype=np.int32) # 网上教程给的是np.int,不安全
a3, caches = forward_propagation(X, parameters)
for i in range(0, a3.shape[1]):
if a3[0, i] > 0.5:
p[0, i] = 1
else:
p[0, i] = 0
# print results
print("Accuracy: " + str(np.mean((p[0, :] == y[0, :]))))
return p
# 绘制决策边界
def plot_decision_boundary(model, X, y):
x_min, x_max = X[0, :].min() - 1, X[0, :].max() + 1
y_min, y_max = X[1, :].min() - 1, X[1, :].max() + 1
h = 0.01
# np.arange为生成等差数列
# np.arange(0, 6, 2) ————array([0, 2, 4])
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = model(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
plt.ylabel('x2')
plt.xlabel('x1')
plt.scatter(X[0, :], X[1, :], c=np.squeeze(y), cmap=plt.cm.Spectral)
plt.show()
def predict_dec(parameters, X):
a3, cache = forward_propagation(X, parameters)
predictions = (a3 > 0.5)
return predictions
2.加载数据集,构建模型
# 加载数据集
train_X, train_Y, test_X, test_Y = load_dataset()
# 定义三层神经网络模型:LINEAR ->RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID
def model(X, Y, learning_rate=0.01, num_iterations=15000, print_cost=True, initialization="he", is_polt=True):
grads = {}
costs = []
# 样本总数m
m = X.shape[1]
# 各层维度
layers_dims = [X.shape[0], 10, 5, 1]
# 选择初始化参数的类型
if initialization == "zeros":
parameters = initialize_parameters_zeros(layers_dims)
elif initialization == "random":
parameters = initialize_parameters_random(layers_dims)
elif initialization == "he":
parameters = initialize_parameters_he(layers_dims)
else:
print("错误的初始化参数!程序退出")
exit
# 开始学习
for i in range(0, num_iterations):
# 前向传播
a3, cache =forward_propagation(X, parameters)
# 计算成本
cost = compute_loss(a3, Y)
# 反向传播
grads =backward_propagation(X, Y, cache)
# 更新参数
parameters =update_parameters(parameters, grads, learning_rate)
# 记录成本
if i % 1000 == 0:
costs.append(cost)
# 打印成本
if print_cost:
print("第" + str(i) + "次迭代,成本值为:" + str(cost))
# 学习完毕,绘制成本曲线
if is_polt:
plt.plot(costs)
plt.ylabel('cost')
plt.xlabel('iterations (per hundreds)')
plt.title("Learning rate =" + str(learning_rate))
plt.show()
# 返回学习完毕后的参数
return parameters
3.定义三种不同初始化权重的方法,并测试效果
# 用0初始化参数
def initialize_parameters_zeros(layers_dims):
parameters = {}
L = len(layers_dims) # 网络层数
# layers_dims = [2, 10, 5, 1]
for l in range(1, L):
parameters['W' + str(l)] = np.zeros((layers_dims[l], layers_dims[l - 1]))
parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))
# 使用断言确保我的数据格式是正确的
assert (parameters["W" + str(l)].shape == (layers_dims[l], layers_dims[l - 1]))
assert (parameters["b" + str(l)].shape == (layers_dims[l], 1))
return parameters
# 随机初始化参数
def initialize_parameters_random(layers_dims):
np.random.seed(3)
parameters = {}
L = len(layers_dims) # 表示层数的整数
for l in range(1, L):
# 使用 10 倍缩放, 看看如果把w设置的很大会发生什么
parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * 10
parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))
# 使用断言确保我的数据格式是正确的
assert (parameters["W" + str(l)].shape == (layers_dims[l], layers_dims[l - 1]))
assert (parameters["b" + str(l)].shape == (layers_dims[l], 1))
return parameters
# he方法初始化参数
def initialize_parameters_he(layers_dims):
np.random.seed(3) # 指定随机种子
parameters = {}
L = len(layers_dims) # 层数
for l in range(1, L):
parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(2 / layers_dims[l - 1])
parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))
# 使用断言确保我的数据格式是正确的
assert (parameters["W" + str(l)].shape == (layers_dims[l], layers_dims[l - 1]))
assert (parameters["b" + str(l)].shape == (layers_dims[l], 1))
return parameters
# 用0初始化参数,测试效果
parameters = model(train_X, train_Y, initialization = "zeros",is_polt=True)
print ("训练集:")
predictions_train = predict(train_X, train_Y, parameters)
print ("测试集:")
predictions_test = predict(test_X, test_Y, parameters)
print("predictions_train = " + str(predictions_train))
print("predictions_test = " + str(predictions_test))
plt.title("Model with Zeros initialization")
axes = plt.gca()
axes.set_xlim([-1.5, 1.5])
axes.set_ylim([-1.5, 1.5])
plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)
# 随机初始化参数,测试效果
parameters = model(train_X, train_Y, initialization = "random",is_polt=True)
print("训练集:")
predictions_train = predict(train_X, train_Y, parameters)
print("测试集:")
predictions_test = predict(test_X, test_Y, parameters)
print(predictions_train)
print(predictions_test)
plt.title("Model with large random initialization")
axes = plt.gca()
axes.set_xlim([-1.5, 1.5])
axes.set_ylim([-1.5, 1.5])
plot_decision_boundary(lambda x:predict_dec(parameters, x.T), train_X, train_Y)
# He初始化参数,测试效果
parameters = model(train_X, train_Y, initialization = "he",is_polt=True)
print("训练集:")
predictions_train = predict(train_X, train_Y, parameters)
print("测试集:")
predictions_test = predict(test_X, test_Y, parameters)
plt.title("Model with He initialization")
axes = plt.gca()
axes.set_xlim([-1.5, 1.5])
axes.set_ylim([-1.5, 1.5])
plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)
测试结果及分析
1.权重初始化为0,cost一直不变,说明模型一直没有学习,预测准确率也很差。通常,将所有权重初始化为零会导致网络无法打破对称性。 这意味着每一层中的每个神经元都将学习相同的东西。权重𝑊[𝑙]应该随机初始化以打破对称性;将偏差𝑏[𝑙]初始化为零是可以的。
2.随机初始化权重。误差开始很高,这是因为由于具有较大的随机权重,最后一个激活(sigmoid)输出的结果非常接近于0或1,而当它出现错误时,它会导致非常高的损失。初始化参数不好会导致梯度消失、爆炸,这会减慢优化算法。如果对这个网络进行更长时间的训练,将看到更好的结果,但是使用过大的随机数初始化会减慢优化的速度。初始化为小的参数值会更好。
3.抑梯度异常初始化。亦叫He初始化,由He等人提出。建议激活函数是ReLU时使用。
总结:
1.不同的初始化方法可能导致性能最终不同。
2.随机初始化有助于打破对称,使得不同隐藏层的单元可以学习到不同的参数。
3.初始化时,初始值不宜过大。
4.He初始化搭配ReLU激活函数常常可以得到不错的效果。
二、正则化
1.预先需要的一些函数
import numpy as np
import matplotlib.pyplot as plt
import pylab
import scipy.io
def relu(x):
return np.maximum(0, x)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def initialize_parameters(layers_dims):
parameters = {}
np.random.seed(3)
for i in range(1, len(layers_dims)):
parameters['W' + str(i)] = np.random.randn(layers_dims[i], layers_dims[i - 1]) / np.sqrt(layers_dims[i - 1])
parameters['b' + str(i)] = np.zeros((layers_dims[i], 1))
return parameters
def forward_propagation(X, parameters):
W1 = parameters["W1"]
b1 = parameters["b1"]
W2 = parameters["W2"]
b2 = parameters["b2"]
W3 = parameters["W3"]
b3 = parameters["b3"]
Z1 = np.dot(W1, X) + b1
A1 = relu(Z1)
Z2 = np.dot(W2, A1) + b2
A2 = relu(Z2)
Z3 = np.dot(W3, A2) + b3
A3 = sigmoid(Z3)
cache = (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3)
return A3, cache
def compute_cost(A3, Y):
m = Y.shape[1]
return 1 / m * np.nansum(np.multiply(-np.log(A3), Y) + np.multiply(-np.log(1 - A3), 1 - Y))
def backward_propagation(X, Y, cache):
m = X.shape[1]
(Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache
dZ3 = 1. / m * (A3 - Y)
dW3 = np.dot(dZ3, A2.T)
db3 = np.sum(dZ3, axis=1, keepdims=True)
dA2 = np.dot(W3.T, dZ3)
dZ2 = np.multiply(dA2, np.int64(A2 > 0))
dW2 = np.dot(dZ2, A1.T)
db2 = np.sum(dZ2, axis=1, keepdims=True)
dA1 = np.dot(W2.T, dZ2)
dZ1 = np.multiply(dA1, np.int64(A1 > 0))
dW1 = np.dot(dZ1, X.T)
db1 = np.sum(dZ1, axis=1, keepdims=True)
gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3,
"dA2": dA2, "dZ2": dZ2, "dW2": dW2, "db2": db2,
"dA1": dA1, "dZ1": dZ1, "dW1": dW1, "db1": db1}
return gradients
def update_parameters(parameters, grads, learning_rate):
for i in range(len(parameters) // 2):
parameters['W' + str(i + 1)] = parameters['W' + str(i + 1)] - learning_rate * grads['dW' + str(i + 1)]
parameters["b" + str(i + 1)] = parameters["b" + str(i + 1)] - learning_rate * grads["db" + str(i + 1)]
return parameters
def predict(X, y, parameters):
m = X.shape[1]
p = np.zeros((1, m), dtype=np.int64)
a3, cache = forward_propagation(X, parameters)
for i in range(0, a3.shape[1]):
if a3[0, i] > 0.5:
p[0, i] = 1
else:
p[0, i] = 0
print("Accuracy: " + str(np.mean((p[0, :] == y[0, :]))))
return p
def plot_decision_boundary(model, X, y):
x_min, x_max = X[0, :].min() - 1, X[0, :].max() + 1
y_min, y_max = X[1, :].min() - 1, X[1, :].max() + 1
h = 0.01
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = model(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
plt.ylabel('x2')
plt.xlabel('x1')
plt.scatter(X[0, :], X[1, :], c=np.squeeze(y), cmap=plt.cm.Spectral)
plt.show()
def predict_dec(parameters, X):
a3, cache = forward_propagation(X, parameters)
predictions = (a3 > 0.5)
return predictions
def load_2D_dataset():
data = scipy.io.loadmat('C:/Users/Dell/Desktop/datasets/data.mat')
# mat格式类似于字典 X、y、yval、Xval
train_X = data['X'].T
train_Y = data['y'].T
test_X = data['Xval'].T
test_Y = data['yval'].T
plt.scatter(train_X[0, :], train_X[1, :], c=np.squeeze(train_Y), s=40, cmap=plt.cm.Spectral)
pylab.show()
'''
print(train_X.shape) # (2,211)
print(train_Y.shape) # (1,211)
print(test_X.shape) # (2,200)
print(test_Y.shape) # (1,200)
'''
return train_X, train_Y, test_X, test_Y
2.加载数据集,构建模型
# 加载数据集
train_X, train_Y, test_X, test_Y = load_2D_dataset()
# 搭建神经网络模型
def model(X, Y, learning_rate=0.3, num_iterations=30000, print_cost=True, is_plot=True, lambd=0, keep_prob=1):
grads = {}
costs = []
m = X.shape[1]
layers_dims = [X.shape[0], 20, 3, 1]
# 初始化参数
parameters = initialize_parameters(layers_dims)
# 开始学习
for i in range(0, num_iterations):
# 前向传播
## 是否随机删除节点
if keep_prob == 1:
### 不随机删除节点
a3, cache = forward_propagation(X, parameters)
elif keep_prob < 1:
### 随机删除节点
a3, cache = forward_propagation_with_dropout(X, parameters, keep_prob)
else:
print("keep_prob参数错误!程序退出。")
exit
# 计算成本
## 是否使用二范数
if lambd == 0:
### 不使用 L2 正则化
cost = compute_cost(a3, Y)
else:
### 使用 L2 正则化
cost = compute_cost_with_regularization(a3, Y, parameters, lambd)
# 反向传播
## 可以同时使用 L2 正则化和随机删除节点,但是本次实验不同时使用。
assert (lambd == 0 or keep_prob == 1)
## 两个参数的使用情况
if (lambd == 0 and keep_prob == 1):
### 不使用 L2 正则化和不使用随机删除节点
grads = backward_propagation(X, Y, cache)
elif lambd != 0:
### 使用 L2 正则化,不使用随机删除节点
grads = backward_propagation_with_regularization(X, Y, cache, lambd)
elif keep_prob < 1:
### 使用随机删除节点,不使用 L2 正则化
grads = backward_propagation_with_dropout(X, Y, cache, keep_prob)
# 更新参数
parameters = update_parameters(parameters, grads, learning_rate)
# 记录并打印成本
if i % 1000 == 0:
## 记录成本
costs.append(cost)
if (print_cost and i % 10000 == 0):
# 打印成本
print("第" + str(i) + "次迭代,成本值为:" + str(cost))
# 是否绘制成本曲线图
if is_plot:
plt.plot(costs)
plt.ylabel('cost')
plt.xlabel('iterations (x1,000)')
plt.title("Learning rate =" + str(learning_rate))
plt.show()
# 返回学习后的参数
return parameters
3. 不使用正则化、使用L2、使用dropout
# 代价函数加上正则化
def compute_cost_with_regularization(A3, Y, parameters, lambd):
m = Y.shape[1]
W1 = parameters["W1"]
W2 = parameters["W2"]
W3 = parameters["W3"]
# 交叉熵损失
cross_entropy_cost = compute_cost(A3, Y)
# 加上L2正则化那一项
L2_regularization_cost = lambd * (np.sum(np.square(W1)) + np.sum(np.square(W2)) + np.sum(np.square(W3))) / (2 * m)
cost = cross_entropy_cost + L2_regularization_cost
return cost
# 相应反向传播梯度计算时dw也要加上正则化那一项
def backward_propagation_with_regularization(X, Y, cache, lambd):
m = X.shape[1]
(Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache
dZ3 = A3 - Y
dW3 = (1 / m) * np.dot(dZ3, A2.T) + ((lambd * W3) / m)
db3 = (1 / m) * np.sum(dZ3, axis=1, keepdims=True)
dA2 = np.dot(W3.T, dZ3)
dZ2 = np.multiply(dA2, np.int64(A2 > 0))
dW2 = (1 / m) * np.dot(dZ2, A1.T) + ((lambd * W2) / m)
db2 = (1 / m) * np.sum(dZ2, axis=1, keepdims=True)
dA1 = np.dot(W2.T, dZ2)
dZ1 = np.multiply(dA1, np.int64(A1 > 0))
dW1 = (1 / m) * np.dot(dZ1, X.T) + ((lambd * W1) / m)
db1 = (1 / m) * np.sum(dZ1, axis=1, keepdims=True)
gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3, "dA2": dA2,
"dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1,
"dZ1": dZ1, "dW1": dW1, "db1": db1}
return gradients
# LINEAR -> RELU + DROPOUT -> LINEAR -> RELU + DROPOUT -> LINEAR -> SIGMOID.
def forward_propagation_with_dropout(X, parameters, keep_prob=0.5):
np.random.seed(1)
# retrieve parameters
W1 = parameters["W1"]
b1 = parameters["b1"]
W2 = parameters["W2"]
b2 = parameters["b2"]
W3 = parameters["W3"]
b3 = parameters["b3"]
# LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID
Z1 = np.dot(W1, X) + b1
A1 = relu(Z1)
### START CODE HERE ### (approx. 4 lines)
D1 = np.random.rand(A1.shape[0], A1.shape[1])
D1 = D1 < keep_prob
A1 = A1 * D1
A1 = A1 / keep_prob
### END CODE HERE ###
Z2 = np.dot(W2, A1) + b2
A2 = relu(Z2)
### START CODE HERE ### (approx. 4 lines)
D2 = np.random.rand(A2.shape[0], A2.shape[1])
D2 = D2 < keep_prob
A2 = A2 * D2
A2 = A2 / keep_prob
### END CODE HERE ###
Z3 = np.dot(W3, A2) + b3
A3 = sigmoid(Z3)
cache = (Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3)
return A3, cache
def backward_propagation_with_dropout(X, Y, cache, keep_prob):
m = X.shape[1]
(Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3) = cache
dZ3 = A3 - Y
dW3 = 1. / m * np.dot(dZ3, A2.T)
db3 = 1. / m * np.sum(dZ3, axis=1, keepdims=True)
dA2 = np.dot(W3.T, dZ3)
### START CODE HERE ### (≈ 2 lines of code)
dA2 = dA2 * D2
dA2 = dA2 / keep_prob
### END CODE HERE ###
dZ2 = np.multiply(dA2, np.int64(A2 > 0))
dW2 = 1. / m * np.dot(dZ2, A1.T)
db2 = 1. / m * np.sum(dZ2, axis=1, keepdims=True)
dA1 = np.dot(W2.T, dZ2)
### START CODE HERE ### (≈ 2 lines of code)
dA1 = dA1 * D1
dA1 = dA1 / keep_prob
### END CODE HERE ###
dZ1 = np.multiply(dA1, np.int64(A1 > 0))
dW1 = 1. / m * np.dot(dZ1, X.T)
db1 = 1. / m * np.sum(dZ1, axis=1, keepdims=True)
gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3, "dA2": dA2,
"dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1,
"dZ1": dZ1, "dW1": dW1, "db1": db1}
return gradients
print("======不适用正则化======")
parameters = model(train_X, train_Y,is_plot=True)
print("训练集:")
predictions_train = predict(train_X, train_Y, parameters)
print("测试集:")
predictions_test = predict(test_X, test_Y, parameters)
plt.title("Model without regularization")
axes = plt.gca()
axes.set_xlim([-0.75,0.40])
axes.set_ylim([-0.75,0.65])
plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)
print("======使用L2 Regularization======")
parameters = model(train_X, train_Y, lambd=0.7,is_plot=True)
print("训练集:")
predictions_train = predict(train_X, train_Y, parameters)
print("测试集:")
predictions_test = predict(test_X, test_Y, parameters)
plt.title("Model with L2-regularization")
axes = plt.gca()
axes.set_xlim([-0.75,0.40])
axes.set_ylim([-0.75,0.65])
plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)
print("======使用dropout======")
parameters = model(train_X, train_Y, keep_prob = 0.86, learning_rate = 0.3)
print ("训练集:")
predictions_train = predict(train_X, train_Y, parameters)
print ("测试集:")
predictions_test = predict(test_X, test_Y, parameters)
plt.title("Model with dropout")
axes = plt.gca()
axes.set_xlim([-0.75,0.40])
axes.set_ylim([-0.75,0.65])
plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)
测试结果及分析
1.在无正则化时,分割曲线有了明显的过拟合特性。
2.L2正则化实际上在做什么?L2正则化依赖于较小权重的模型比具有较大权重的模型更简单这样的假设,因此,通过削减成本函数中权重的平方值,可以将所有权重值逐渐改变到到较小的值。 λ的值是可以使用开发集调整时的超参数。L2正则化会使决策边界更加平滑。如果λ太大,也可能会“过度平滑”,从而导致模型高偏差。
3.dropout是一种正则化技术。仅在训练期间使用dropout,在测试期间不要使用。在正向和反向传播期间均应用dropout。在训练期间,将每个dropout层除以keep_prob,以保持激活的期望值相同。
4.正则化会损害训练集的性能,这是因为它限制了网络过拟合训练集的能力。 但它最终可以提供更好的测试准确性。
三、梯度检验
1.预先需要的一些函数
import numpy as np
def sigmoid(x):
s = 1 / (1 + np.exp(-x))
return s
def relu(x):
s = np.maximum(0, x)
return s
# 将参数字典转化为向量
def dictionary_to_vector(parameters):
keys = []
count = 0
for key in ["W1", "b1", "W2", "b2", "W3", "b3"]:
# 展平为列向量
new_vector = np.reshape(parameters[key], (-1, 1))
# 将键添加到列表,重复次数为展平后的向量长度
keys = keys + [key] * new_vector.shape[0]
# 拼接为最后的theta向量
if count == 0:
theta = new_vector
else:
theta = np.concatenate((theta, new_vector), axis=0)
count = count + 1
return theta, keys
def vector_to_dictionary(theta):
parameters = {}
parameters["W1"] = theta[:20].reshape((5, 4))
parameters["b1"] = theta[20:25].reshape((5, 1))
parameters["W2"] = theta[25:40].reshape((3, 5))
parameters["b2"] = theta[40:43].reshape((3, 1))
parameters["W3"] = theta[43:46].reshape((1, 3))
parameters["b3"] = theta[46:47].reshape((1, 1))
return parameters
# 梯度转化成向量
def gradients_to_vector(gradients):
count = 0
for key in ["dW1", "db1", "dW2", "db2", "dW3", "db3"]:
# flatten parameter
new_vector = np.reshape(gradients[key], (-1, 1))
if count == 0:
theta = new_vector
else:
theta = np.concatenate((theta, new_vector), axis=0)
count = count + 1
return theta
def gradient_check_n_test_case():
np.random.seed(1)
x = np.random.randn(4, 3)
y = np.array([1, 1, 0])
W1 = np.random.randn(5, 4)
b1 = np.random.randn(5, 1)
W2 = np.random.randn(3, 5)
b2 = np.random.randn(3, 1)
W3 = np.random.randn(1, 3)
b3 = np.random.randn(1, 1)
parameters = {"W1": W1,
"b1": b1,
"W2": W2,
"b2": b2,
"W3": W3,
"b3": b3}
return x, y, parameters
2.一维梯度检查
# 一维梯度检查
def forward_propagation(x, theta):
J = np.dot(theta, x)
return J
def backward_propagation(x, theta):
dtheta = x
return dtheta
def gradient_check(x, theta, epsilon=1e-7):
# 计算gradapprox
thetaplus = theta + epsilon
thetaminus = theta - epsilon
J_plus = forward_propagation(x, thetaplus)
J_minus = forward_propagation(x, thetaminus)
gradapprox = (J_plus - J_minus) / (2 * epsilon)
# 得到grad
grad = backward_propagation(x, theta)
# 检查 gradapprox 是否足够接近 grad
numerator = np.linalg.norm(grad - gradapprox)
denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox)
difference = numerator / denominator
if difference < 1e-7:
print("梯度检查:梯度正常!")
else:
print("梯度检查:梯度超出阈值!")
return difference
print("======一维梯度检查======")
x, theta = 2, 4
difference = gradient_check(x, theta)
print("difference = " + str(difference))
3.N维梯度检查
# N维梯度检查
def forward_propagation_n(X, Y, parameters):
m = X.shape[1]
W1 = parameters["W1"]
b1 = parameters["b1"]
W2 = parameters["W2"]
b2 = parameters["b2"]
W3 = parameters["W3"]
b3 = parameters["b3"]
# LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID
Z1 = np.dot(W1, X) + b1
A1 = relu(Z1)
Z2 = np.dot(W2, A1) + b2
A2 = relu(Z2)
Z3 = np.dot(W3, A2) + b3
A3 = sigmoid(Z3)
# 计算成本
logprobs = np.multiply(-np.log(A3), Y) + np.multiply(-np.log(1 - A3), 1 - Y)
cost = (1 / m) * np.sum(logprobs)
cache = (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3)
return cost, cache
def backward_propagation_n(X, Y, cache):
m = X.shape[1]
(Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache
dZ3 = A3 - Y
dW3 = 1. / m * np.dot(dZ3, A2.T)
db3 = 1. / m * np.sum(dZ3, axis=1, keepdims=True)
dA2 = np.dot(W3.T, dZ3)
dZ2 = np.multiply(dA2, np.int64(A2 > 0))
dW2 = 1. / m * np.dot(dZ2, A1.T)
db2 = 1. / m * np.sum(dZ2, axis=1, keepdims=True)
dA1 = np.dot(W2.T, dZ2)
dZ1 = np.multiply(dA1, np.int64(A1 > 0))
dW1 = 1. / m * np.dot(dZ1, X.T)
db1 = 1. / m * np.sum(dZ1, axis=1, keepdims=True)
gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3,
"dA2": dA2, "dZ2": dZ2, "dW2": dW2, "db2": db2,
"dA1": dA1, "dZ1": dZ1, "dW1": dW1, "db1": db1}
return gradients
def gradient_check_n(parameters, gradients, X, Y, epsilon=1e-7):
# 获取J(theta),返回参数向量和对应的keys
parameters_values, keys = dictionary_to_vector(parameters)
# 获取d theta ,返回梯度向量
grad = gradients_to_vector(gradients)
# 有多少theta[i]
num_parameters = parameters_values.shape[0]
# 初始化以下三个向量
J_plus = np.zeros((num_parameters, 1))
J_minus = np.zeros((num_parameters, 1))
gradapprox = np.zeros((num_parameters, 1))
# 计算 gradapprox
# 对于每个theta[i]
for i in range(num_parameters):
# 计算 J_plus [i]
# thetaplus与parameters_values共享相同数据,但不影响原始数组
thetaplus = np.copy(parameters_values)
thetaplus[i][0] = thetaplus[i][0] + epsilon # Step 2
J_plus[i], cache = forward_propagation_n(X, Y, vector_to_dictionary(thetaplus))
# 计算 J_minus [i]
thetaminus = np.copy(parameters_values)
thetaminus[i][0] = thetaminus[i][0] - epsilon
J_minus[i], cache = forward_propagation_n(X, Y, vector_to_dictionary(thetaminus))
# 计算 gradapprox[i]
gradapprox[i] = (J_plus[i] - J_minus[i]) / (2 * epsilon)
# 通过计算差异比较 gradapprox 和后向传播梯度
numerator = np.linalg.norm(grad - gradapprox)
denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox)
difference = numerator / denominator
if difference > 1e-7:
print(
"\033[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "\033[0m")
else:
print(
"\033[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "\033[0m")
return difference
print("======N维梯度检查======")
X, Y, parameters = gradient_check_n_test_case()
cost, cache = forward_propagation_n(X, Y, parameters)
gradients = backward_propagation_n(X, Y, cache)
difference = gradient_check_n(parameters, gradients, X, Y)
- 梯度检验可验证反向传播的梯度与梯度的数值近似值之间的接近度(使用正向传播进行计算)。
- 梯度检验很慢,因此不会在每次训练中都运行它。通常,为了确保代码正确而运行。注意梯度检验不适用于dropout,运行不带dropout的梯度检验算法以确保你的backprop是正确的,然后再添加dropout。