Python Basic
$ python3
>>> 1 + 2
3
>>> 1 - 2
-1
>>> 4 * 5
20
>>> 7 / 5
1.4
>>> 3 ** 2
9
>>> type(10)
<class 'int'>
>>> type(2.718)
<class 'float'>
>>> type("hello")
<class 'str'>
>>> x = 10
>>> print(x)
10
>>> x = 100
>>> print(x)
100
>>> y = 3.14
>>> x * y
314.0
>>> type(x * y)
<class 'float'>
>>> a = [1, 2, 3, 4, 5]
>>> print(a)
[1, 2, 3, 4, 5]
>>> len(a)
5
>>> a[0]
1
>>> a[4]
5
>>> a[4] = 99
>>> print(a)
[1, 2, 3, 4, 99]
>>> a[0:2]
[1, 2]
>>> a[1:]
[2, 3, 4, 99]
>>> a[:3]
[1, 2, 3]
>>> a[:-1]
[1, 2, 3, 4]
>>> a[:-2]
[1, 2, 3]
>>> me = {'height' : 100}
>>> me['height']
100
>>> me['height'] = 70
>>> print(me)
{'height': 70}
>>> me['weight'] = 80
>>> print(me)
{'height': 70, 'weight': 80}
>>> hungry = True
>>> sleepy = False
>>> type(hungry)
<class 'bool'>
>>> not hungry
False
>>> hungry and sleepy
False
>>> hungry or sleepy
True
>>> hungry = True
>>> if hungry:
... print("I'm hungry")
...
"I'm hungry"
>>> hungry = False
>>> if hungry:
... print("I'm hungry")
... else:
... print("I'm not hungry")
... print("I'm sleepy")
...
"I'm not hungry"
"I'm sleepy"
>>> for i in [1, 2, 3]:
... print(i)
...
1
2
3
>>> def hello():
... print("Hello world")
...
>>> hello()
Hello world
>>> def hello(object):
... print("Hello " + object + "!")
...
>>> hello("cat")
Hello cat!
>>>
Class
class Man:
def __init__(self, name):
self.name = name
print("Initialized!")
def hello(self):
print("Hello " + self.name + "!")
def goodbye(self):
print("Good-bye " + self.name + "!")
m = Man("David")
m.hello()
m.goodbye()
Numpy
>>> import numpy as np
>>> x = np.array([1.0, 2.0, 3.0])
>>> print(x)
[1. 2. 3.]
>>> type(x)
<class 'numpy.ndarray'>
>>> x = np.array([1.0, 2.0, 3.0])
>>> y = np.array([2.0, 4.0, 6.0])
>>> x + y
array([3., 6., 9.])
>>> x - y
array([-1., -2., -3.])
>>> x * y
array([ 2., 8., 18.])
>>> x / y
array([0.5, 0.5, 0.5])
>>> x = np.array([1.0, 2.0, 3.0])
>>> x / 2.0
array([0.5, 1. , 1.5])
>>> A = np.array([[1,2], [3,4]])
>>> print(A)
[[1 2]
[3 4]]
>>> A.shape
(2, 2)
>>> A.dtype
dtype('int64')
>>> B = np.array([[3,0], [0,6]])
>>> A + B
array([[ 4, 2],
[ 3, 10]])
>>> A * B
array([[ 3, 0],
[ 0, 24]])
>>> print(A)
[[1 2]
[3 4]]
>>> A * 10
array([[10, 20],
[30, 40]])
>>> A = np.array([[1,2], [3,4]])
>>> B = np.array([10, 20])
>>> A * B
array([[10, 40],
[30, 80]])
>>> X = np.array([[51,55], [14, 19], [0, 4]])
>>> print(X)
[[51 55]
[14 19]
[ 0 4]]
>>> X[0] # get the first row of this array
array([51, 55])
>>> X[0][1] # get the number of (0, 1)
55
>>> for row in X:
... print(row)
...
[51 55]
[14 19]
[0 4]
>>> X = X.flatten()
>>> print(X)
[51 55 14 19 0 4]
>>> X[np.array([0, 2, 4])] # get the number whose the index is 0, 2, 4
array([51, 14, 0])
>>> X > 15 # filter the number that greater than 15
array([ True, True, False, True, False, False])
>>> X[X > 15]
array([51, 55, 19])
>>>
Matplotlib
Plot the sin function
import numpy as np
import matplotlib.pyplot as plt
x = np.arange(0, 6, 0.1)
y = np.sin(x)
plt.plot(x, y)
plt.show()
Plot the image of function sin and cos
import numpy as np
import matplotlib.pyplot as plt
x = np.arange(0, 6, 0.1)
y1 = np.sin(x)
y2 = np.cos(x)
plt.plot(x, y1, label="sin")
plt.plot(x, y2, linestyle = "--", label="cos")
plt.xlabel("x")
plt.ylabel("y")
plt.title('sin & cos')
plt.legend()
plt.show()
Display the image
import matplotlib.pyplot as plt
from matplotlib.image import imread
img = imread('lena.png') # the path of the image
plt.imshow(img)
plt.show()
Perceptron
感知机的运行原理
The equation below can represent the behavior of the perceptron
y={0 (w1x1+w2x2≤θ)1 (w1x1+w2x2>θ)y = \left\{\begin{matrix}0 \ (w_1 x_1 + w_2 x_2 \le \theta) \\ 1 \ (w_1 x_1 + w_2 x_2 \gt \theta) \end{matrix}\right.y={0 (w1x1+w2x2≤θ)1 (w1x1+w2x2>θ)
Simple implementation
def AND(x1, x2):
w1, w2, theta = 0.5, 0.5, 0.7
tmp = w1 * x1 + w2 * x2
if tmp <= theta:
return 0
elif tmp > theta:
return 1
print(AND(0, 0))
print(AND(0, 1))
print(AND(1, 0))
print(AND(1, 1))
导入权重和偏置
bbb is called bias, w1w_1w1 and w2w_2w2 are called weight
y={0 (b+w1x1+w2x2≤0)1 (b+w1x1+w2x2>0)y = \left\{\begin{matrix} 0 \ (b + w_1 x_1 + w_2 x_2 \le 0) \\ 1 \ (b + w_1 x_1 + w_2 x_2 \gt 0)\end{matrix}\right.y={0 (b+w1x1+w2x2≤0)1 (b+w1x1+w2x2>0)
use numpy to complete a simple neuron
import numpy as np
x = np.array([0, 1]) # input
w = np.array([0.5, 0.5]) # weight
b = -0.7 # bias
print(w * x)
print(np.sum(w * x))
print(np.sum(w * x) + b)
use bias and weight to complete an AND gate
def AND(x1, x2):
x = np.array([x1, x2])
w = np.array([0.5, 0.5])
b = -0.7
tmp = np.sum(w * x) + b
if tmp <= 0:
return 0
else:
return 1
NAND gate, OR gate
def NAND(x1, x2):
x = np.array([x1, x2])
w = np.array([-0.5, -0.5]) # just bias and weight different from AND
b = 0.7
tmp = np.sum(w * x) + b
if tmp <= 0:
return 0
else:
return 1
def OR(x1, x2):
x = np.array([x1, x2])
w = np.array([0.5, 0.5]) # just bias and weight different from AND
b = -0.2
tmp = np.sum(w * x) + b
if tmp <= 0:
return 0
else:
return 1
Use AND, NAND, OR gate to build NOR gate
def XOR(x1, x2):
s1 = NAND(x1, x2)
s2 = OR(x1, x2)
y = AND(s1, s2)
return y
Neuron Network
y={0 (b+w1x1+w2x2≤0)1 (b+w1x1+w2x2>0)y = \left\{\begin{matrix} 0 \ (b + w_1 x_1 + w_2 x_2 \le 0) \\ 1 \ (b + w_1 x_1 + w_2 x_2 \gt 0) \end{matrix}\right.y={0 (b+w1x1+w2x2≤0)1 (b+w1x1+w2x2>0)
引入h(x)h(x)h(x)
y=h(b+w1x1+w2x2)y = h(b + w_1 x_1 + w_2 x_2)y=h(b+w1x1+w2x2)
h(x)={0 (x≤0)1 (x>0)h(x) = \left\{\begin{matrix} 0 \ (x \le 0) \\ 1 \ (x \gt 0) \end{matrix}\right.h(x)={0 (x≤0)1 (x>0)
激活函数
activation function
a=b+w1x1+w2x2a = b + w_1 x_1 + w_2 x_2a=b+w1x1+w2x2
y=h(a)y = h(a)y=h(a)
sigmoid 函数
Sigmoid function
h(x)=11+exp(−x)h(x) = \frac{1}{1 + exp(-x)}h(x)=1+exp(−x)1 , exp(−x)exp(-x)exp(−x) represent e−xe^{-x}e−x
def sigmoid(x):
return 1 / (1 + np.exp(-x))
Function of step
import numpy as np
import matplotlib.pylab as plt
# def step_function(x):
# if x > 0:
# return 1
# else:
# return 0
# def step_function(x):
# y = x > 0
# return y.astype(np.int)
def step_function(x):
return np.array(x > 0, dtype=np.int64)
x = np.arange(-5.0, 5.0, 0.1)
y = step_function(x)
plt.plot(x, y)
plt.ylim(-0.1, 1.1) # the arange of axis y plt.show()
plt.show()
Sigmoid function and step function:
import numpy as np
import matplotlib.pylab as plt
# def step_function(x):
# if x > 0:
# return 1
# else:
# return 0
# def step_function(x):
# y = x > 0
# return y.astype(np.int)
def step_function(x):
return np.array(x > 0, dtype=np.int64)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
x = np.arange(-5.0, 5.0, 0.1)
y1 = step_function(x)
y2 = sigmoid(x)
plt.plot(x, y1, linestyle = "--")
plt.plot(x, y2)
plt.ylim(-0.1, 1.1) # the arange of axis y plt.show()
plt.show()
ReLU函数
ReLU function
h(x)={x (x>0)0 (x≤0)h(x) = \left\{\begin{matrix} x \ (x \gt 0) \\ 0 \ (x \le 0) \end{matrix}\right.h(x)={x (x>0)0 (x≤0)
def relu(x):
return np.maximum(0, x)
多维数组
>>> import numpy as np
>>> A = np.array([1, 2, 3, 4])
>>> print(A)
[1 2 3 4]
>>> np.ndim(A)
1
>>> A.shape
(4,)
>>> A.shape[0]
4
>>> B = np.array([[1,2], [3,4], [5,6]])
>>> print(B)
[[1 2]
[3 4]
[5 6]]
>>> np.ndim(B)
2
>>> B.shape
(3, 2)
矩阵乘法
>>> A = np.array([[1,2], [3,4]])
>>> A.shape
(2, 2)
>>> B = np.array([[5,6], [7,8]])
>>> B.shape
(2, 2)
>>> np.dot(A, B)
array([[19, 22],
[43, 50]])
>>> A = np.array([[1,2,3], [4,5,6]])
>>> A.shape
(2, 3)
>>> B = np.array([[1,2], [3,4], [5,6]])
>>> B.shape
(3, 2)
>>> np.dot(A, B)
array([[22, 28],
[49, 64]])
>>> C = np.array([[1,2], [3,4]])
>>> C.shape
(2, 2)
>>> A.shape
(2, 3)
>>> np.dot(A, C)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
ValueError: shapes (2,3) and (2,2) not aligned: 3 (dim 1) != 2 (dim 0)
>>> A = np.array([[1,2], [3, 4], [5,6]])
>>> A.shape
(3, 2)
>>> B = np.array([7,8])
>>> B.shape
(2,)
>>> np.dot(A, B)
array([23, 53, 83])
>>> A = np.array([[1,2], [3, 4], [5,6]])
>>> A.shape
(3, 2)
>>> B = np.array([7,8])
>>> B.shape
(2,)
>>> np.dot(A, B)
array([23, 53, 83])
神经网络内积
>>> X = np.array([1, 2])
>>> X.shape
(2,)
>>> W = np.array([[1, 3, 5], [2, 4, 6]])
>>> print(W)
[[1 3 5]
[2 4 6]]
>>> W.shape
(2, 3)
>>> Y = np.dot(X, W)
>>> print(Y)
[ 5 11 17]
多层神经网络
w12(1)w^{(1)}_{12}w12(1) 中 (1)(1)(1) 表示第1层的权重,1 表示后一层的第一个神经元,2 表示前一层的第2个神经元
权重右下角按照“后一层的索引号、前一层的索引号”的顺序排列
各层间信号传递的实现
例如:
a1(1)=w11(1)x1+w12(1)x2+b1(1)a^{(1)}_1 = w^{(1)}_{11} x_1 + w^{(1)}_{12} x_2 + b^{(1)}_1a1(1)=w11(1)x1+w12(1)x2+b1(1)
如果使用矩阵的乘法运算,则可以将第1层的加权和表示成下面的式
A(1)=XW(1)+B(1)A^{(1)} = XW^{(1)} + B^{(1)}A(1)=XW(1)+B(1)
其中, A(1),X,W(1),B(1)A^{(1)}, X, W^{(1)}, B^{(1)}A(1),X,W(1),B(1) 如下所示:
A(1)=(a1(1) a2(1) a3(1))A^{(1)} = (a^{(1)}_1 \ a^{(1)}_2 \ a^{(1)}_3)A(1)=(a1(1) a2(1) a3(1)), X=(x1 x2)X = (x_1 \ x_2)X=(x1 x2), B(1)=(b1(1) b2(1) b3(1))B^{(1)} = (b^{(1)}_1 \ b^{(1)}_2 \ b^{(1)}_3)B(1)=(b1(1) b2(1) b3(1)), W(1)=(w11(1)w21(1)w31(1)w12(1)w22(1)w32(1))W^{(1)} = \begin{pmatrix} w^{(1)}_{11} & w^{(1)}_{21} & w^{(1)}_{31} \\ w^{(1)}_{12} & w^{(1)}_{22} & w^{(1)}_{32} \end{pmatrix}W(1)=(w11(1)w12(1)w21(1)w22(1)w31(1)w32(1))
X = np.array([1.0, 0.5])
W1 = np.array([[0.1, 0.3, 0.5], [0.2, 0.4, 0.6]])
B1 = np.array([0.1, 0.2, 0.3])
print(W1.shape) # (2, 3)
print(X.shape) # (2,)
print(B1.shape) # (3,)
A1 = np.dot(X, W1) + B1 # (2,) * (2, 3) + (3,) = (3,)
W1W1W1 为 2×32 \times 32×3 的数组,XXX 为元素个数为 2 的一维数组,是用 sigmoidsigmoidsigmoid 作为激活函数
Z1 = sigmoid(A1)
print(A1) # [0.3, 0.7, 1.1]
print(Z1) # [0.57444252, 0.66818777, 0.75026011]
第一层的输出变成第二层的输入
W2 = np.array([[0.1, 0.4], [0.2, 0.5], [0.3, 0.6]])
B2 = np.array([0.1, 0.2])
print(Z1.shape) # (3,)
print(W2.shape) # (3, 2)
print(B2.shape) # (2,)
A2 = np.dot(Z1, W2) + B2 # (3,) * (3, 2) + (2,) = (2,)
Z2 = sigmoid(A2) # (2,)
激活函数
def identity_function(x):
return x
W3 = np.array([[0.1, 0.3], [0.2, 0.4]])
B3 = np.array([0.1, 0.2])
A3 = np.dot(Z2, W3) + B3
Y = identity_function(A3) # 或者Y = A3
代码实现
import numpy as np
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def identity_function(x):
return x
def init_network():
network = {}
network['W1'] = np.array([[0.1, 0.3, 0.5], [0.2, 0.4, 0.6]])
network['b1'] = np.array([0.1, 0.2, 0.3])
network['W2'] = np.array([[0.1, 0.4], [0.2, 0.5], [0.3, 0.6]])
network['b2'] = np.array([0.1, 0.2])
network['W3'] = np.array([[0.1, 0.3], [0.2, 0.4]])
network['b3'] = np.array([0.1, 0.2])
return network
def forward(network, x):
W1, W2, W3 = network['W1'], network['W2'], network['W3']
b1, b2, b3 = network['b1'], network['b2'], network['b3']
a1 = np.dot(x, W1) + b1
z1 = sigmoid(a1)
a2 = np.dot(z1, W2) + b2
z2 = sigmoid(a2)
a3 = np.dot(z2, W3) + b3
y = identity_function(a3)
return y
network = init_network()
x = np.array([1.0, 0.5])
y = forward(network, x)
print(y)
softmax 函数
yk=exp(ak)∑i=1nexp(ai)y_k = \frac{exp(a_k)}{\sum^{n}_{i = 1} exp(a_i)}yk=∑i=1nexp(ai)exp(ak)
>>> import numpy as np
>>> a = np.array([0.3, 2.9, 4.0])
>>> exp_a = np.exp(a)
>>> print(exp_a)
[ 1.34985881 18.17414537 54.59815003]
>>> sum_exp_a = np.sum(exp_a)
>>> print(sum_exp_a)
74.1221542101633
>>> y = exp_a / sum_exp_a
>>> print(y)
[0.01821127 0.24519181 0.73659691]
Softmax function
def softmax(a):
exp_a = np.exp(a)
sum_exp_a = np.sum(exp_a)
y = exp_a / sum_exp_a
return y
yk=exp(ak)∑i=1nexp(ai)=Cexp(ak)C∑i=1nexp(ai)=exp(ak+logC)∑i=1nexp(ai+logC)=exp(ak+C′)∑i=1nexp(ai+C′)\begin{matrix} y_k & = \frac{exp(a_k)}{\sum^{n}_{i = 1} exp(a_i)} & = \frac{Cexp(a_k)}{C\sum^{n}_{i = 1}exp(a_i)} \\ & & = \frac{exp(a_k + log C)}{\sum^{n}_{i = 1} exp(a_i + logC)} \\ & & = \frac{exp(a_k + C')}{\sum^{n}_{i = 1} exp(a_i + C')} \end{matrix}yk=∑i=1nexp(ai)exp(ak)=C∑i=1nexp(ai)Cexp(ak)=∑i=1nexp(ai+logC)exp(ak+logC)=∑i=1nexp(ai+C′)exp(ak+C′)
>>> a = np.array([1010, 1000, 990])
>>> np.exp(a) / np.sum(np.exp(a))
<stdin>:1: RuntimeWarning: invalid value encountered in divide
array([nan, nan, nan])
>>> c = np.max(a)
>>> a - c
array([ 0, -10, -20])
>>> np.exp(a - c) / np.sum(np.exp(a - c))
array([9.99954600e-01, 4.53978686e-05, 2.06106005e-09])
def softmax(a):
c = np.max(a)
exp_a = np.exp(a - c) # 溢出对策 sum_exp_a = np.sum(exp_a)
y = exp_a / sum_exp_a
return y
Deep Learning
均方误差
mean squared error
E=12∑k(yk−tk)2E= \frac{1}{2} \sum_k (y_k - t_k)^2E=21∑k(yk−tk)2
import numpy as np
y = [0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0]
t = [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
def mean_squared_error(y, t):
return 0.5 * np.sum((y - t)**2)
print(mean_squared_error(np.array(y), np.array(t)))
交叉熵误差
cross entropy error
E=−∑k tk log ykE = - \sum_k \ t_k \ log \ y_kE=−∑k tk log yk
def cross_entropy_error(y, t):
delta = 1e-7
return -np.sum(t * np.log(y + delta))
mini-batch 学习
E=−1N∑n∑k tnk log ynkE = - \frac{1}{N} \sum_n \sum_k \ t_{nk} \ log \ y_{nk}E=−N1∑n∑k tnk log ynk
mini-batch 版交叉误差
def cross_entropy_error(y, t):
if y.ndim == 1:
t = t.reshape(1, t.size)
y = y.reshape(1, y.size)
batch_size = y.shape[0]
return -np.sum(t * np.log(y + 1e-7)) / batch_size
# return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size
导数
numerical differentiation
df(x)dx=limh→0f(x+h)−f(x)h\frac{df(x)}{dx} = lim_{h \rightarrow 0} \frac{f(x + h) - f(x)}{h}dxdf(x)=limh→0hf(x+h)−f(x)
def numerical_diff(f, x):
h = 1e-4
return (f(x + h) - f(x - h)) / (2 * h)
偏导数
∂f∂x0,∂f∂x1\frac{\partial f}{\partial x_0},\frac{\partial f}{\partial x_1}∂x0∂f,∂x1∂f
梯度
(∂f∂x0,∂f∂x1)(\frac{\partial f}{\partial x_0},\frac{\partial f}{\partial x_1})(∂x0∂f,∂x1∂f)
def numerical_gradient(f, x):
h = 1e-4 # 0.0001
grad = np.zeros_like(x) # 生成和x形状相同的数组
for idx in range(x.size):
tmp_val = x[idx]
# f(x+h)的计算
x[idx] = tmp_val + h
fxh1 = f(x)
# f(x-h)的计算
x[idx] = tmp_val - h
fxh2 = f(x)
grad[idx] = (fxh1 - fxh2) / (2 * h)
x[idx] = tmp_val # 还原值
return grad
梯度法
x0=x0−η∂f∂x0x_0 = x_0 - \eta \frac{\partial f}{\partial x_0}x0=x0−η∂x0∂f
x1=x1−η∂f∂x1x_1 = x_1 - \eta \frac{\partial f}{\partial x_1}x1=x1−η∂x1∂f
梯度下降法:
def gradient_descent(f, init_x, lr=0.01, step_num=100):
x = init_x
for i in range(step_num):
grad = numerical_gradient(f, x)
x -= lr * grad
return x
神经网络的梯度
WWW 为 2×32 \times 32×3 权重的神经网络,LLL 表示损失函数
W=(w11w12w13w21w22w23)W = \begin{pmatrix} w_{11} & w_{12} & w_{13} \\ w_{21} & w_{22} & w_{23} \end{pmatrix}W=(w11w21w12w22w13w23)
梯度用 ∂L∂W\frac{\partial L}{\partial W}∂W∂L 表示
∂L∂W=(∂L∂w11∂L∂w12∂L∂w13∂L∂w21∂L∂w22∂L∂w23)\frac{\partial L}{\partial W} = \begin{pmatrix} \frac{\partial L}{\partial w_{11}} & \frac{\partial L}{\partial w_{12}} & \frac{\partial L}{\partial w_{13}} \\ \frac{\partial L}{\partial w_{21}} & \frac{\partial L}{\partial w_{22}} & \frac{\partial L}{\partial w_{23}}\end{pmatrix}∂W∂L=(∂w11∂L∂w21∂L∂w12∂L∂w22∂L∂w13∂L∂w23∂L)
简单层的实现
乘法层
class MulLayer:
def __init__(self):
self.x = None
self.y = None
def forward(self, x, y):
self.x = x
self.y = y
out = x * y
return out
def backward(self, dout):
dx = dout * self.y
dy = dout * self.x
return dx, dy
加法层
class AddLayer:
def __init__(self):
pass
def forward(self, x, y):
out = x + y
return out
def backward(self, dout):
dx = dout * 1
dy = dout * 1
return dx, dy
ReLU 层
class Relu:
def __init__(self):
self.mask = None
def forward(self, x):
self.mask = (x <= 0)
out = x.copy()
out[self.mask] = 0
return out
def backward(self, dout):
dout[self.mask] = 0
dx = dout
return dx
误差反向传播法
∂z∂x=∂z∂t∂t∂x\frac{\partial{z}}{\partial{x}} = \frac{\partial{z}}{\partial{t}} \frac{\partial{t}}{\partial{x}}∂x∂z=∂t∂z∂x∂t ,
y={x(x>0)0(x≤0)y = \left\{\begin{matrix} x & (x \gt 0) \\ 0 & (x \le 0) \end{matrix}\right.y={x0(x>0)(x≤0) , ∂y∂x={1(x>0)0(x≤0)\frac{\partial{y}}{\partial{x}} = \left\{\begin{matrix} 1 & (x \gt 0) \\ 0 & (x \le 0) \end{matrix}\right.∂x∂y={10(x>0)(x≤0) ,
y=11+exp(−x)y = \frac{1}{1 + exp(-x) }y=1+exp(−x)1 , ∂y∂x=−1x2=−y2\frac{\partial{y}}{\partial{x}} = - \frac{1}{x^2} = -y^2∂x∂y=−x21=−y2
x→×(−1)−x→expexp(−x)→+11+exp(−x)→/y∂L∂yy2exp(−x)←×(−1)∂L∂yy2exp(−x)←exp∂L∂yy2←+1∂L∂yy2←/∂L∂y\begin{matrix} x & \overset{\times(-1)}{\rightarrow} & -x & \overset{exp}{\rightarrow} & exp(-x) & \overset{+ 1}{\rightarrow} & 1 + exp(-x) & \overset{/}{\rightarrow} & y \\ \frac{\partial{L}}{\partial{y}} y^2 exp(-x) & \overset{\times(-1)}{\leftarrow} & \frac{\partial{L}}{\partial{y}} y^2 exp(-x) & \overset{exp}{\leftarrow} & \frac{\partial{L}}{\partial{y}} y^2 & \overset{+ 1}{\leftarrow} & \frac{\partial{L}}{\partial{y}} y^2 & \overset{/}{\leftarrow} & \frac{\partial{L}}{\partial{y}}\end{matrix}x∂y∂Ly2exp(−x)→×(−1)←×(−1)−x∂y∂Ly2exp(−x)→exp←expexp(−x)∂y∂Ly2→+1←+11+exp(−x)∂y∂Ly2→/←/y∂y∂L
∂L∂yy2exp(−x)=∂L∂y1(1+exp(−x))2exp(−x)=∂L∂y11+exp(−x)exp(−x)1+exp(−x)=∂L∂yy(1−y)\begin{matrix}\frac{\partial{L}}{\partial{y}} y^2 exp(-x) & = & \frac{\partial{L}}{\partial{y}} \frac{1}{(1 + exp(-x))^2} exp(-x) \\ & = & \frac{\partial{L}}{\partial{y}} \frac{1}{1 + exp(-x)} \frac{exp(-x)}{1 + exp(-x)} \\ & = & \frac{\partial{L}}{\partial{y}} y (1-y)\end{matrix}∂y∂Ly2exp(−x)===∂y∂L(1+exp(−x))21exp(−x)∂y∂L1+exp(−x)11+exp(−x)exp(−x)∂y∂Ly(1−y)
class Sigmoid:
def __init__(self):
self.out = None
def forward(self, x):
out = 1 / (1 + np.exp(-x))
self.out = out
return out
def backward(self, dout):
dx = dout * (1.0 - self.out) * self.out
return dx
∂L∂X=∂L∂Y⋅WT\frac{\partial{L}}{\partial{X}} = \frac{\partial{L}}{\partial{Y}} \cdot W^T∂X∂L=∂Y∂L⋅WT, ∂L∂W=XT⋅∂L∂Y\frac{\partial{L}}{\partial{W}} = X^T \cdot \frac{\partial{L}}{\partial{Y}}∂W∂L=XT⋅∂Y∂L
W=(w11w12w13w21w22w23)W = \begin{pmatrix} w_{11} & w_{12} & w_{13} \\ w_{21} & w_{22} & w_{23} \end{pmatrix}W=(w11w21w12w22w13w23), WT=(w11w21w12w22w13w23)W^T = \begin{pmatrix} w_{11} & w_{21} \\ w_{12} & w_{22} \\ w_{13} & w_{23} \end{pmatrix}WT=w11w12w13w21w22w23
X=(x0,x1,...,xn)X=(x_0, x_1, ... , x_n)X=(x0,x1,...,xn) , ∂L∂X=(∂L∂x0,∂L∂x1,...,∂L∂xn)\frac{\partial{L}}{\partial{X}} = (\frac{\partial{L}}{\partial{x_0}}, \frac{\partial{L}}{\partial{x_1}}, ... , \frac{\partial{L}}{\partial{x_n}})∂X∂L=(∂x0∂L,∂x1∂L,...,∂xn∂L)
Affine 层
class Affine:
def __init__(self, W, b):
self.W = W
self.b = b
self.x = None
self.dW = None
self.db = None
def forward(self, x):
self.x = x
out = np.dot(x, self.W) + self.b
return out
def backward(self, dout):
dx = np.dot(dout, self.W.T)
self.dW = np.dot(self.x.T, dout)
self.db = np.sum(dout, axis=0)
return dx
Softmax-with-Loss 层
class SoftmaxWithLoss:
def __init__(self):
self.loss = None # 损失
self.y = None # softmax的输出
self.t = None # 监督数据(one-hot vector)
def forward(self, x, t):
self.t = t
self.y = softmax(x)
self.loss = cross_entropy_error(self.y, self.t)
return self.loss
def backward(self, dout=1):
batch_size = self.t.shape[0]
dx = (self.y - self.t) / batch_size
return dx
Trick
SGD
W←W←η∂L∂WW \leftarrow W \leftarrow \eta \frac{\partial{L}}{\partial{W}}W←W←η∂W∂L
class SGD:
def __init__(self, lr=0.01):
self.lr = lr
def update(self, params, grads):
for key in params.keys():
params[key] -= self.lr * grads[key]
Momentum
v←αv−η∂L∂Wv \leftarrow \alpha v - \eta \frac{\partial{L}}{\partial{W}}v←αv−η∂W∂L
W←W+vW \leftarrow W + vW←W+v
class Momentum:
def __init__(self, lr=0.01, momentum=0.9):
self.lr = lr
self.momentum = momentum
self.v = None
def update(self, params, grads):
if self.v is None:
self.v = {}
for key, val in params.items():
self.v[key] = np.zeros_like(val)
for key in params.keys():
self.v[key] = self.momentum * self.v[key] - self.lr * grads[key]
params[key] += self.v[key]
AdaGrad
h←h+∂L∂W⊙∂L∂Wh \leftarrow h + \frac{\partial{L}}{\partial{W}} \odot \frac{\partial{L}}{\partial{W}}h←h+∂W∂L⊙∂W∂L
W←W−η1h∂L∂WW \leftarrow W - \eta \frac{1}{\sqrt{h}} \frac{\partial{L}}{\partial{W}}W←W−ηh1∂W∂L
class AdaGrad:
def __init__(self, lr=0.01):
self.lr = lr
self.h = None
def update(self, params, grads):
if self.h is None:
self.h = {}
for key, val in params.items():
self.h[key] = np.zeros_like(val)
for key in params.items():
self.h[key] += grads[key] * grads[key]
params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)
Batch Normalization
μB←1m∑i=1mxi\mu_B \leftarrow \frac{1}{m} \sum_{i=1}^{m} x_iμB←m1∑i=1mxi
σB2←1m∑i=1m(xi−μB)2\sigma_{B}^{2} \leftarrow \frac{1}{m} \sum^{m}_{i = 1}(x_i - \mu_{B})^2σB2←m1∑i=1m(xi−μB)2
xi^←xi−μBσB2+ε\hat{x_i} \leftarrow \frac{x_i - \mu_B}{\sqrt{\sigma^2_B + \varepsilon}}xi^←σB2+εxi−μB
yi←γxi^+βy_i \leftarrow \gamma \hat{x_i} + \betayi←γxi^+β
Dropout
class Dropout:
def __init__(self, dropout_ratio=0.5):
self.dropout_ratio = dropout_ratio
self.mask = None
def forward(self, x, train_flg=True):
if train_flg:
self.mask = np.random.rand(*x.shape) > self.dropout_ratio
return x * self.mask
else:
return x * (1.0 - self.dropout_ratio)
def backward(self, dout):
return dout * self.mask
卷积神经网络
假设输入大小为 (H,W)(H,W)(H,W) ,滤波器大小为 (FH,FW)(FH,FW)(FH,FW) ,输出大小为 (OH,OW)(OH,OW)(OH,OW) ,填充为 PPP,步幅为SSS 。此时,输出大小可通过下式进行计算
OH=H+2P−FHS+1OH = \frac{H + 2P -FH}{S} + 1OH=SH+2P−FH+1 , OW=W+2P−FWS+1OW = \frac{W + 2P -FW}{S} + 1OW=SW+2P−FW+1
通道数为 CCC、高度为 HHH、长度为W的数据的形状可以写成 (C,H,W)(C,H,W)(C,H,W)。滤波器也一样,要按(channel, height,width) 的顺序书写。比如,通道数为 CCC、滤波器高度为 FHFHFH (Filter Height)、长度为 FWFWFW (Filter Width) 时,可以写成 (C,FH,FW)(C,FH,FW)(C,FH,FW)
(C,H,W)∗(C,FH,FW)→(1,OH,OW)(C, H, W) \ast (C, FH, FW) \rightarrow (1, OH, OW)(C,H,W)∗(C,FH,FW)→(1,OH,OW)
基于多个滤波器的卷积运算
(C,H,W)∗(FN,C,FH,FW)→(FN,OH,OW)(C, H, W) \ast (FN, C, FH, FW) \rightarrow (FN, OH, OW)(C,H,W)∗(FN,C,FH,FW)→(FN,OH,OW)
卷积运算的处理流
(C,H,W)∗(FN,C,FH,FW)→(FN,OH,OW)+(FN,1,1)→(FN,OH,OW)(C, H, W) \ast (FN, C, FH, FW) \rightarrow (FN, OH, OW) + (FN, 1, 1) \rightarrow (FN, OH, OW)(C,H,W)∗(FN,C,FH,FW)→(FN,OH,OW)+(FN,1,1)→(FN,OH,OW)
卷积运算的处理流(批处理)
(N,C,H,W)∗(FN,C,FH,FW)→(N,FN,OH,OW)+(FN,1,1)→(N,FN,OH,OW)(N, C, H, W) \ast (FN, C, FH, FW) \rightarrow (N, FN, OH, OW) + (FN, 1, 1) \rightarrow (N, FN, OH, OW)(N,C,H,W)∗(FN,C,FH,FW)→(N,FN,OH,OW)+(FN,1,1)→(N,FN,OH,OW)
class Convolution:
def __init__(self, W, b, stride=1, pad=0):
self.W = W
self.b = b
self.stride = stride
self.pad = pad
def forward(self, x):
FN, C, FH, FW = self.W.shape
N, C, H, W = x.shape
out_h = int(1 + (H + 2*self.pad - FH) / self.stride)
out_w = int(1 + (W + 2*self.pad - FW) / self.stride)
col = im2col(x, FH, FW, self.stride, self.pad)
col_W = self.W.reshape(FN, -1).T # 滤波器的展开
out = np.dot(col, col_W) + self.b
out = out.reshape(N, out_h, out_w, -1).transpose(0, 3, 1, 2)
return out
池化层
class Pooling:
def __init__(self, pool_h, pool_w, stride=1, pad=0):
self.pool_h = pool_h
self.pool_w = pool_w
self.stride = stride
self.pad = pad
def forward(self, x):
N, C, H, W = x.shape
out_h = int(1 + (H - self.pool_h) / self.stride)
out_w = int(1 + (W - self.pool_w) / self.stride)
# 展开(1)
col = im2col(x, self.pool_h, self.pool_w, self.stride, self.pad)
col = col.reshape(-1, self.pool_h*self.pool_w)
# 最大值(2)
out = np.max(col, axis=1)
# 转换(3)
out = out.reshape(N, out_h, out_w, C).transpose(0, 3, 1, 2)
return out