深度学习入门

Python Basic


$ python3

>>> 1 + 2
3

>>> 1 - 2
-1

>>> 4 * 5
20

>>> 7 / 5
1.4

>>> 3 ** 2
9

>>> type(10)
<class 'int'>

>>> type(2.718)
<class 'float'>

>>> type("hello")
<class 'str'>

>>> x = 10
>>> print(x)
10

>>> x = 100
>>> print(x)
100

>>> y = 3.14
>>> x * y
314.0

>>> type(x * y)
<class 'float'>

>>> a = [1, 2, 3, 4, 5]
>>> print(a)
[1, 2, 3, 4, 5]

>>> len(a)
5

>>> a[0]
1

>>> a[4]
5

>>> a[4] = 99
>>> print(a)
[1, 2, 3, 4, 99]

>>> a[0:2]
[1, 2]

>>> a[1:]
[2, 3, 4, 99]

>>> a[:3]
[1, 2, 3]

>>> a[:-1]
[1, 2, 3, 4]

>>> a[:-2]
[1, 2, 3]

>>> me = {'height' : 100}
>>> me['height']
100

>>> me['height'] = 70
>>> print(me)
{'height': 70}

>>> me['weight'] = 80
>>> print(me)
{'height': 70, 'weight': 80}

>>> hungry = True
>>> sleepy = False
>>> type(hungry)
<class 'bool'>

>>> not hungry
False

>>> hungry and sleepy

False

>>> hungry or sleepy
True

>>> hungry = True
>>> if hungry:
... print("I'm hungry")
...

"I'm hungry"

>>> hungry = False
>>> if hungry:
...     print("I'm hungry")
... else:
...     print("I'm not hungry")
... print("I'm sleepy")
...

"I'm not hungry"
"I'm sleepy"

>>> for i in [1, 2, 3]:
...     print(i)
...

1
2
3

>>> def hello():
...     print("Hello world")
...

>>> hello()
Hello world

>>> def hello(object):
...     print("Hello " + object + "!")
...

>>> hello("cat")
Hello cat!

>>>

Class


class Man:
	def __init__(self, name):
		self.name = name
		print("Initialized!")

	def hello(self):
		print("Hello " + self.name + "!")
	
	def goodbye(self):
		print("Good-bye " + self.name + "!")

m = Man("David")
m.hello()
m.goodbye()

Numpy


>>> import numpy as np
>>> x = np.array([1.0, 2.0, 3.0])
>>> print(x)
[1. 2. 3.]

>>> type(x)
<class 'numpy.ndarray'>

>>> x = np.array([1.0, 2.0, 3.0])
>>> y = np.array([2.0, 4.0, 6.0])
>>> x + y
array([3., 6., 9.])

>>> x - y
array([-1., -2., -3.])

>>> x * y
array([ 2., 8., 18.])

>>> x / y
array([0.5, 0.5, 0.5])

>>> x = np.array([1.0, 2.0, 3.0])
>>> x / 2.0
array([0.5, 1. , 1.5])

>>> A = np.array([[1,2], [3,4]])
>>> print(A)
[[1 2]
[3 4]]

>>> A.shape
(2, 2)

>>> A.dtype
dtype('int64')

>>> B = np.array([[3,0], [0,6]])
>>> A + B
array([[ 4, 2],
[ 3, 10]])

>>> A * B
array([[ 3, 0],
[ 0, 24]])

>>> print(A)
[[1 2]
[3 4]]

>>> A * 10
array([[10, 20],
[30, 40]])

>>> A = np.array([[1,2], [3,4]])
>>> B = np.array([10, 20])
>>> A * B
array([[10, 40],
[30, 80]])

>>> X = np.array([[51,55], [14, 19], [0, 4]])
>>> print(X)
[[51 55]
[14 19]
[ 0 4]]

>>> X[0] # get the first row of this array
array([51, 55])

>>> X[0][1] # get the number of (0, 1)
55

>>> for row in X:
... print(row)
...

[51 55]
[14 19]
[0 4]

>>> X = X.flatten()
>>> print(X)
[51 55 14 19 0 4]

>>> X[np.array([0, 2, 4])] # get the number whose the index is 0, 2, 4
array([51, 14, 0])

>>> X > 15 # filter the number that greater than 15
array([ True, True, False, True, False, False])

>>> X[X > 15]
array([51, 55, 19])

>>>

Matplotlib

Plot the sin function

import numpy as np
import matplotlib.pyplot as plt

x = np.arange(0, 6, 0.1)
y = np.sin(x)

plt.plot(x, y)
plt.show()

Plot the image of function sin and cos

import numpy as np
import matplotlib.pyplot as plt

  

x = np.arange(0, 6, 0.1)
y1 = np.sin(x)
y2 = np.cos(x)

plt.plot(x, y1, label="sin")
plt.plot(x, y2, linestyle = "--", label="cos")

plt.xlabel("x")
plt.ylabel("y")
plt.title('sin & cos')

plt.legend()
plt.show()

Display the image

import matplotlib.pyplot as plt
from matplotlib.image import imread

img = imread('lena.png') # the path of the image
plt.imshow(img)
plt.show()

Perceptron

感知机的运行原理

The equation below can represent the behavior of the perceptron

y={0 (w1x1+w2x2≤θ)1 (w1x1+w2x2>θ)y = \left\{\begin{matrix}0 \ (w_1 x_1 + w_2 x_2 \le \theta) \\ 1 \ (w_1 x_1 + w_2 x_2 \gt \theta) \end{matrix}\right.y={0 (w1x1+w2x2θ)1 (w1x1+w2x2>θ)

Simple implementation


def AND(x1, x2):
	w1, w2, theta = 0.5, 0.5, 0.7
	tmp = w1 * x1 + w2 * x2
	
	if tmp <= theta:
		return 0
	elif tmp > theta:
		return 1

print(AND(0, 0))
print(AND(0, 1))
print(AND(1, 0))
print(AND(1, 1))

导入权重和偏置

bbb is called bias, w1w_1w1 and w2w_2w2 are called weight

y={0 (b+w1x1+w2x2≤0)1 (b+w1x1+w2x2>0)y = \left\{\begin{matrix} 0 \ (b + w_1 x_1 + w_2 x_2 \le 0) \\ 1 \ (b + w_1 x_1 + w_2 x_2 \gt 0)\end{matrix}\right.y={0 (b+w1x1+w2x20)1 (b+w1x1+w2x2>0)

use numpy to complete a simple neuron

import numpy as np
x = np.array([0, 1]) # input
w = np.array([0.5, 0.5]) # weight
b = -0.7 # bias

print(w * x)
print(np.sum(w * x))
print(np.sum(w * x) + b)

use bias and weight to complete an AND gate

def AND(x1, x2):
	x = np.array([x1, x2])
	w = np.array([0.5, 0.5])
	b = -0.7
	tmp = np.sum(w * x) + b
	if tmp <= 0:
		return 0
	else:
		return 1

NAND gate, OR gate

def NAND(x1, x2):
	x = np.array([x1, x2])
	w = np.array([-0.5, -0.5]) # just bias and weight different from AND
	b = 0.7
	tmp = np.sum(w * x) + b
	
	if tmp <= 0:
		return 0
	else:
		return 1


def OR(x1, x2):
	x = np.array([x1, x2])
	w = np.array([0.5, 0.5]) # just bias and weight different from AND
	b = -0.2
	tmp = np.sum(w * x) + b
	
	if tmp <= 0:
		return 0
	else:
		return 1

Use AND, NAND, OR gate to build NOR gate

def XOR(x1, x2):
	s1 = NAND(x1, x2)
	s2 = OR(x1, x2)
	y = AND(s1, s2)
	return y

Neuron Network

y={0 (b+w1x1+w2x2≤0)1 (b+w1x1+w2x2>0)y = \left\{\begin{matrix} 0 \ (b + w_1 x_1 + w_2 x_2 \le 0) \\ 1 \ (b + w_1 x_1 + w_2 x_2 \gt 0) \end{matrix}\right.y={0 (b+w1x1+w2x20)1 (b+w1x1+w2x2>0)

引入h(x)h(x)h(x)

y=h(b+w1x1+w2x2)y = h(b + w_1 x_1 + w_2 x_2)y=h(b+w1x1+w2x2)

h(x)={0 (x≤0)1 (x>0)h(x) = \left\{\begin{matrix} 0 \ (x \le 0) \\ 1 \ (x \gt 0) \end{matrix}\right.h(x)={0 (x0)1 (x>0)

激活函数

activation function

a=b+w1x1+w2x2a = b + w_1 x_1 + w_2 x_2a=b+w1x1+w2x2
y=h(a)y = h(a)y=h(a)

sigmoid 函数

Sigmoid function

h(x)=11+exp(−x)h(x) = \frac{1}{1 + exp(-x)}h(x)=1+exp(x)1 , exp(−x)exp(-x)exp(x) represent e−xe^{-x}ex

def sigmoid(x):
	return 1 / (1 + np.exp(-x))

Function of step


import numpy as np
import matplotlib.pylab as plt

# def step_function(x):
# if x > 0:
#     return 1
# else:
#     return 0


# def step_function(x):
# y = x > 0
# return y.astype(np.int)

def step_function(x):
	return np.array(x > 0, dtype=np.int64)


x = np.arange(-5.0, 5.0, 0.1)
y = step_function(x)
plt.plot(x, y)
plt.ylim(-0.1, 1.1) # the arange of axis y plt.show()
plt.show()

Sigmoid function and step function:


import numpy as np
import matplotlib.pylab as plt


# def step_function(x):
#     if x > 0:
#         return 1
#     else:
#         return 0


# def step_function(x):
#     y = x > 0
#     return y.astype(np.int)


def step_function(x):
	return np.array(x > 0, dtype=np.int64)


def sigmoid(x):
	return 1 / (1 + np.exp(-x))


x = np.arange(-5.0, 5.0, 0.1)
y1 = step_function(x)
y2 = sigmoid(x)
plt.plot(x, y1, linestyle = "--")
plt.plot(x, y2)
plt.ylim(-0.1, 1.1) # the arange of axis y plt.show()
plt.show()

ReLU函数

ReLU function

h(x)={x (x>0)0 (x≤0)h(x) = \left\{\begin{matrix} x \ (x \gt 0) \\ 0 \ (x \le 0) \end{matrix}\right.h(x)={x (x>0)0 (x0)

def relu(x):
	return np.maximum(0, x)

多维数组

>>> import numpy as np
>>> A = np.array([1, 2, 3, 4])
>>> print(A)
[1 2 3 4]
>>> np.ndim(A)
1
>>> A.shape
(4,)
>>> A.shape[0]
4

>>> B = np.array([[1,2], [3,4], [5,6]])
>>> print(B)
[[1 2]
 [3 4]
 [5 6]]
>>> np.ndim(B)
2
>>> B.shape
(3, 2)

矩阵乘法

>>> A = np.array([[1,2], [3,4]])
>>> A.shape
(2, 2)
>>> B = np.array([[5,6], [7,8]])
>>> B.shape
(2, 2)
>>> np.dot(A, B)
array([[19, 22],
       [43, 50]])
>>> A = np.array([[1,2,3], [4,5,6]])
>>> A.shape
(2, 3)

>>> B = np.array([[1,2], [3,4], [5,6]])
>>> B.shape
(3, 2)

>>> np.dot(A, B)
array([[22, 28],
       [49, 64]])
>>> C = np.array([[1,2], [3,4]])  
>>> C.shape
(2, 2)  
>>> A.shape  
(2, 3)  
>>> np.dot(A, C)  
Traceback (most recent call last):  
	File "<stdin>", line 1, in <module>  
ValueError: shapes (2,3) and (2,2) not aligned: 3 (dim 1) != 2 (dim 0)

>>> A = np.array([[1,2], [3, 4], [5,6]])  
>>> A.shape  
(3, 2)  
  
>>> B = np.array([7,8])    
>>> B.shape  
(2,)  
  
>>> np.dot(A, B)  
array([23, 53, 83])
>>> A = np.array([[1,2], [3, 4], [5,6]])  
>>> A.shape  
(3, 2)  
  
>>> B = np.array([7,8])    
>>> B.shape  
(2,)  

>>> np.dot(A, B)  
array([23, 53, 83])

神经网络内积

>>> X = np.array([1, 2])  
>>> X.shape  
(2,)  
  
>>> W = np.array([[1, 3, 5], [2, 4, 6]])  
>>> print(W)  
[[1 3 5]  
[2 4 6]]  
  
>>> W.shape
(2, 3)  
>>> Y = np.dot(X, W)  
>>> print(Y)
[ 5 11 17]

多层神经网络

w12(1)w^{(1)}_{12}w12(1)(1)(1)(1) 表示第1层的权重,1 表示后一层的第一个神经元,2 表示前一层的第2个神经元
权重右下角按照“后一层的索引号、前一层的索引号”的顺序排列

各层间信号传递的实现

例如:

a1(1)=w11(1)x1+w12(1)x2+b1(1)a^{(1)}_1 = w^{(1)}_{11} x_1 + w^{(1)}_{12} x_2 + b^{(1)}_1a1(1)=w11(1)x1+w12(1)x2+b1(1)

如果使用矩阵的乘法运算,则可以将第1层的加权和表示成下面的式
A(1)=XW(1)+B(1)A^{(1)} = XW^{(1)} + B^{(1)}A(1)=XW(1)+B(1)

其中, A(1),X,W(1),B(1)A^{(1)}, X, W^{(1)}, B^{(1)}A(1),X,W(1),B(1) 如下所示:

A(1)=(a1(1) a2(1) a3(1))A^{(1)} = (a^{(1)}_1 \ a^{(1)}_2 \ a^{(1)}_3)A(1)=(a1(1) a2(1) a3(1)), X=(x1 x2)X = (x_1 \ x_2)X=(x1 x2), B(1)=(b1(1) b2(1) b3(1))B^{(1)} = (b^{(1)}_1 \ b^{(1)}_2 \ b^{(1)}_3)B(1)=(b1(1) b2(1) b3(1)), W(1)=(w11(1)w21(1)w31(1)w12(1)w22(1)w32(1))W^{(1)} = \begin{pmatrix} w^{(1)}_{11} & w^{(1)}_{21} & w^{(1)}_{31} \\ w^{(1)}_{12} & w^{(1)}_{22} & w^{(1)}_{32} \end{pmatrix}W(1)=(w11(1)w12(1)w21(1)w22(1)w31(1)w32(1))

X = np.array([1.0, 0.5])
W1 = np.array([[0.1, 0.3, 0.5], [0.2, 0.4, 0.6]])
B1 = np.array([0.1, 0.2, 0.3])

print(W1.shape) # (2, 3)
print(X.shape) # (2,)
print(B1.shape) # (3,)  

A1 = np.dot(X, W1) + B1 # (2,) * (2, 3) + (3,) = (3,)

W1W1W12×32 \times 32×3 的数组,XXX 为元素个数为 2 的一维数组,是用 sigmoidsigmoidsigmoid 作为激活函数

Z1 = sigmoid(A1)  
  
print(A1) # [0.3, 0.7, 1.1]  
print(Z1) # [0.57444252, 0.66818777, 0.75026011]

第一层的输出变成第二层的输入

W2 = np.array([[0.1, 0.4], [0.2, 0.5], [0.3, 0.6]])
B2 = np.array([0.1, 0.2])

print(Z1.shape) # (3,)
print(W2.shape) # (3, 2)
print(B2.shape) # (2,)

A2 = np.dot(Z1, W2) + B2 # (3,) * (3, 2) + (2,) = (2,)
Z2 = sigmoid(A2) # (2,)

激活函数

def identity_function(x):  
	return x  

W3 = np.array([[0.1, 0.3], [0.2, 0.4]])  
B3 = np.array([0.1, 0.2])  

A3 = np.dot(Z2, W3) + B3  
Y = identity_function(A3) # 或者Y = A3

代码实现

import numpy as np

def sigmoid(x):
	return 1 / (1 + np.exp(-x))


def identity_function(x):
	return x


def init_network():
	network = {}
	network['W1'] = np.array([[0.1, 0.3, 0.5], [0.2, 0.4, 0.6]])
	network['b1'] = np.array([0.1, 0.2, 0.3])
	network['W2'] = np.array([[0.1, 0.4], [0.2, 0.5], [0.3, 0.6]])
	network['b2'] = np.array([0.1, 0.2])
	network['W3'] = np.array([[0.1, 0.3], [0.2, 0.4]])
	network['b3'] = np.array([0.1, 0.2])
	return network


def forward(network, x):
	W1, W2, W3 = network['W1'], network['W2'], network['W3']
	b1, b2, b3 = network['b1'], network['b2'], network['b3']
	
	a1 = np.dot(x, W1) + b1
	z1 = sigmoid(a1)
	a2 = np.dot(z1, W2) + b2
	
	z2 = sigmoid(a2)
	a3 = np.dot(z2, W3) + b3
	y = identity_function(a3)
	
	return y


network = init_network()
x = np.array([1.0, 0.5])
y = forward(network, x)
print(y)

softmax 函数

yk=exp(ak)∑i=1nexp(ai)y_k = \frac{exp(a_k)}{\sum^{n}_{i = 1} exp(a_i)}yk=i=1nexp(ai)exp(ak)

>>> import numpy as np
>>> a = np.array([0.3, 2.9, 4.0])
>>> exp_a = np.exp(a)
>>> print(exp_a)
[ 1.34985881 18.17414537 54.59815003]
>>> sum_exp_a = np.sum(exp_a)
>>> print(sum_exp_a)
74.1221542101633
>>> y = exp_a / sum_exp_a
>>> print(y)
[0.01821127 0.24519181 0.73659691]

Softmax function

def softmax(a):
	exp_a = np.exp(a)
	sum_exp_a = np.sum(exp_a)
	y = exp_a / sum_exp_a
	return y

yk=exp(ak)∑i=1nexp(ai)=Cexp(ak)C∑i=1nexp(ai)=exp(ak+logC)∑i=1nexp(ai+logC)=exp(ak+C′)∑i=1nexp(ai+C′)\begin{matrix} y_k & = \frac{exp(a_k)}{\sum^{n}_{i = 1} exp(a_i)} & = \frac{Cexp(a_k)}{C\sum^{n}_{i = 1}exp(a_i)} \\ & & = \frac{exp(a_k + log C)}{\sum^{n}_{i = 1} exp(a_i + logC)} \\ & & = \frac{exp(a_k + C')}{\sum^{n}_{i = 1} exp(a_i + C')} \end{matrix}yk=i=1nexp(ai)exp(ak)=Ci=1nexp(ai)Cexp(ak)=i=1nexp(ai+logC)exp(ak+logC)=i=1nexp(ai+C)exp(ak+C)

>>> a = np.array([1010, 1000, 990])
>>> np.exp(a) / np.sum(np.exp(a))
<stdin>:1: RuntimeWarning: invalid value encountered in divide
array([nan, nan, nan])
>>> c = np.max(a)
>>> a - c
array([  0, -10, -20])
>>> np.exp(a - c) / np.sum(np.exp(a - c))
array([9.99954600e-01, 4.53978686e-05, 2.06106005e-09])
def softmax(a):
	c = np.max(a)
	exp_a = np.exp(a - c) # 溢出对策 sum_exp_a = np.sum(exp_a)
	y = exp_a / sum_exp_a
	return y

Deep Learning

均方误差

mean squared error

E=12∑k(yk−tk)2E= \frac{1}{2} \sum_k (y_k - t_k)^2E=21k(yktk)2

import numpy as np

y = [0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0]
t = [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]

def mean_squared_error(y, t):
	return 0.5 * np.sum((y - t)**2)

print(mean_squared_error(np.array(y), np.array(t)))

交叉熵误差

cross entropy error

E=−∑k tk log ykE = - \sum_k \ t_k \ log \ y_kE=k tk log yk

def cross_entropy_error(y, t):
	delta = 1e-7
	return -np.sum(t * np.log(y + delta))

mini-batch 学习

E=−1N∑n∑k tnk log ynkE = - \frac{1}{N} \sum_n \sum_k \ t_{nk} \ log \ y_{nk}E=N1nk tnk log ynk

mini-batch 版交叉误差

def cross_entropy_error(y, t):
	if y.ndim == 1:
		t = t.reshape(1, t.size)
		y = y.reshape(1, y.size)
	
	batch_size = y.shape[0]
	return -np.sum(t * np.log(y + 1e-7)) / batch_size
#	return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size

导数

numerical differentiation

df(x)dx=limh→0f(x+h)−f(x)h\frac{df(x)}{dx} = lim_{h \rightarrow 0} \frac{f(x + h) - f(x)}{h}dxdf(x)=limh0hf(x+h)f(x)

def numerical_diff(f, x):
	h = 1e-4
	return (f(x + h) - f(x - h)) / (2 * h)

偏导数

∂f∂x0,∂f∂x1\frac{\partial f}{\partial x_0},\frac{\partial f}{\partial x_1}x0f,x1f

梯度

(∂f∂x0,∂f∂x1)(\frac{\partial f}{\partial x_0},\frac{\partial f}{\partial x_1})(x0f,x1f)

def numerical_gradient(f, x):
	h = 1e-4 # 0.0001
	grad = np.zeros_like(x) # 生成和x形状相同的数组
	
	for idx in range(x.size):
		tmp_val = x[idx]
		
		# f(x+h)的计算
		x[idx] = tmp_val + h
		fxh1 = f(x)
		
		# f(x-h)的计算
		x[idx] = tmp_val - h
		fxh2 = f(x)
		
		grad[idx] = (fxh1 - fxh2) / (2 * h)
		x[idx] = tmp_val # 还原值
	
	return grad

梯度法

x0=x0−η∂f∂x0x_0 = x_0 - \eta \frac{\partial f}{\partial x_0}x0=x0ηx0f

x1=x1−η∂f∂x1x_1 = x_1 - \eta \frac{\partial f}{\partial x_1}x1=x1ηx1f

梯度下降法:

def gradient_descent(f, init_x, lr=0.01, step_num=100):
	x = init_x
	for i in range(step_num):
		grad = numerical_gradient(f, x)
		x -= lr * grad

	return x

神经网络的梯度

WWW2×32 \times 32×3 权重的神经网络,LLL 表示损失函数

W=(w11w12w13w21w22w23)W = \begin{pmatrix} w_{11} & w_{12} & w_{13} \\ w_{21} & w_{22} & w_{23} \end{pmatrix}W=(w11w21w12w22w13w23)

梯度用 ∂L∂W\frac{\partial L}{\partial W}WL 表示

∂L∂W=(∂L∂w11∂L∂w12∂L∂w13∂L∂w21∂L∂w22∂L∂w23)\frac{\partial L}{\partial W} = \begin{pmatrix} \frac{\partial L}{\partial w_{11}} & \frac{\partial L}{\partial w_{12}} & \frac{\partial L}{\partial w_{13}} \\ \frac{\partial L}{\partial w_{21}} & \frac{\partial L}{\partial w_{22}} & \frac{\partial L}{\partial w_{23}}\end{pmatrix}WL=(w11Lw21Lw12Lw22Lw13Lw23L)

简单层的实现

乘法层
class MulLayer:
	def __init__(self):
		self.x = None
		self.y = None
	
	def forward(self, x, y):
		self.x = x
		self.y = y
		out = x * y
		return out
	
	def backward(self, dout):
		dx = dout * self.y
		dy = dout * self.x
		return dx, dy
加法层
class AddLayer:  
	def __init__(self):    
		pass
	
	def forward(self, x, y):  
		out = x + y
		return out  
	
	def backward(self, dout):  
		dx = dout * 1  
		dy = dout * 1  
		return dx, dy

ReLU 层

class Relu:
	def __init__(self):
		self.mask = None
	
	def forward(self, x):
		self.mask = (x <= 0)
		out = x.copy()
		out[self.mask] = 0
		
		return out
	
	def backward(self, dout):
		dout[self.mask] = 0
		dx = dout
		
		return dx

误差反向传播法

∂z∂x=∂z∂t∂t∂x\frac{\partial{z}}{\partial{x}} = \frac{\partial{z}}{\partial{t}} \frac{\partial{t}}{\partial{x}}xz=tzxt ,

y={x(x>0)0(x≤0)y = \left\{\begin{matrix} x & (x \gt 0) \\ 0 & (x \le 0) \end{matrix}\right.y={x0(x>0)(x0) , ∂y∂x={1(x>0)0(x≤0)\frac{\partial{y}}{\partial{x}} = \left\{\begin{matrix} 1 & (x \gt 0) \\ 0 & (x \le 0) \end{matrix}\right.xy={10(x>0)(x0) ,

y=11+exp(−x)y = \frac{1}{1 + exp(-x) }y=1+exp(x)1 , ∂y∂x=−1x2=−y2\frac{\partial{y}}{\partial{x}} = - \frac{1}{x^2} = -y^2xy=x21=y2

x→×(−1)−x→expexp(−x)→+11+exp(−x)→/y∂L∂yy2exp(−x)←×(−1)∂L∂yy2exp(−x)←exp∂L∂yy2←+1∂L∂yy2←/∂L∂y\begin{matrix} x & \overset{\times(-1)}{\rightarrow} & -x & \overset{exp}{\rightarrow} & exp(-x) & \overset{+ 1}{\rightarrow} & 1 + exp(-x) & \overset{/}{\rightarrow} & y \\ \frac{\partial{L}}{\partial{y}} y^2 exp(-x) & \overset{\times(-1)}{\leftarrow} & \frac{\partial{L}}{\partial{y}} y^2 exp(-x) & \overset{exp}{\leftarrow} & \frac{\partial{L}}{\partial{y}} y^2 & \overset{+ 1}{\leftarrow} & \frac{\partial{L}}{\partial{y}} y^2 & \overset{/}{\leftarrow} & \frac{\partial{L}}{\partial{y}}\end{matrix}xyLy2exp(x)×(1)×(1)xyLy2exp(x)expexpexp(x)yLy2+1+11+exp(x)yLy2//yyL

∂L∂yy2exp(−x)=∂L∂y1(1+exp(−x))2exp(−x)=∂L∂y11+exp(−x)exp(−x)1+exp(−x)=∂L∂yy(1−y)\begin{matrix}\frac{\partial{L}}{\partial{y}} y^2 exp(-x) & = & \frac{\partial{L}}{\partial{y}} \frac{1}{(1 + exp(-x))^2} exp(-x) \\ & = & \frac{\partial{L}}{\partial{y}} \frac{1}{1 + exp(-x)} \frac{exp(-x)}{1 + exp(-x)} \\ & = & \frac{\partial{L}}{\partial{y}} y (1-y)\end{matrix}yLy2exp(x)===yL(1+exp(x))21exp(x)yL1+exp(x)11+exp(x)exp(x)yLy(1y)

class Sigmoid:
	def __init__(self):
		self.out = None
	
	def forward(self, x):
		out = 1 / (1 + np.exp(-x))
		self.out = out
		
		return out
	
	def backward(self, dout):
		dx = dout * (1.0 - self.out) * self.out
		
		return dx

∂L∂X=∂L∂Y⋅WT\frac{\partial{L}}{\partial{X}} = \frac{\partial{L}}{\partial{Y}} \cdot W^TXL=YLWT, ∂L∂W=XT⋅∂L∂Y\frac{\partial{L}}{\partial{W}} = X^T \cdot \frac{\partial{L}}{\partial{Y}}WL=XTYL

W=(w11w12w13w21w22w23)W = \begin{pmatrix} w_{11} & w_{12} & w_{13} \\ w_{21} & w_{22} & w_{23} \end{pmatrix}W=(w11w21w12w22w13w23)WT=(w11w21w12w22w13w23)W^T = \begin{pmatrix} w_{11} & w_{21} \\ w_{12} & w_{22} \\ w_{13} & w_{23} \end{pmatrix}WT=w11w12w13w21w22w23

X=(x0,x1,...,xn)X=(x_0, x_1, ... , x_n)X=(x0,x1,...,xn) , ∂L∂X=(∂L∂x0,∂L∂x1,...,∂L∂xn)\frac{\partial{L}}{\partial{X}} = (\frac{\partial{L}}{\partial{x_0}}, \frac{\partial{L}}{\partial{x_1}}, ... , \frac{\partial{L}}{\partial{x_n}})XL=(x0L,x1L,...,xnL)

Affine 层

class Affine:  
	def __init__(self, W, b):  
		self.W = W  
		self.b = b  
		self.x = None  
		self.dW = None  
		self.db = None  
	
	def forward(self, x):  
		self.x = x  
		out = np.dot(x, self.W) + self.b  
		return out  
	  
	def backward(self, dout):  
		dx = np.dot(dout, self.W.T)  
		self.dW = np.dot(self.x.T, dout)  
		self.db = np.sum(dout, axis=0)  
		return dx

Softmax-with-Loss 层

class SoftmaxWithLoss:  
	def __init__(self):  
		self.loss = None # 损失  
		self.y = None # softmax的输出  
		self.t = None # 监督数据(one-hot vector)  
	
	def forward(self, x, t):  
		self.t = t  
		self.y = softmax(x)  
		self.loss = cross_entropy_error(self.y, self.t)  
		return self.loss  
	  
	def backward(self, dout=1):  
		batch_size = self.t.shape[0]  
		dx = (self.y - self.t) / batch_size  
		return dx

Trick

SGD

W←W←η∂L∂WW \leftarrow W \leftarrow \eta \frac{\partial{L}}{\partial{W}}WWηWL

class SGD:
	def __init__(self, lr=0.01):
		self.lr = lr
	
	def update(self, params, grads):
		for key in params.keys():
			params[key] -= self.lr * grads[key]

Momentum

v←αv−η∂L∂Wv \leftarrow \alpha v - \eta \frac{\partial{L}}{\partial{W}}vαvηWL
W←W+vW \leftarrow W + vWW+v

class Momentum:
	def __init__(self, lr=0.01, momentum=0.9):
		self.lr = lr
		self.momentum = momentum
		self.v = None

	def update(self, params, grads):
		if self.v is None:
			self.v = {}
			for key, val in params.items():
				self.v[key] = np.zeros_like(val)

		for key in params.keys():
			self.v[key] = self.momentum * self.v[key] - self.lr * grads[key]
			params[key] += self.v[key]

AdaGrad

h←h+∂L∂W⊙∂L∂Wh \leftarrow h + \frac{\partial{L}}{\partial{W}} \odot \frac{\partial{L}}{\partial{W}}hh+WLWL

W←W−η1h∂L∂WW \leftarrow W - \eta \frac{1}{\sqrt{h}} \frac{\partial{L}}{\partial{W}}WWηh1WL

class AdaGrad:
	def __init__(self, lr=0.01):
		self.lr = lr
		self.h = None

	def update(self, params, grads):
		if self.h is None:
			self.h = {}
			for key, val in params.items():
				self.h[key] = np.zeros_like(val)
		for key in params.items():
			self.h[key] += grads[key] * grads[key]
			params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)

Batch Normalization

μB←1m∑i=1mxi\mu_B \leftarrow \frac{1}{m} \sum_{i=1}^{m} x_iμBm1i=1mxi

σB2←1m∑i=1m(xi−μB)2\sigma_{B}^{2} \leftarrow \frac{1}{m} \sum^{m}_{i = 1}(x_i - \mu_{B})^2σB2m1i=1m(xiμB)2

xi^←xi−μBσB2+ε\hat{x_i} \leftarrow \frac{x_i - \mu_B}{\sqrt{\sigma^2_B + \varepsilon}}xi^σB2+εxiμB

yi←γxi^+βy_i \leftarrow \gamma \hat{x_i} + \betayiγxi^+β

Dropout

class Dropout:  
	def __init__(self, dropout_ratio=0.5):  
		self.dropout_ratio = dropout_ratio
		self.mask = None
		
	def forward(self, x, train_flg=True):  
		if train_flg:
			self.mask = np.random.rand(*x.shape) > self.dropout_ratio
			return x * self.mask
		else:
			return x * (1.0 - self.dropout_ratio)
	
	def backward(self, dout):  
		return dout * self.mask

卷积神经网络

假设输入大小为 (H,W)(H,W)(H,W) ,滤波器大小为 (FH,FW)(FH,FW)(FH,FW) ,输出大小为 (OH,OW)(OH,OW)(OH,OW) ,填充为 PPP,步幅为SSS 。此时,输出大小可通过下式进行计算

OH=H+2P−FHS+1OH = \frac{H + 2P -FH}{S} + 1OH=SH+2PFH+1 , OW=W+2P−FWS+1OW = \frac{W + 2P -FW}{S} + 1OW=SW+2PFW+1

通道数为 CCC、高度为 HHH、长度为W的数据的形状可以写成 (C,H,W)(C,H,W)(C,H,W)。滤波器也一样,要按(channel, height,width) 的顺序书写。比如,通道数为 CCC、滤波器高度为 FHFHFH (Filter Height)、长度为 FWFWFW (Filter Width) 时,可以写成 (C,FH,FW)(C,FH,FW)(C,FH,FW)

(C,H,W)∗(C,FH,FW)→(1,OH,OW)(C, H, W) \ast (C, FH, FW) \rightarrow (1, OH, OW)(C,H,W)(C,FH,FW)(1,OH,OW)

基于多个滤波器的卷积运算
(C,H,W)∗(FN,C,FH,FW)→(FN,OH,OW)(C, H, W) \ast (FN, C, FH, FW) \rightarrow (FN, OH, OW)(C,H,W)(FN,C,FH,FW)(FN,OH,OW)

卷积运算的处理流
(C,H,W)∗(FN,C,FH,FW)→(FN,OH,OW)+(FN,1,1)→(FN,OH,OW)(C, H, W) \ast (FN, C, FH, FW) \rightarrow (FN, OH, OW) + (FN, 1, 1) \rightarrow (FN, OH, OW)(C,H,W)(FN,C,FH,FW)(FN,OH,OW)+(FN,1,1)(FN,OH,OW)

卷积运算的处理流(批处理)
(N,C,H,W)∗(FN,C,FH,FW)→(N,FN,OH,OW)+(FN,1,1)→(N,FN,OH,OW)(N, C, H, W) \ast (FN, C, FH, FW) \rightarrow (N, FN, OH, OW) + (FN, 1, 1) \rightarrow (N, FN, OH, OW)(N,C,H,W)(FN,C,FH,FW)(N,FN,OH,OW)+(FN,1,1)(N,FN,OH,OW)

class Convolution:
	def __init__(self, W, b, stride=1, pad=0):
		self.W = W
		self.b = b
		self.stride = stride
		self.pad = pad
	
	def forward(self, x):
		FN, C, FH, FW = self.W.shape
		N, C, H, W = x.shape
		out_h = int(1 + (H + 2*self.pad - FH) / self.stride)  
		out_w = int(1 + (W + 2*self.pad - FW) / self.stride)  
		
		col = im2col(x, FH, FW, self.stride, self.pad)  
		col_W = self.W.reshape(FN, -1).T # 滤波器的展开  
		out = np.dot(col, col_W) + self.b  
		out = out.reshape(N, out_h, out_w, -1).transpose(0, 3, 1, 2)  
		  
		return out

池化层

class Pooling:  
	def __init__(self, pool_h, pool_w, stride=1, pad=0):  
		self.pool_h = pool_h  
		self.pool_w = pool_w  
		self.stride = stride  
		self.pad = pad  
	
	def forward(self, x):  
		N, C, H, W = x.shape    
		out_h = int(1 + (H - self.pool_h) / self.stride)  
		out_w = int(1 + (W - self.pool_w) / self.stride)  
		
		# 展开(1) 
		col = im2col(x, self.pool_h, self.pool_w, self.stride, self.pad)  
		col = col.reshape(-1, self.pool_h*self.pool_w)  
		
		# 最大值(2)  
		out = np.max(col, axis=1)  
		
		# 转换(3)  
		out = out.reshape(N, out_h, out_w, C).transpose(0, 3, 1, 2)  
		return out
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值