Backpropagation Algorithm Implementation

本文介绍了一个使用三层神经网络对手写数字进行分类的实验。该网络采用Sigmoid激活函数,并通过反向传播算法进行训练。实验使用了MNIST数据集,并详细记录了训练过程中的参数变化。

数据来源:http://yann.lecun.com/exdb/mnist/

以下代码实现了一个三层的神经网络,其激活函数为 s i g m o d   f u n c t i o n sigmod\ function sigmod function。笔者运行了多次都遇到了 s i g m o d sigmod sigmod梯度消失,模型预测效果蛮差。读者也可试试其它激活函数。阅读本程序可助于深入理解神经网络和反向传播算法

import os
import numpy as np
import random
from mnist import MNIST
from sklearn.preprocessing import LabelBinarizer

mndata = MNIST('mnist/')
training_images, training_labels = mndata.load_training()

training_labels = LabelBinarizer().fit_transform(np.array(training_labels))
training_images = np.array(training_images)
training_images = training_images/255

testing_images, testing_labels = mndata.load_testing()

testing_labels = LabelBinarizer().fit_transform(np.array(testing_labels))
testing_images = np.array(testing_images)
testing_images = testing_images/255
def sigmod(x):
    return 1/(1+np.exp(-x))
    
def linear_act(x):
    return x

def d_sigmod(y):
    return y*(1-y)

def d_linear_act(x):
    x[True] = 1
    return x

class NerualNetwork():
    def __init__(self,layers,lr=0.1):
        self.layers = layers
        self.num_of_layers = np.shape(layers)[0]
        self.lr = lr
        self.W = [np.nan]
        self.b = [np.nan]
        self.a_last = np.array([])
        for i in range(self.num_of_layers-1):
            self.W.append(np.random.random((layers[i+1],layers[i]))*2-1)
            self.b.append(np.random.random((layers[i+1],1))*2-1)
            
    def forword_propagation(self,X):
        s = [np.nan]
        a = [X]
        for l in range(1,self.num_of_layers-1):
            s.append(np.dot(self.W[l],a[l-1])+self.b[l])
            a.append(sigmod(s[l]))
        s.append(np.dot(self.W[self.num_of_layers-1],a[self.num_of_layers-2])+self.b[self.num_of_layers-1])
        a.append(sigmod(s[self.num_of_layers-1]))
        
        return s,a
    
    def predict(self,X):
        s,a = self.forword_propagation(X)
        prediction = a[-1]
        return prediction
    
    def back_propagation(self,s,a,Y):
        d_a_last = -2*(Y-a[-1])/self.layers[-1]
        d_s_last = np.dot(np.diag(d_sigmod(a[-1][:,0])), d_a_last[:,0].reshape(10,1))
        
        for m in range(1,np.shape(Y)[1]):
            d_s_last = np.append(d_s_last,
                                 np.dot(np.diag(d_sigmod(a[-1][:,m])), d_a_last[:,m].reshape(10,1)), 1)
        
        ds = [d_s_last]
        for i in list(reversed(range(self.num_of_layers-2))):            
            d_s_hide = np.dot(np.diag(d_sigmod(a[i+1][:,0])),
                              np.dot(self.W[i+2].T,ds[-1][:,0].reshape(self.layers[i+2],1)))
            
            for m in range(1,np.shape(Y)[1]):
                d_s_hide = np.append(d_s_hide,
                                     np.dot(np.diag(d_sigmod(a[i+1][:,m])),
                                            np.dot(self.W[i+2].T,ds[-1][:,m].reshape(self.layers[i+2],1))), 1)
            ds.append(d_s_hide)
        
        ds.append(np.nan)
        ds = list(reversed(ds))
        
        with open('ds.txt','a') as f_ds:
            for l in range(1,self.num_of_layers):
                f_ds.write('\nLayer '+str(l)+'\n')
                f_ds.write('\nds:'+str(np.shape(ds[l]))+'\n')
                f_ds.write(str(ds[l])+'\n')
        
        self.update_param(a,ds)
        
    def update_param(self,a,ds):
        for i in range(1,self.num_of_layers):
            sum_of_gradient_w = np.dot(ds[i][:,0].reshape(self.layers[i],1),
                                       a[i-1][:,0].reshape(self.layers[i-1],1).T)
            sum_of_gradient_b = ds[i][:,0].reshape(self.layers[i],1)
            for m in range(1,np.shape(ds[i])[1]):
                sum_of_gradient_w += np.dot(ds[i][:,m].reshape(self.layers[i],1),
                                         a[i-1][:,m].reshape(self.layers[i-1],1).T)
                sum_of_gradient_b += ds[i][:,m].reshape(self.layers[i],1)
                
            self.W[i] -= self.lr*sum_of_gradient_w/(np.shape(ds[i])[1])
            self.b[i] -= self.lr*sum_of_gradient_b/(np.shape(ds[i])[1])
            
    def train(self,X,Y,epochs=10000, batch_size=500):
        if os.path.isfile('parameters.txt'):
            os.remove('parameters.txt')
        if os.path.isfile('activations.txt'):
            os.remove('activations.txt') 
        if os.path.isfile('costs.txt'):
            os.remove('costs.txt')
        if os.path.isfile('ds.txt'):
            os.remove('ds.txt')
            
        for i in range(epochs):
            sample_index = random.sample(list(range(np.shape(Y)[1])), batch_size)
            X_in_batch = X[:,sample_index]
            Y_in_batch = Y[:,sample_index]
            if i > 200:
                self.lr /= 10
            self.train_in_batch(X_in_batch,Y_in_batch,i)
            
    def train_in_batch(self,X,Y,i):
        s,a = self.forword_propagation(X)
        
        if 1:
            with open('parameters.txt','a') as f_parameters:
                f_parameters.write('\n\nAt iteration:'+str(i)+'\n')
                for l in range(1,self.num_of_layers):
                    f_parameters.write('\nLayer '+str(l)+'\n')
                    f_parameters.write('\nW:'+str(np.shape(self.W[l]))+'\n')
                    f_parameters.write(str(self.W[l])+'\n')

                    f_parameters.write('\nb:'+str(np.shape(self.b[l]))+'\n')
                    f_parameters.write(str(self.b[l])+'\n')

            with open('activations.txt','a') as f_activations:
                f_activations.write('\n\nAt iteration:'+str(i)+'\n')
                for l in range(0,self.num_of_layers):
                    f_activations.write('\nLayer '+str(l)+'\n')
                    f_activations.write('\ns:'+str(np.shape(s[l]))+'\n')
                    f_activations.write(str(s[l])+'\n')
                    
                    f_activations.write('\na:'+str(np.shape(a[l]))+'\n')
                    f_activations.write(str(a[l])+'\n')
            
            with open('ds.txt','a') as f_ds:
                f_ds.write('\n\nAt iteration:'+str(i)+'\n')
                
        self.a_last = a[-1]
        
        cost = np.zeros((1,1))
        for j in range(np.shape(Y)[1]):
            residual = Y[:,j].reshape(10,1)-self.a_last[:,j].reshape(10,1)
            cost += np.dot(residual.T,residual)
                
        with open('costs.txt','a') as f_costs:
            f_costs.write(str(cost/(np.shape(Y)[1]))+'\n')
        self.back_propagation(s,a,Y)
training_X = training_images.T
training_Y = training_labels.T
testing_X = testing_images.T
testing_Y = testing_labels.T
nn = NerualNetwork(np.array([784,196,49,10]))
nn.train(training_X,training_Y,epochs=1000)
import matplotlib.pyplot as plt
%matplotlib inline

with open('costs.txt') as f:
    y = []
    for eachline in f:
        y.append(float(eachline[3:][:-3]))
        
x = range(len(y))

plt.figure()
plt.plot(x,y)
plt.title('MNIST: 3-layer Nerual Networks using Sigmod Function')
plt.xlabel('steps')
plt.ylabel('cost')

这里写图片描述

testing_Y[:,list(range(5))]
array([[0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])
nn.predict(testing_X[:,list(range(5))])
array([[ 0.08852091,  0.07712739,  0.24558161,  0.24253024,  0.09615363],
       [ 0.06009189,  0.01842345,  0.02953039,  0.10981609,  0.12261026],
       [ 0.42194989,  0.26216405,  0.0929229 ,  0.26748772,  0.22785839],
       [ 0.06863829,  0.07286736,  0.17960174,  0.18293914,  0.0487555 ],
       [ 0.02216657,  0.12481555,  0.21624219,  0.04304522,  0.04131497],
       [ 0.03851181,  0.03991082,  0.0137592 ,  0.01167841,  0.0100367 ],
       [ 0.08333821,  0.33273348,  0.100765  ,  0.11475403,  0.23539435],
       [ 0.00238281,  0.00640189,  0.00788065,  0.02162292,  0.00926097],
       [ 0.03512258,  0.00634047,  0.00719783,  0.00774136,  0.00957267],
       [ 0.17760028,  0.27654505,  0.02500085,  0.11724239,  0.08195934]])
training_Y[:,list(range(5))]
array([[0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1]])
nn.predict(training_X[:,list(range(5))])
array([[ 0.14088619,  0.43447728,  0.00969862,  0.11855052,  0.3753252 ],
       [ 0.07504254,  0.18396886,  0.01375044,  0.02276849,  0.17391514],
       [ 0.18822948,  0.24832705,  0.48081013,  0.13560714,  0.13318023],
       [ 0.10294189,  0.28654818,  0.14425579,  0.14219287,  0.040433  ],
       [ 0.06517029,  0.08686016,  0.14041424,  0.38237368,  0.14986737],
       [ 0.0906671 ,  0.04954782,  0.01434389,  0.00715321,  0.01533208],
       [ 0.15485006,  0.04465222,  0.01622938,  0.18944488,  0.24251637],
       [ 0.10819804,  0.01239158,  0.03952334,  0.01231031,  0.00277187],
       [ 0.05063162,  0.01122012,  0.04046127,  0.01782585,  0.02276523],
       [ 0.34232939,  0.01950765,  0.18545528,  0.05277625,  0.03637404]])
翻译下面的这段话 Hard Autograd for Algebraic Expressions 分数 100 作者 郑友怡 单位 浙江大学 The application of automatic differentiation technology in frameworks such as torch and tensorflow has greatly facilitated people's implementation and training of deep learning algorithms based on backpropagation. Now, we hope to implement an automatic differentiation program for algebraic expressions. Input Format First, input an infix expression composed of the following symbols. Operators Type Examples Notes Bracket ( ) Power ^ Multiplication & Division * / Addition & Subtraction + - Argument separator , optional, only used as argument separators in multivariate functions The above operators are arranged in order of operator precedence from top to bottom. For example, a+b^c*d will be considered the same as a + ( (b ^ c) * d ). Mathematical Functions (bonus) Function Description ln(A) log(A, B) logarithm. ln(A) represents the natural logarithm of A, and log(A, B) represents the logarithm of B based on A. cos(A) sin(A) tan(A) basic trigonometric functions. pow(A, B) exp(A) exponential functions. pow(A, B) represents the B power of A, and exp(A) represents the natural exponential of A. Operands Type Examples Notes Literal constant 2 3 0 -5 Just consider integers consisting of pure numbers and minus signs. Variable ex cosval xy xx Considering the above "mathematical functions" as reserved words, identifiers (strings of lowercase English letters) that are not reserved words are called variables. Output Method For each variable (as defined above) that appears in the expression, describe an arithmetic expression that represents the derivative of the input algebraic expression with respect to that variable, using the operators, mathematical functions, and operands defined in the input form. The output is arranged according to the lexicographical order of the variables. For each line print two strings, which are each variable and the corresponding derivative function. Separate the two strings
最新发布
03-27
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值