batch Normalization层

最新推荐文章于 2024-11-24 21:26:39 发布
一米半
最新推荐文章于 2024-11-24 21:26:39 发布
阅读量1.2k
点赞数
CC 4.0 BY-SA版权
分类专栏： tensorflow
本文链接：https://blog.youkuaiyun.com/u013061183/article/details/80496430
tensorflow 专栏收录该内容
11 篇文章
订阅专栏
大致是对每层做归一化，能加快收敛速度。
它主要是加在每一层的output 和 activition中间的。
主要看这篇博客：https://www.jianshu.com/p/0312e04e4e83
原理看这篇：https://blog.youkuaiyun.com/hjimce/article/details/50866313
其中会用到滑动平均：可以看我之前的这篇博客。
一个样例代码：
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

'''
参考：https://morvanzhou.github.io/tutorials/machine-learning/tensorflow/5-13-BN/
batch_normalization 简称BN，paper：《Batch Normalization: Accelerating Deep Network Training by  Reducing Internal Covariate Shift》
'''

__author__ = 'Zhang Shuai'

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt


# ACTIVATION = tf.nn.relu # 每一层都使用 relu
ACTIVATION = tf.nn.tanh # 每一层都使用 tanh
N_LAYERS = 7            # 一共7层隐藏层
N_HIDDEN_UNITS = 30     # 每个层隐藏层有 30 个神经元

def fix_seed(seed=1): # 设置随机数种子
    np.random.seed(seed)  #使每次产生的随机数一样
    tf.set_random_seed(seed)


def plot_his(inputs, inputs_norm): # 绘制直方图函数
    for j, all_inputs in enumerate([inputs, inputs_norm]):
        for i, input in enumerate(all_inputs):
            plt.subplot(2, len(all_inputs), j*len(all_inputs)+(i+1))
            plt.cla()
            if i == 0:
                the_range = (-7, 10)
            else:
                the_range = (-1, 1)
            plt.hist(input.ravel(), bins=15, range=the_range, color='#FF5733')
            plt.yticks(())
            if j == 1:
                plt.xticks(the_range)
            else:
                plt.xticks(())
            ax = plt.gca()
            ax.spines['right'].set_color('none')
            ax.spines['top'].set_color('none')
        plt.title("%s normalizing" % ("Without" if j == 0 else "With"))
    plt.draw()
    plt.pause(0.01)


def built_net(xs, ys, norm,on_train=True):  # 搭建网络函数
    # 添加层
    def add_layer(inputs, in_size, out_size, activation_function=None, norm=False):
        Weights = tf.Variable(tf.random_normal([in_size, out_size],
                                               mean=0.0, stddev=1.0))
        biases = tf.Variable(tf.zeros([1, out_size]) + 0.1) #不为0的效果好，但是在BN中因为已经对每个数存在β这个偏置，所以这个参数其实在文章中说已经不需要了
        Wx_plus_b = tf.matmul(inputs, Weights) + biases

        if norm:  # 判断是否是Batch Normalization层
            # 计算均值和方差，axes参数0表示batch维度,均值和方差的维度为out_size
            #如果是图像比如说[batch,height,width,kernels](batch为128，kernels为64),那么axes要等于[0,1,2]
            #这个意思就,128张图之前经过一个核为64的卷积层处理，其实就相当于128*64张图，要求每张图在（其实就是把每张图当做一个神经元，每张图权值共享，kernels的大小就像当于out_size的大小）这个batch下的的均值和方差所以axes等[0,1,2]
            #返回的mean和var的shape均为(64, )也即是以 kernels为64 为单位，batch 中的全部样本的均值与方差
            #这个函数输入shape为(batch_size, out_size),返回的shape应该是(out_size,)
            #参考http://blog.youkuaiyun.com/lanchunhui/article/details/70792458
            #https://www.jianshu.com/p/0312e04e4e83 这个感觉更清楚
            #http://blog.youkuaiyun.com/hjimce/article/details/50866313这个也需要看一下
            fc_mean, fc_var = tf.nn.moments(Wx_plus_b, axes=[0])
            scale = tf.Variable(tf.ones([out_size])) #需要训练，初始化为1，具体参考上面链接中图中公式的γ
            shift = tf.Variable(tf.zeros([out_size])) #就是batch_normalization中的参数offset，这个参数是需要训练的，初始化为0，参考图中的β
            epsilon = 0.001 #图中的ε，选取一个适当小的数就可以

            # 定义滑动平均模型对象，可以参考我的博客http://blog.youkuaiyun.com/u013061183/article/details/79335162
            ema = tf.train.ExponentialMovingAverage(decay=0.5)

            def mean_var_with_update():
                ema_apply_op = ema.apply([fc_mean, fc_var])
                # 下面control_dependencies和identity可以参考我的博客http://blog.youkuaiyun.com/u013061183/article/details/79335065
                with tf.control_dependencies([ema_apply_op]):
                    return tf.identity(fc_mean), tf.identity(fc_var)

            mean, var = tf.cond(tf.cast(on_train, tf.bool),  # on_train 的值是 True/False,但是不能用python的bool类型，所以要用cast转换成tf.bool
                                mean_var_with_update,  # 如果是 True, 更新 mean/var
                                lambda: (  # 如果是 False, 返回之前 fc_mean/fc_var 的Moving Average
                                    ema.average(fc_mean),
                                    ema.average(fc_var)
                                )
                                )

            Wx_plus_b = tf.nn.batch_normalization(Wx_plus_b, mean, var,
                                                  shift, scale, epsilon)
            # 上面那一步, 在做如下事情:
            # Wx_plus_b = (Wx_plus_b - fc_mean) / tf.sqrt(fc_var + 0.001)
            # Wx_plus_b = Wx_plus_b * scale + shift

        if activation_function is None:
            outputs = Wx_plus_b
        else:
            outputs = activation_function(Wx_plus_b)
        return outputs

    fix_seed(1)
    #在上面还有一个 if norm 那里定义的更详细
    if norm:  # 为第一层进行BN，就是对输入也进行normalizaiton，前面的只是对每一个加一个normalization层
        fc_mean, fc_var = tf.nn.moments(xs, axes=[0])
        scale = tf.Variable(tf.ones([1]))
        shift = tf.Variable(tf.zeros([1]))
        epsilon = 0.001
        # 可以加上num_updates=steps这个参数，这个steps的是步数，加入后decay=min(decay,(1+steps)/(10+steps))这样可以让刚开始的时候decay较小直到收敛到decay
        ema = tf.train.ExponentialMovingAverage(decay=0.5)
        def mean_var_with_update():
            ema_apply_op = ema.apply([fc_mean, fc_var])
            with tf.control_dependencies([ema_apply_op]):
                return tf.identity(fc_mean), tf.identity(fc_var)
        #cond 相当于if else
        mean, var = tf.cond(tf.cast(on_train, tf.bool),  # on_train 的值是 True/False
                            mean_var_with_update,  # 如果是 True, 更新 mean/var
                            lambda: (  # 如果是 False, 返回之前 fc_mean/fc_var 的Moving Average
                                ema.average(fc_mean), #获取到一直计算的滑动平均，意思是说如果不是训练那么就取
                                ema.average(fc_var)
                            )
                            )
        xs = tf.nn.batch_normalization(xs, mean, var, shift, scale, epsilon)

    layers_inputs = [xs]  # 记录每一层的输入

    for l_n in range(N_LAYERS):  # 依次添加7层
        layer_input = layers_inputs[l_n]
        in_size = layers_inputs[l_n].get_shape()[1].value
        output = add_layer(layer_input, in_size, N_HIDDEN_UNITS, ACTIVATION, norm)
        layers_inputs.append(output)

    prediction = add_layer(layers_inputs[-1], 30, 1, activation_function=None)
    cost = tf.reduce_mean(tf.reduce_sum(tf.square(ys - prediction),
                                        reduction_indices=[1]))

    train_op = tf.train.GradientDescentOptimizer(0.001).minimize(cost)
    return [train_op, cost, layers_inputs]


fix_seed(1)
x_data = np.linspace(-7, 10, 2500)[:, np.newaxis]
np.random.shuffle(x_data)
noise = np.random.normal(0, 8, x_data.shape)
y_data = np.square(x_data) - 5 + noise

plt.scatter(x_data, y_data)
plt.show()

xs = tf.placeholder(tf.float32, [None, 1])
ys = tf.placeholder(tf.float32, [None, 1])

train_op, cost, layers_inputs = built_net(xs, ys, norm=False)
train_op_norm, cost_norm, layers_inputs_norm = built_net(xs, ys, norm=True)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    cost_his = []
    cost_his_norm = []
    record_step = 5

    plt.ion()
    plt.figure(figsize=(7, 3))
    for i in range(250):
        if i % 50 == 0:
            all_inputs, all_inputs_norm = sess.run([layers_inputs, layers_inputs_norm],
                                                   feed_dict={xs: x_data, ys: y_data})
            plot_his(all_inputs, all_inputs_norm)

        sess.run([train_op, train_op_norm],
                 feed_dict={xs: x_data[i * 10:i * 10 + 10], ys: y_data[i * 10:i * 10 + 10]})

        if i % record_step == 0:
            cost_his.append(sess.run(cost, feed_dict={xs: x_data, ys: y_data}))
            cost_his_norm.append(sess.run(cost_norm,
                                          feed_dict={xs: x_data, ys: y_data}))

    plt.ioff()
    plt.figure()
    plt.plot(np.arange(len(cost_his)) * record_step,
             np.array(cost_his), label='Without BN')  # no norm
    plt.plot(np.arange(len(cost_his)) * record_step,
             np.array(cost_his_norm), label='With BN')  # norm
    plt.legend()
    plt.show()