指数衰减学习率
#指数衰减学习率
#Learning_rate = LEARNING_RATE_BASE * LEARNING_RATE_DECAY * (global_step / LEARNING_RATE_BATCH_SIZE )
#
#--------------------------------------------------------------------
LEARNING_RATE_BASE = 0.1 #最初学习率
LEARNING_RATE_DECAY = 0.99 #学习衰减率
LEARNING_RATE_STEP = 1 #多少轮BATCH_SIZE后,更新一次学习率,一般为(总样本/BATCH_SIZE)
global_step = tf.Variable(0, trainable=False) #记录当前轮数,运行几轮BATCH_SIZE的计数器,不被训练
#定义指数下降学习率
#staircase=True: 表示global_step/learning_rate_step取整数,学习率阶梯型衰减
#staircase=False:学习率是一条平滑下降的曲线
learning_rate = tf.train.exponential_decay(LEARNING_RATE_BASE, global_step, LEARNING_RATE_STEP, LEARNING_RATE_DECAY, staircase=True)
#待优化参数
w1 = tf.Variable(tf.random_normal([2, 1], stddev=1, seed=1))
y = tf.matmul(x, w1)
loss = tf.reduce_sum(tf.where(tf.greater(y, y_), (y - y_) * COST, (y_-y) * PROFIT))
#反向传播方式
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
滑动平均
记录每一个参数一段时间内过往值的平均,增加模型的泛化性。
#实例化滑动平均类
MOVING_AVERATE_DECAY = 0.99
ema = tf.train.ExponentialMovingAverage(MOVING_AVERATE_DECAY, global_step)
#ema.apply后面括号里面的是更新列表,每次运行sess.run(ema_op)时,对更新列表中的元素求滑动平均值
#在实际应用中会使用tf.trainable_variables自动将所有待训练的参数
ema_op = ema.apply(tf.trainable_variables())
with tf.Session() as sess:
……
sess.run(ema_op)
print(sess.run([w1, ema.average(w1)]))
正则化
在损失函数中给每个参数w加上权重,引入复杂度指标,从而抑制噪声,减少过拟合
def get_weight(shape, regularizer):
w = tf.Variable(tf.random_normal(shape), dtype=tf.float32)
tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(regularizer)(w))
return w
def get_bias(shape):
b = tf.Variable(tf.constant(0.01, shape=shape))
return b
x = tf.placeholder(tf.float32, shape=(None, 2))
y_= tf.placeholder(tf.float32, shape=(None, 1))
w1 = get_weight([2, 11], 0.01)
b1 = get_bias([11])
y1 = tf.nn.relu(tf.matmul(x, w1) + b1)
w2 = get_weight([11, 1], 0.01)
b2 = get_bias([1])
y = tf.matmul(y1, w2) + b2 #输出层不激活
loss_mse = tf.reduce_mean(tf.square(y - y_))
loss_total = loss_mse + tf.add_n(tf.get_collection('losses'))
#定义反向传播,不包含正则化
#train_step = tf.train.AdamOptimizer(0.0001).minimize(loss_mse)
#定义反向传播,包含正则化
train_step = tf.train.AdamOptimizer(0.0001).minimize(loss_total)
不包含正则化、L2和L1正则化
L2
L1
通用模板
def get_weight(shape, regularizer):
w = tf.Variable(tf.random_normal(shape), dtype=tf.float32)
tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(regularizer)(w))
return w
def get_bias(shape):
b = tf.Variable(tf.constant(0.01, shape=shape))
return b
REGULARIZER = 0.01
def forward(x, regularizer):
w1 = get_weight([2, 11], regularizer)
b1 = get_bias([11])
y1 = tf.nn.relu(tf.matmul(x, w1) + b1)
w2 = get_weight([11, 1], regularizer)
b2 = get_bias([1])
y = tf.matmul(y1, w2) + b2
return y
LEARNING_RATE_BASE = 0.001 # 最初学习率
LEARNING_RATE_DECAY = 0.99 # 学习衰减率
LEARNING_RATE_STEP = 1 # 多少轮BATCH_SIZE后,更新一次学习率,一般为(总样本/BATCH_SIZE)
def backward():
x = tf.placeholder(tf.float32, shape=(None, 2))
y_ = tf.placeholder(tf.float32, shape=(None, 1))
y = forward(x, REGULARIZER)
global_step = tf.Variable(0, trainable=False)
#############################################
#type 1: y 与 y_的差距 均方误差
loss = tf.reduce_mean(tf.square(y - y_))
#type 2: 交叉熵
ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y, labels=tf.arg_max(y_, 1))
loss = tf.reduce_mean(ce)
#type 3: 加入正则化(y 与 y_的差距 + )
loss_total = loss + tf.add_n(tf.get_collection('losses'))
# 指数衰减学习率
learning_rate = tf.train.exponential_decay(LEARNING_RATE_BASE,
global_step,
LEARNING_RATE_STEP,
LEARNING_RATE_DECAY,
staircase=True)
# loss_total global_step
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss_total, global_step=global_step)
# 滑动平均,下面三行
MOVING_AVERATE_DECAY = 0.99
ema = tf.train.ExponentialMovingAverage(MOVING_AVERATE_DECAY, global_step)
ema_op = ema.apply(tf.trainable_variables())
with tf.control_dependencies(train_step, ema_op):
train_op = tf.no_op(name='train')
with tf.Session() as sess:
init_op = tf.global_variables_initializer()
sess.run(init_op)
for i in range(10000):
sess.run(train_step, feed_dict={x: , y_:})