先上问题源代码
from tensorflow.examples.tutorials.mnist import input_data
import os
import tensorflow as tf
path = os.path.join("d:\\", "机器学习数据集")
mnist = input_data.read_data_sets(path, one_hot=True)
#输入
X = tf.placeholder("float", [None, 28, 28, 1])
y = tf.placeholder("float", [None, 10])
#第一层卷积
W1 = tf.Variable(tf.truncated_normal([5, 5, 1, 32]))
b1 = tf.Variable(tf.constant(0.1, shape=[32]))
conv1 = tf.nn.relu(tf.nn.conv2d(X, W1, strides=[1, 1, 1, 1], padding="SAME") + b1)
#第一层池化
pool1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME")
#第二层卷积
W2 = tf.Variable(tf.truncated_normal([5, 5, 32, 64]))
b2 = tf.Variable(tf.constant(0.1, shape=[64]))
conv2 = tf.nn.relu(tf.nn.conv2d(pool1, W2, strides=[1, 1, 1, 1], padding="SAME") + b2)
#第二层池化
pool2 = tf.nn.max_pool(conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME")
#全连接层
pool2_flat = tf.reshape(pool2, [-1, 7*7*64])
W_fc = tf.Variable(tf.truncated_normal([7*7*64, 1024]))
b_fc = tf.Variable(tf.constant(0.1, shape=[1024]))
h_fc = tf.nn.relu(tf.matmul(pool2_flat, W_fc) + b_fc)
#dropout层
keep_prob = tf.placeholder("float")
h_fc_drop = tf.nn.dropout(h_fc, keep_prob)
#softmax层
W_fc2 = tf.Variable(tf.truncated_normal([1024, 10]))
b_fc2 = tf.Variable(tf.constant(0.1, shape=[10]))
y_hat = tf.nn.softmax(tf.matmul(h_fc_drop, W_fc2) + b_fc2)
cross_entropy = -tf.reduce_sum(y*tf.log(y_hat))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_hat,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
sess = tf.Session()
sess.run(tf.initialize_all_variables())
for i in range(20000):
batch = mnist.train.next_batch(50)
if i%100 == 0:
train_accuracy = accuracy.eval(session=sess, feed_dict={X: batch[0].reshape([-1, 28, 28, 1]), y: batch[1], keep_prob: 1.0})
print("step %d, training accuracy %g"%(i, train_accuracy))
print(sess.run(W1), sess.run(b1))
sess.run(train_step, feed_dict={X: batch[0].reshape([-1,28,28,1]), y: batch[1], keep_prob: 0.5})
print("test accuracy %g"%accuracy.eval(session=sess, feed_dict={X: mnist.test.images.reshape([-1, 28, 28, 1]), y: mnist.test.labels, keep_prob: 1.0}))
跑下来的结果就是所有权重矩阵W的值全部为nan。
原因是使用了交叉熵作为损失函数,而预测值y_hat是经过softmax得到的,softmax函数的内部可能出现部分元素为大负数且部分元素为大正数的情况,如下图所示,会导致某些项得到的预测值y_hat取0,从而交叉熵log内取0值,使整个权重矩阵为nan。
图片来源于网络,只是为了方便理解,侵删。
修改方法有:
1.对权重W和偏置b的初始值进行限制
对每个权重W生成时加上标准差限制,这里以stddev=0.1,使其生成最大值为0.2,最小值为-0.2,避免此类问题。
修改后代码如下:
# -*- coding:utf-8 -*-
from tensorflow.examples.tutorials.mnist import input_data
import os
import tensorflow as tf
path = os.path.join("d:\\", "机器学习数据集")
mnist = input_data.read_data_sets(path, one_hot=True)
#输入
X = tf.placeholder("float", [None, 28, 28, 1])
y = tf.placeholder("float", [None, 10])
#第一层卷积
W1 = tf.Variable(tf.truncated_normal([5, 5, 1, 32], stddev=0.1))
b1 = tf.Variable(tf.constant(0.1, shape=[32]))
conv1 = tf.nn.relu(tf.nn.conv2d(X, W1, strides=[1, 1, 1, 1], padding="SAME") + b1)
#第一层池化
pool1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME")
#第二层卷积
W2 = tf.Variable(tf.truncated_normal([5, 5, 32, 64], stddev=0.1))
b2 = tf.Variable(tf.constant(0.1, shape=[64]))
conv2 = tf.nn.relu(tf.nn.conv2d(pool1, W2, strides=[1, 1, 1, 1], padding="SAME") + b2)
#第二层池化
pool2 = tf.nn.max_pool(conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME")
#全连接层
pool2_flat = tf.reshape(pool2, [-1, 7*7*64])
W_fc = tf.Variable(tf.truncated_normal([7*7*64, 1024], stddev=0.1))
b_fc = tf.Variable(tf.constant(0.1, shape=[1024]))
h_fc = tf.nn.relu(tf.matmul(pool2_flat, W_fc) + b_fc)
#dropout层
keep_prob = tf.placeholder("float")
h_fc_drop = tf.nn.dropout(h_fc, keep_prob)
#softmax层
W_fc2 = tf.Variable(tf.truncated_normal([1024, 10], stddev=0.1))
b_fc2 = tf.Variable(tf.constant(0.1, shape=[10]))
y_hat = tf.nn.softmax(tf.matmul(h_fc_drop, W_fc2) + b_fc2)
cross_entropy = -tf.reduce_sum(y*tf.log(y_hat))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_hat,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
sess = tf.Session()
sess.run(tf.initialize_all_variables())
for i in range(20000):
batch = mnist.train.next_batch(50)
if i%100 == 0:
train_accuracy = accuracy.eval(session=sess, feed_dict={X: batch[0].reshape([-1, 28, 28, 1]), y: batch[1], keep_prob: 1.0})
print("step %d, training accuracy %g"%(i, train_accuracy))
sess.run(train_step, feed_dict={X: batch[0].reshape([-1,28,28,1]), y: batch[1], keep_prob: 0.5})
print("test accuracy %g"%accuracy.eval(session=sess, feed_dict={X: mnist.test.images.reshape([-1, 28, 28, 1]), y: mnist.test.labels, keep_prob: 1.0}))
2.对交叉熵中log输入上下限进行限定
把损失函数从
-tf.reduce_sum(ytf.log(y_hat))
改成
-tf.reduce_sum(ytf.log(tf.clip_by_value(y_hat, 1e-8, 1)))
即限定log内的输入值上下限,对于输入的0值改成一个很小的正数,对于大正数截断成1.0,即可避免此类错误。
代码不贴了,就是把上面的损失函数改一下。
3.使用tf.nn.softmax_cross_entropy_with_logits_v2函数
tensorflow考虑到了这种情况,提供了tf.nn.softmax_cross_entropy_with_logits_v2函数。把softmax之前的张量作为输入,即可得到交叉熵。