成本函数是“交叉熵”(交叉熵)交叉熵产生于信息论里面的信息压缩编码技术,但是它后来演变成为从博弈论到机器学习等其他领域里的重要技术手段它的定义如下。:
y 是我们预测的概率分布, y' 是实际的分布(我们输入的one-hot vector)。比较粗糙的理解是,交叉熵是用来衡量我们的预测用于描述真相的低效性。的关于交叉熵的解释超出本教程的范畴,但是你很有必要好好理解它。
为了计算交叉熵,我们首先需要添加一个新的占位符用于输入正确值:
y_ = tf.placeholder("float", [None,10])
我们然后可以用 计算交叉熵:
cross_entropy = -tf.reduce_sum(y_*tf.log(y))
首先,用 tf.log
计算 y
的每个元素的对数。接下来,把我们 y_
的每一个元素状语从句: tf.log(y)
的对应元素相乘。最后,用 tf.reduce_sum
计算张量的所有元素的总和。(注意,这里的交叉熵不仅仅用来衡量单一的一对预测和真实值,而是所有100幅图片的交叉熵的总和。对于100个数据点的预测表现比单一数据点的表现能更好地描述我们的模型的性能。
-
注意:上面求得是交叉熵的总和,正常我们求得是样本均值。
#此处y是经过softmax的值
cross_entropy = -tf.reduce_mean(tf.reduce_sum(y_*tf.log(y),reduction_indices=[1]))
在例子中采用的是:
#此处logits=y为《未经过》softmax的预测值
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_))
train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
二分类问题例子(tensorflow):
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
导入数据,不需要咱X矩阵前加一列数字1
def load_data():
datafile = 'data/ex2data1.txt'
#!head $datafile
cols = np.loadtxt(datafile,delimiter=',',usecols=(0,1,2),unpack=True) #Read in comma separated data
##Form the usual "X" matrix and "y" vector
X = np.transpose(np.array(cols[:-1]))
y = np.transpose(np.array(cols[-1:]))
m = y.size # number of training examples
##Insert the usual column of 1's into the "X" matrix
# X = np.insert(X,0,1,axis=1)
return X, y #X为(100,2) y为(100,1)
to_categorical函数将Ÿ的类别表示为one_hot编码。
如[0,1]两类变为01和10
如[0,1,2]三类变为001和010和100
def to_categorical(y, num_classes=None):
"""Converts a class vector (integers) to binary class matrix.
E.g. for use with categorical_crossentropy.
# Arguments
y: class vector to be converted into a matrix
(integers from 0 to num_classes).
num_classes: total number of classes.
# Returns
A binary matrix representation of the input.
"""
y = np.array(y, dtype='int').ravel()
if not num_classes:
num_classes = np.max(y) + 1
n = y.shape[0]
categorical = np.zeros((n, num_classes))
categorical[np.arange(n), y] = 1
return categorical
洗牌,在数据集的数据中随机选择num数量的样本。这里加了一个打印(idx)监视
注意,data_shuffle开始为np数组的列表,使用np.asarray转化为完全的数组
def next_batch(num, data):
"""
Return a total of `num` samples from the array `data`.
"""
idx = np.arange(0, len(data)) # get all possible indexes
np.random.shuffle(idx) # shuffle indexes
idx = idx[0:num] # use only `num` random indexes
print(idx)
data_shuffle = [data[i] for i in idx] # get list of `num` random samples
data_shuffle = np.asarray(data_shuffle) # get back numpy array
return data_shuffle
建立模型并初始化
train_X, train_y = load_data()
print(train_X.shape) #(100, 2)
print(train_y.shape) #(100, 1)
x = tf.placeholder("float", shape=[None, 2])
y_ = tf.placeholder("float", shape=[None, 2])
W = tf.Variable(tf.zeros([2, 2]))
b = tf.Variable(tf.zeros([2]))
import tensorflow as tf
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
logits参数应该是还没有softmax的结果,如[[1,2,3]]
用这里的为交叉熵
y = tf.nn.softmax(tf.matmul(x, W) + b)
#这行是老版本的cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, y_))
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_))
train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
# train_step.run(feed_dict={x: train_X, y_: train_y})
# print(sess.run(W), sess.run(b))
向模型每次喂10个随机样本,循环1000次。
但是,这个next_batch的用法肯定不对,两次随机结果不同。
for i in range(1000):
batch_X = next_batch(10, train_X)
batch_y = next_batch(10, train_y)
train_step.run(feed_dict={x: batch_X, y_: to_categorical(batch_y, num_classes=2)})
print(i, sess.run(W), sess.run(b))
correct_prediction = tf.equal(tf.argmax(y_, 1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
print(accuracy.eval(feed_dict={x: train_X, y_: to_categorical(train_y)}))
某次的结果:
[64 45 31 21 84 24 53 78 14 52]
[99 54 32 13 57 96 59 7 28 43]
999 [[-0.08833836 0.08833836]
[-0.76764715 0.76764715]] [-0.005 0.005]
0.6
二元分类(sklearn)看不懂:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
def load_data():
datafile = 'data/ex2data1.txt'
#!head $datafile
cols = np.loadtxt(datafile,delimiter=',',usecols=(0,1,2),unpack=True) #Read in comma separated data
##Form the usual "X" matrix and "y" vector
X = np.transpose(np.array(cols[:-1]))
#ravel()同flatten()都是将多维数组降位一维
#numpy.flatten()返回一份拷贝,修改不影响原数据
#numpy.ravel()返回的是视图,修改原数据
print(np.array(cols[-1:]).shape) #(1,100)
y = np.transpose(np.array(cols[-1:])).ravel() #(100,1)→(100,)
m = y.size # number of training examples
##Insert the usual column of 1's into the "X" matrix
# X = np.insert(X,0,1,axis=1)
return X, y
def plotData(X, plt):
pos = np.array([X[i] for i in xrange(X.shape[0]) if y[i] == 1])
neg = np.array([X[i] for i in xrange(X.shape[0]) if y[i] == 0])
plt.figure(figsize=(10,6))
plt.plot(pos[:,1],pos[:,2],'k+',label='Admitted')
plt.plot(neg[:,1],neg[:,2],'yo',label='Not admitted')
plt.xlabel('Exam 1 score')
plt.ylabel('Exam 2 score')
plt.legend()
plt.grid(True)
X, y = load_data()
print(X)
print(y)
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X, y)
print(clf.coef_)
xx, yy = np.mgrid[np.min(X[:, 0]):np.max(X[:, 0]):2., np.min(X[:, 0]):np.max(X[:, 0]):2.]
grid = np.c_[xx.ravel(), yy.ravel()]
probs = clf.predict_proba(grid)[:, 1].reshape(xx.shape)
f, ax = plt.subplots(figsize=(8, 6))
contour = ax.contourf(xx, yy, probs, 25, cmap="RdBu",
vmin=0, vmax=1)
ax_c = f.colorbar(contour)
ax_c.set_label("$P(y = 1)$")
ax_c.set_ticks([0, .25, .5, .75, 1])
ax.scatter(X[:,0], X[:, 1], c=y[:], s=50,
cmap="RdBu", vmin=-.2, vmax=1.2,
edgecolor="white", linewidth=1)
ax.set(aspect="equal",
xlim=(30, 90), ylim=(30, 90),
xlabel="$X_1$", ylabel="$X_2$")
plt.show()