softmax这个比较简单,和之前的SVM基本差不多,只是用的公式不太一样罢了.
涉及到两个文件
第一个是根目录下的softmax.ipynb
第二个是/CS231n/classifier/softmax.py
softmax.ipynb
Softmax exercise
Complete and hand in this completed worksheet (including its outputs and any supporting code outside of the worksheet) with your assignment submission. For more details see the assignments page on the course website.
This exercise is analogous to the SVM exercise. You will:
implement a fully-vectorized loss function for the Softmax classifier
implement the fully-vectorized expression for its analytic gradient
check your implementation with numerical gradient
use a validation set to tune the learning rate and regularization strength
optimize the loss function with SGD
visualize the final learned weights
In [1]:
import random
import numpy as np
from cs231n.data_utils import load_CIFAR10
import matplotlib.pyplot as plt
from __future__ import print_function
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
# for auto-reloading extenrnal modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
In [2]:
def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=1000, num_dev=500):
"""
Load the CIFAR-10 dataset from disk and perform preprocessing to prepare
it for the linear classifier. These are the same steps as we used for the
SVM, but condensed to a single function.
"""
# Load the raw CIFAR-10 data
cifar10_dir = 'cs231n/datasets/cifar-10-batches-py'
X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
# subsample the data
mask = list(range(num_training, num_training + num_validation))
X_val = X_train[mask]
y_val = y_train[mask]
mask = list(range(num_training))
X_train = X_train[mask]
y_train = y_train[mask]
mask = list(range(num_test))
X_test = X_test[mask]
y_test = y_test[mask]
mask = np.random.choice(num_training, num_dev, replace=False)
X_dev = X_train[mask]
y_dev = y_train[mask]
# Preprocessing: reshape the image data into rows
X_train = np.reshape(X_train, (X_train.shape[0], -1))
X_val = np.reshape(X_val, (X_val.shape[0], -1))
X_test = np.reshape(X_test, (X_test.shape[0], -1))
X_dev = np.reshape(X_dev, (X_dev.shape[0], -1))
# Normalize the data: subtract the mean image
mean_image = np.mean(X_train, axis = 0)
X_train -= mean_image
X_val -= mean_image
X_test -= mean_image
X_dev -= mean_image
# add bias dimension and transform into columns
X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))])
X_val = np.hstack([X_val, np.ones((X_val.shape[0], 1))])
X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))])
X_dev = np.hstack([X_dev, np.ones((X_dev.shape[0], 1))])
return X_train, y_train, X_val, y_val, X_test, y_test, X_dev, y_dev
# Invoke the above function to get our data.
X_train, y_train, X_val, y_val, X_test, y_test, X_dev, y_dev = get_CIFAR10_data()
print('Train data shape: ', X_train.shape)
print('Train labels shape: ', y_train.shape)
print('Validation data shape: ', X_val.shape)
print('Validation labels shape: ', y_val.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)
print('dev data shape: ', X_dev.shape)
print('dev labels shape: ', y_dev.shape)
Train data shape: (49000, 3073)
Train labels shape: (49000,)
Validation data shape: (1000, 3073)
Validation labels shape: (1000,)
Test data shape: (1000, 3073)
Test labels shape: (1000,)
dev data shape: (500, 3073)
dev labels shape: (500,)
Softmax Classifier
Your code for this section will all be written inside cs231n/classifiers/softmax.py.
In [6]:
# First implement the naive softmax loss function with nested loops.
# Open the file cs231n/classifiers/softmax.py and implement the
# softmax_loss_naive function.
from cs231n.classifiers.softmax import softmax_loss_naive
import time
# Generate a random softmax weight matrix and use it to compute the loss.
W = np.random.randn(3073, 10) * 0.0001
loss, grad = softmax_loss_naive(W, X_dev, y_dev, 0.0)
# As a rough sanity check, our loss should be something close to -log(0.1).
print('loss: %f' % loss)
print('sanity check: %f' % (-np.log(0.1)))
loss: 2.339974
sanity check: 2.302585
Inline Question 1:
Why do we expect our loss to be close to -log(0.1)? Explain briefly.**
Your answer: Fill this in
In [7]:
# Complete the implementation of softmax_loss_naive and implement a (naive)
# version of the gradient that uses nested loops.
loss, grad = softmax_loss_naive(W, X_dev, y_dev, 0.0)
# As we did for the SVM, use numeric gradient checking as a debugging tool.
# The numeric gradient should be close to the analytic gradient.
from cs231n.gradient_check import grad_check_sparse
f = lambda w: softmax_loss_naive(w, X_dev, y_dev, 0.0)[0]
grad_numerical = grad_check_sparse(f, W, grad, 10)
# similar to SVM case, do another gradient check with regularization
loss, grad = softmax_loss_naive(W, X_dev, y_dev, 5e1)
f = lambda w: softmax_loss_naive(w, X_dev, y_dev, 5e1)[0]
grad_numerical = grad_check_sparse(f, W, grad, 10)
numerical: 1.464275 analytic: 1.464275, relative error: 2.040045e-08
numerical: -1.310010 analytic: -1.310010, relative error: 1.243980e-09
numerical: -1.426120 analytic: -1.426120, relative error: 1.817633e-08
numerical: -0.903849 analytic: -0.903849, relative error: 3.389652e-08
numerical: -2.001461 analytic: -2.001461, relative error: 2.635176e-08
numerical: -0.564966 analytic: -0.564966, relative error: 2.155435e-08
numerical: -0.136658 analytic: -0.136658, relative error: 3.808970e-07
numerical: 1.498520 analytic: 1.498520, relative error: 7.586528e-09
numerical: -0.507835 analytic: -0.507835, relative error: 3.374378e-08
numerical: -3.199066 analytic: -3.199066, relative error: 3.025549e-08
numerical: 1.881794 analytic: 1.881794, relative error: 1.405336e-08
numerical: 1.455125 analytic: 1.455125, relative error: 2.393593e-09
numerical: -0.752963 analytic: -0.752963, relative error: 1.049488e-07
numerical: 2.752752 analytic: 2.752752, relative error: 7.774718e-09
numerical: 0.168792 analytic: 0.168792, relative error: 6.825204e-08
numerical: 2.559652 analytic: 2.559652, relative error: 1.527773e-08
numerical: -0.288527 analytic: -0.288527, relative error: 6.532290e-08
numerical: -0.136916 analytic: -0.136916, relative error: 3.876003e-08
numerical: 3.747826 analytic: 3.747826, relative error: 1.589165e-08
numerical: 1.112363 analytic: 1.112363, relative error: 1.513127e-08
In [8]:
# Now that we have a naive implementation of the softmax loss function and its gradient,
# implement a vectorized version in softmax_loss_vectorized.
# The two versions should compute the same results, but the vectorized version should be
# much faster.
tic = time.time()
loss_naive, grad_naive = softmax_loss_naive(W, X_dev, y_dev, 0.000005)
toc = time.time()
print('naive loss: %e computed in %fs' % (loss_naive, toc - tic))
from cs231n.classifiers.softmax import softmax_loss_vectorized
tic = time.time()
loss_vectorized, grad_vectorized = softmax_loss_vectorized(W, X_dev, y_dev, 0.000005)
toc = time.time()
print('vectorized loss: %e computed in %fs' % (loss_vectorized, toc - tic))
# As we did for the SVM, we use the Frobenius norm to compare the two versions
# of the gradient.
grad_difference = np.linalg.norm(grad_naive - grad_vectorized, ord='fro')
print('Loss difference: %f' % np.abs(loss_naive - loss_vectorized))
print('Gradient difference: %f' % grad_difference)
naive loss: 2.339974e+00 computed in 0.064048s
vectorized loss: 2.339974e+00 computed in 0.003671s
Loss difference: 0.000000
Gradient difference: 0.000000
In [10]:
# Use the validation set to tune hyperparameters (regularization strength and
# learning rate). You should experiment with different ranges for the learning
# rates and regularization strengths; if you are careful you should be able to
# get a classification accuracy of over 0.35 on the validation set.
from cs231n.classifiers import Softmax
results = {}
best_val = -1
best_softmax = None
learning_rates = [1e-7, 5e-7]
regularization_strengths = [2.5e4, 5e4]
################################################################################
# TODO: #
# Use the validation set to set the learning rate and regularization strength. #
# This should be identical to the validation that you did for the SVM; save #
# the best trained softmax classifer in best_softmax. #
################################################################################
for reg in regularization_strengths:
for lr in learning_rates:
svm = Softmax()
loss_hist = svm.train(X_train, y_train, lr, reg, num_iters=1500)
y_train_pred = svm.predict(X_train)
train_accuracy = np.mean(y_train == y_train_pred)
y_val_pred = svm.predict(X_val)
val_accuracy = np.mean(y_val == y_val_pred)
if val_accuracy > best_val:
best_val = val_accuracy
best_softmax = svm
results[(lr,reg)] = train_accuracy, val_accuracy
################################################################################
# END OF YOUR CODE #
################################################################################
# Print out results.
for lr, reg in sorted(results):
train_accuracy, val_accuracy = results[(lr, reg)]
print('lr %e reg %e train accuracy: %f val accuracy: %f' % (
lr, reg, train_accuracy, val_accuracy))
print('best validation accuracy achieved during cross-validation: %f' % best_val)
lr 1.000000e-07 reg 2.500000e+04 train accuracy: 0.351776 val accuracy: 0.363000
lr 1.000000e-07 reg 5.000000e+04 train accuracy: 0.331980 val accuracy: 0.345000
lr 5.000000e-07 reg 2.500000e+04 train accuracy: 0.347939 val accuracy: 0.355000
lr 5.000000e-07 reg 5.000000e+04 train accuracy: 0.322000 val accuracy: 0.333000
best validation accuracy achieved during cross-validation: 0.363000
In [11]:
# evaluate on test set
# Evaluate the best softmax on test set
y_test_pred = best_softmax.predict(X_test)
test_accuracy = np.mean(y_test == y_test_pred)
print('softmax on raw pixels final test set accuracy: %f' % (test_accuracy, ))
softmax on raw pixels final test set accuracy: 0.351000
In [12]:
# Visualize the learned weights for each class
w = best_softmax.W[:-1,:] # strip out the bias
w = w.reshape(32, 32, 3, 10)
w_min, w_max = np.min(w), np.max(w)
classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
for i in range(10):
plt.subplot(2, 5, i + 1)
# Rescale the weights to be between 0 and 255
wimg = 255.0 * (w[:, :, :, i].squeeze() - w_min) / (w_max - w_min)
plt.imshow(wimg.astype('uint8'))
plt.axis('off')
plt.title(classes[i])
/CS231n/classifier/softmax.py
import numpy as np
from random import shuffle
from past.builtins import xrange
def softmax_loss_naive(W, X, y, reg):
"""
Softmax loss function, naive implementation (with loops)
Inputs have dimension D, there are C classes, and we operate on minibatches
of N examples.
Inputs:
- W: A numpy array of shape (D, C) containing weights.
- X: A numpy array of shape (N, D) containing a minibatch of data.
- y: A numpy array of shape (N,) containing training labels; y[i] = c means
that X[i] has label c, where 0 <= c < C.
- reg: (float) regularization strength
Returns a tuple of:
- loss as single float
- gradient with respect to weights W; an array of same shape as W
"""
# Initialize the loss and gradient to zero.
loss = 0.0
dW = np.zeros_like(W)
#############################################################################
# TODO: Compute the softmax loss and its gradient using explicit loops. #
# Store the loss in loss and the gradient in dW. If you are not careful #
# here, it is easy to run into numeric instability. Don't forget the #
# regularization! #
#############################################################################
dW_each = np.zeros_like(W)
num_train, dim = X.shape
num_class = W.shape[1]
f = X.dot(W) # N by C
# Considering the Numeric Stability
f_max = np.reshape(np.max(f, axis=1), (num_train, 1)) # 找到最大值然后减去,这样是为了防止后面的操作会出现数值上的一些偏差
prob = np.exp(f - f_max) / np.sum(np.exp(f - f_max), axis=1, keepdims=True) # N by C
y_trueClass = np.zeros_like(prob)
y_trueClass[np.arange(num_train), y] = 1.0
for i in xrange(num_train):
for j in xrange(num_class):
loss += -(y_trueClass[i, j] * np.log(prob[i, j])) # 损失函数的公式L = -(1/N)∑i∑j1(k=yi)log(exp(fk)/∑j exp(fj)) + λR(W)
dW_each[:, j] = -(y_trueClass[i, j] - prob[i, j]) * X[i, :]#梯度的公式 ∇Wk L = -(1/N)∑i xiT(pi,m-Pm) + 2λWk, where Pk = exp(fk)/∑j exp(fj
dW += dW_each#这是把每个类的放在了一起
loss /= num_train
loss += 0.5 * reg * np.sum(W * W)
dW /= num_train
dW += reg * W
#############################################################################
# END OF YOUR CODE #
#############################################################################
return loss, dW
def softmax_loss_vectorized(W, X, y, reg):
"""
Softmax loss function, vectorized version.
Inputs and outputs are the same as softmax_loss_naive.
"""
# Initialize the loss and gradient to zero.
loss = 0.0
dW = np.zeros_like(W)
#############################################################################
# TODO: Compute the softmax loss and its gradient using no explicit loops. #
# Store the loss in loss and the gradient in dW. If you are not careful #
# here, it is easy to run into numeric instability. Don't forget the #
# regularization! #
#############################################################################
num_train, dim = X.shape
f = X.dot(W) # (N,C)
f_max = np.reshape(np.max(f, axis=1), (num_train, 1)) # (N,1)
prob = np.exp(f - f_max) / np.sum(np.exp(f - f_max), axis=1, keepdims=True)
y_trueClass = np.zeros_like(prob)
y_trueClass[range(num_train), y] = 1.0 # N by C
loss += -np.sum(y_trueClass * np.log(prob)) / num_train + 0.5 * reg * np.sum(W * W)
dW += -np.dot(X.T, y_trueClass - prob) / num_train + reg * W
#############################################################################
# END OF YOUR CODE #
#############################################################################
return loss, dW