1.KNN
knn 的三种实现
import numpy as np from past.builtins import xrange class KNearestNeighbor(object): """ a kNN classifier with L2 distance """ def __init__(self): pass def train(self, X, y): """ Train the classifier. For k-nearest neighbors this is just memorizing the training data. Inputs: - X: A numpy array of shape (num_train, D) containing the training data consisting of num_train samples each of dimension D. - y: A numpy array of shape (N,) containing the training labels, where y[i] is the label for X[i]. """ self.X_train = X self.y_train = y def predict(self, X, k=1, num_loops=0): """ Predict labels for test data using this classifier. Inputs: - X: A numpy array of shape (num_test, D) containing test data consisting of num_test samples each of dimension D. - k: The number of nearest neighbors that vote for the predicted labels. - num_loops: Determines which implementation to use to compute distances between training points and testing points. Returns: - y: A numpy array of shape (num_test,) containing predicted labels for the test data, where y[i] is the predicted label for the test point X[i]. """ if num_loops == 0: dists = self.compute_distances_no_loops(X) elif num_loops == 1: dists = self.compute_distances_one_loop(X) elif num_loops == 2: dists = self.compute_distances_two_loops(X) else: raise ValueError('Invalid value %d for num_loops' % num_loops) return self.predict_labels(dists, k=k) def compute_distances_two_loops(self, X): """ Compute the distance between each test point in X and each training point in self.X_train using a nested loop over both the training data and the test data. Inputs: - X: A numpy array of shape (num_test, D) containing test data. Returns: - dists: A numpy array of shape (num_test, num_train) where dists[i, j] is the Euclidean distance between the ith test point and the jth training point. """ num_test = X.shape[0] num_train = self.X_train.shape[0] dists = np.zeros((num_test, num_train)) for i in xrange(num_test): for j in xrange(num_train): ##################################################################### # TODO: # # Compute the l2 distance between the ith test point and the jth # # training point, and store the result in dists[i, j]. You should # # not use a loop over dimension. # ##################################################################### # dists[i,j]=np.sqrt(np.sum(np.square(X[i,:]-self.X_train[j,:]))) dists[i, j] = np.sqrt(np.dot(X[i] - self.X_train[j], X[i] - self.X_train[j])) ##################################################################### # END OF YOUR CODE # ##################################################################### return dists def compute_distances_one_loop(self, X): """ Compute the distance between each test point in X and each training point in self.X_train using a single loop over the test data. Input / Output: Same as compute_distances_two_loops """ num_test = X.shape[0] num_train = self.X_train.shape[0] dists = np.zeros((num_test, num_train)) for i in xrange(num_test): ####################################################################### # TODO: # # Compute the l2 distance between the ith test point and all training # # points, and store the result in dists[i, :]. # ####################################################################### dists[i, :] = np.sqrt(np.sum(np.square(X[i] - self.X_train), axis=1)) ####################################################################### # END OF YOUR CODE # ####################################################################### return dists def compute_distances_no_loops(self, X): """ Compute the distance between each test point in X and each training point in self.X_train using no explicit loops. Input / Output: Same as compute_distances_two_loops """ num_test = X.shape[0] num_train = self.X_train.shape[0] dists = np.zeros((num_test, num_train)) ######################################################################### # TODO: # # Compute the l2 distance between all test points and all training # # points without using any explicit loops, and store the result in # # dists. # # # # You should implement this function using only basic array operations; # # in particular you should not use functions from scipy. # # # # HINT: Try to formulate the l2 distance using matrix multiplication # # and two broadcast sums. # ######################################################################### matrix_1 = (np.sum(np.square(self.X_train), axis=1).reshape(num_train, 1) @ np.ones((1, num_test))).T matrix_2 = (np.ones((num_train, 1)) @ np.sum(np.square(X), axis=1).reshape(-1, num_test)).T merge = -2 * (self.X_train @ X.T).T dists = np.sqrt(matrix_1 + matrix_2 + merge) ######################################################################### # END OF YOUR CODE # ######################################################################### return dists def predict_labels(self, dists, k=1): """ Given a matrix of distances between test points and training points, predict a label for each test point. Inputs: - dists: A numpy array of shape (num_test, num_train) where dists[i, j] gives the distance betwen the ith test point and the jth training point. Returns: - y: A numpy array of shape (num_test,) containing predicted labels for the test data, where y[i] is the predicted label for the test point X[i]. """ num_test = dists.shape[0] y_pred = np.zeros(num_test) for i in xrange(num_test): # A list of length k storing the labels of the k nearest neighbors to # the ith test point. closest_y = [] ######################################################################### # TODO: # # Use the distance matrix to find the k nearest neighbors of the ith # # testing point, and use self.y_train to find the labels of these # # neighbors. Store these labels in closest_y. # # Hint: Look up the function numpy.argsort. # ######################################################################### # argsort函数返回的是数组值从小到大的索引值 distance = np.argsort(dists[i])[:k] closest_y = self.y_train[distance] ######################################################################### # TODO: # # Now that you have found the labels of the k nearest neighbors, you # # need to find the most common label in the list closest_y of labels. # # Store this label in y_pred[i]. Break ties by choosing the smaller # # label. # ######################################################################### y_pred[i] = np.argmax(np.bincount(closest_y)) ######################################################################### # END OF YOUR CODE # ######################################################################### return y_pred
Two loop version took 22.063287 secondsOne loop version took 80.193520 secondsNo loop version took 0.386157 seconds这是三种实现的时间对比,可以看见向量化实现的优势了
交叉验证的实现
num_folds = 5 k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100] X_train_folds = [] y_train_folds = [] ################################################################################ # TODO: # # Split up the training data into folds. After splitting, X_train_folds and # # y_train_folds should each be lists of length num_folds, where # # y_train_folds[i] is the label vector for the points in X_train_folds[i]. # # Hint: Look up the numpy array_split function. # ################################################################################ X_train_folds = np.array_split(X_train,num_folds) y_train_folds = np.array_split(y_train,num_folds) ################################################################################ # END OF YOUR CODE # ################################################################################ # A dictionary holding the accuracies for different values of k that we find # when running cross-validation. After running cross-validation, # k_to_accuracies[k] should be a list of length num_folds giving the different # accuracy values that we found when using that value of k. k_to_accuracies = {} ################################################################################ # TODO: # # Perform k-fold cross validation to find the best value of k. For each # # possible value of k, run the k-nearest-neighbor algorithm num_folds times, # # where in each case you use all but one of the folds as training data and the # # last fold as a validation set. Store the accuracies for all fold and all # # values of k in the k_to_accuracies dictionary. # ################################################################################ for k in k_choices: accuracies = np.zeros(num_folds) # for num in range(num_folds): # x_train=X_train_folds[num] # y_test=y_train_folds[num] # x_train=[] # y_train=[] # to_train=[x for x in np.arange(num_folds) if x!=num ] # for x in to_train: # x_train.append(X_train_folds[x]) # y_train.append(y_train_folds[x]) # classifier.train(x_train, y_train) # y_test_pred = classifier.predict_labels(x_test, k=k) # num_correct = np.sum(y_test_pred == y_test) # accuracy = float(num_correct) / num_test # accuracies[k][num]=accuracy for fold in range(num_folds): temp_X = X_train_folds[:] temp_y = y_train_folds[:] X_validate_fold = temp_X.pop(fold) y_validate_fold = temp_y.pop(fold) temp_X = np.array([y for x in temp_X for y in x]) temp_y = np.array([y for x in temp_y for y in x]) classifier.train(temp_X, temp_y) y_test_pred = classifier.predict(X_validate_fold, k=k) num_correct = np.sum(y_test_pred == y_validate_fold) accuracy = float(num_correct) / num_test accuracies[fold] =accuracy k_to_accuracies[k] = accuracies ################################################################################ # END OF YOUR CODE # ################################################################################ # Print out the computed accuracies for k in sorted(k_to_accuracies): for accuracy in k_to_accuracies[k]: print('k = %d, accuracy = %f' % (k, accuracy)) 我写的老是报错,后来参考了别人的,如果有知道的可以指点我一下
参考内容
【实验小结】cs231n assignment1 knn 部分

SVM 2种实现
import numpy as np from past.builtins import xrange def svm_loss_naive(W, X, y, reg): """ Structured SVM loss function, naive implementation (with loops). Inputs have dimension D, there are C classes, and we operate on minibatches of N examples. Inputs: - W: A numpy array of shape (D, C) containing weights. - X: A numpy array of shape (N, D) containing a minibatch of data. - y: A numpy array of shape (N,) containing training labels; y[i] = c means that X[i] has label c, where 0 <= c < C. - reg: (float) regularization strength Returns a tuple of: - loss as single float - gradient with respect to weights W; an array of same shape as W """ dW = np.zeros(W.shape) # initialize the gradient as zero [D,C] # compute the loss and the gradient num_classes = W.shape[1] num_train = X.shape[0] loss = 0.0 for i in xrange(num_train): scores = X[i].dot(W) #[1,D]*[D,C] correct_class_score = scores[y[i]] for j in xrange(num_classes): if j == y[i]: continue margin = scores[j] - correct_class_score + 1 # note delta = 1 if margin > 0: loss += margin dW[: , y[i]]-= X[i,:].T dW[: , j]+= X[i,:].T # Right now the loss is a sum over all training examples, but we want it # to be an average instead so we divide by num_train. loss /= num_train dW /=num_train # Add regularization to the loss. loss += reg * np.sum(W * W) dW += reg*W ############################################################################# # TODO: # # Compute the gradient of the loss function and store it dW. # # Rather that first computing the loss and then computing the derivative, # # it may be simpler to compute the derivative at the same time that the # # loss is being computed. As a result you may need to modify some of the # # code above to compute the gradient. # ############################################################################# return loss, dW def svm_loss_vectorized(W, X, y, reg): """ Structured SVM loss function, vectorized implementation. Inputs and outputs are the same as svm_loss_naive. """ loss = 0.0 dW = np.zeros(W.shape) # initialize the gradient as zero ############################################################################# # TODO: # # Implement a vectorized version of the structured SVM loss, storing the # # result in loss. # ############################################################################# num_classes = W.shape[1] num_train = X.shape[0] scores = X.dot(W) # [N,D]*[D,C] correct_class_score = scores[np.arange(num_train), y] correct_class_score = np.reshape(correct_class_score,(num_train,-1)) margin=scores - correct_class_score + 1 margin=np.maximum(margin,0) margin[np.arange(num_train), y]=0 # 此时 margins 是一个 [N,C]的数组,可以索引 loss+=np.sum(margin) loss /=num_train loss+= reg*np.sum(W**2) ############################################################################# # END OF YOUR CODE # ############################################################################# margin[margin>0]=1 row_sum=np.sum(margin, axis=1) margin[np.arange(num_train),y]-=row_sum dW = 1.0/num_train*np.dot(X.T, margin) + reg*W ############################################################################# # TODO: # # Implement a vectorized version of the gradient for the structured SVM # # loss, storing the result in dW. # # # # Hint: Instead of computing the gradient from scratch, it may be easier # # to reuse some of the intermediate values that you used to compute the # # loss. # ############################################################################# pass ############################################################################# # END OF YOUR CODE # ############################################################################# return loss, dW
参考教程
cs231n assignment1 关于svm_loss_vectorized中代码的梯度部分

linear_classifier实现
class LinearClassifier(object): def __init__(self): self.W = None def train(self, X, y, learning_rate=1e-3, reg=1e-5, num_iters=100, batch_size=200, verbose=False): """ Train this linear classifier using stochastic gradient descent. Inputs: - X: A numpy array of shape (N, D) containing training data; there are N training samples each of dimension D. - y: A numpy array of shape (N,) containing training labels; y[i] = c means that X[i] has label 0 <= c < C for C classes. - learning_rate: (float) learning rate for optimization. - reg: (float) regularization strength. - num_iters: (integer) number of steps to take when optimizing - batch_size: (integer) number of training examples to use at each step. - verbose: (boolean) If true, print progress during optimization. Outputs: A list containing the value of the loss function at each training iteration. """ num_train, dim = X.shape num_classes = np.max(y) + 1 # assume y takes values 0...K-1 where K is number of classes if self.W is None: # lazily initialize W self.W = 0.001 * np.random.randn(dim, num_classes) # Run stochastic gradient descent to optimize W loss_history = [] for it in xrange(num_iters): X_batch = None y_batch = None ######################################################################### # TODO: # # Sample batch_size elements from the training data and their # # corresponding labels to use in this round of gradient descent. # # Store the data in X_batch and their corresponding labels in # # y_batch; after sampling X_batch should have shape (dim, batch_size) # # and y_batch should have shape (batch_size,) # # # # Hint: Use np.random.choice to generate indices. Sampling with # # replacement is faster than sampling without replacement. # ######################################################################### bt=np.random.choice(num_train,batch_size) X_batch=X[bt] y_batch=y[bt] ######################################################################### # END OF YOUR CODE # ######################################################################### # evaluate loss and gradient loss, grad = self.loss(X_batch, y_batch, reg) loss_history.append(loss) # perform parameter update ######################################################################### # TODO: # # Update the weights using the gradient and the learning rate. # ######################################################################### self.W=self.W-learning_rate*grad ######################################################################### # END OF YOUR CODE # ######################################################################### if verbose and it % 100 == 0: print('iteration %d / %d: loss %f' % (it, num_iters, loss)) return loss_history def predict(self, X): """ Use the trained weights of this linear classifier to predict labels for data points. Inputs: - X: A numpy array of shape (N, D) containing training data; there are N training samples each of dimension D. Returns: - y_pred: Predicted labels for the data in X. y_pred is a 1-dimensional array of length N, and each element is an integer giving the predicted class. """ y_pred = np.zeros(X.shape[0]) ########################################################################### # TODO: # # Implement this method. Store the predicted labels in y_pred. # ########################################################################### y_pred=np.argmax(X@self.W,axis=1) ########################################################################### # END OF YOUR CODE # ########################################################################### return y_pred
训练部分实现
# Use the validation set to tune hyperparameters (regularization strength and # learning rate). You should experiment with different ranges for the learning # rates and regularization strengths; if you are careful you should be able to # get a classification accuracy of about 0.4 on the validation set. learning_rates = [1e-7, 5e-5,1e-3,4e-6,1e-6,3e-7] regularization_strengths = [2.5e4, 5e4,3e3,5e3] # results is dictionary mapping tuples of the form # (learning_rate, regularization_strength) to tuples of the form # (training_accuracy, validation_accuracy). The accuracy is simply the fraction # of data points that are correctly classified. results = {} best_val = -1 # The highest validation accuracy that we have seen so far. best_svm = None # The LinearSVM object that achieved the highest validation rate. # 达到正确率最大值的svm对象 ################################################################################ # TODO: # # Write code that chooses the best hyperparameters by tuning on the validation # # set. For each combination of hyperparameters, train a linear SVM on the # # training set, compute its accuracy on the training and validation sets, and # # store these numbers in the results dictionary. In addition, store the best # # validation accuracy in best_val and the LinearSVM object that achieves this # # accuracy in best_svm. # # # # Hint: You should use a small value for num_iters as you develop your # # validation code so that the SVMs don't take much time to train; once you are # # confident that your validation code works, you should rerun the validation # # code with a larger value for num_iters. # ################################################################################ for lr in learning_rates: for rs in regularization_strengths: svm=LinearSVM() svm.train(X_train, y_train, learning_rate=lr, reg=rs, num_iters=2000) y_train_pred = svm.predict(X_train) train_acc=np.mean(np.mean(y_train == y_train_pred)) y_val_pred = svm.predict(X_val) val_acc = np.mean(y_val == y_val_pred) results[(lr,rs)]=(train_acc,val_acc) if (best_val<val_acc): best_val=val_acc best_svm=svm ################################################################################ # END OF YOUR CODE # ################################################################################ # Print out results. for lr, reg in sorted(results): train_accuracy, val_accuracy = results[(lr, reg)] print('lr %e reg %e train accuracy: %f val accuracy: %f' % ( lr, reg, train_accuracy, val_accuracy)) print('best validation accuracy achieved during cross-validation: %f' % best_val) # ps : best validation accuracy achieved during cross-validation: 0.393000, # 我的并没有达到0.4
softmax原始和向量化实现
import numpy as np from random import shuffle from past.builtins import xrange def softmax_loss_naive(W, X, y, reg): """ Softmax loss function, naive implementation (with loops) Inputs have dimension D, there are C classes, and we operate on minibatches of N examples. Inputs: - W: A numpy array of shape (D, C) containing weights. - X: A numpy array of shape (N, D) containing a minibatch of data. - y: A numpy array of shape (N,) containing training labels; y[i] = c means that X[i] has label c, where 0 <= c < C. - reg: (float) regularization strength Returns a tuple of: - loss as single float - gradient with respect to weights W; an array of same shape as W """ # Initialize the loss and gradient to zero. loss = 0.0 dW = np.zeros_like(W) ############################################################################# # TODO: Compute the softmax loss and its gradient using explicit loops. # # Store the loss in loss and the gradient in dW. If you are not careful # # here, it is easy to run into numeric instability. Don't forget the # # regularization! # ############################################################################# num_train = X.shape[0] num_class = W.shape[1] for i in range(num_train): f_i = X[i] @ W #[1,c] f_i-=np.max(f_i) f_i_sum = np.sum(np.exp(f_i),keepdims=True) p = lambda k: np.exp(f_i[k]) / f_i_sum #softmax 实现 loss -= np.log(p(y[i])) # for j in range(num_class): # if j==y[i]: # dW[i][j]+=X[i]@(p(j)-1).T # else: # dW[i][j]+= X[i].reshape(1, -1).T @ f_i[i][j] # dW= X[i].reshape(1,-1).T@(p(y[i])-1) #[d,c] for k in range(num_class): p_k=p(k) dW[:,k] += (p_k-(k==y[i]))*X[i] #[D,1] # print('dW',dW[:,k].shape) # print('X[i]',X[i].shape) loss /= num_train loss += 0.5 * np.sum(W ** 2) dW /= num_train dW += np.sum(W) ############################################################################# # END OF YOUR CODE # ############################################################################# return loss, dW def softmax_loss_vectorized(W, X, y, reg): """ Softmax loss function, vectorized version. Inputs and outputs are the same as softmax_loss_naive. """ # Initialize the loss and gradient to zero. loss = 0.0 dW = np.zeros_like(W) ############################################################################# # TODO: Compute the softmax loss and its gradient using no explicit loops. # # Store the loss in loss and the gradient in dW. If you are not careful # # here, it is easy to run into numeric instability. Don't forget the # # regularization! # ############################################################################# num_train = X.shape[0] num_class = W.shape[1] f = X@W #[n,c] f-=np.max(f,keepdims=True,axis=1) f_sum=np.sum(np.exp(f),axis=1,keepdims=True) p = np.exp(f) / f_sum loss+=np.sum(-np.log(p[np.arange(num_train),y])) tmp=np.zeros(p.shape) tmp[np.arange(num_train),y]=1 dW=X.T@(p-tmp) loss /= num_train loss += 0.5 * np.sum(W ** 2) dW /= num_train dW += np.sum(W) ############################################################################# # END OF YOUR CODE # ############################################################################# return loss, dW
选取最优学习率和正则项
写法同SVM
for lr in learning_rates: for reg in regularization_strengths: soft = Softmax() soft.train(X_train, y_train, learning_rate=lr, reg=reg, num_iters=2000) y_train_pred = soft.predict(X_train) train_acc=np.mean(np.mean(y_train == y_train_pred)) y_val_pred = soft.predict(X_val) val_acc = np.mean(y_val == y_val_pred) results[(lr,reg)]=(train_acc,val_acc) if (best_val<val_acc): best_val=val_acc best_softmax=soft
参考文章
简单易懂的softmax交叉熵损失函数求导 - 优快云博客

1.KNN
knn 的三种实现
import numpy as np from past.builtins import xrange class KNearestNeighbor(object): """ a kNN classifier with L2 distance """ def __init__(self): pass def train(self, X, y): """ Train the classifier. For k-nearest neighbors this is just memorizing the training data. Inputs: - X: A numpy array of shape (num_train, D) containing the training data consisting of num_train samples each of dimension D. - y: A numpy array of shape (N,) containing the training labels, where y[i] is the label for X[i]. """ self.X_train = X self.y_train = y def predict(self, X, k=1, num_loops=0): """ Predict labels for test data using this classifier. Inputs: - X: A numpy array of shape (num_test, D) containing test data consisting of num_test samples each of dimension D. - k: The number of nearest neighbors that vote for the predicted labels. - num_loops: Determines which implementation to use to compute distances between training points and testing points. Returns: - y: A numpy array of shape (num_test,) containing predicted labels for the test data, where y[i] is the predicted label for the test point X[i]. """ if num_loops == 0: dists = self.compute_distances_no_loops(X) elif num_loops == 1: dists = self.compute_distances_one_loop(X) elif num_loops == 2: dists = self.compute_distances_two_loops(X) else: raise ValueError('Invalid value %d for num_loops' % num_loops) return self.predict_labels(dists, k=k) def compute_distances_two_loops(self, X): """ Compute the distance between each test point in X and each training point in self.X_train using a nested loop over both the training data and the test data. Inputs: - X: A numpy array of shape (num_test, D) containing test data. Returns: - dists: A numpy array of shape (num_test, num_train) where dists[i, j] is the Euclidean distance between the ith test point and the jth training point. """ num_test = X.shape[0] num_train = self.X_train.shape[0] dists = np.zeros((num_test, num_train)) for i in xrange(num_test): for j in xrange(num_train): ##################################################################### # TODO: # # Compute the l2 distance between the ith test point and the jth # # training point, and store the result in dists[i, j]. You should # # not use a loop over dimension. # ##################################################################### # dists[i,j]=np.sqrt(np.sum(np.square(X[i,:]-self.X_train[j,:]))) dists[i, j] = np.sqrt(np.dot(X[i] - self.X_train[j], X[i] - self.X_train[j])) ##################################################################### # END OF YOUR CODE # ##################################################################### return dists def compute_distances_one_loop(self, X): """ Compute the distance between each test point in X and each training point in self.X_train using a single loop over the test data. Input / Output: Same as compute_distances_two_loops """ num_test = X.shape[0] num_train = self.X_train.shape[0] dists = np.zeros((num_test, num_train)) for i in xrange(num_test): ####################################################################### # TODO: # # Compute the l2 distance between the ith test point and all training # # points, and store the result in dists[i, :]. # ####################################################################### dists[i, :] = np.sqrt(np.sum(np.square(X[i] - self.X_train), axis=1)) ####################################################################### # END OF YOUR CODE # ####################################################################### return dists def compute_distances_no_loops(self, X): """ Compute the distance between each test point in X and each training point in self.X_train using no explicit loops. Input / Output: Same as compute_distances_two_loops """ num_test = X.shape[0] num_train = self.X_train.shape[0] dists = np.zeros((num_test, num_train)) ######################################################################### # TODO: # # Compute the l2 distance between all test points and all training # # points without using any explicit loops, and store the result in # # dists. # # # # You should implement this function using only basic array operations; # # in particular you should not use functions from scipy. # # # # HINT: Try to formulate the l2 distance using matrix multiplication # # and two broadcast sums. # ######################################################################### matrix_1 = (np.sum(np.square(self.X_train), axis=1).reshape(num_train, 1) @ np.ones((1, num_test))).T matrix_2 = (np.ones((num_train, 1)) @ np.sum(np.square(X), axis=1).reshape(-1, num_test)).T merge = -2 * (self.X_train @ X.T).T dists = np.sqrt(matrix_1 + matrix_2 + merge) ######################################################################### # END OF YOUR CODE # ######################################################################### return dists def predict_labels(self, dists, k=1): """ Given a matrix of distances between test points and training points, predict a label for each test point. Inputs: - dists: A numpy array of shape (num_test, num_train) where dists[i, j] gives the distance betwen the ith test point and the jth training point. Returns: - y: A numpy array of shape (num_test,) containing predicted labels for the test data, where y[i] is the predicted label for the test point X[i]. """ num_test = dists.shape[0] y_pred = np.zeros(num_test) for i in xrange(num_test): # A list of length k storing the labels of the k nearest neighbors to # the ith test point. closest_y = [] ######################################################################### # TODO: # # Use the distance matrix to find the k nearest neighbors of the ith # # testing point, and use self.y_train to find the labels of these # # neighbors. Store these labels in closest_y. # # Hint: Look up the function numpy.argsort. # ######################################################################### # argsort函数返回的是数组值从小到大的索引值 distance = np.argsort(dists[i])[:k] closest_y = self.y_train[distance] ######################################################################### # TODO: # # Now that you have found the labels of the k nearest neighbors, you # # need to find the most common label in the list closest_y of labels. # # Store this label in y_pred[i]. Break ties by choosing the smaller # # label. # ######################################################################### y_pred[i] = np.argmax(np.bincount(closest_y)) ######################################################################### # END OF YOUR CODE # ######################################################################### return y_pred
Two loop version took 22.063287 secondsOne loop version took 80.193520 secondsNo loop version took 0.386157 seconds这是三种实现的时间对比,可以看见向量化实现的优势了
交叉验证的实现
num_folds = 5 k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100] X_train_folds = [] y_train_folds = [] ################################################################################ # TODO: # # Split up the training data into folds. After splitting, X_train_folds and # # y_train_folds should each be lists of length num_folds, where # # y_train_folds[i] is the label vector for the points in X_train_folds[i]. # # Hint: Look up the numpy array_split function. # ################################################################################ X_train_folds = np.array_split(X_train,num_folds) y_train_folds = np.array_split(y_train,num_folds) ################################################################################ # END OF YOUR CODE # ################################################################################ # A dictionary holding the accuracies for different values of k that we find # when running cross-validation. After running cross-validation, # k_to_accuracies[k] should be a list of length num_folds giving the different # accuracy values that we found when using that value of k. k_to_accuracies = {} ################################################################################ # TODO: # # Perform k-fold cross validation to find the best value of k. For each # # possible value of k, run the k-nearest-neighbor algorithm num_folds times, # # where in each case you use all but one of the folds as training data and the # # last fold as a validation set. Store the accuracies for all fold and all # # values of k in the k_to_accuracies dictionary. # ################################################################################ for k in k_choices: accuracies = np.zeros(num_folds) # for num in range(num_folds): # x_train=X_train_folds[num] # y_test=y_train_folds[num] # x_train=[] # y_train=[] # to_train=[x for x in np.arange(num_folds) if x!=num ] # for x in to_train: # x_train.append(X_train_folds[x]) # y_train.append(y_train_folds[x]) # classifier.train(x_train, y_train) # y_test_pred = classifier.predict_labels(x_test, k=k) # num_correct = np.sum(y_test_pred == y_test) # accuracy = float(num_correct) / num_test # accuracies[k][num]=accuracy for fold in range(num_folds): temp_X = X_train_folds[:] temp_y = y_train_folds[:] X_validate_fold = temp_X.pop(fold) y_validate_fold = temp_y.pop(fold) temp_X = np.array([y for x in temp_X for y in x]) temp_y = np.array([y for x in temp_y for y in x]) classifier.train(temp_X, temp_y) y_test_pred = classifier.predict(X_validate_fold, k=k) num_correct = np.sum(y_test_pred == y_validate_fold) accuracy = float(num_correct) / num_test accuracies[fold] =accuracy k_to_accuracies[k] = accuracies ################################################################################ # END OF YOUR CODE # ################################################################################ # Print out the computed accuracies for k in sorted(k_to_accuracies): for accuracy in k_to_accuracies[k]: print('k = %d, accuracy = %f' % (k, accuracy)) 我写的老是报错,后来参考了别人的,如果有知道的可以指点我一下
参考内容
【实验小结】cs231n assignment1 knn 部分

SVM 2种实现
import numpy as np from past.builtins import xrange def svm_loss_naive(W, X, y, reg): """ Structured SVM loss function, naive implementation (with loops). Inputs have dimension D, there are C classes, and we operate on minibatches of N examples. Inputs: - W: A numpy array of shape (D, C) containing weights. - X: A numpy array of shape (N, D) containing a minibatch of data. - y: A numpy array of shape (N,) containing training labels; y[i] = c means that X[i] has label c, where 0 <= c < C. - reg: (float) regularization strength Returns a tuple of: - loss as single float - gradient with respect to weights W; an array of same shape as W """ dW = np.zeros(W.shape) # initialize the gradient as zero [D,C] # compute the loss and the gradient num_classes = W.shape[1] num_train = X.shape[0] loss = 0.0 for i in xrange(num_train): scores = X[i].dot(W) #[1,D]*[D,C] correct_class_score = scores[y[i]] for j in xrange(num_classes): if j == y[i]: continue margin = scores[j] - correct_class_score + 1 # note delta = 1 if margin > 0: loss += margin dW[: , y[i]]-= X[i,:].T dW[: , j]+= X[i,:].T # Right now the loss is a sum over all training examples, but we want it # to be an average instead so we divide by num_train. loss /= num_train dW /=num_train # Add regularization to the loss. loss += reg * np.sum(W * W) dW += reg*W ############################################################################# # TODO: # # Compute the gradient of the loss function and store it dW. # # Rather that first computing the loss and then computing the derivative, # # it may be simpler to compute the derivative at the same time that the # # loss is being computed. As a result you may need to modify some of the # # code above to compute the gradient. # ############################################################################# return loss, dW def svm_loss_vectorized(W, X, y, reg): """ Structured SVM loss function, vectorized implementation. Inputs and outputs are the same as svm_loss_naive. """ loss = 0.0 dW = np.zeros(W.shape) # initialize the gradient as zero ############################################################################# # TODO: # # Implement a vectorized version of the structured SVM loss, storing the # # result in loss. # ############################################################################# num_classes = W.shape[1] num_train = X.shape[0] scores = X.dot(W) # [N,D]*[D,C] correct_class_score = scores[np.arange(num_train), y] correct_class_score = np.reshape(correct_class_score,(num_train,-1)) margin=scores - correct_class_score + 1 margin=np.maximum(margin,0) margin[np.arange(num_train), y]=0 # 此时 margins 是一个 [N,C]的数组,可以索引 loss+=np.sum(margin) loss /=num_train loss+= reg*np.sum(W**2) ############################################################################# # END OF YOUR CODE # ############################################################################# margin[margin>0]=1 row_sum=np.sum(margin, axis=1) margin[np.arange(num_train),y]-=row_sum dW = 1.0/num_train*np.dot(X.T, margin) + reg*W ############################################################################# # TODO: # # Implement a vectorized version of the gradient for the structured SVM # # loss, storing the result in dW. # # # # Hint: Instead of computing the gradient from scratch, it may be easier # # to reuse some of the intermediate values that you used to compute the # # loss. # ############################################################################# pass ############################################################################# # END OF YOUR CODE # ############################################################################# return loss, dW
参考教程
cs231n assignment1 关于svm_loss_vectorized中代码的梯度部分

linear_classifier实现
class LinearClassifier(object): def __init__(self): self.W = None def train(self, X, y, learning_rate=1e-3, reg=1e-5, num_iters=100, batch_size=200, verbose=False): """ Train this linear classifier using stochastic gradient descent. Inputs: - X: A numpy array of shape (N, D) containing training data; there are N training samples each of dimension D. - y: A numpy array of shape (N,) containing training labels; y[i] = c means that X[i] has label 0 <= c < C for C classes. - learning_rate: (float) learning rate for optimization. - reg: (float) regularization strength. - num_iters: (integer) number of steps to take when optimizing - batch_size: (integer) number of training examples to use at each step. - verbose: (boolean) If true, print progress during optimization. Outputs: A list containing the value of the loss function at each training iteration. """ num_train, dim = X.shape num_classes = np.max(y) + 1 # assume y takes values 0...K-1 where K is number of classes if self.W is None: # lazily initialize W self.W = 0.001 * np.random.randn(dim, num_classes) # Run stochastic gradient descent to optimize W loss_history = [] for it in xrange(num_iters): X_batch = None y_batch = None ######################################################################### # TODO: # # Sample batch_size elements from the training data and their # # corresponding labels to use in this round of gradient descent. # # Store the data in X_batch and their corresponding labels in # # y_batch; after sampling X_batch should have shape (dim, batch_size) # # and y_batch should have shape (batch_size,) # # # # Hint: Use np.random.choice to generate indices. Sampling with # # replacement is faster than sampling without replacement. # ######################################################################### bt=np.random.choice(num_train,batch_size) X_batch=X[bt] y_batch=y[bt] ######################################################################### # END OF YOUR CODE # ######################################################################### # evaluate loss and gradient loss, grad = self.loss(X_batch, y_batch, reg) loss_history.append(loss) # perform parameter update ######################################################################### # TODO: # # Update the weights using the gradient and the learning rate. # ######################################################################### self.W=self.W-learning_rate*grad ######################################################################### # END OF YOUR CODE # ######################################################################### if verbose and it % 100 == 0: print('iteration %d / %d: loss %f' % (it, num_iters, loss)) return loss_history def predict(self, X): """ Use the trained weights of this linear classifier to predict labels for data points. Inputs: - X: A numpy array of shape (N, D) containing training data; there are N training samples each of dimension D. Returns: - y_pred: Predicted labels for the data in X. y_pred is a 1-dimensional array of length N, and each element is an integer giving the predicted class. """ y_pred = np.zeros(X.shape[0]) ########################################################################### # TODO: # # Implement this method. Store the predicted labels in y_pred. # ########################################################################### y_pred=np.argmax(X@self.W,axis=1) ########################################################################### # END OF YOUR CODE # ########################################################################### return y_pred
训练部分实现
# Use the validation set to tune hyperparameters (regularization strength and # learning rate). You should experiment with different ranges for the learning # rates and regularization strengths; if you are careful you should be able to # get a classification accuracy of about 0.4 on the validation set. learning_rates = [1e-7, 5e-5,1e-3,4e-6,1e-6,3e-7] regularization_strengths = [2.5e4, 5e4,3e3,5e3] # results is dictionary mapping tuples of the form # (learning_rate, regularization_strength) to tuples of the form # (training_accuracy, validation_accuracy). The accuracy is simply the fraction # of data points that are correctly classified. results = {} best_val = -1 # The highest validation accuracy that we have seen so far. best_svm = None # The LinearSVM object that achieved the highest validation rate. # 达到正确率最大值的svm对象 ################################################################################ # TODO: # # Write code that chooses the best hyperparameters by tuning on the validation # # set. For each combination of hyperparameters, train a linear SVM on the # # training set, compute its accuracy on the training and validation sets, and # # store these numbers in the results dictionary. In addition, store the best # # validation accuracy in best_val and the LinearSVM object that achieves this # # accuracy in best_svm. # # # # Hint: You should use a small value for num_iters as you develop your # # validation code so that the SVMs don't take much time to train; once you are # # confident that your validation code works, you should rerun the validation # # code with a larger value for num_iters. # ################################################################################ for lr in learning_rates: for rs in regularization_strengths: svm=LinearSVM() svm.train(X_train, y_train, learning_rate=lr, reg=rs, num_iters=2000) y_train_pred = svm.predict(X_train) train_acc=np.mean(np.mean(y_train == y_train_pred)) y_val_pred = svm.predict(X_val) val_acc = np.mean(y_val == y_val_pred) results[(lr,rs)]=(train_acc,val_acc) if (best_val<val_acc): best_val=val_acc best_svm=svm ################################################################################ # END OF YOUR CODE # ################################################################################ # Print out results. for lr, reg in sorted(results): train_accuracy, val_accuracy = results[(lr, reg)] print('lr %e reg %e train accuracy: %f val accuracy: %f' % ( lr, reg, train_accuracy, val_accuracy)) print('best validation accuracy achieved during cross-validation: %f' % best_val) # ps : best validation accuracy achieved during cross-validation: 0.393000, # 我的并没有达到0.4
softmax原始和向量化实现
import numpy as np from random import shuffle from past.builtins import xrange def softmax_loss_naive(W, X, y, reg): """ Softmax loss function, naive implementation (with loops) Inputs have dimension D, there are C classes, and we operate on minibatches of N examples. Inputs: - W: A numpy array of shape (D, C) containing weights. - X: A numpy array of shape (N, D) containing a minibatch of data. - y: A numpy array of shape (N,) containing training labels; y[i] = c means that X[i] has label c, where 0 <= c < C. - reg: (float) regularization strength Returns a tuple of: - loss as single float - gradient with respect to weights W; an array of same shape as W """ # Initialize the loss and gradient to zero. loss = 0.0 dW = np.zeros_like(W) ############################################################################# # TODO: Compute the softmax loss and its gradient using explicit loops. # # Store the loss in loss and the gradient in dW. If you are not careful # # here, it is easy to run into numeric instability. Don't forget the # # regularization! # ############################################################################# num_train = X.shape[0] num_class = W.shape[1] for i in range(num_train): f_i = X[i] @ W #[1,c] f_i-=np.max(f_i) f_i_sum = np.sum(np.exp(f_i),keepdims=True) p = lambda k: np.exp(f_i[k]) / f_i_sum #softmax 实现 loss -= np.log(p(y[i])) # for j in range(num_class): # if j==y[i]: # dW[i][j]+=X[i]@(p(j)-1).T # else: # dW[i][j]+= X[i].reshape(1, -1).T @ f_i[i][j] # dW= X[i].reshape(1,-1).T@(p(y[i])-1) #[d,c] for k in range(num_class): p_k=p(k) dW[:,k] += (p_k-(k==y[i]))*X[i] #[D,1] # print('dW',dW[:,k].shape) # print('X[i]',X[i].shape) loss /= num_train loss += 0.5 * np.sum(W ** 2) dW /= num_train dW += np.sum(W) ############################################################################# # END OF YOUR CODE # ############################################################################# return loss, dW def softmax_loss_vectorized(W, X, y, reg): """ Softmax loss function, vectorized version. Inputs and outputs are the same as softmax_loss_naive. """ # Initialize the loss and gradient to zero. loss = 0.0 dW = np.zeros_like(W) ############################################################################# # TODO: Compute the softmax loss and its gradient using no explicit loops. # # Store the loss in loss and the gradient in dW. If you are not careful # # here, it is easy to run into numeric instability. Don't forget the # # regularization! # ############################################################################# num_train = X.shape[0] num_class = W.shape[1] f = X@W #[n,c] f-=np.max(f,keepdims=True,axis=1) f_sum=np.sum(np.exp(f),axis=1,keepdims=True) p = np.exp(f) / f_sum loss+=np.sum(-np.log(p[np.arange(num_train),y])) tmp=np.zeros(p.shape) tmp[np.arange(num_train),y]=1 dW=X.T@(p-tmp) loss /= num_train loss += 0.5 * np.sum(W ** 2) dW /= num_train dW += np.sum(W) ############################################################################# # END OF YOUR CODE # ############################################################################# return loss, dW
选取最优学习率和正则项
写法同SVM
for lr in learning_rates: for reg in regularization_strengths: soft = Softmax() soft.train(X_train, y_train, learning_rate=lr, reg=reg, num_iters=2000) y_train_pred = soft.predict(X_train) train_acc=np.mean(np.mean(y_train == y_train_pred)) y_val_pred = soft.predict(X_val) val_acc = np.mean(y_val == y_val_pred) results[(lr,reg)]=(train_acc,val_acc) if (best_val<val_acc): best_val=val_acc best_softmax=soft
参考文章
简单易懂的softmax交叉熵损失函数求导 - 优快云博客
