knn.py # coding=gbk # Run some setup code for this notebook. from __future__ import print_function import sys sys.path.append('/home/wyx/my_train') import random import numpy as np from cs231n.data_utils import load_CIFAR10 import matplotlib.pyplot as plt import os # This is a bit of magic to make matplotlib figures appear inline in the notebook # rather than in a new window. plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots plt.rcParams['image.interpolation'] = 'nearest' plt.rcParams['image.cmap'] = 'gray' # Some more magic so that the notebook will reload external python modules; # see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython cifar10_dir = 'cs231n//datasets//cifar-10-batches-py' # Cleaning up variables to prevent loading data multiple times (which may cause memory issue) try: del X_train, y_train del X_test, y_test print('Clear previously loaded data.') except: pass X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir) # As a sanity check, we print out the size of the training and test data. print('Training data shape: ', X_train.shape) print('Training labels shape: ', y_train.shape) print('Test data shape: ', X_test.shape) print('Test labels shape: ', y_test.shape) ''' Training data shape: (50000, 32, 32, 3) Training labels shape: (50000,) Test data shape: (10000, 32, 32, 3) Test labels shape: (10000,) ''' # Visualize some examples from the dataset. # We show a few examples of training images from each class. classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'] num_classes = len(classes) samples_per_class = 7 for y, cls in enumerate(classes): idxs = np.flatnonzero(y_train == y) idxs = np.random.choice(idxs, samples_per_class, replace=False) for i, idx in enumerate(idxs): plt_idx = i * num_classes + y + 1 plt.subplot(samples_per_class, num_classes, plt_idx) plt.imshow(X_train[idx].astype('uint8')) plt.axis('off') if i == 0: plt.title(cls) # plt.show() # Subsample the data for more efficient code execution in this exercise num_training = 5000 mask = list(range(num_training)) X_train = X_train[mask] y_train = y_train[mask] num_test = 500 mask = list(range(num_test)) X_test = X_test[mask] y_test = y_test[mask] # Reshape the image data into rows X_train = np.reshape(X_train, (X_train.shape[0], -1)) X_test = np.reshape(X_test, (X_test.shape[0], -1)) print(X_train.shape, X_test.shape) from cs231n.classifiers import KNearestNeighbor # Create a kNN classifier instance. # Remember that training a kNN classifier is a noop: # the Classifier simply remembers the data and does no further processing classifier = KNearestNeighbor() classifier.train(X_train, y_train) # Open cs231n/classifiers/k_nearest_neighbor.py and implement # compute_distances_two_loops. # Test your implementation: dists = classifier.compute_distances_two_loops(X_test) print(dists.shape) ''' # We can visualize the distance matrix: each row is a single test example and # its distances to training examples plt.imshow(dists, interpolation='none') plt.show() # Now implement the function predict_labels and run the code below: # We use k = 1 (which is Nearest Neighbor). y_test_pred = classifier.predict_labels(dists, k=1) # Compute and print the fraction of correctly predicted examples num_correct = np.sum(y_test_pred == y_test) accuracy = float(num_correct) / num_test print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy)) #Got 139 / 500 correct => accuracy: 0.278000 y_test_pred = classifier.predict_labels(dists, k=5) num_correct = np.sum(y_test_pred == y_test) accuracy = float(num_correct) / num_test print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy)) #Got 139 / 500 correct => accuracy: 0.278000 # Now lets speed up distance matrix computation by using partial vectorization # with one loop. Implement the function compute_distances_one_loop and run the # code below: dists_one = classifier.compute_distances_one_loop(X_test) # To ensure that our vectorized implementation is correct, we make sure that it # agrees with the naive implementation. There are many ways to decide whether # two matrices are similar; one of the simplest is the Frobenius norm. In case # you haven't seen it before, the Frobenius norm of two matrices is the square # root of the squared sum of differences of all elements; in other words, reshape # the matrices into vectors and compute the Euclidean distance between them. difference = np.linalg.norm(dists - dists_one, ord='fro') print('Difference was: %f' % (difference,)) if difference < 0.001: print('Good! The distance matrices are the same') else: print('Uh-oh! The distance matrices are different') dists_zero = classifier.compute_distances_no_loops(X_test) # To ensure that our vectorized implementation is correct, we make sure that it # agrees with the naive implementation. There are many ways to decide whether # two matrices are similar; one of the simplest is the Frobenius norm. In case # you haven't seen it before, the Frobenius norm of two matrices is the square # root of the squared sum of differences of all elements; in other words, reshape # the matrices into vectors and compute the Euclidean distance between them. difference = np.linalg.norm(dists - dists_zero, ord='fro') print('Difference was: %f' % (difference,)) if difference < 0.001: print('Good! The distance matrices are the same') else: print('Uh-oh! The distance matrices are different') # Let's compare how fast the implementations are def time_function(f, *args): """ Call a function f with args and return the time (in seconds) that it took to execute. """ import time tic = time.time() f(*args) toc = time.time() return toc - tic two_loop_time = time_function(classifier.compute_distances_two_loops, X_test) print('Two loop version took %f seconds' % two_loop_time) one_loop_time = time_function(classifier.compute_distances_one_loop, X_test) print('One loop version took %f seconds' % one_loop_time) no_loop_time = time_function(classifier.compute_distances_no_loops, X_test) print('No loop version took %f seconds' % no_loop_time) # you should see significantly faster performance with the fully vectorized implementation # Good! The distance matrices are the same # Two loop version took 43.605151 seconds # One loop version took 71.236836 seconds # No loop version took 0.178438 seconds ''' num_folds = 5 k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100] X_train_folds = [] y_train_folds = [] ################################################################################ # TODO: # # Split up the training data into folds. After splitting, X_train_folds and # # y_train_folds should each be lists of length num_folds, where # # y_train_folds[i] is the label vector for the points in X_train_folds[i]. # # Hint: Look up the numpy array_split function. # ################################################################################ # Your code # 将训练数据集划分成5份,X_train_folds和y_train_folds分别是长度为5的列表 # (注意是只对训练数据集划分,测试数据及并没有参与其中) # X_train_folds是shape=[num_train/5,num dim]的numpy ndarray # y_train_folds是shape=[num_train/5,]的numpy ndarray # classifier.train(X_train, y_train) import random # 产生从0到num_train-1的不重复随机数 random_index=random.sample(range(num_training),num_training) # interval=num_training//num_folds # # for i in range(num_folds): # X_train_folds.append(X_train[i*interval:(i+1)*interval,:]) # y_train_folds.append(y_train[i*interval:(i+1)*interval]) X_train_folds=np.split(X_train,num_folds) y_train_folds=np.split(y_train,num_folds) ################################################################################ # END OF YOUR CODE # ################################################################################ # A dictionary holding the accuracies for different values of k that we find # when running cross-validation. After running cross-validation, # k_to_accuracies[k] should be a list of length num_folds giving the different # accuracy values that we found when using that value of k. k_to_accuracies = {} ################################################################################ # TODO: # # Perform k-fold cross validation to find the best value of k. For each # # possible value of k, run the k-nearest-neighbor algorithm num_folds times, # # where in each case you use all but one of the folds as training data and the # # last fold as a validation set. Store the accuracies for all fold and all # # values of k in the k_to_accuracies dictionary. # ################################################################################ # 使用k折交叉验证找出KNN算法中最适合当前数据的k # 对于KNN算法中所设置的每个k(k_choices列表中的每个元素),运行num_folds次KNN算法, # 则对于KNN的每个可能的取值k,会得到num_folds个准确率,存储在k_to_accuracies字典中 # 的key=k的value中,value为列表结构,长度为num_folds,即对于KNN算法所设置的当前k值, # 得到k个准确率 # training dataset和validation dataset的区别在于: # 训练数据集是知道每个样本数据所对应的ground truth标签的,训练样本与标签 # 的对应关系就是通过训练希望模型从数据中所学到的知识 # 而对于验证数据集,它的标签并不作为模型已经知道的知识输入,而仅仅将标签 # 用来计算validation dataset的performance for hyper_k in k_choices: k_to_accuracies[hyper_k]=[] for i in range(num_folds): temp_test_x = X_train_folds[i] temp_test_y = y_train_folds[i] X_train_folds.pop(i)#将列表中的第i个元素(numpy.ndarray)删除 y_train_folds.pop(i) # temp_train_x=np.concatenate(X_train_folds) # temp_train_y=np.concatenate(y_train_folds) temp_train_x=np.concatenate(X_train_folds[:i]+X_train_folds[i+1:]) temp_train_y = np.concatenate(y_train_folds[:i] + y_train_folds[i + 1:]) # X_train_folds.insert(i,temp_test_x) # y_train_folds.insert(i,temp_test_y) classifier.train(temp_train_x, temp_train_y) dists_no_loop = classifier.compute_distances_no_loops(temp_test_x) for hyper_k in k_choices: temp_y_test_pred = classifier.predict_labels(dists_no_loop, k=hyper_k) # Compute and print the fraction of correctly predicted examples num_correct = np.sum(temp_y_test_pred == temp_test_y) accuracy = float(num_correct) / temp_test_x.shape[0] k_to_accuracies[hyper_k].append(accuracy) # Your code ################################################################################ # END OF YOUR CODE # ################################################################################ # Print out the computed accuracies for k in sorted(k_to_accuracies): for accuracy in k_to_accuracies[k]: print('k = %d, accuracy = %f' % (k, accuracy)) # plot the raw observations for k in k_choices: accuracies = k_to_accuracies[k] plt.scatter([k] * len(accuracies), accuracies) # plot the trend line with error bars that correspond to standard deviation accuracies_mean = np.array([np.mean(v) for k,v in sorted(k_to_accuracies.items())]) accuracies_std = np.array([np.std(v) for k,v in sorted(k_to_accuracies.items())]) plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std) plt.title('Cross-validation on k') plt.xlabel('k') plt.ylabel('Cross-validation accuracy') plt.show() # Based on the cross-validation results above, choose the best value for k, # retrain the classifier using all the training data, and test it on the test # data. You should be able to get above 28% accuracy on the test data. best_k = 10 classifier = KNearestNeighbor() classifier.train(X_train, y_train) y_test_pred = classifier.predict(X_test, k=best_k) # Compute and display the accuracy num_correct = np.sum(y_test_pred == y_test) accuracy = float(num_correct) / num_test print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))
CS231n assignment1 KNN(二)
最新推荐文章于 2023-05-12 10:48:27 发布