CS231n assignment1 KNN（二）

最新推荐文章于 2023-05-12 10:48:27 发布
WYXHAHAHA123
最新推荐文章于 2023-05-12 10:48:27 发布
阅读量240
点赞数
CC 4.0 BY-SA版权
分类专栏： cs231n
本文链接：https://blog.youkuaiyun.com/WYXHAHAHA123/article/details/89045087
cs231n 专栏收录该内容
4 篇文章
订阅专栏
knn.py

# coding=gbk
# Run some setup code for this notebook.
from __future__ import print_function
import sys

sys.path.append('/home/wyx/my_train')
import random
import numpy as np
from cs231n.data_utils import load_CIFAR10
import matplotlib.pyplot as plt
import os

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
plt.rcParams['figure.figsize'] = (10.0, 8.0)  # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython


cifar10_dir = 'cs231n//datasets//cifar-10-batches-py'

# Cleaning up variables to prevent loading data multiple times (which may cause memory issue)
try:
    del X_train, y_train
    del X_test, y_test
    print('Clear previously loaded data.')
except:
    pass

X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)

# As a sanity check, we print out the size of the training and test data.
print('Training data shape: ', X_train.shape)
print('Training labels shape: ', y_train.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)
'''
Training data shape:  (50000, 32, 32, 3)
Training labels shape:  (50000,)
Test data shape:  (10000, 32, 32, 3)
Test labels shape:  (10000,)
'''

# Visualize some examples from the dataset.
# We show a few examples of training images from each class.
classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
num_classes = len(classes)
samples_per_class = 7
for y, cls in enumerate(classes):
    idxs = np.flatnonzero(y_train == y)
    idxs = np.random.choice(idxs, samples_per_class, replace=False)
    for i, idx in enumerate(idxs):
        plt_idx = i * num_classes + y + 1
        plt.subplot(samples_per_class, num_classes, plt_idx)
        plt.imshow(X_train[idx].astype('uint8'))
        plt.axis('off')
        if i == 0:
            plt.title(cls)
# plt.show()

# Subsample the data for more efficient code execution in this exercise
num_training = 5000
mask = list(range(num_training))
X_train = X_train[mask]
y_train = y_train[mask]

num_test = 500
mask = list(range(num_test))
X_test = X_test[mask]
y_test = y_test[mask]

# Reshape the image data into rows
X_train = np.reshape(X_train, (X_train.shape[0], -1))
X_test = np.reshape(X_test, (X_test.shape[0], -1))
print(X_train.shape, X_test.shape)

from cs231n.classifiers import KNearestNeighbor

# Create a kNN classifier instance.
# Remember that training a kNN classifier is a noop:
# the Classifier simply remembers the data and does no further processing
classifier = KNearestNeighbor()
classifier.train(X_train, y_train)

# Open cs231n/classifiers/k_nearest_neighbor.py and implement
# compute_distances_two_loops.

# Test your implementation:
dists = classifier.compute_distances_two_loops(X_test)
print(dists.shape)

'''
# We can visualize the distance matrix: each row is a single test example and
# its distances to training examples
plt.imshow(dists, interpolation='none')
plt.show()

# Now implement the function predict_labels and run the code below:
# We use k = 1 (which is Nearest Neighbor).
y_test_pred = classifier.predict_labels(dists, k=1)

# Compute and print the fraction of correctly predicted examples
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / num_test
print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))

#Got 139 / 500 correct => accuracy: 0.278000


y_test_pred = classifier.predict_labels(dists, k=5)
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / num_test
print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))
#Got 139 / 500 correct => accuracy: 0.278000



# Now lets speed up distance matrix computation by using partial vectorization
# with one loop. Implement the function compute_distances_one_loop and run the
# code below:
dists_one = classifier.compute_distances_one_loop(X_test)

# To ensure that our vectorized implementation is correct, we make sure that it
# agrees with the naive implementation. There are many ways to decide whether
# two matrices are similar; one of the simplest is the Frobenius norm. In case
# you haven't seen it before, the Frobenius norm of two matrices is the square
# root of the squared sum of differences of all elements; in other words, reshape
# the matrices into vectors and compute the Euclidean distance between them.
difference = np.linalg.norm(dists - dists_one, ord='fro')
print('Difference was: %f' % (difference,))
if difference < 0.001:
    print('Good! The distance matrices are the same')
else:
    print('Uh-oh! The distance matrices are different')

dists_zero = classifier.compute_distances_no_loops(X_test)

# To ensure that our vectorized implementation is correct, we make sure that it
# agrees with the naive implementation. There are many ways to decide whether
# two matrices are similar; one of the simplest is the Frobenius norm. In case
# you haven't seen it before, the Frobenius norm of two matrices is the square
# root of the squared sum of differences of all elements; in other words, reshape
# the matrices into vectors and compute the Euclidean distance between them.
difference = np.linalg.norm(dists - dists_zero, ord='fro')
print('Difference was: %f' % (difference,))
if difference < 0.001:
    print('Good! The distance matrices are the same')
else:
    print('Uh-oh! The distance matrices are different')


# Let's compare how fast the implementations are
def time_function(f, *args):
    """
    Call a function f with args and return the time (in seconds) that it took to execute.
    """
    import time
    tic = time.time()
    f(*args)
    toc = time.time()
    return toc - tic


two_loop_time = time_function(classifier.compute_distances_two_loops, X_test)
print('Two loop version took %f seconds' % two_loop_time)

one_loop_time = time_function(classifier.compute_distances_one_loop, X_test)
print('One loop version took %f seconds' % one_loop_time)

no_loop_time = time_function(classifier.compute_distances_no_loops, X_test)
print('No loop version took %f seconds' % no_loop_time)

# you should see significantly faster performance with the fully vectorized implementation

# Good! The distance matrices are the same
# Two loop version took 43.605151 seconds
# One loop version took 71.236836 seconds
# No loop version took 0.178438 seconds
'''

num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]

X_train_folds = []
y_train_folds = []
################################################################################
# TODO:                                                                        #
# Split up the training data into folds. After splitting, X_train_folds and    #
# y_train_folds should each be lists of length num_folds, where                #
# y_train_folds[i] is the label vector for the points in X_train_folds[i].     #
# Hint: Look up the numpy array_split function.                                #
################################################################################
# Your code

# 将训练数据集划分成5份，X_train_folds和y_train_folds分别是长度为5的列表
# （注意是只对训练数据集划分，测试数据及并没有参与其中）
# X_train_folds是shape=[num_train/5,num dim]的numpy ndarray
# y_train_folds是shape=[num_train/5,]的numpy ndarray
# classifier.train(X_train, y_train)

import random
# 产生从0到num_train-1的不重复随机数
random_index=random.sample(range(num_training),num_training)

# interval=num_training//num_folds
#
# for i in range(num_folds):
#     X_train_folds.append(X_train[i*interval:(i+1)*interval,:])
#     y_train_folds.append(y_train[i*interval:(i+1)*interval])
X_train_folds=np.split(X_train,num_folds)
y_train_folds=np.split(y_train,num_folds)
################################################################################
#                                 END OF YOUR CODE                             #
################################################################################

# A dictionary holding the accuracies for different values of k that we find
# when running cross-validation. After running cross-validation,
# k_to_accuracies[k] should be a list of length num_folds giving the different
# accuracy values that we found when using that value of k.
k_to_accuracies = {}

################################################################################
# TODO:                                                                        #
# Perform k-fold cross validation to find the best value of k. For each        #
# possible value of k, run the k-nearest-neighbor algorithm num_folds times,   #
# where in each case you use all but one of the folds as training data and the #
# last fold as a validation set. Store the accuracies for all fold and all     #
# values of k in the k_to_accuracies dictionary.                               #
################################################################################
# 使用k折交叉验证找出KNN算法中最适合当前数据的k
# 对于KNN算法中所设置的每个k（k_choices列表中的每个元素），运行num_folds次KNN算法，
# 则对于KNN的每个可能的取值k，会得到num_folds个准确率，存储在k_to_accuracies字典中
# 的key=k的value中，value为列表结构，长度为num_folds，即对于KNN算法所设置的当前k值，
# 得到k个准确率

# training dataset和validation dataset的区别在于：
# 训练数据集是知道每个样本数据所对应的ground truth标签的，训练样本与标签
# 的对应关系就是通过训练希望模型从数据中所学到的知识
# 而对于验证数据集，它的标签并不作为模型已经知道的知识输入，而仅仅将标签
# 用来计算validation dataset的performance

for hyper_k in k_choices:
    k_to_accuracies[hyper_k]=[]
for i in range(num_folds):
    temp_test_x = X_train_folds[i]
    temp_test_y = y_train_folds[i]
    X_train_folds.pop(i)#将列表中的第i个元素（numpy.ndarray）删除
    y_train_folds.pop(i)
    # temp_train_x=np.concatenate(X_train_folds)
    # temp_train_y=np.concatenate(y_train_folds)
    temp_train_x=np.concatenate(X_train_folds[:i]+X_train_folds[i+1:])
    temp_train_y = np.concatenate(y_train_folds[:i] + y_train_folds[i + 1:])

    # X_train_folds.insert(i,temp_test_x)
    # y_train_folds.insert(i,temp_test_y)

    classifier.train(temp_train_x, temp_train_y)
    dists_no_loop = classifier.compute_distances_no_loops(temp_test_x)

    for hyper_k in k_choices:
        temp_y_test_pred = classifier.predict_labels(dists_no_loop, k=hyper_k)

        # Compute and print the fraction of correctly predicted examples
        num_correct = np.sum(temp_y_test_pred == temp_test_y)
        accuracy = float(num_correct) / temp_test_x.shape[0]
        k_to_accuracies[hyper_k].append(accuracy)

# Your code
################################################################################
#                                 END OF YOUR CODE                             #
################################################################################

# Print out the computed accuracies
for k in sorted(k_to_accuracies):
    for accuracy in k_to_accuracies[k]:
        print('k = %d, accuracy = %f' % (k, accuracy))


# plot the raw observations
for k in k_choices:
    accuracies = k_to_accuracies[k]
    plt.scatter([k] * len(accuracies), accuracies)

# plot the trend line with error bars that correspond to standard deviation
accuracies_mean = np.array([np.mean(v) for k,v in sorted(k_to_accuracies.items())])
accuracies_std = np.array([np.std(v) for k,v in sorted(k_to_accuracies.items())])
plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std)
plt.title('Cross-validation on k')
plt.xlabel('k')
plt.ylabel('Cross-validation accuracy')
plt.show()

# Based on the cross-validation results above, choose the best value for k,
# retrain the classifier using all the training data, and test it on the test
# data. You should be able to get above 28% accuracy on the test data.
best_k = 10

classifier = KNearestNeighbor()
classifier.train(X_train, y_train)
y_test_pred = classifier.predict(X_test, k=best_k)

# Compute and display the accuracy
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / num_test
print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))