k_nearest_neighbor.py # coding=gbk import numpy as np class KNearestNeighbor(object): """ a kNN classifier with L2 distance """ def __init__(self): pass def train(self, X, y): """ Train the classifier. For k-nearest neighbors this is just memorizing the training data. KNN训练的算法仅仅是将所有的训练数据简单地记住(memorizing) KNN中并没有任何可学习的参数 Inputs: - X: A numpy array of shape (num_train, D) containing the training data consisting of num_train samples each of dimension D. - y: A numpy array of shape (N,) containing the training labels, where y[i] is the label for X[i]. """ self.X_train = X self.y_train = y def predict(self, X, k=1, num_loops=0): """ Predict labels for test data using this classifier. Inputs: - X: A numpy array of shape (num_test, D) containing test data consisting of num_test samples each of dimension D. - k: The number of nearest neighbors that vote for the predicted labels. - num_loops: Determines which implementation to use to compute distances between training points and testing points. Returns: - y: A numpy array of shape (num_test,) containing predicted labels for the test data, where y[i] is the predicted label for the test point X[i]. """ if num_loops == 0: dists = self.compute_distances_no_loops(X) elif num_loops == 1: dists = self.compute_distances_one_loop(X) elif num_loops == 2: dists = self.compute_distances_two_loops(X) else: raise ValueError('Invalid value %d for num_loops' % num_loops) return self.predict_labels(dists, k=k) def compute_distances_two_loops(self, X): """ Compute the distance between each test point in X and each training point in self.X_train using a nested loop over both the training data and the test data. Inputs: - X: A numpy array of shape (num_test, D) containing test data. Returns: - dists: A numpy array of shape (num_test, num_train) where dists[i, j] is the Euclidean distance between the ith test point and the jth training point. dists numpy数组,第i行第j列表示第i个测试样本点与第j个训练样本点之间的欧式距离 """ num_test = X.shape[0] num_train = self.X_train.shape[0] dists = np.zeros((num_test, num_train)) for i in range(num_test): for j in range(num_train): ##################################################################### # TODO: # # Compute the l2 distance between the ith test point and the jth # # training point, and store the result in dists[i, j]. You should # # not use a loop over dimension. # # ##################################################################### # train_samples=self.X_train[j,:]# shape [D,] # test_samples=# shape [D,] dists[i][j] = np.sqrt(np.sum((self.X_train[j, :] - X[i, :]) ** 2)) pass ##################################################################### # END OF YOUR CODE # ##################################################################### return dists def compute_distances_one_loop(self, X): """ Compute the distance between each test point in X and each training point in self.X_train using a single loop over the test data. Input / Output: Same as compute_distances_two_loops """ num_test = X.shape[0] num_train = self.X_train.shape[0] dists = np.zeros((num_test, num_train)) for i in range(num_test): ####################################################################### # TODO: # # Compute the l2 distance between the ith test point and all training # # points, and store the result in dists[i, :]. # ####################################################################### all_train_dist = self.X_train - X[i, :] # all_train_dist 由于numpy ndarray中的broadcast机制,得到 [num_train,D]数组 # 数组中每个元素表示 train_samples_d - test_samples_d all_train_dist = all_train_dist ** 2 all_train_dist = np.sum(all_train_dist, axis=1) dists[i, :] = np.sqrt(all_train_dist) pass ####################################################################### # END OF YOUR CODE # ####################################################################### return dists def compute_distances_no_loops(self, X): """ Compute the distance between each test point in X and each training point in self.X_train using no explicit loops. Input / Output: Same as compute_distances_two_loops """ num_test = X.shape[0] num_train = self.X_train.shape[0] dists = np.zeros((num_test, num_train)) ######################################################################### # TODO: # # Compute the l2 distance between all test points and all training # # points without using any explicit loops, and store the result in # # dists. # # # # You should implement this function using only basic array operations; # # in particular you should not use functions from scipy. # # # # HINT: Try to formulate the l2 distance using matrix multiplication # # and two broadcast sums. # ######################################################################### ''' 将求解两个向量之间的L2 距离的问题转换成矩阵相乘问题 假设现在需要求一个测试样本向量与一个训练样本向量之间的距离 (x1-y1)**2+(x2-y2)**2+……+(xD-yD)**2 可以分解成因子相加的形式 x1**2-2*x1*y1+y1**2 + x2**2-2*x2*y2+y2**2 + …… ''' # all_dim=num_train*num_test train_square = self.X_train ** 2 train_square = np.sum(train_square, axis=1) train_num = train_square.shape[0] # 求出训练数据集每个样本向量的模长平方,shape [num_train,] train_square = np.expand_dims(train_square, axis=0) # shape [1,num_train] train_square = np.tile(train_square, (num_test, 1)) # shape [num_test, num_train] # numpy tile是将整个数组作为一个小单元,沿着指定的维度复制相应次数 # self.X_train shape=[num_train,num dim] # 平方后进行axis=1的summation之后,得到[num_train,] # 最终得到 [num_test,num_train] test_square = X ** 2 test_square = np.sum(test_square, axis=1) test_square = np.expand_dims(test_square, axis=1) test_square = np.tile(test_square, (1, num_train)) cross_item = np.dot(X, np.transpose(self.X_train)) # X = [num_test,num dim] # np.transpose(self.X_train)=[num_dim,num_train] # cross_item = [num_test,num_train] dists = train_square + test_square - 2 * cross_item dists = np.sqrt(dists) pass ######################################################################### # END OF YOUR CODE # ######################################################################### return dists def predict_labels(self, dists, k=1): """ Given a matrix of distances between test points and training points, predict a label for each test point. Inputs: - dists: A numpy array of shape (num_test, num_train) where dists[i, j] gives the distance betwen the ith test point and the jth training point. Returns: - y: A numpy array of shape (num_test,) containing predicted labels for the test data, where y[i] is the predicted label for the test point X[i]. """ num_test = dists.shape[0] y_pred = np.zeros(num_test) for i in range(num_test): # A list of length k storing the labels of the k nearest neighbors to # the ith test point. closest_y = [] ######################################################################### # TODO: # # Use the distance matrix to find the k nearest neighbors of the ith # # testing point, and use self.y_train to find the labels of these # # neighbors. Store these labels in closest_y. # # Hint: Look up the function numpy.argsort. # ######################################################################### arg = np.argsort(dists[i, :]) # numpy argsort 默认按照升序排列 shape [num_train] # arg表示排序之后的元素在原始数组中的排序, # out=self.y_train[np.where(arg<k)],这种写法肯定是错的,因为这样取出来的永远是前面k个元素 out = self.y_train[arg] out = out[0:k] # 取出数组中前k个最小的值 closest_y = out.tolist() pass ######################################################################### # TODO: # # Now that you have found the labels of the k nearest neighbors, you # # need to find the most common label in the list closest_y of labels. # # Store this label in y_pred[i]. Break ties by choosing the smaller # # label. # ######################################################################### # 问题转换成求一个list中出现最多的元素数值 # print('closest_y',closest_y) set_pred = sorted(list(set(closest_y))) # 首先将数组转换成set集合形式,集合中包含了数组内部不重复的所有元素 # 再将集合转换成列表,做升序排列(这样是为了保证,当有多个类别的标签 # 出现次数一样多的时候,可以选择较小的标签值) count = [closest_y.count(value) for value in set_pred] # 对于集合中的每个元素,计算它在原始列表中出现的次数 y_pred[i] = set_pred[count.index(max(count))] # print('y_pred[i]',y_pred[i]) # 求出次数的最大值,并求出在count中的索引值,count列表和set_pred集合列表长度相等 # 返回出现次数最多的ground truth 标签 pass ######################################################################### # END OF YOUR CODE # ######################################################################### return y_pred
CS231n assignment1 KNN(一)
最新推荐文章于 2024-05-28 18:13:08 发布