from numpy import *
import operator
def createDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
return group,labels
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0] # shape[0]:get the number of rows of the dataset matrix (i.e.size of the dataset)
diffMat = tile(inX, (dataSetSize,1)) - dataSet # inX:unlabelled data, get the difference of inX and dataSet(each element in dataSet)
sqDiffMat = diffMat ** 2 # return square of each element in diffMat
sqDistances = sqDiffMat.sum(axis=1) # axis=n :remove dimension n then return sum?
distances = sqDistances ** 0.5 # Euclid distance
sortedDisIndicies = distances.argsort() #return index of the sorted number(from small to large)
classCount={}
for i in range(k):
voteIlabel = labels[sortedDisIndicies[i]] # get the nearest k number lables
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 #find the correspond label(if there is not, append '0' to the dict)
sortedClassCount = sorted(classCount.items(),
key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0] # the label with most votes