使用k-近邻算法快速判定她是不是你喜欢的类型?
问题描述
比如你的朋友经常上约会网站寻找自己的约会对象,你的朋友选定约会对象的时候主要看重三点“每年飞行的旅程数”、“玩游戏所耗时间百分比”、“每个月看书的数目”,你阅人无数的朋友已经约会过很多个对象了,并且把这些对象分为三类“她是我喜欢的类型”、“一般喜欢”,“她不是我喜欢的类型”,经过无数次的约会之后,你的朋友心已经很累了,他想能否输入某人的“每年飞行的旅程数”、“玩游戏所耗时间百分比”、“每个月看书的数目”这三项数据,就能判断她是不是他喜欢的类型呢?
from numpy import *
import operator
import matplotlib.pyplot as plt
def classify0(inx,dataset,labels,k): # knn 算法,inx新样本,测试集。dataset原样本,训练集。labels,y值。k,簇个数。
datasetsize = dataset.shape[0] # 有多少样本
diffmat = tile(inx,(datasetsize,1))-dataset #新样本与原样本的差值
sqdiffmat = diffmat ** 2 #1
sqdistance = sqdiffmat.sum(axis=1) #2
distances = sqdistance ** 0.5 #3 ,123求距离
sorteddistances = distances.argsort() # 按距离从小到大排序
classcount = {}
for i in range(k): # 取前k个,取前k个label最多的为新样本的label
votelabel = labels[sorteddistances[i]]
classcount[votelabel] = classcount.get(votelabel,0)+1
sortedclasscount = sorted(classcount.items(),key = operator.itemgetter(1),reverse=True)
return sortedclasscount[0][0]
def file2matrix(filename): # 导入txt文件
fr = open(filename)
arrayolines = fr.readlines()
numberoflines = len(arrayolines)
returnmat = zeros((numberoflines,3))
classLabelVector = []
index =0
for line in arrayolines:
line = line.strip()
listFromline = line.split('\t')
returnmat[index,:] = listFromline[0:3] #x值,3是txt文件里面有3列数据,3个特征
labels = {'didntLike':1,'smallDoses':2,'largeDoses':3} ### txt文件里面的label
classLabelVector.append(int(labels[listFromline[-1]])) # y值
index +=1
return returnmat,classLabelVector
def autoNorm(dataset): # 归一化
minvals = dataset.min()
maxvals = dataset.max()
ranges = maxvals - minvals
norndataset = zeros(shape(dataset))
m = dataset.shape[0]
normdataset = dataset - tile(minvals,(m,1))
norndataset = norndataset / tile(ranges,(m,1))
return normdataset,ranges,minvals
def dataClassTest(): # 用作测试你的正确率
horatio = 0.1 # 90%用作训练,10%用作测试
datamat,datalabel = file2matrix("datingTestSet.txt")
normat,ranges,minvals = autoNorm(datamat)
m = normat.shape[0]
numTestVecs = int(m*horatio)
errcount = 0
for i in range(numTestVecs):
classfierresult = classify0(normat[i,:],normat[numTestVecs:m,:],datalabel[numTestVecs:m],3)
##print("The classifier came back with : %d ,the real answer is : %d "%(classfierresult,datalabel[i]))
if (classfierresult != datalabel[i]):
errcount +=1
print("The total error rate is : %f"%(errcount/float(numTestVecs)))
得到错误率了,后面你就可以应用上面的算法,写一个算法,让你可以输入新数据,然后判断他的是否是你的类型
def classfyperson():
resultlist = ['didntLike','smallDoses','largeDoses']
percentats = float(input ('percentage of time spent playing video games?'))
ffmiles = float(input("frequent filer miles earned per year?"))
icecream = float(input("liters of ice cream consumed per year?"))
datamat,datalabel = file2matrix("datingTestSet.txt")
normat,ranges,minvals = autoNorm(datamat)
inarr = array([ffmiles,percentats,icecream])
classifierresult = classify0((inarr-minvals)/ranges,normat,datalabel,3)
print("you will probably like this person: " , resultlist[classifierresult-1])
运行一下: