Python3.0 + 机器学习实战-第二章knn例子

最新推荐文章于 2025-05-14 11:48:37 发布

魔术师_

最新推荐文章于 2025-05-14 11:48:37 发布

阅读量673

点赞数 1

CC 4.0 BY-SA版权

分类专栏：机器学习

本文链接：https://blog.youkuaiyun.com/maymay_/article/details/80459617

机器学习专栏收录该内容

10 篇文章

订阅专栏

本文介绍了一种利用K-近邻算法帮助快速判断某人是否符合个人喜好的方法。通过分析约会对象的多项指标如旅行频率、游戏时间占比及阅读量等，该算法能有效预测此人是否符合你的理想型。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

使用k-近邻算法快速判定她是不是你喜欢的类型？
问题描述
比如你的朋友经常上约会网站寻找自己的约会对象，你的朋友选定约会对象的时候主要看重三点“每年飞行的旅程数”、“玩游戏所耗时间百分比”、“每个月看书的数目”，你阅人无数的朋友已经约会过很多个对象了，并且把这些对象分为三类“她是我喜欢的类型”、“一般喜欢”，“她不是我喜欢的类型”，经过无数次的约会之后，你的朋友心已经很累了，他想能否输入某人的“每年飞行的旅程数”、“玩游戏所耗时间百分比”、“每个月看书的数目”这三项数据，就能判断她是不是他喜欢的类型呢？

from numpy import * 
import operator
import matplotlib.pyplot as plt
def classify0(inx,dataset,labels,k): # knn 算法,inx新样本，测试集。dataset原样本，训练集。labels，y值。k，簇个数。
    datasetsize = dataset.shape[0] # 有多少样本
    diffmat = tile(inx,(datasetsize,1))-dataset #新样本与原样本的差值
    sqdiffmat = diffmat ** 2  #1
    sqdistance = sqdiffmat.sum(axis=1)  #2
    distances = sqdistance ** 0.5  #3 ，123求距离
    sorteddistances = distances.argsort()  # 按距离从小到大排序
    classcount = {}
    for i in range(k):  # 取前k个，取前k个label最多的为新样本的label
        votelabel = labels[sorteddistances[i]] 
        classcount[votelabel] = classcount.get(votelabel,0)+1
    sortedclasscount = sorted(classcount.items(),key = operator.itemgetter(1),reverse=True)
    return sortedclasscount[0][0]

def file2matrix(filename): # 导入txt文件
    fr = open(filename)
    arrayolines = fr.readlines()
    numberoflines = len(arrayolines)
    returnmat = zeros((numberoflines,3))
    classLabelVector = []
    index =0
    for line in arrayolines:
        line = line.strip()
        listFromline = line.split('\t')
        returnmat[index,:] = listFromline[0:3]  #x值，3是txt文件里面有3列数据，3个特征
        labels = {'didntLike':1,'smallDoses':2,'largeDoses':3}  ### txt文件里面的label
        classLabelVector.append(int(labels[listFromline[-1]]))  # y值
        index +=1
    return returnmat,classLabelVector

def autoNorm(dataset): # 归一化
    minvals = dataset.min()
    maxvals = dataset.max()
    ranges = maxvals - minvals
    norndataset = zeros(shape(dataset))
    m = dataset.shape[0]
    normdataset = dataset - tile(minvals,(m,1))
    norndataset = norndataset / tile(ranges,(m,1))
    return normdataset,ranges,minvals

def dataClassTest():  # 用作测试你的正确率
    horatio = 0.1  # 90%用作训练，10%用作测试
    datamat,datalabel = file2matrix("datingTestSet.txt")
    normat,ranges,minvals = autoNorm(datamat)
    m = normat.shape[0]
    numTestVecs =  int(m*horatio)
    errcount = 0
    for i in range(numTestVecs):
        classfierresult = classify0(normat[i,:],normat[numTestVecs:m,:],datalabel[numTestVecs:m],3)
        ##print("The classifier came back with : %d ,the real answer is : %d "%(classfierresult,datalabel[i]))
        if (classfierresult != datalabel[i]):
            errcount +=1
    print("The total error rate is : %f"%(errcount/float(numTestVecs)))

得到错误率了，后面你就可以应用上面的算法，写一个算法，让你可以输入新数据，然后判断他的是否是你的类型

def classfyperson():
    resultlist = ['didntLike','smallDoses','largeDoses']
    percentats = float(input ('percentage of time spent playing video games?'))
    ffmiles = float(input("frequent filer miles earned per year?"))
    icecream = float(input("liters of ice cream consumed per year?"))
    datamat,datalabel = file2matrix("datingTestSet.txt")
    normat,ranges,minvals = autoNorm(datamat)
    inarr = array([ffmiles,percentats,icecream])
    classifierresult = classify0((inarr-minvals)/ranges,normat,datalabel,3)
    print("you will probably like this person: " , resultlist[classifierresult-1])