分别实现了最基本的3个DEMO
1.给你若干个带有标签的二维点作为训练集,给定一系列的二维随机点,看其通过训练集,可以被分为哪一类
2.给你N个人的飞行里程数,玩游戏消耗时间百分比和每周消耗冰激凌的公升数,来判断妹子对这个人的兴趣如何。
3.识别二进制文件所对应的0-9的数字是多少。
# -*- coding: utf-8 -*-
"""
照葫芦画瓢完成与2017.4.14 20:21
算法名称 : K近邻算法
参数:
测试集数据(维度 1 * N)
训练集数据(维度 M * N)即有M个训练集,其每个训练集的维度跟测试集的每个子测试集维度一样为1*N
标签(维度1 * M)为每一个训练集所对应的标签,共M个训练集
输出:
测试集中每个测试集所对应的分类
算法整体思路:
1.对于每个测试集的子测试集跟训练集进行比对,计算子测试集向量和训练集所有向量的差的平方的和,开根号获取其对应的距离矩阵
例如一个子测试集为一个1*3的矩阵[x,y,z] 训练集为3*3的分别是[x1,y1,z1] [x2,y2,z2] [x3,y3,z3]
则距离分别为 dis1 = sqrt((x - x1)**2 + (y-y1)**2) dis2 = sqrt((x - x2)**2 + (y-y2)**2) dis3 = sqrt((x - x3)**2 + (y-y3)**2)
2.按照距离矩阵的升序排序,返回对应的下标。
例如dis2 > dis1 > dis3
则返回下标为 3 1 2
3.选取K个,获取其对应的标签,然后进行累加,降序排序获取出现最多的标签次数。
例如 返回下标为3 1 4 7 5 6 2
那么我们取K = 4,则我们去看labels[3] labels[1] labels[4] labels[7]所对应的标签,并对其对应的标签进行累加
比如labels[3] = labels[7] ='A' labels[1] = 'B' labels[4] ='C'
则我们获取的标签次数并按降序排序则为2 1 1
发现A出现次数最多,则将其测试集分类为A,算法结束
作者:
zzt941006
"""
from numpy import * #导入科学计算包
import operator #导入运算符模块
#导入以上两个是为了能够在K近邻算法中执行相关包中的函数
#import sys
#sys.path.append('c:\Python27\kNN.py') # 这个例子针对 windows 用户来说的
from os import listdir#列出给定文件目录名
def createDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
return group,labels
def classify0(inX,dataSet,labels,k):
dataSetSize = dataSet.shape[0]
# print tile(inX,(dataSetSize,1))
diffMat = tile(inX,(dataSetSize,1)) - dataSet #计算输入向量与每个数据的差,构成一个二维矩阵
#print diffMat
sqDiffMat = diffMat**2
#print sqDiffMat
sqDistances = sqDiffMat.sum(axis = 1) #求相对应向量的平方的和
#print sqDistances
distances = sqDistances**0.5
# print distances
sortedDistIndicies = distances.argsort() #按照距离的小到大排序,返回对应下标
# print sortedDistIndicies
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
sortedClassCount = sorted(classCount.iteritems(),
key=operator.itemgetter(1),reverse=True)
#print sortedClassCount
return sortedClassCount[0][0]
def file2matrix(filename):
fr = open(filename)
arrayOLines = fr.readlines()#按行读文件
numberOfLines = len(arrayOLines)#获取文件行数
returnMat = zeros((numberOfLines,3)) #创建返回的矩阵
classLabelVector = []
index = 0
for line in arrayOLines:
line = line.strip()#截掉文件中的回撤字符
listFromLine = line.split('\t')#使用\t讲上一步得到的整行的数据分割成一个元素列表
returnMat[index,:] = listFromLine[0:3]#获取前3个元素
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
def autoNorm(dataSet):
minVals = dataSet.min(0)#获取当前列最小值
maxVals = dataSet.max(0)#获取当前列最大值
ranges = maxVals - minVals#获取当前列极差
normDataSet = zeros(shape(dataSet))#构建归一化矩阵
m = dataSet.shape[0]#获取总列数
normDataSet = dataSet - tile(minVals,(m,1))#遍历每一列求出原值与最小值的差
normDataSet = normDataSet / tile(ranges,(m,1))#遍历每一列获取归一化的结果
return normDataSet,ranges,minVals#返回归一化矩阵,极差矩阵和最小值矩阵
def datingClassTest():
hoRatio = 0.1
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)#获取测试集 0 -> m*hoRatio故训练集为m*hoRation -> m
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])
if (classifierResult != datingLabels[i]): errorCount += 1.0
print "the total error rate is: %f" % (errorCount/float(numTestVecs))
print errorCount
def classifyPerson():
resultList=['not at all','in small doses','in large doses']
datingDataMat,datinglabels=file2matrix('datingTestSet2.txt')
percentTats = float(raw_input("percentage of time spent playing video games?"))
ffMiles = float(raw_input("frequent flier miles earned per year?"))
iceCream = float(raw_input("liters of ice cream consumed per year?"))
datingDataMat,datingLabels =file2matrix('datingTestSet2.txt')
normMat,ranges,minVals = autoNorm(datingDataMat)
inArr = array([ffMiles,percentTats,iceCream])
classifierResult = classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
print "You will probably like this person: ",resultList[classifierResult-1]
def img2vector(filename):
returnVect = zeros((1,1024));#将32*32的二进制图像转化为1*1024维度的矩阵
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
# print lineStr
for j in range(32):
returnVect[0,32*i+j] = int(lineStr[j])#第一行为0-31 第二行加在后面为32-63....直到第32行的32个数字映射为 992-1023
return returnVect
def handwritingClassTest():
hwLabels = []
trainingFileList = listdir('trainingDigits')#获取所有的训练集文件
m = len(trainingFileList)#训练集个数
trainingMat = zeros((m,1024))#训练集矩阵维度为个数*1024
'''
for i in range (m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.spilt('.')[0]
classNumStr = int(fileStr.spilt('_')[0])
hwLabels.append(classNumStr)
trainingMat[i,:] = im2vector('trainingDigits/%s' % fileNameStr)
'''
for i in range(m):
fileNameStr = trainingFileList[i]#取每一个文件
fileStr = fileNameStr.split('.')[0] #获取.txt之前的信息
classNumStr = int(fileStr.split('_')[0])#获取'_'之前的信息(即它代表数字几)
hwLabels.append(classNumStr)
trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)#第i个文件的1*1024的格式
testFileList = listdir('testDigits')
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumStr = int(fileStr.split('_')[0])
vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)#获取每个测试集文件格式
classifierResult = classify0(vectorUnderTest,trainingMat,hwLabels,3)#丢进kNN中让当前的测试集跟所有的训练集格式去跑3近邻,通过打分来获取它最像谁
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr)
if (classifierResult != classNumStr): errorCount += 1.0
print "\nthe total number of errors is: %d" % errorCount
print "\nthe total error rate is: %f" % (errorCount/float(mTest))