核心思想:前k个最相似数据中出现次数最多的类别,作为新数据的类别。
核心函数:计算距离函数,投票函数
#-*- coding : utf-8 -*-
import random
import sys
from collections import Counter
from operator import itemgetter
reload(sys)
sys.setdefaultencoding("utf-8")
def createTriangle():
# 生成三角形三个角的度数
angleC = 0
while(angleC <= 0):
angleA = random.randint(1, 179)
angleB = random.randint(1, 179)
angleC = 180 - angleA - angleB
if max([angleA, angleB, angleC]) < 90:
label = 'R' # 锐角三角形
elif max([angleA, angleB, angleC]) == 90:
label = 'Z' # 直角三角形
else:
label = 'D' # 钝角
return angleA, angleB, angleC, label
def createDataSet(n,filename):
# 生成数据集文本文件
f = open(filename, 'w')
for i in range(n):
angleA, angleB, angleC, label = createTriangle()
row = str(angleA) + '\t' + str(angleB) + '\t' + str(angleC) + '\t' + str(label) + '\n'
f.writelines(row)
f.close()
def loadFile2Matrix(filename):
# load文本文件生成矩阵
f = open(filename, 'r')
labelMatrix = []
dataMatrix = []
for line in f.readlines():
splitLine = line.strip('\n').split('\t')
labelMatrix.append(splitLine[-1]) # 最后一列是类别标签
dataMatrix.append([int(splitLine[i]) for i in range(len(splitLine)-1)])
return labelMatrix, dataMatrix
def calcDistance(vecA, vecB):
# 计算两个向量之间的欧式距离
sumOfSquare = 0
for i in range(len(vecA)):
sumOfSquare += (vecA[i]-vecB[i]) ** 2
distance = sumOfSquare ** (1.0/2)
return distance
def vote(candidateList):
count_timesDict = Counter(candidateList)
count_timesTuple = sorted(count_timesDict.iteritems(), key=itemgetter(1), reverse = True)
voteResult = str(count_timesTuple[0][0])
return voteResult
def kNN(testVec, dataMatrix, labelMatrix, k):
distanceDict = {}
# 计算新向量到各已分类向量的距离
for i in range(len(dataMatrix)):
distance = calcDistance(testVec, dataMatrix[i])
distanceDict[distance] = labelMatrix[i]
# 按距离从小到大排序
distanceTuple = sorted(distanceDict.iteritems(), key=itemgetter(0), reverse = False)
candidateList = []
# 取出距离最小的k个对应的类别标签,并进行投票判别新向量的类别
for j in range(k):
candidateList.append(distanceTuple[j][1])
classifyLabel = vote(candidateList)
return classifyLabel
def testClassifier(testDataMatrix, testLabelMatrix, dataMatrix, labelMatrix, k):
errorCount = 0
for i in range(len(testDataMatrix)):
vec = testDataMatrix[i]
classifyLabel = kNN(vec, dataMatrix, labelMatrix, k)
# 当根据kNN算法预测的类别,与测试数据的真实类别不一致时,错分个数增加1
if not classifyLabel == testLabelMatrix[i]:
errorCount += 1
# 分类准确率
pricision = 1 - float(errorCount)/len(testDataMatrix)
print 'The pricision of this kNN classifier is:', pricision
if __name__ == '__main__':
createDataSet(10000, 'dataSet.txt')
labelMatrix, dataMatrix = loadFile2Matrix('dataSet.txt')
createDataSet(1000, 'testDataSet.txt')
testLabelMatrix, testDataMatrix = loadFile2Matrix('testDataSet.txt')
testClassifier(testDataMatrix, testLabelMatrix, dataMatrix, labelMatrix, 5)
455

被折叠的 条评论
为什么被折叠?



