# 代码摘自<<机器学习实战>>,略有修改
# KNN
import numpy as np
import operator
# 读取数据
def fileMatrix(path):
f = open(path)
arraylines = f.readlines()
lengthLine = len(np.array(arraylines))
returnMat = np.zeros([lengthLine, 3])
labelVector = [] # 标签
index = 0 # 初始化索引
for line in arraylines:
line = line.strip()
listLine = line.split('\t')
returnMat[index,:] = listLine[0:3]
labelVector.append(int(listLine[-1]))
index += 1
return returnMat, labelVector
# 数据标准化
def autoNorm(dataset): # 标准化之前数据每列一定要置为float类型
minVals = dataset.min(0) # 沿行方向取最小值,每列最小值
maxVals = dataset.max(0)
for i in range(dataset.shape[1]):
dataset[:, i] = (dataset[:, i]-minVals[i])/(maxVals[i]-minVals[i])
stdMat = dataset
return stdMat
# 计算距离
def classify(inx, datasets, labels, k):
dataSize = datasets.shape[0] # get the length of datasets
diffMat = np.tile(inx, (dataSize, 1)) - datasets # 为什么要创建 (6,1)的数组? 因为输入输入
sqDiffMat = diffMat**2 # 数组中的每个值都会平方
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort() # 升序排列索引
classCount = {}
for i in range(k): #
voteLabel = labels[sortedDistIndicies[i]] # 获取标签
classCount[voteLabel]=classCount.get(voteLabel,0) + 1 # 如果字典内没有键,值为1; 如果有值+1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) # 按字典的值降序返回含有元组的列表, reverse=True 升序
return sortedClassCount[0][0]
# 调用函数
# 主函数,测试算法正确率
def dataClassTest():
path = 'G:\\DataSet\\Ch02\\datingTestSet2.txt'
ratio = 0.25 # 测试数据比例
dataMat, dataLabel = fileMatrix(path)
dataLabel = np.array(dataLabel) # 支持数组索引
stdMat = autoNorm(dataMat) # 标准化后的特征
m = stdMat.shape[0]
numTest = int(m*ratio) # 测试数据数量
testDataIndex = np.random.choice(range(m), size=int(m*ratio), replace=False) # 测试数据索引
calDataIndex = np.delete(np.array(range(m)), list(testDataIndex)) # 去掉 测试数据索引
collectCount = 0
for i in testDataIndex:
classResult = classify(stdMat[i,:], stdMat[calDataIndex], dataLabel[calDataIndex], 3)
if classResult == dataLabel[i]:
collectCount += 1
else:
pass
print("The collect rate is %2f" % (collectCount*100/numTest)+"%")
KNN算法实际使用效率不高,因为每个测试向量都要与整个训练集进行计算,计算成本较高,KNN的改建算法K决策树算法有有效的改善这一情况