对knn的一点想法: 1、如果对特征权重是一致的,一定要做归一化处理,不然结果一定会偏向某一特征去,误差绝对很大。但如果有偏重不一样,我倒是觉得要加权重,但权重具体是多少也是个问题,能提前训练出来嘛? 2、k的选取?看了李航老师的《统计学习方法》k选小了,相当于用较小的领域中的训练实例进行预测,这样只对相似的数据有较好结果,估计误差会增大(类似过拟合的一种情况);k选取大,那么就是用较大的领域的训练实例进行预测,也就是说对一些不相似的数据也会有预测,那么会增大错误率。然后是用交叉验证法来选取最优的k值。等以后有写出实验再来补充。 import numpy as np import operator # knn算法 def classify0(inX, dataSet, labels, k): dataSetSize = dataSet.shape[0] diffMat = np.tile(inX, (dataSetSize,1)) - dataSet sqDiffMat = diffMat**2 sqDistances = sqDiffMat.sum(axis=1) distances = sqDistances**0.5 sortedDistIndicies = distances.argsort() classCount={} for i in range(k): voteIlabel = labels[sortedDistIndicies[i]] classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] # 导入数据进行预处理 def file2matrix(filename): fr = open(filename) ''' while True: line = fr.readline() if not line: break print(line) ''' arrayOlines = fr.readlines() numberOfLines = len(arrayOlines) # print(numberOfLines) retrunMat = np.zeros((numberOfLines, 3)) classLabelVector = [] index = 0 for line in arrayOlines: line = line.strip() listFromLine = line.split('\t') retrunMat[index, :] = (listFromLine[0:3]) classLabelVector.append(int(listFromLine[-1])) index += 1 return retrunMat, classLabelVector import matplotlib.pyplot as plt # 绘图处理 def data_image(): fig = plt.figure() # 1*1的网格,第一个子图 223是2*2的网格,第3个子图 ax = fig.add_subplot(111) datingDataMat, DatingLabels= file2matrix('datingTestSet2.txt') # 理解成二维的list ax.scatter(datingDataMat[:, 1],datingDataMat[:, 2], c=['b','green']) plt.show() # 归一化处理,认为特征是同等重要的,所以对参数的取值要进行归一化处理 def autoNorm(dataSet): # 0代表的是每列,这里是获取每列中的最小值,1*3个值 minVals = dataSet.min(0) maxVals = dataSet.max(0) ranges = maxVals - minVals normDataSet = np.shape(dataSet) m = dataSet.shape[0] # np.tile(a,(2,1))第一个参数为Y轴扩大倍数,第二个为X轴扩大倍数。本例中X轴扩大一倍便为不复制。 normDataSet = dataSet - np.tile(minVals, (m, 1)) normDataSet = normDataSet / np.tile(ranges, (m,1)) return normDataSet, ranges, minVals # 分类器针对约会网站的测试代码 def datingClassTest(): hoRatio = 0.1 datingDataMat, datingLabels= file2matrix('datingTestSet2.txt') normMat, ranges, minVals = autoNorm(datingDataMat) # 查看归一化后特征矩阵第一维度的长度 m = normMat.shape[0] numTestVecs = int(m * hoRatio) errorCount = 0.0 for i in range(numTestVecs): classfilerResult = classify0(normMat[i, :], normMat[numTestVecs:m, :],\ datingLabels[numTestVecs:m], 3) print("the classifier came back with: %d, the real answer is: %d"\ % (classfilerResult, datingLabels[i])) if (classfilerResult != datingLabels[i]): errorCount += 1.0 print(" the total error rate is: %f" % (errorCount/float(numTestVecs))) if __name__ == '__main__': #datingDataMat, DatingLabels= file2matrix('datingTestSet2.txt') #print(datingDataMat) #print(DatingLabels) #data_image() datingDataMat, DatingLabels = file2matrix('datingTestSet2.txt') normDataSet, ranges, minVals = autoNorm(datingDataMat) print(normDataSet) datingClassTest()