机器学习实战笔记七_Python3_python 取消批量批注-优快云博客

本文介绍了一个基于k近邻算法的手写数字识别系统实现过程。该系统包括从图像文件中读取数字并将其转换为向量的功能，以及使用训练数据集进行分类的方法。通过对测试数据集的准确率评估，验证了该识别系统的有效性。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

程序清单2-6 手写数字识别系统的测试代码

伪代码

[python]view plain copy
def img2vector(filename):#返回1*1024行向量  
    returnVect = zeros((1,1024))  
    fr = open(filename)  
    for i in range(32):  
        #一次只读一行  
        lineStr = fr.readline()  
        for j in range(32):  
            #行复制  
            returnVect[0,32*i+j] = int(lineStr[j])  
    return returnVect  
  
def handwritingClassTest():  
    hwLabels = []  
    # listdir 可以列出trainingDigits文件夹目录中的文件  
    trainingFileList = listdir('trainingDigits')           #load the training set  
    #check the len of trainingFileList  
    m = len(trainingFileList)  
    #每行数据存储一个图像  
    trainingMat = zeros((m,1024))  
    for i in range(m):  
        #get one name of trainingFileList，ex:0_17.txt  
        fileNameStr = trainingFileList[i]  
        #get"0_17";  
        fileStr = fileNameStr.split('.')[0]     #split函数，去除'.'，然后将剩余两侧元素分为一行二列的  
                                                #向量，然后[0]得到第一列，即0_17  
        #get"0"  
        classNumStr = int(fileStr.split('_')[0])  
        hwLabels.append(classNumStr)  
        trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)  
  
    testFileList = listdir('testDigits')        #iterate through the test set  
    errorCount = 0.0  
    mTest = len(testFileList)  
    for i in range(mTest):  
        #以下为对测试数据的相同处理  
        fileNameStr = testFileList[i]  
        fileStr = fileNameStr.split('.')[0]     #take off .txt  
        classNumStr = int(fileStr.split('_')[0])  
        vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)  
        #k近邻算法  
        classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)  
        #算法输出与结果比较  
        print("the classifier came back with: %d, the real answer is\  
                        : %d" % (classifierResult, classNumStr))  
        if (classifierResult != classNumStr): errorCount += 1.0  
    print ("\nthe total number of errors is: %d" % errorCount)  
    print( "\nthe total error rate is: %f" % (errorCount/float(mTest)))  

完整代码

[python]view plain copy
#批量注释、批量取消注释 Ctrl+/  
# from __future__ import print_function  
from  numpy import *  
from os import listdir  
import operator#运算符模块  
import matplotlib.pyplot as plt  
def createDataSet():  
    group = array([[1.0,1.1],[1.0,1.0],[0,0],[0, 0.1]])  
    labels = ['A','A','B','B']  
    return group,labels  
  
group,labels=createDataSet()  
  
def classify0(inX, dataSet, labels, k): #inX: 待测试数据 ;  dataSet: 训练样本集；labels: 样本集的标签；k近邻  
    dataSetSize = dataSet.shape[0]      #to get the rows of the matrix  
    # to get the Xi-Yi of the dataSet  
    diffMat = tile(inX, (dataSetSize,1)) - dataSet      #a=[1 2],b=[2 3];tile(a,b) to generate 2*3 matrix when  
                                                        #the element all is a [1 2]  
    sqDiffMat = diffMat**2  
    sqDistances = sqDiffMat.sum(axis=1)         #使每行的元素相加，得到测试样本与各训练样本distance**2  
                                                #axis=0，按列相加；axis=1，按行相加；  
    distances = sqDistances**0.5  
    sortedDistIndicies = distances.argsort()    #将distance中的元素从小到大排列，  
                                                # 提取其对应的index(索引)，然后输出到 sortedDistIndicies  
   #声明一个dict：{key:value1,key2:value2}  
    classCount={}  
    for i in range(k):  
        voteIlabel = labels[sortedDistIndicies[i]]  
        #classCount= {'B': 2, 'A': 1}，初始化后，classCount每得到一个相同的voteIlabel，就+1  
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1       #当我们获取字典里的值的时候，一个是通过  
                                                                        # 键值对，即dict['key'],另一个就是dict.get()方法  
                                                                        # dict.get(voteIlabel,0) = 0, 此处0 to be initiated,  
                                                                        #  之后就没有作用了。  
    #items方法是可以将字典中的所有项，以列表方式返回。 iteritems方法与items方法相比作用大致相同，只是它的返回值不是列表，而是一个迭代器  
    #Python3 中没有iteritems函数，需要用values()代替，并用list转为列表  
    # sortedClassCount = sorted((key_label, value_num), key=operator.itemgetter(1), reverse=True)  
    #python3中无法使用iteritems，需要对上面这句话改造，我们通过得到两个list，得到出现频率最高的label  
    key_label=list(classCount.keys())  
    value_num=list(classCount.values())  
    #label出现频率由小到大排列，并返回索引index  
    sortedvalue_num_indicies = argsort(value_num)  
    #返回频率最大的label  
    return key_label[len(sortedvalue_num_indicies)-1]  
  
# group,labels = createDataSet()  
# a=classify0([0,0], group,labels,3)  
# print(a)  
  
#自己根据Python3 改正后的函数  
# def file2matrix(filename): # 将数据分离为样本数据与标签  
#     #open a file, default: 'r'ead  
#     fr = open(filename)  
#     #一次读取所有行  
#     arrayOLines = fr.readlines()  
#     #得到行数  
#     numberOfLines = len(arrayOLines)  
#     #1000*3 zeros matrix,row-1000, column-3  
#     returnMat = zeros((numberOfLines,3))  
#     #声明  
#     classLabelVector = []  
#     classLabelVector_Value = []  
#     index = 0  
#     #逐行扫描  
#     for line in arrayOLines:  
#         #strip函数会删除头和尾的字符，中间的不会删除  
#         line = line.strip()  
#         #删除‘\t’字符，仅剩下数据，供使用  
#         listFromLine = line.split('\t')  
#         #得到前三列数据，即飞行时间，游戏，冰激凌  
#         returnMat[index, :] = listFromLine[0:3]  
#         #得到largeDoses，smallDoses，didntLike的label  
#         classLabelVector.append(listFromLine[-1])      #无法将largeDoses，smallDoses，didntLike  
#                                                        #转换为int。基于这个思想，我们在这里将得到的行矩阵建立  
#                                                        #一个数值矩阵与之对应，暂时这样处理，不合适再继续修改  
#         if classLabelVector[index] == 'largeDoses':  
#             classLabelVector_Value.append(3)  
#         elif classLabelVector[index] == 'smallDoses':  
#             classLabelVector_Value.append(2)  
#         else:  
#             classLabelVector_Value.append(1)  
#         index += 1  
#     return returnMat, classLabelVector_Value  
def file2matrix(filename):  
    fr = open(filename)  
    numberOfLines = len(fr.readlines())         #get the number of lines in the file  
    returnMat = zeros((numberOfLines,3))        #prepare matrix to return  
    classLabelVector = []                       #prepare labels return  
    fr = open(filename)  
    index = 0  
    for line in fr.readlines():  
        line = line.strip()  
        listFromLine = line.split('\t')  
        returnMat[index,:] = listFromLine[0:3]  
        classLabelVector.append(int(listFromLine[-1]))  
        index += 1  
    return returnMat,classLabelVector  
def autoNorm(dataSet):#得到归一化后的数据样本，最大值最小值之差，与最小值  
    #得到每一列的max，min  
    minVals = dataSet.min(0)  
    maxVals = dataSet.max(0)  
    ranges = maxVals - minVals  
    #initiate a zero-matrix like dataSet's shape  
    normDataSet = zeros(shape(dataSet))  
    #get the num of row in dataSet  
    m = dataSet.shape[0]  
    #init a matrix of minvals that the same rows to the dataSet, 从而使当前数据矩阵中的每个数减去最小值  
    normDataSet = dataSet - tile(minVals, (m,1))        #tile(matrixlike,A) :init a matrix when the shape is same to A  
                                                        #meanwhile, if A is a number, the matrix is A*1, if A is (m,n),the matrix  
                                                        #is m*n matrix  
    normDataSet = normDataSet/tile(ranges, (m,1))      #element wise divide  
    return normDataSet, ranges, minVals  
  
  
def datingClassTest():  
    #使用10%的数据去测试分类器  
    hoRatio = 0.10  # hold out 10%  
    #datingTestSet2.txt中标签全部变为3,2,1，而不是字符串label，所以如果不想改file2matrix（）函数，应用datingTestSet.txt  
    #如果file2matrix（）用书中原程序，可用datingTestSet.txt  
    datingDataMat, datingLabels = file2matrix('datingTestSet.txt')  # 将数据分离为样本数据与标签  
    normMat, ranges, minVals = autoNorm(datingDataMat)#得到归一化后的数据样本，最大值最小值之差，与最小值  
    #get the num of the row  
    m = normMat.shape[0]  
    #get the test num of normMat  
    numTestVecs = int(m * hoRatio)  
    errorCount = 0.0  
    for i in range(numTestVecs):  
        #数据前numTestVecs个为测试数据，以后为样本训练集  
        classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)  # inX: 待测试数据 ;  dataSet: 训练样本集；labels: 样本集的标签；k近邻  
        #测试结果与真正结果对照输出  
        print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]))  
        if classifierResult != datingLabels[i]:  
            errorCount += 1.0  
    print("the total error rate is: %f"% (errorCount / float(numTestVecs)))  
    print(errorCount)  
  
def img2vector(filename):#返回1*1024行向量  
    returnVect = zeros((1,1024))  
    fr = open(filename)  
    for i in range(32):  
        #一次只读一行  
        lineStr = fr.readline()  
        for j in range(32):  
            #行复制  
            returnVect[0,32*i+j] = int(lineStr[j])  
    return returnVect  
  
def handwritingClassTest():  
    hwLabels = []  
    # listdir 可以列出trainingDigits文件夹目录中的文件  
    trainingFileList = listdir('trainingDigits')           #load the training set  
    #check the len of trainingFileList  
    m = len(trainingFileList)  
    #每行数据存储一个图像  
    trainingMat = zeros((m,1024))  
    for i in range(m):  
        #get one name of trainingFileList，ex:0_17.txt  
        fileNameStr = trainingFileList[i]  
        #get"0_17";  
        fileStr = fileNameStr.split('.')[0]     #split函数，去除'.'，然后将剩余两侧元素分为一行二列的  
                                                #向量，然后[0]得到第一列，即0_17  
        #get"0"  
        classNumStr = int(fileStr.split('_')[0])  
        hwLabels.append(classNumStr)  
        trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)  
  
    testFileList = listdir('testDigits')        #iterate through the test set  
    errorCount = 0.0  
    mTest = len(testFileList)  
    for i in range(mTest):  
        #以下为对测试数据的相同处理  
        fileNameStr = testFileList[i]  
        fileStr = fileNameStr.split('.')[0]     #take off .txt  
        classNumStr = int(fileStr.split('_')[0])  
        vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)  
        #k近邻算法  
        classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)  
        #算法输出与结果比较  
        print("the classifier came back with: %d, the real answer is\  
                        : %d" % (classifierResult, classNumStr))  
        if (classifierResult != classNumStr): errorCount += 1.0  
    print ("\nthe total number of errors is: %d" % errorCount)  
    print( "\nthe total error rate is: %f" % (errorCount/float(mTest)))  

测试

[python]view plain copy
handwritingClassTest()  

完成！

##########################################

转自：https://blog.youkuaiyun.com/shunquanlan9446/article/details/79779403