《机器学习实战中文版》机器学习分类学习器性能测试

本文链接：https://blog.youkuaiyun.com/kangxiatao/article/details/109452707

该博客进行机器学习分类学习器性能测试。数据部分从文件读取马的医院检测指标数据，标签为是否患疝气病症，选取部分特征。学习器采用K - 近邻居算法、决策树等多种。结果展示各学习器分类错误率，还给出基于《机器学习实战中文版》的全部代码。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

机器学习分类学习器性能测试

1. 数据部分

从文件中读取数据
特征为马的某些医院检测的指标
标签为是否属于疝气病症
在本次测试中只选取了部分特征
list格式数据

2. 学习器部分

K-近邻居算法
决策树
朴素贝叶斯
Logistic回归
支持向量机

集成学习

# 选择学习器
# KNN, Tree, Bayes, Logistic, SVM, Adaboost, ALL
choiceModel = 'ALL'

3. 结果

学习器对于分类的错误率

4. 全部代码

所有分类器算法都是机器学习实战中文版这本书上的代码
训练集和数据集是horseColicTraining2.txt和horseColicTest2.txt

# -*- coding: utf-8 -*-
# author: AnoI  time:2020/10/29

import kNN
import trees, treePlotter
import bayes
import logRegres
import svmMLiA
import adaboost
from numpy import *  # 科学计算包
import copy
import matplotlib.pyplot as plt
import random


# ---------------------- 数据部分 ----------------------
'''
从文件中读取数据
特征为马的某些医院检测的指标
标签为是否属于疝气病症
在本次测试中只选取了部分特征
list格式
'''
frTrain = open('horseColicTraining2.txt')
frTest = open('horseColicTest2.txt')
trainingSet = []
trainingLabels = []
testingSet = []
testingLabels = []
featureLabels = []  # 特征的描述，这里并不清楚，用数字描述
featureNum = 20  # 选取特征数
for line in frTrain.readlines():
    currLine = line.strip().split('\t')
    lineArr = []
    for i in range(featureNum):
        lineArr.append(float(currLine[i]))
    trainingSet.append(lineArr)
    trainingLabels.append(float(currLine[21]))
for line in frTest.readlines():
    currLine = line.strip().split('\t')
    lineArr = []
    for i in range(featureNum):
        lineArr.append(float(currLine[i]))
    testingSet.append(lineArr)
    testingLabels.append(float(currLine[21]))
for i in range(featureNum):
    featureLabels.append('%d' % i)

trainLen = len(trainingLabels)
testLen = len(testingLabels)
print('trainLen:', trainLen)
print('testLen:', testLen)
# print(featureLabels)
# print(trainingSet, '\n', trainingLabels)
# print(testingSet, '\n', testingLabels)

# -------------------------------------------------------


# ---------------------- 学习器部分 ----------------------
# 选择学习器
# KNN, Tree, Bayes, Logistic, SVM, Adaboost, ALL
choiceModel = 'ALL'

# 学习器错误率
errorRateKNN = 0.0
errorRateTree = 0.0
errorRateBayes = 0.0
errorRateLogistic = 0.0
errorRateSVM = 0.0
errorRateAdaboost = 0.0



if choiceModel == 'KNN' or choiceModel == 'ALL':

    print("----- kNN -----")

    # 先归一化训练集和测试机的数据
    normTrain, ranges0, minVals0 = kNN.autoNorm(array(trainingSet))
    normTest, ranges1, minVals1 = kNN.autoNorm(array(testingSet))
    errorCount = 0.0
    numTestVecs = testLen
    for i in range(numTestVecs):
        classifierResult = kNN.classify0(normTest[i], normTrain, trainingLabels, 10)
        if (classifierResult != testingLabels[i]): errorCount += 1.0
    errorRateKNN = errorCount / float(numTestVecs)
    print("\nkNN error rate is: %f" % errorRateKNN, "\nerrorCount: ", errorCount)


if choiceModel == 'Tree' or choiceModel == 'ALL':

    print("----- Tree -----")

    # 下面是最开始的方法，但是我找到了好的方法，好好学python是不可能的，哥全靠猜
    # 这里就离谱，没找到不引用的复制方式
    # dataSet = []
    # for i in range(trainLen):
    #     dataArr = trainingSet[i][:]
    #     dataArr.append(trainingLabels[i])
    #     dataSet.append(dataArr)

    # 终于不是引用了，trainingSet[0]的长度不会变化了
    # print(len(trainingSet[0]))
    dataSet = copy.deepcopy(trainingSet)
    for i in range(trainLen):
        dataSet[i].append(trainingLabels[i])
    # print(len(trainingSet[0]))
    # print(len(dataSet))
    # print(len(dataSet[0]))

    # 香农熵
    shan = trees.calcShannonEnt(dataSet)
    print("香农熵:", shan)
    # 创建的树
    mytree = trees.createTree(dataSet, featureLabels)
    # 画出树
    # treePlotter.createPlot(mytree)

    numTestVecs = testLen
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = trees.classify(mytree, featureLabels, testingSet[i])
        if (classifierResult != testingLabels[i]): errorCount += 1.0
    errorRateTree = errorCount / float(numTestVecs)
    print("\nTree error rate is: %f" % errorRateTree, "\nerrorCount: ", errorCount)


if choiceModel == 'Bayes' or choiceModel == 'ALL':

    print("----- Bayes -----")

    myList = bayes.createVocabList(trainingSet)
    trainMat = []
    for pl in trainingSet:
        trainMat.append(bayes.setOfWords2Vec(myList, pl))
    p0V, p1V, pAb = bayes.trainNB0(trainMat, trainingLabels)
    # print(p0V, p1V, pAb)

    numTestVecs = testLen
    errorCount = 0.0
    for i in range(numTestVecs):
        thisTesting = array(bayes.setOfWords2Vec(myList, testingSet[i]))
        classifierResult = bayes.classifyNB(thisTesting, p0V, p1V, pAb)
        if classifierResult == 0:
            classifierResult = -1
        if (classifierResult != testingLabels[i]): errorCount += 1.0
    errorRateBayes = errorCount / float(numTestVecs)
    print("\nBayes error rate is: %f" % errorRateBayes, "\nerrorCount: ", errorCount)


if choiceModel == 'Logistic' or choiceModel == 'ALL':

    print("----- Logistic -----")

    trainWeights = logRegres.stocGradAscent1(array(trainingSet), trainingLabels, 500)
    # print(trainWeights)

    numTestVecs = testLen
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = logRegres.classifyVector(array(testingSet[i]), trainWeights)
        if classifierResult == 0.0:
            classifierResult = -1.0
        if (classifierResult != testingLabels[i]): errorCount += 1.0
    errorRateLogistic = errorCount / float(numTestVecs)
    print("\nLogistic error rate is: %f" % errorRateLogistic, "\nerrorCount: ", errorCount)

if choiceModel == 'SVM' or choiceModel == 'ALL':

    print("----- SVM -----")

    k1 = 1.3
    # 常数C 200，容错率 0.0001，取消最大循环次数 5000
    b, alphas = svmMLiA.smoP(trainingSet, trainingLabels, 200, 0.0001, 5000, ('rbf', k1))  # C=200 important
    datMat = mat(trainingSet)
    labelMat = mat(trainingLabels).transpose()
    svInd = nonzero(alphas.A > 0)[0]
    sVs = datMat[svInd]  # get matrix of only support vectors
    labelSV = labelMat[svInd]
    # print("there are %d Support Vectors" % shape(sVs)[0])
    m, n = shape(datMat)
    errorCount = 0
    for i in range(m):
        kernelEval = svmMLiA.kernelTrans(sVs, datMat[i, :], ('rbf', k1))
        predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
        if sign(predict) != sign(trainingLabels[i]): errorCount += 1
    print("SVM: the training error rate is: %f" % (float(errorCount) / m))
    errorCount = 0
    datMat = mat(testingSet)
    labelMat = mat(testingLabels).transpose()
    m, n = shape(datMat)
    for i in range(m):
        kernelEval = svmMLiA.kernelTrans(sVs, datMat[i, :], ('rbf', k1))
        predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
        if sign(predict) != sign(testingLabels[i]): errorCount += 1
    errorRateSVM = (float(errorCount) / m)
    print("\nSVM error rate is: %f" % errorRateSVM, "\nerrorCount: ", errorCount)


if choiceModel == 'Adaboost' or choiceModel == 'ALL':
    print("----- Adaboost -----")
    classiA, aggClass = adaboost.adaBoostTrainDS(trainingSet, trainingLabels, 20)
    prediction = adaboost.adaClassify(testingSet, classiA)
    err = mat(ones((testLen, 1)))
    errorCount = (err[prediction != mat(testingLabels).T].sum())
    errorRateAdaboost = (float(errorCount) / testLen)
    print("\nAdaboost error rate is: %f" % errorRateAdaboost, "\nerrorCount: ", errorCount)


# 生成随机颜色
def randomcolor():
    colorArr = ['1','2','3','4','5','6','7','8','9','A','B','C','D','E','F']
    color = ""
    for i in range(6):
        color += colorArr[random.randint(0,14)]
    return "#"+color


if choiceModel == 'ALL':
    # 画矩阵图
    # fig1 = plt.figure()
    # ax1 = fig1.add_subplot(111, aspect='equal')
    # ax1.add_patch(plt.Rectangle((0, 0),10,errorRateKNN*100,color=randomcolor()))
    # ax1.add_patch(plt.Rectangle((15, 0),10,errorRateTree*100,color=randomcolor()))
    # ax1.add_patch(plt.Rectangle((30, 0),10,errorRateBayes*100,color=randomcolor()))
    # ax1.add_patch(plt.Rectangle((45, 0),10,errorRateLogistic*100,color=randomcolor()))
    # ax1.add_patch(plt.Rectangle((60, 0),10,errorRateSVM*100,color=randomcolor()))
    # ax1.add_patch(plt.Rectangle((75, 0),10,errorRateAdaboost*100,color=randomcolor()))
    # plt.xlim(0, 100)
    # plt.ylim(0, 100)
    # plt.show()

    name_list = ['KNN', 'Tree', 'Bayes', 'Logistic', 'SVM', 'Adaboost']
    num_list = [errorRateKNN, errorRateTree, errorRateBayes, errorRateLogistic, errorRateSVM, errorRateAdaboost]
    plt.bar(range(len(num_list)), num_list, color='rgb', tick_label=name_list)
    plt.show()