《机器学习实战》学习（二）——决策树（DT）

最新推荐文章于 2025-04-16 11:25:57 发布

不系之舟913

最新推荐文章于 2025-04-16 11:25:57 发布

阅读量3.1k

点赞数 1

分类专栏：机器学习 Python编程文章标签： python 机器学习决策树

机器学习同时被 2 个专栏收录

20 篇文章

订阅专栏

Python编程

5 篇文章

订阅专栏

1、决策树简述

决策树学习是一种逼近离散值目标函数的方法，在这种方法中学习到的函数被表示为一棵决策树。在周志华老师的《机器学习》这本书中专门一章节对决策树进行了讲述。并对id3算法后的改进算法也做了相应的介绍。决策树容易导致过拟合现象，介绍了预剪枝和后剪枝等相关的处理方法。决策树依赖测试集，可以把测试集生成的树结构序列化存到文件中，下次使用可以很快进行加载。
一个牛人对决策树的总结，我觉得很有道理，所以原文放在这里，总结如下：
据我了解，决策树是最简单，也是曾经最常用的分类方法了。决策树基于树理论实现数据分类，个人感觉就是数据结构中的B+树。决策树是一个预测模型，他代表的是对象属性与对象值之间的一种映射关系。决策树计算复杂度不高、输出结果易于理解、对中间值缺失不敏感、可以处理不相关特征数据。其比KNN好的是可以了解数据的内在含义。但其缺点是容易产生过度匹配的问题，且构建很耗时。决策树还有一个问题就是，如果不绘制树结构，分类细节很难明白。所以，生成决策树，然后再绘制决策树，最后再分类，才能更好的了解数据的分类过程。
决策树的核心树的分裂。到底该选择什么来决定树的分叉是决策树构建的基础。最好的方法是利用信息熵实现。熵这个概念很头疼，很容易让人迷糊，简单来说就是信息的复杂程度。信息越多，熵越高。所以决策树的核心是通过计算信息熵划分数据集。
来源于：http://www.cnblogs.com/zhizhan/p/4432943.html

2、ID3生成一个决策树python代码注释

# -*- coding: utf-8 -*-
"""
@brief 计算给定数据集的信息熵
@param dataSet 数据集
@return 香农熵
"""
import operator
from math import log
def calcShannonEnt(dataSet):
    numEntries = len(dataSet)#求取数据集的行数
    labelCounts = {}
    for featVec in dataSet:#读取数据集中的一行数据
        currentLabel = featVec[-1] #取featVec中最后一列的值
        #以一行数据中的最后一列值为键值进行统计
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries#将每一类求取概率
        shannonEnt -= prob * log(prob,2)#求取数据集的香农熵
    return shannonEnt

"""
@brief 创建临时测试集
@param 
@return dataSet  返回一个测试数据集
@return labels 返回数据集的标签
"""
def createDataSet():
    dataSet = [[1,1,'yes'],
               [1,1,'yes'],
               [1,0,'no'],
               [0,1,'no'],
               [0,1,'no']]
    labels = ['no surfacing','flippers']
    return dataSet,labels

"""
@brief 划分数据集 按照给定的特征划分数据集
@param[in] dataSet 待划分的数据集
@param[in] axis  划分数据集的特征
@param[in] value 需要返回的特征的值
@return retDataSet 返回划分后的数据集
"""
def splitDataSet(dataSet, axis, value):
    retDataSet = []#返回的划分后的数据集
    for featVec in dataSet:
        #抽取符合划分特征的值
        if featVec[axis] == value:
            #如何符合此特征值 则存储，存储划分后的数据集时 不需要存储选为划分的特征
            reducedFeatVec = featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1 :])
            retDataSet.append(reducedFeatVec)
    return retDataSet

"""
@brief 遍历整个数据集，循环计算香农熵和选择划分函数，找到最好的划分方式。
@param[in] dataSet 整个特征集 待选择的集
@return bestFeature 划分数据集最好的划分特征列的索引值
"""  
def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0])-1 #计算数据集中特征数目
    baseEntropy = calcShannonEnt(dataSet) #计算数据集的香农熵
    bestInfoGain = 0.0 #信息增益初始值
    bestFeature = -1#初始化最佳特征值
    for i in range(numFeatures):
        featList = [example[i] for example in dataSet] #提取数据集中特征值 i表示列数
        uniqueVals = set(featList) #获取本列中的特征值集合 去除列中重复元素
        newEntropy = 0.0 #划分数据后 数据集的香农熵
        #计算每一个特征值进行划分 产生的子集的信息熵 然后将各个子集的信息熵按照比值求和
        for value in uniqueVals :
            subDataSet = splitDataSet(dataSet,i,value)
            prob = len(subDataSet)/float(len(dataSet)) #q求取子集的比值
            newEntropy += prob * calcShannonEnt(subDataSet)
        #计算每个信息增益
        infoGain = baseEntropy - newEntropy
        if(infoGain > bestInfoGain):#获得最大信息增益值以及特征值列的索引值
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature

"""
@brief 计算一个特征数据列表中 出现次数最多的特征值以及次数
@param[in] 特征值列表
@return 返回次数最多的特征值
例如：[1,1,0,1,1]数据列表 返回 1
"""
def majorityCnt(classList):
    classCount = {}
    #统计数据列表中每个特征值出现的次数
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote] = 0
        classCount[vote] += 1
    #根据出现的次数进行排序 key=operator.itemgetter(1) 意思是按照次数进行排序
    #classCount.items() 转换为数据字典 进行排序 reverse = True 表示由大到小排序
    sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse = True)
    #返回次数最多的一项的特征值
    return sortedClassCount[0][0]

"""
@brief 递归创建一颗树
@param[in] dataSet 数据集
@param[in] labels 标签数据
@return myTree 返回数结构  使用字典类型存储树结构
"""
def createTree(dataSet,labels):
    classList = [example[-1] for example in dataSet]#获取数据集中的最后一列
    #如果类别完全相同，则停止划分创建
    if classList.count(classList[0]) == len(classList):
        return classList[0]
    #如果第一行数据长度为1 表示已经遍历完所有特征，则返回出现次数最多的特征值
    if len(dataSet[0]) == 1:
        return majorityCnt(classList)
    #获得数据集的最好划分特征值列索引
    bestFeat = chooseBestFeatureToSplit(dataSet)
    bestFeatLabel = labels[bestFeat]
    myTree ={bestFeatLabel:{}}
    del(labels[bestFeat])
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:]
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet,bestFeat,value),subLabels)
    return myTree

"""
@brief 对未知特征在创建的决策树上进行分类
@param[in] inputTree
@param[in] featLabels
@param[in] testVec
@return classLabel 返回识别的结果
"""
def classify(inputTree,featLabels,testVec):
    firstStr = list(inputTree.keys())[0]
    secondDict = inputTree[firstStr]
    featIndex = featLabels.index(firstStr)
    for key in secondDict.keys():
        if testVec[featIndex] == key :
            if isinstance(secondDict[key],dict) == True:
                classLabel = classify(secondDict[key],featLabels,testVec)
            else:
                classLabel = secondDict[key]
    return classLabel

"""
@brief 存储构建的决策树
"""
def storeTree(inputTree,filename):
    import pickle
    fw = open(filename,'wb')
    pickle.dump(inputTree,fw)
    fw.close()

"""
@brief 读取文本存储的决策树
"""
def grabTree(filename):
    import pickle
    fr = open(filename,'rb')
    return pickle.load(fr)

3、使用Matplotlib绘制决策树python代码

import matplotlib.pyplot as plt
#定义决策节点和叶子节点的风格
decisionNode = dict(boxstyle = "sawtooth",fc="0.8")
#boxstyle = "swatooth"意思是注解框的边缘是波浪线型的，fc控制的注解框内的颜色深度
leafNode = dict(boxstyle="round4",fc="0.8")
arrow_args = dict(arrowstyle="<-")#箭头符号
"""
@brief 绘制节点
@param[in] nodeTxt 节点显示文本
@param[in] centerPt 起点位置
@param[in] parentPt 终点位置
@param[in] nodeType 节点风格
"""
def plotNode(nodeTxt,centerPt,parentPt,nodeType):
    createPlot.ax1.annotate(nodeTxt,xy=parentPt,xycoords='axes fraction',\
    xytext=centerPt,textcoords='axes fraction',\
    va="center",ha="center",bbox=nodeType,arrowprops=arrow_args)
"""
@brief 修改createPlot函数
"""

def createPlot(inTree):
    fig=plt.figure(1,facecolor='white')
    fig.clf()
    axprops = dict(xticks=[],yticks=[])

    createPlot.ax1 = plt.subplot(111,frameon=False,**axprops) #绘制子图
    plotTree.totalW = float(getNumLeafs(inTree))
    plotTree.totalD = float(getTreeDepth(inTree))
    plotTree.xOff = -0.5/plotTree.totalW
    plotTree.yOff = 1.0
    plotTree(inTree,(0.5,1.0),'')
    plt.show()

"""
@brief 获取叶节点的数目 采用递归广度遍历算法获得树的叶子节点数目
@param[in] myTree 输入字典存储的树结构
@return numLeafs 返回叶子节点数目
"""
def getNumLeafs(myTree):
    numLeafs = 0
    firstStr = list(myTree.keys())[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        #if type(secondDict[key]).__name__=='dict':
        if isinstance(secondDict[key],dict) == True:
            numLeafs += getNumLeafs(secondDict[key])
        else:
            numLeafs +=1
    return numLeafs

"""
@brief 获得树的层数 采用递归深度遍历算法获得树深度
@param[in] myTree 输入字典存储的树结构
@return maxDepth 返回最大层深度
"""
def getTreeDepth(myTree):
    maxDepth = 0
    firstStr = list(myTree.keys())[0]#获得数据字典中键值列表 并返回第一个值
    secondDict = myTree[firstStr]#获取第一个键值的值
    for key in secondDict.keys():
        #if type(secondDict[key]).__name__ == 'dict':#判断数据类型是否是字典类型
        if isinstance(secondDict[key],dict) == True:#判断数据类型
            thisDepth = 1 + getTreeDepth(secondDict[key])
        else:
            thisDepth = 1
        if thisDepth > maxDepth :
            maxDepth = thisDepth
    return maxDepth

"""
@brief 在子父节点位置中间显示一个文本信息
@param[in] cntrPt 起点坐标 子节点坐标
@param[in] parentPt 结束坐标 父节点坐标
@param[in] 在中间位置显示的字符
"""
def plotMidText(cntrPt,parentPt,txtString):
    xMid = (parentPt[0] - cntrPt[0])/2.0 + cntrPt[0]
    yMid = (parentPt[1] - cntrPt[1])/2.0 + cntrPt[1]
    createPlot.ax1.text(xMid,yMid,txtString)

"""
@brief 绘制树
@param[in] myTree
@param[in] parentPt
@param[in] nodeTxt
"""
def plotTree(myTree,parentPt,nodeTxt):
    numLeafs = getNumLeafs(myTree)
    #depth = getTreeDepth(myTree)
    firstStr = list(myTree.keys())[0]
    cntrPt = (plotTree.xOff + (1.0 + float(numLeafs))/2.0/plotTree.totalW,plotTree.yOff)
    plotMidText(cntrPt,parentPt,nodeTxt)
    plotNode(firstStr,cntrPt,parentPt,decisionNode)
    secondDict = myTree[firstStr]
    plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD
    for key in secondDict.keys():
        if isinstance(secondDict[key],dict) == True :
            plotTree(secondDict[key],cntrPt,str(key))
        else:
            plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
            plotNode(secondDict[key],(plotTree.xOff,plotTree.yOff),cntrPt,leafNode)
            plotMidText((plotTree.xOff,plotTree.yOff),cntrPt,str(key))
    plotTree.yOff = plotTree.yOff +1.0/plotTree.totalD

机器学习实战书中例子，构建的决策树绘制如下：

fr=open('lenses.txt')
lenses = [inst.strip().split('\t') for inst in fr.readlines()]
lensesLabels = ['age','prescript','astigmatic','tearRate']
lensesTree = trees.createTree(lenses,lensesLabels)
treePlotter.createPlot(lensesTree)

这里写图片描述

4、决策树总结

通过本章的实践，代码大部分同书中代码一样，实现id3算法构建的决策树。明显可以看出，决策树逻辑结构简单，非常直观的表达出预测种类。也对python语言熟悉了很多，特别是对字典数据类型的使用。确实解决了大部分编程问题，使得编程更为简单。后续将对决策树具体进行应用。应用实例主要是针对周老师《机器学习》书中决策树章节的课后习题进行练习。更加深入的理解决策树。