id3决策树Python版

最新推荐文章于 2024-09-13 15:57:16 发布

rocky_zheng

最新推荐文章于 2024-09-13 15:57:16 发布

阅读量672

点赞数

CC 4.0 BY-SA版权

分类专栏：机器学习文章标签：机器学习算法

本文链接：https://blog.youkuaiyun.com/sinat_16233463/article/details/36379059

机器学习专栏收录该内容

9 篇文章

订阅专栏

本文介绍使用Python实现决策树算法的过程，通过具体实例展示了如何计算信息熵、信息增益，并找到最佳划分特征，最终构建决策树模型。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

一直都是用R,SAS这样的工具完成机器学习算法。但是面对大数据集的时候，这些工具往往显得很弱，基本上在读入文件的时候就能让电脑的内存爆掉。能用程序实现这些机器学习算法是很重要的。因为自己如果能实现这些算法，你就可以很灵活，比如说在文本挖掘过程中往往是稀疏矩阵，虽然R也有提供稀疏矩阵的操作，但是不一定满足自己的需求。所以我选择了从《机器学习实战》这本书中来学习用Python实现机器学习的算法。

下面直接贴上代码，这代码完全是仿照书上写的。用这个思路我还实现了c++版的id3算法，会稍后贴出。

import math

dataset = [
   ("青年", "否", "否", "一般", "否")
   ,("青年", "否", "否", "好", "否")
   ,("青年", "是", "否", "好", "是")
   ,("青年", "是", "是", "一般", "是")
   ,("青年", "否", "否", "一般", "否")
   ,("中年", "否", "否", "一般", "否")
   ,("中年", "否", "否", "好", "否")
   ,("中年", "是", "是", "好", "是")
   ,("中年", "否", "是", "非常好", "是")
   ,("中年", "否", "是", "非常好", "是")
   ,("老年", "否", "是", "非常好", "是")
   ,("老年", "否", "是", "好", "是")
   ,("老年", "是", "否", "好", "是")
   ,("老年", "是", "否", "非常好", "是")
   ,("老年", "否", "否", "一般", "否")
]
labels=['var1','var2','var3','var4']

data=[[1,1,'yes'],
      [1,1,'yes'],
      [1,0,'no'],
      [0,1,'no'],
      [0,1,'no']]
la=['no surfacing','flippes']






#计算信息熵,target为分类标签的位置
def entropy(dataset,target=-1):
    instance_num=len(dataset)
    tar_num={}
    entro=0.0
    for t in dataset:
        tar_num[t[target]]=tar_num.setdefault(t[target],0)+1
    pl=[n/instance_num for n in tar_num.values()]
    for p in pl:
        entro-=p*math.log2(p)
    
    return(entro)

###拆分数据集
def split_dataset(dataset,f_index,value):
    sub_set=[]
    for row in dataset:
        row=list(row)
        if row[f_index]==value:
            row.remove(row[f_index])
            
            sub_set.append(row)
    return(sub_set)


        

###计算信息增益
def info_gain(dataset,feature_index):
    entropy_d=entropy(dataset)
    long_dataset=len(dataset)
    feature_count={}
    
    sub_entropy=0.0
    for row in dataset:
        feature_count[row[feature_index]]=feature_count.setdefault(row[feature_index],0)+1

    feature_p=0.0
    
    for k,v in feature_count.items():
        feature_p=v/long_dataset
        subset=split_dataset(dataset,feature_index,k)
        
        sub_entropy+=feature_p*entropy(subset)
    
    ig=entropy_d-sub_entropy
    #print(ig)
    return(ig)


##寻找最大信息增益的特征
def find_bestfeature(dataset,label):
    
    max_ig=0.0
    
    for la in label:
        label_index=label.index(la)
        
        ig=info_gain(dataset,label_index)
        if ig>max_ig:
            max_ig=ig
            baset_feature=la
            
        
    return(baset_feature,label_index)


###返回数量做多的value
def max_count(dataset):
    
    if dataset[0]==1:
        for row in dataset:
            class_count[row[0]]=class_count.setdefault(row[0],0)+1
        sorted_count=sorted(class_count.items(),reverse=True)
        return(sorted_count[0][0])
    else:
        return(None)

        
###create_tree
def create_tree(dataset,label):
    
    class_list=[x[-1] for x in dataset]
    if class_list.count(class_list[0])==len(dataset):
        return(class_list[0])
    if len(dataset)==1:
        return(max_count(dataset))
    

    best_feature,index=find_bestfeature(dataset,label)
    tree={best_feature:{}}
    feature_list=set(x[index] for x in dataset)
    label.remove(best_feature)
    for ft in feature_list:
        sublabel=label[:]
        
        subset=split_dataset(dataset,index,ft)
        tree[best_feature][ft]=create_tree(subset,sublabel)
        
    
    return(tree)


def classfy(inputTree,featlabels,testVec):
    
    firstStr=list(inputTree.keys())[0]
    
    secondDict=inputTree[firstStr]
    featIndex=featlabels.index(firstStr)
    for key in secondDict.keys():
        if testVec[featIndex]==key:
            if type(secondDict[key]).__name__=='dict':
                classLabel=classfy(secondDict[key],featlabels,testVec)
            else:
                classLabel=secondDict[key]
    return classLabel

mytree=create_tree(data,la)

test=classfy(mytree,['no surfacing','flippes'],[0,1])
print(test)