python实现ID3

本文介绍了一个基于Python的决策树算法实现过程,包括计算信息增益、构建决策树及评估预测准确性等内容。通过具体实例展示了如何从训练数据集中构建决策树模型,并应用于测试数据集进行预测。
# -*- coding: utf-8 -*-
#计算各个属性各个值的嫡
import numpy as np
def H(tdata):
    n = tdata.shape[1] -1
    C = tdata.ix[:,n]
    result = 0
    counts = list(C.value_counts())
    for i in range(len(counts)):
        p = counts[i]/len(C)
        result = result + p*np.log2(p)
    print('H')
    return result
        
#计算各个属性的条件嫡
def tiaojiandi(dataset,T):
    #按照T划分数据集
    xiaodi = 0
    for i in dataset[T].unique():
        tdata = dataset[dataset[T]==i]
        p = len(tdata)/len(dataset)
        xiaodi = xiaodi +p*H(tdata)    
    print('tiaojiandi')
    return -xiaodi


#计算最大信息增益的属性
def maxgain(dataset):
    gain=[]
    n = dataset.shape[1] -1
    features = list(dataset.columns[0:n])
    for i in range(len(features)):
        di = tiaojiandi(dataset,features[i])
        gain.append(di)
    gain = np.array(gain)
    print('maxgain')
    return features[gain.argmin()]


#获得属性后,拆分数据集
def split(dataset, feature, value):
    newdata = dataset[dataset[feature]==value]
    del newdata[feature]
    print('split')
    return newdata

#若属性为空时,结果多的为终结点
def classfiy(C):
    counts = C.value_counts().sort_index()
    print('classfiy')
    return str(counts.index[-1])

#创建决策树
def decision_tree(dataset):
    n = dataset.shape[1] -1
    features = list(dataset.columns[0:n])
    C = list(dataset.ix[:,n])
    if C.count(C[0]) == len(C):
        return C[0]
    if len(features)==0:
        return classfiy(dataset.ix[:,n])
    feature = maxgain(dataset)
    tree={feature:{}}
    for value in dataset[feature].unique():
        print('ok')
        newdata = split(dataset,feature,value)
        tree[feature][value] = decision_tree(newdata)
    return tree



import pandas as pd
train = pd.read_csv(r'E:\Python\machine learning\own\decision_tree\train.csv')
tree = decision_tree(train) 

#预测结果
def predict(tree,test):
    result = []
    for i in range(len(test)):
        newdata = test.ix[i,0:4].to_dict()
        while isinstance(tree,dict):
            key = list(tree.keys())[0]
            tree = tree[key][newdata[key]]
        result.append(tree)
    print(result)
    return result

#计算准确率
def pinggu(tree, test):
    result = predict(tree,test)
    test['result']=result
    return len(test[test['Play']==test['result']])/len(test)

test = pd.read_csv(r'E:\Python\machine learning\own\decision_tree\test.csv')
accuary = pinggu(tree,test)


   

 

转载于:https://www.cnblogs.com/chenyaling/p/7234997.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值