import pandas as pd
import numpy as np
from math import log
def entropy(ele):
# ele:类别取值的列表
# 计算列表中取值的概率分布
probs=[ele.count(i)/len(ele) for i in set(ele)]
# 计算信息熵
entropy=-sum([prob*log(prob,2) for prob in probs])
return entropy
# 划分函数
def df_split(df,col):
'''
df:待划分的训练数据
col:划分数据的依据特征
'''
# 获取依据特征的不同取值
unique_col_val=df[col].unique()
# 创建划分结果的数据框字典
res_dict={elem:pd.DataFrame for elem in unique_col_val}
# 根据特征取值进行划分
for key in res_dict.keys():
res_dict[key]=df[:][df[col]==key]
# 根据特征取值划分后的不同数据集字典
return res_dict
# 选择最优参数
def choose_best_feature(df,label):
'''
df:待划分的训练数据
label:训练标签
max_value:最大信息增益值
best_feature:最优参数
max_splited:根据最优特征划分后的数据字典
'''
entropy_D=entropy(df[label].tolist())
cols=[col for col in df.columns if col not in [label]]
max_value,best_feature=-999,None
max_splited=None
# 遍历特征并根据特征取值划分
for col in cols:
splited_set=df_split(df,col)
entropy_DA=0
for subset_col,subset in splited_set.items():
# 计算划分后的数据子集的标签信息熵
entropy_Di=entropy(subset[label].tolist())
# 计算当前特征的经验条件熵
entropy_DA+=len(subset)/len(df)*entropy_Di
# 计算当前特征的信息增益
info_gain=entropy_D-entropy_DA
if info_gain>max_value:
max_value,best_feature=info_gain,col
max_splited=splited_set
return max_value,best_feature,max_splited
class ID3Tree:
# 定义决策树的结点类
class TreeNode:
# 定义树节点
def __init__(self,name):
self.name=name
self.connections={}
# 定义树连接
def connect(self,label,node):
self.connections[label]=node
def __init__(self,df,label):
self.columns=df.columns
self.df=df
self.label=label
self.root=self.TreeNode("Root")
# 构建树的调用
def construct_tree(self):
self.construct(self.root,"",self.df,self.columns)
# 决策树的构建方法
def construct(self,parent_node,parent_label,subset_df,colums):
# 选择最优特征
max_value,best_feature,max_splited=choose_best_feature(subset_df[colums],self.label)
if not best_feature:
node=self.TreeNode(subset_df[self.label].iloc[0])
parent_node.connect(parent_label,node)
return
# 根据最有特征以及子节点构建树
node=self.TreeNode(best_feature)
parent_node.connect(parent_label,node)
# 以A—Ai为新的特征集
new_columns=[col for col in colums if col != best_feature]
# 递归构造决策树
for splited_value,splited_data in max_splited.items():
self.construct(node,splited_value,splited_data,new_columns)
df=pd.read_csv(r'监督学习单模型\6.决策树\example_data.csv')
id3_tree=ID3Tree(df,'play')
id3_tree.construct_tree()