import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
import graphviz
'''
函数功能: 计算信息熵
:param dataSet 原始数据集
:return ent 信息熵的值
'''
def calEnt(dataSet):
n = dataSet.shape[0]
iset = dataSet.iloc[:, -1].value_counts()
p = iset / n
ent = (-p * np.log2(p)).sum()
return ent
'''
构建数据集
'''
def createDataSet():
row_data = {
'no surfacing': [1, 1, 1, 0, 0],
'flippers': [1, 1, 0, 1, 1],
'fish': ['yes', 'yes', 'no', 'no', 'no']
}
dataSet = pd.DataFrame(row_data)
return dataSet
'''
函数功能: 根据信息增益选择出最佳数据集切分的列
:param dataSet 原始数据集
:return axis 数据集的最佳切分列的索引
'''
def bestSplit(dataSet):
baseEnt = calEnt(dataSet)
bestGain = 0
axis = -1
for i in range(dataSet.shape[1] - 1):
leaves = dataSet.iloc[:, i].value_counts().index
ents = 0
for j in leaves:
childSet = dataSet[dataSet.iloc[:, i] == j]
ent = calEnt(childSet)
ents += (childSet.shape[0] / dataSet.shape[0]) * ent
infoGain = baseEnt - ents
if (infoGain > bestGain):
bestGain = infoGain
axis = i
return axis
'''
按照给定列切分数据集
:params dataSet 原始数据集
axis 指定列索引
value: 指定属性值
:return redataSet 按照指定列索引和属性值切分后的数据集
'''
def mySplit(dataSet, axis, value):
col = dataSet.columns[axis]
redataSet = dataSet.loc[dataSet[col] == value, :].drop(col, axis=1)
return redataSet
'''
函数功能 递归构建决策树
:param dataSet
'''
def createTree(dataSet):
featList = list(dataSet.columns)
classList = dataSet.iloc[:, -1].value_counts()
if classList[0] == dataSet.shape[0] or dataSet.shape[1] == 1:
return classList.index[0]
axis = bestSplit(dataSet)
bestFeat = featList[axis]
myTree = {
bestFeat: {}
}
del featList[axis]
valueList = set(dataSet.iloc[:, axis])
for value in valueList:
myTree[bestFeat][value] = createTree(mySplit(dataSet, axis, value))
return myTree
'''
函数功能 使用决策树执行分类
:params inputTree: 已经生成的决策树
labels: 存储选择的最优特征标签
testVec: 测试数据列表,顺序对应原数据集
:return classLabel : 分类结果
'''
def classify(inputTree, labels, testVec):
firstStr = next(iter(inputTree))
secondDict = inputTree[firstStr]
print(secondDict)
featIndex = labels.index(firstStr)
for key in secondDict.keys():
if testVec[featIndex] == key:
if type(secondDict[key]) == dict:
classLabel = classify(secondDict[key], labels, testVec)
else:
classLabel = secondDict[key]
return classLabel
'''
函数功能:对测试集进行预测,并返回预测后的结果
:params train: 训练集
test: 测试集
:return test: 预测好分类的测试集
'''
def acc_classify(train, test):
inputTree = createTree(train)
labels = list(train.columns)
result = []
for i in range(test.shape[0]):
testVec = test.iloc[i, : -1]
classLabel = classify(inputTree, labels, testVec)
result.append(classLabel)
test['predic'] = result
acc = (test.iloc[:,-1] == test.iloc[:, -2]).mean()
print(f'模型预测准确率为{acc}')
return test
dataSet = createDataSet()
train = dataSet
test = dataSet.iloc[:3,:]
acc_classify(train, test)
'''
dataSet = createDataSet()
# 特征
Xtrain = dataSet.iloc[:,:-1]
# 标签
Ytrain = dataSet.iloc[:,-1]
labels = Ytrain.unique().tolist()
# 将文本转化为数字
Ytrain = Ytrain.apply(lambda x : labels.index(x))
# 绘制树模型
clf = DecisionTreeClassifier()
clf = clf.fit(Xtrain, Ytrain)
tree.export_graphviz(clf)
dot_data = tree.export_graphviz(clf, out_file=None)
graphviz.Source(dot_data)
# 绘制图形增加标签和颜色
dot_data = tree.export_graphviz(
clf, out_file=None,
feature_names=['no surfacing','flippers'],
class_names=['fish','not fish'],
filled=True,
rounded=True,
special_characters=True
)
graphviz.Source(dot_data)
# 利用render方法生成图形
graph = graphviz.Source(dot_data)
graph.render("fish")
'''