机器学习之决策树
完整示例代码
import operator
def calc_shannon_entropy(dataset):
#计算信息熵
from math import log
dataset_length = len(dataset)
label_count = {}
for feat_vec in dataset:
current_label = feat_vec[-1]
label_count[current_label] = label_count.get(current_label, 0) + 1
entropy = 0.0
for key in label_count.keys():
prob = label_count[key] / float(dataset_length)
entropy = entropy - prob * log(prob, 2)
return entropy
def creat_dataset():
dataset = [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]
labels = ['No surfacing', 'Flippers']
return dataset, labels
def split_dataset(dataset, axis, value):
'''此函数是取出符合特定特征的样本'''
return_dataset = []
for feat_vec in dataset:
if feat_vec[axis] == value:
reduced_feat_vec = feat_vec[: axis]
reduced_feat_vec.extend(feat_vec[axis + 1:])
return_dataset.append(reduced_feat_vec)
return return_dataset
def best_feature_to_split(dataset):
# 计算原始信息熵
base_entropy = calc_shannon_entropy(dataset)
best_gain = 0.0
best_feature = -1
dataset_length = len(dataset)
feature_num = len(dataset[0]) - 1
for axis in range(feature_num):
feature_value_list = [feat_vec[axis] for feat_vec in dataset]
unique_feat_value = set(feature_value_list)
new_entropy = 0.0
for feat_value in unique_feat_value:
splited_out_dataset = split_dataset(dataset, axis, feat_value)
splited_prob = len(splited_out_dataset) / float(dataset_length)
new_entropy += splited_prob * calc_shannon_entropy(splited_out_dataset)
print('new_entropy = ', new_entropy)
info_gain = base_entropy - new_entropy
print(info_gain)
if info_gain > best_gain:
best_gain = info_gain
best_feature = axis
return best_feature
def majority_cnt(dataset):
labels_count = {}
for feat_vec in dataset:
current_label = feat_vec[-1]
labels_count[current_label] = labels_count.get(current_label, 0) + 1
sorted_index = sorted(labels_count.items(), keys = operator.itemgetter(1), reverse = True)
return sorted_index[0][0]
def creat_tree(dataset, labels):
class_list = [feat_vec[-1] for feat_vec in dataset]
if class_list.count(class_list[0]) == len(class_list):
return class_list[0]
if len(dataset[0]) == 1:
return majority_cnt(class_list)
best_feat_index = best_feature_to_split(dataset)
best_feat_label = labels[best_feat_index]
mytree = {best_feat_label : {}}
del(labels[best_feat_index])
best_feat_values = [feat_vec[best_feat_index] for feat_vec in dataset]
unique_best_feat_values = set(best_feat_values)
for best_feat_value in unique_best_feat_values:
sublabels = labels[:]
mytree[best_feat_label][best_feat_value] = creat_tree(split_dataset(dataset, best_feat_index,best_feat_value), sublabels)
return mytree
def classify(input_tree, labels, fect_vec):
first_str = list(input_tree.keys())[0]
sec_dict = input_tree[first_str]
feat_index = labels.index(first_str)
for key in sec_dict.keys():
if key == fect_vec[feat_index]:
if type(sec_dict[key]).__name__ == 'dict':
class_label = classify(sec_dict[key], labels, fect_vec)
else:
class_label = sec_dict[key]
return class_label
def main():
dataset, labels = creat_dataset()
print(dataset, '\n', 'labels = ', labels)
entropy = calc_shannon_entropy(dataset)
print('entropy = \n', entropy)
splited_dataset = split_dataset(dataset, 0, 1)
print('splited_dataset = \n', splited_dataset)
best_feature = best_feature_to_split(dataset)
print('best feature = ', best_feature)
mytree = creat_tree(dataset, labels[:])
print('mytree = \n', mytree)
class_label = classify(mytree, labels, [1, 1])
print('[1, 1]的分类结果为;%s' % class_label)
if __name__ == "__main__":
main()
