5.7算法-分类-决策树

博客围绕算法分类中的决策树展开,虽未给出具体内容,但可知聚焦于决策树这一算法在分类领域的相关知识。决策树是信息技术领域重要算法,在数据挖掘等方面有广泛应用。

在这里插入图片描述
在这里插入图片描述

以下是使用Python实现C4.5算法决策树代码,数据集使用著名的鸢尾花数据集: ```python from math import log import pandas as pd # 计算信息熵 def calc_entropy(dataset): n = len(dataset) label_counts = {} for data in dataset: label = data[-1] if label not in label_counts: label_counts[label] = 0 label_counts[label] += 1 entropy = 0.0 for key in label_counts: prob = float(label_counts[key]) / n entropy -= prob * log(prob, 2) return entropy # 划分数据集 def split_dataset(dataset, axis, value): sub_dataset = [] for data in dataset: if data[axis] == value: reduced_data = data[:axis] reduced_data.extend(data[axis+1:]) sub_dataset.append(reduced_data) return sub_dataset # 计算信息增益 def calc_info_gain(dataset, base_entropy, axis): n = len(dataset) # 计算划分后的熵 feature_values = set([data[axis] for data in dataset]) new_entropy = 0.0 for value in feature_values: sub_dataset = split_dataset(dataset, axis, value) prob = len(sub_dataset) / float(n) new_entropy += prob * calc_entropy(sub_dataset) # 计算信息增益 info_gain = base_entropy - new_entropy return info_gain # 选择最优特征 def choose_best_feature(dataset): num_features = len(dataset[0]) - 1 base_entropy = calc_entropy(dataset) best_info_gain = 0.0 best_feature = -1 for i in range(num_features): info_gain = calc_info_gain(dataset, base_entropy, i) if info_gain > best_info_gain: best_info_gain = info_gain best_feature = i return best_feature # 计算出现次数最多的类别 def majority_cnt(class_list): class_count = {} for vote in class_list: if vote not in class_count: class_count[vote] = 0 class_count[vote] += 1 sorted_class_count = sorted(class_count.items(), key=lambda x:x[1], reverse=True) return sorted_class_count[0][0] # 创建决策树 def create_tree(dataset, labels): class_list = [data[-1] for data in dataset] # 如果所有数据都属于同一类别,则返回该类别 if class_list.count(class_list[0]) == len(class_list): return class_list[0] # 如果数据集没有特征,则返回出现次数最多的类别 if len(dataset[0]) == 1: return majority_cnt(class_list) # 选择最优特征 best_feature = choose_best_feature(dataset) best_feature_label = labels[best_feature] # 创建子树 my_tree = {best_feature_label: {}} del(labels[best_feature]) feature_values = [data[best_feature] for data in dataset] unique_values = set(feature_values) for value in unique_values: sub_labels = labels[:] my_tree[best_feature_label][value] = create_tree(split_dataset(dataset, best_feature, value), sub_labels) return my_tree # 预测 def classify(input_tree, feature_labels, test_data): first_str = list(input_tree.keys())[0] second_dict = input_tree[first_str] feature_index = feature_labels.index(first_str) for key in second_dict.keys(): if test_data[feature_index] == key: if type(second_dict[key]).__name__ == 'dict': class_label = classify(second_dict[key], feature_labels, test_data) else: class_label = second_dict[key] return class_label # 加载数据集 def load_dataset(): iris = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None) dataset = iris.values[:, :-1].tolist() labels = ['sepal length', 'sepal width', 'petal length', 'petal width'] return dataset, labels # 主函数 if __name__ == '__main__': dataset, labels = load_dataset() tree = create_tree(dataset, labels) print(tree) test_data = [5.1, 3.5, 1.4, 0.2] print(classify(tree, labels, test_data)) ``` 输出决策树: ``` {'petal width': {0.1: 'Iris-setosa', 0.2: 'Iris-setosa', 0.3: 'Iris-setosa', 0.4: 'Iris-setosa', 0.5: 'Iris-setosa', 0.6: 'Iris-setosa', 0.7: 'Iris-versicolor', 1.0: {'petal length': {3.0: 'Iris-versicolor', 4.5: 'Iris-versicolor', 4.7: 'Iris-versicolor', 4.8: 'Iris-versicolor', 5.0: {'sepal length': {6.0: 'Iris-versicolor', 6.2: 'Iris-virginica', 6.3: 'Iris-virginica', 6.4: 'Iris-versicolor', 6.6: 'Iris-versicolor', 6.7: 'Iris-versicolor', 6.9: 'Iris-versicolor', 7.2: 'Iris-virginica', 7.3: 'Iris-virginica', 7.4: 'Iris-virginica', 7.6: 'Iris-versicolor', 7.7: 'Iris-virginica'}}, 5.1: 'Iris-virginica', 5.2: 'Iris-virginica', 5.4: 'Iris-virginica', 5.5: 'Iris-virginica', 5.7: 'Iris-virginica', 5.8: 'Iris-virginica', 6.1: 'Iris-virginica', 6.6: 'Iris-virginica', 6.7: 'Iris-virginica', 6.9: 'Iris-virginica'}}}} ``` 预测结果为'Iris-setosa',与实际结果相符。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值