统计学习方法_决策树实现

最新推荐文章于 2022-11-02 12:35:49 发布
Eminem1147
最新推荐文章于 2022-11-02 12:35:49 发布
阅读量236
点赞数 1
CC 4.0 BY-SA版权
分类专栏：统计学习方法
本文链接：https://blog.youkuaiyun.com/qq_33765907/article/details/83272881
统计学习方法专栏收录该内容
9 篇文章
订阅专栏
这个使用了像素点二值化的MNIST，重点理解递归子树的构造过程。

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import cv2
import time
import numpy as np
import pandas as pd

from sklearn.cross_validation import train_test_split

# 将图片二值化
def binaryzation(image):
    cv_img = []
    for i in image:
        if i > 0:
            cv_img.append(1)
        else:
            cv_img.append(0)
    return np.array(cv_img)

def binaryzation_features(train_set):
	features = []

	for img in train_set:
		img = binaryzation(img)
		features.append(img)

	features = np.array(features)
	features = features.reshape(-1, 784)
	return features

class Tree(object):
	def __init__(self, node_type, Class=None, feature=None):
		self.node_type = node_type  # 标记当前结点是叶节点还是内部结点
		self.Class = Class  # 标记子结点所对应的类别
		# 标记上层子树的最优特征对应的这棵子树的特征值（画图理解）
		self.feature = feature
		# 键值对中，键为最优特征的一个特征值，值为这个特征值对应的子树（递归）
		self.dict = {}

	def add_tree(self, val, tree):
		self.dict[val] = tree

	def predict(self, features):  # 每个features是一个测试数据
		if self.node_type == 'leaf':
			return self.Class

		tree = self.dict[features[self.feature]]  # 根据测试数据的特征值递归寻找对应的子树
		return tree.predict(features)

def calc_ent(x):
	'''
		计算x的熵，log以2为底
		x为np.array类型
	'''
	x_value_list = set([x[i] for i in range(x.shape[0])])  # 去重
	ent = 0.0
	for x_value in x_value_list:  # 遍历每个值
		p = float(x[x == x_value].shape[0]) / x.shape[0]
		logp = np.log2(p)
		ent -= p * logp
	return ent

def calc_condition_ent(x, y):
	'''
		计算经验条件熵H(y|x)
		x,y均为np.array类型
	'''
	x_value_list = set([x[i] for i in range(x.shape[0])])  # 去重
	ent = 0.0
	for x_value in x_value_list:  # 对特征x的不同的值进行遍历
		sub_y = y[x == x_value]  # 特征x的值相同所对应的类别值y
		temp_ent = calc_ent(sub_y)
		ent += (float(sub_y.shape[0]) / y.shape[0]) * temp_ent
	return ent

def train(train_set, train_label, features, epsilon):
	'''
		features:list 子结点的特征下标，在构建子树过程中要不断去除features中的特征
	'''
	# 步骤1：如果train_set中所有实例都属于同一类Ck
	label_set = set(train_label)
	if len(label_set) == 1:
		return Tree('leaf', Class=label_set.pop())

	# 求这个结点最多的类别，生成列表，列表中是元组，格式为(类别下标, 此类别数目)
	# max求最大以key的函数对象作为标准，这里是以元组的第二项作为取最大标准
	max_class, max_len = max([(i, len(list(filter(lambda x: x == i, train_label)))) for i in range(total_class)], key=lambda x: x[1])

	# 步骤2：如果features为空
	if len(features) == 0:
		return Tree('leaf', Class=max_class)

	# 步骤3：计算信息增益
	max_feature = 0  # 最大信息增益所对应的特征
	max_gain = 0  # 最大信息增益

	D = train_label
	HD = calc_ent(D)
	for feature in features:  # 寻找最优特征
		A = np.array(train_set[:, feature].flat)  # flat返回一个迭代器
		gain = HD - calc_condition_ent(A, D)  # gain为当前特征的信息增益

		if gain > max_gain:
			max_gain, max_feature = gain, feature

	# 步骤4：小于阈值
	if max_gain < epsilon:
		return Tree('leaf', Class=max_class)

	# 步骤5：根据最优特征构建非空子集
	sub_features = list(filter(lambda x: x != max_feature, features))
	tree = Tree('internal', feature=max_feature)

	# 构造基于最优特征的子树
	feature_col = np.array(train_set[:, max_feature].flat)
	feature_value_list = set([feature_col[i] for i in range(feature_col.shape[0])])
	for feature_value in feature_value_list:  # 遍历最优特征的特征值
		index = []  # 记录当前特征值的下标
		for i in range(len(train_label)):
			if train_set[i][max_feature] == feature_value:
				index.append(i)
		
		sub_train_set = train_set[index]
		sub_train_label = train_label[index]

		# 递归构造子树
		sub_tree = train(sub_train_set, sub_train_label, sub_features, epsilon)
		tree.add_tree(feature_value, sub_tree)

	return tree

def predict(test_set, tree):
	result = []
	for features in test_set:  # 遍历测试数据
		temp_predict = tree.predict(features)
		result.append(temp_predict)
	return np.array(result)

total_class = 10

if __name__ == '__main__':
	print('Start reading data:')
	time1 = time.time()

	raw_data = pd.read_csv('data/train.csv', header=0)
	data = raw_data.values

	imgs = data[:, 1:]
	labels = data[:, 0]

	# 图片二值化
	features = binaryzation_features(imgs)

	# print(features.shape)  # (42000, 784)

	# 2/3为训练集，1/3为测试集
	train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, random_state=11111)

	# print(train_labels.shape)  # (28140,)
	
	time2 = time.time()
	print('read data cost %f seconds' % (time2 - time1))

	print('Starting training:')
	tree = train(train_features, train_labels, [i for i in range(784)], 0.1)
	time3 = time.time()
	print('training cost %f seconds' % (time3 - time2))
	
	print('Starting predicting:')
	test_predict = predict(test_features, tree)
	time4 = time.time()
	print('predicting cost %f seconds' % (time4 - time3))

	accuracy = np.sum(test_labels == test_predict.reshape(len(test_labels))) / len(test_labels)
	print('The accuracy is %f!' % accuracy)

'''
output:
Start reading data:
read data cost 16.684554 seconds
Starting training:
training cost 106.074897 seconds
Starting predicting:
predicting cost 0.081568 seconds
The accuracy is 0.862987!
'''