这个使用了像素点二值化的MNIST,重点理解递归子树的构造过程。
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import cv2
import time
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
# 将图片二值化
def binaryzation(image):
cv_img = []
for i in image:
if i > 0:
cv_img.append(1)
else:
cv_img.append(0)
return np.array(cv_img)
def binaryzation_features(train_set):
features = []
for img in train_set:
img = binaryzation(img)
features.append(img)
features = np.array(features)
features = features.reshape(-1, 784)
return features
class Tree(object):
def __init__(self, node_type, Class=None, feature=None):
self.node_type = node_type # 标记当前结点是叶节点还是内部结点
self.Class = Class # 标记子结点所对应的类别
# 标记上层子树的最优特征对应的这棵子树的特征值(画图理解)
self.feature = feature
# 键值对中,键为最优特征的一个特征值,值为这个特征值对应的子树(递归)
self.dict = {}
def add_tree(self, val, tree):
self.dict[val] = tree
def predict(self, features): # 每个features是一个测试数据
if self.node_type == 'leaf':
return self.Class
tree = self.dict[features[self.feature]] # 根据测试数据的特征值递归寻找对应的子树
return tree.predict(features)
def calc_ent(x):
'''
计算x的熵,log以2为底
x为np.array类型
'''
x_value_list = set([x[i] for i in range(x.shape[0])]) # 去重
ent = 0.0
for x_value in x_value_list: # 遍历每个值
p = float(x[x == x_value].shape[0]) / x.shape[0]
logp = np.log2(p)
ent -= p * logp
return ent
def calc_condition_ent(x, y):
'''
计算经验条件熵H(y|x)
x,y均为np.array类型
'''
x_value_list = set([x[i] for i in range(x.shape[0])]) # 去重
ent = 0.0
for x_value in x_value_list: # 对特征x的不同的值进行遍历
sub_y = y[x == x_value] # 特征x的值相同所对应的类别值y
temp_ent = calc_ent(sub_y)
ent += (float(sub_y.shape[0]) / y.shape[0]) * temp_ent
return ent
def train(train_set, train_label, features, epsilon):
'''
features:list 子结点的特征下标,在构建子树过程中要不断去除features中的特征
'''
# 步骤1:如果train_set中所有实例都属于同一类Ck
label_set = set(train_label)
if len(label_set) == 1:
return Tree('leaf', Class=label_set.pop())
# 求这个结点最多的类别,生成列表,列表中是元组,格式为(类别下标, 此类别数目)
# max求最大以key的函数对象作为标准,这里是以元组的第二项作为取最大标准
max_class, max_len = max([(i, len(list(filter(lambda x: x == i, train_label)))) for i in range(total_class)], key=lambda x: x[1])
# 步骤2:如果features为空
if len(features) == 0:
return Tree('leaf', Class=max_class)
# 步骤3:计算信息增益
max_feature = 0 # 最大信息增益所对应的特征
max_gain = 0 # 最大信息增益
D = train_label
HD = calc_ent(D)
for feature in features: # 寻找最优特征
A = np.array(train_set[:, feature].flat) # flat返回一个迭代器
gain = HD - calc_condition_ent(A, D) # gain为当前特征的信息增益
if gain > max_gain:
max_gain, max_feature = gain, feature
# 步骤4:小于阈值
if max_gain < epsilon:
return Tree('leaf', Class=max_class)
# 步骤5:根据最优特征构建非空子集
sub_features = list(filter(lambda x: x != max_feature, features))
tree = Tree('internal', feature=max_feature)
# 构造基于最优特征的子树
feature_col = np.array(train_set[:, max_feature].flat)
feature_value_list = set([feature_col[i] for i in range(feature_col.shape[0])])
for feature_value in feature_value_list: # 遍历最优特征的特征值
index = [] # 记录当前特征值的下标
for i in range(len(train_label)):
if train_set[i][max_feature] == feature_value:
index.append(i)
sub_train_set = train_set[index]
sub_train_label = train_label[index]
# 递归构造子树
sub_tree = train(sub_train_set, sub_train_label, sub_features, epsilon)
tree.add_tree(feature_value, sub_tree)
return tree
def predict(test_set, tree):
result = []
for features in test_set: # 遍历测试数据
temp_predict = tree.predict(features)
result.append(temp_predict)
return np.array(result)
total_class = 10
if __name__ == '__main__':
print('Start reading data:')
time1 = time.time()
raw_data = pd.read_csv('data/train.csv', header=0)
data = raw_data.values
imgs = data[:, 1:]
labels = data[:, 0]
# 图片二值化
features = binaryzation_features(imgs)
# print(features.shape) # (42000, 784)
# 2/3为训练集,1/3为测试集
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, random_state=11111)
# print(train_labels.shape) # (28140,)
time2 = time.time()
print('read data cost %f seconds' % (time2 - time1))
print('Starting training:')
tree = train(train_features, train_labels, [i for i in range(784)], 0.1)
time3 = time.time()
print('training cost %f seconds' % (time3 - time2))
print('Starting predicting:')
test_predict = predict(test_features, tree)
time4 = time.time()
print('predicting cost %f seconds' % (time4 - time3))
accuracy = np.sum(test_labels == test_predict.reshape(len(test_labels))) / len(test_labels)
print('The accuracy is %f!' % accuracy)
'''
output:
Start reading data:
read data cost 16.684554 seconds
Starting training:
training cost 106.074897 seconds
Starting predicting:
predicting cost 0.081568 seconds
The accuracy is 0.862987!
'''