C4.5算法使用信息增益率来代替ID3的信息增益进行特征的选择,克服了信息增益选择特征时偏向于特征值个数较多的不足。信息增益率的定义如下:
# -*- coding: utf-8 -*-
from numpy import *
import math
import copy
import cPickle as pickle
class C45DTree(object):
def __init__(self): # 构造方法
self.tree = {} # 生成树
self.dataSet = [] # 数据集
self.labels = [] # 标签集
# 数据导入函数
def loadDataSet(self, path, labels):
recordList = []
fp = open(path, "rb") # 读取文件内容
content = fp.read()
fp.close()
rowList = content.splitlines() # 按行转换为一维表
recordList = [row.split("\t") for row in rowList if row.strip()] # strip()函数删除空格、Tab等
self.dataSet = recordList
self.labels = labels
# 执行决策树函数
def train