C4.5 与之前说的ID3以及后续要介绍的CART相比最基本的区别就是分裂属性的选择依据不同,ID3主要根据信息增益来划分决策树,但ID3本身存在一个问题,那就是划分偏向与属性内部很纯的属性,如个人的ID信息,条件熵为0,则信息增益最大,但这样划分没有任何意义,一定会发生过拟合,所以一定要注意在训练过程中,选择合适的特征建模和训练。
C4.5 和ID3相比主要有以下优点
1,选择信息增益比代替信息增益来进行属性划分
2,一定程度上解决了数据缺失值的问题
3,提出了连续值属性的划分方法
4,提出了减枝的概念(未真正实现)
2.5.1 信息增益比
我们知道信息增益会偏向取值较多的特征,使用信息增益比可以对这一问题进行校正。
定义:特征A对训练数据集D的信息增益比GainRatio(D,A)定义为其信息增益Gain(D,A)与训练数据集D的经验熵H(D)之比:
代码实现
# _*_ coding:utf-8 _*_
import numpy as np
import math
import sys
import operator
def createDataSet():
"""
outlook-> 0: sunny | 1: overcast | 2: rain
temperature-> 0: hot | 1: mild | 2: cool
humidity-> 0: high | 1: normal
windy-> 0: false | 1: true
"""
dataSet = np.array([[0, 0, 0, 0, 'N'],
[0, 0, 0, 1, 'N'],
[1, 0, 0, 0, 'Y'],
[2, 1, 0, 0, 'Y'],
[2, 2, 1, 0, 'Y'],
[2, 2, 1, 1, 'N'],
[1, 2, 1, 1, 'Y']])
labels = ['outlook', 'temperature', 'humidity', 'windy']
return dataSet, labels
def createTestSet():
"""
outlook-> 0: sunny | 1: overcast | 2: rain
temperature-> 0: hot | 1: mild | 2: cool
humidity-> 0: high | 1: normal
windy-> 0: false | 1: true
"""
testSet = np.array([[0, 1, 0, 0],
[0, 2, 1, 0],
[2, 1, 1, 0],
[0, 1, 1, 1],
[1, 1, 0, 1],
[1, 0, 1, 0],
[2, 1, 0, 1]])
return testSet
#信息熵计算
def cal_entropy(dataset):
classlabel = dataset[:,-1]
label_count = {}
for i in range(classlabel.size):
label = classlabel[i]
# label_count[label]+=1
label_count[label] = label_count.get(label,0)+1
ent=0
for k,v in label_count.items():
ent+=-v/classlabel.size*np.log2(v/classlabel.size)
return ent
#信息熵计算
def cal_ent(data_set):
num = len(data_set)
label = data_set[:,-1]
label_class={}
for l in label:
if l not in label_class.keys():
label_class[l]=1
else:
label_class[l]+=1
ret = 0.0
for k,v in label_class.items():
prob = label_class[k]/num
ret -= prob*math.log(prob,2)
return ret
#计算信息增益率
def cal_split_info(dataset,i):
samples = [sample[i] for sample in dataset]
num = len(data_set)
label_dic = {}
prob = 0
for s in samples:
if s not in label_dic.keys():
label_dic[s] = 0
label_dic[s]+=1
for k,v in label_dic.items():
prob -= (v/num)*math.log(v/num)
return prob
def chooseBestFeature(data_set):
print(data_set)
numfeatures = len(data_set[0])-1
Entropy = cal_ent(data_set)
bestinfoGain = 0.0
bestindex = -1
for i in range(numfeatures):
feallist = [example[i] for example in data_set]
unitfeat = set(feallist)
newEntropy = 0.0
for val in unitfeat:
subDataset = split(data_set,i,val)
prob = len(subDataset) / float(len(data_set))
newEntropy += prob*cal_ent(subDataset)
info_gain = Entropy-newEntropy
split_info = cal_split_info(data_set,i)
new_gain = info_gain/split_info
if(new_gain>bestinfoGain):
bestinfoGain = new_gain
bestindex = i
print(bestindex,bestinfoGain)
return bestindex
def split(dataset,i,value):
retDataset = []
for featVec in dataset:
if featVec[i] == value:
reduceFeatVec = list(featVec[:i]) #之前的值
# # print(type(reduceFeatVec))
reduceFeatVec.extend(featVec[i+1:])#之后的值
# reduceFeatVec = np.delete(featVec,i,axis=1)
retDataset.append(reduceFeatVec)
return np.array(retDataset)
def majorityCnt(classList):
classCount= {}
for vote in classList:
if vote not in classCount.keys():
classCount[vote]=0
else:
classCount[vote]+=1
sortdic = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
return sortdic
def createTree(dataset,labels,featLabels):
classList = [example[-1] for example in dataset]
if classList.count(classList[0])==len(classList):
return classList[0]
if(len(dataset[0])==1):
return majorityCnt(classList)
best_index = chooseBestFeature(dataset)
print(best_index)
bestLabel = labels[best_index]
# print(best_index,bestLabel)
featLabels.append(bestLabel)
# print(featLabels)
mytree = {bestLabel:{}}
del(labels[best_index])
featValues = [example[best_index] for example in dataset]
uniqueVls = set(featValues)
#print(uniqueVls)
for value in uniqueVls:
# print(best_index,value)
# data = split(dataset,best_index,value)
# print(data)
mytree[bestLabel][value]=createTree(split(dataset,best_index,value),labels,featLabels)
print(mytree)
return mytree
if __name__ == '__main__':
data_set,label = createDataSet()
featLabels = []
mytree = createTree(data_set,label,featLabels)
坚持一件事或许很难,但坚持下来一定很酷!^_^