相对ID3,改为判断ratio
for value in uniqueVals: #计算每种划分方式的信息熵
subdataset=splitdataset(dataset,i,value)
p=len(subdataset)/float(len(dataset))
newEnt+=p*jisuanEnt(subdataset)
IV=IV-p*log(p,2)
infoGain=baseEnt-newEnt
if (IV == 0): # fix the overflow bug
continue
infoGain_ratio = infoGain / IV #这个feature的infoGain_ratio
或者
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet) / float(len(dataSet))
newEntropy += prob *calcShannonEntOfFeature(subDataSet, -1) #calc conditional entropy
infoGain = baseEntropy - newEntropy
iv = calcShannonEntOfFeature(dataSet, i)
if(iv == 0): #value of the feature is all same,infoGain and iv all equal 0, skip the feature
continue
infoGainRate = infoGain / iv
#calc shannon entropy of label or feature
def calcShannonEntOfFeature(dataSet, feat):
numEntries = len(dataSet)
labelCounts = {}
for feaVec in dataSet:
currentLabel = feaVec[feat]
if currentLabel not in labelCounts:
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
shannonEnt -= prob * log(prob, 2)
return shannonEnt

被折叠的 条评论
为什么被折叠?



