# -*- coding: utf-8 -*-
"""
Created on Tue Sep 08 10:10:15 2015
@author: Administrator
"""
import operator #为了能够处理叶子节点的类标签仍然不唯一的情况。
from math import log
#计算训练数据集的香农熵
def calcShannonEnt(dataSet):
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0 #这里不能为1,因为后一语句是一定执行的。
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
shannonEnt -= prob * log(prob, 2)
return shannonEnt
#创建训练数据集
def createDataSet():
dataSet = [[1, 1, 'yes'],
[1, 1, 'yes'],
[1, 0, 'no'],
[0, 1, 'no'],
[0, 1, 'no']]
labels = ['no surfacing',