决策树
http://mayuxiang.sinaapp.com/?p=155
http://www.engr.uvic.ca/~seng474/
#!/usr/bin/python
#coding:utf-8
my_data=[['slashdot','USA','yes',18,'None'],
['google','France','yes',23,'Premium'],
['digg','USA','yes',24,'Basic'],
['kiwitobes','France','yes',23,'Basic'],
['google','UK','no',21,'Premium'],
['(direct)','New Zealand','no',12,'None'],
['(direct)','UK','no',21,'Basic'],
['google','USA','no',24,'Premium'],
['slashdot','France','yes',19,'None'],
['digg','USA','no',18,'None'],
['google','UK','no',18,'None'],
['kiwitobes','UK','no',19,'None'],
['digg','New Zealand','yes',12,'Basic'],
['slashdot','UK','no',21,'None'],
['google','UK','yes',18,'Basic'],
['kiwitobes','France','yes',19,'Basic']]
class decisionnode:
def __init__(self,col=-1,value=None,results=None,tb=None,fb=None):
self.col=col
self.value=value
self.results=results
self.tb=tb
self.fb=fb
# 根据某一列进行子集划分,可同时处理数字和非数值两类数据
def divideset(rows,column,value):
# 定义一个函数来计算出某一行是属于Group1 (值为True)还是Group2
#(值为false)
split_function=None
if isinstance(value,int) or isinstance(value,float):
# 如果变量为数值类型,那么用“大于等于”来判断
split_function=lambda row:row[column]>=value
else:
split_function=lambda row:row[column]==value
# 将结果存于两个集合数组,并返回
set1=[row for row in rows if split_function(row)]
set2=[row for row in rows if not split_function(row)]
return (set1,set2)
#结果集中每个结果出现的次数,如None字段出现2次
def uniquecounts(rows):
classcounts = {}
for row in rows:
# we care about the last column, the class column
theclass = row[len(row)-1]
if theclass not in classcounts: classcounts[theclass]=0
classcounts[theclass]+=1
return classcounts
# 计算熵值 e=p(x)log(p(x)),熵值越低,划分效果越好
def entropy(rows):
from math import log
log2=lambda x:log(x)/log(2)
results=uniquecounts(rows) #结果集中每个结果出现的次数,
#开始计算熵值
ent=0.0
for r in results.keys():
p=float(results[r])/len(rows)
ent=ent-p*log2(p)
return ent
# treepredict.divideset(treepredict.my_data,2,'yes')
def buildtree(rows,scoref=entropy):
if len(rows)==0: return decisionnode()
# Set up some variables to track the best split 设置变量来捕获最优解
lowest_impurity = scoref(rows)
best_split = None
best_sets = None
column_count = len(rows[0])-1
for col in range(0,column_count):
# Generate the list of different values in this column
column_values = {}
for row in rows: column_values[row[col]] = 1
# Now divide the rows up for each value in this column
for value in column_values.keys():
(set1,set2) = divideset(rows,col,value)
exp_impurity = float(len(set1))/len(rows) * scoref(set1) + float(len(set2))/len(rows) * scoref(set2)
if exp_impurity < lowest_impurity and len(set1)>0 and len(set2)>0:
lowest_impurity = exp_impurity
best_split = (col,value)
best_sets = (set1,set2)
if lowest_impurity < scoref(rows):
trueBranch = buildtree(best_sets[0],scoref)
falseBranch = buildtree(best_sets[1],scoref)
return decisionnode(col=best_split[0],value=best_split[1],
tb=trueBranch, fb=falseBranch)
else:
return decisionnode(results=uniquecounts(rows))
def printtree(tree, indent=' '):
if tree.results != None:
print tree.results
else:
print str(tree.col) + ':' + str(tree.value) + '?'
print indent+'T->',
printtree(tree.tb, indent+' ')
print indent+'F->',
printtree(tree.fb, indent+' ')
#print divideset(my_data,2,'yes')
print uniquecounts(my_data)
print entropy(my_data)
tree = buildtree(my_data)
printtree(tree)