一直都是用R,SAS这样的工具完成机器学习算法。但是面对大数据集的时候,这些工具往往显得很弱,基本上在读入文件的时候就能让电脑的内存爆掉。能用程序实现这些机器学习算法是很重要的。因为自己如果能实现这些算法,你就可以很灵活,比如说在文本挖掘过程中往往是稀疏矩阵,虽然R也有提供稀疏矩阵的操作,但是不一定满足自己的需求。所以我选择了从《机器学习实战》这本书中来学习用Python实现机器学习的算法。
下面直接贴上代码,这代码完全是仿照书上写的。用这个思路我还实现了c++版的id3算法,会稍后贴出。
import math
dataset = [
("青年", "否", "否", "一般", "否")
,("青年", "否", "否", "好", "否")
,("青年", "是", "否", "好", "是")
,("青年", "是", "是", "一般", "是")
,("青年", "否", "否", "一般", "否")
,("中年", "否", "否", "一般", "否")
,("中年", "否", "否", "好", "否")
,("中年", "是", "是", "好", "是")
,("中年", "否", "是", "非常好", "是")
,("中年", "否", "是", "非常好", "是")
,("老年", "否", "是", "非常好", "是")
,("老年", "否", "是", "好", "是")
,("老年", "是", "否", "好", "是")
,("老年", "是", "否", "非常好", "是")
,("老年", "否", "否", "一般", "否")
]
labels=['var1','var2','var3','var4']
data=[[1,1,'yes'],
[1,1,'yes'],
[1,0,'no'],
[0,1,'no'],
[0,1,'no']]
la=['no surfacing','flippes']
#计算信息熵,target为分类标签的位置
def entropy(dataset,target=-1):
instance_num=len(dataset)
tar_num={}
entro=0.0
for t in dataset:
tar_num[t[target]]=tar_num.setdefault(t[target],0)+1
pl=[n/instance_num for n in tar_num.values()]
for p in pl:
entro-=p*math.log2(p)
return(entro)
###拆分数据集
def split_dataset(dataset,f_index,value):
sub_set=[]
for row in dataset:
row=list(row)
if row[f_index]==value:
row.remove(row[f_index])
sub_set.append(row)
return(sub_set)
###计算信息增益
def info_gain(dataset,feature_index):
entropy_d=entropy(dataset)
long_dataset=len(dataset)
feature_count={}
sub_entropy=0.0
for row in dataset:
feature_count[row[feature_index]]=feature_count.setdefault(row[feature_index],0)+1
feature_p=0.0
for k,v in feature_count.items():
feature_p=v/long_dataset
subset=split_dataset(dataset,feature_index,k)
sub_entropy+=feature_p*entropy(subset)
ig=entropy_d-sub_entropy
#print(ig)
return(ig)
##寻找最大信息增益的特征
def find_bestfeature(dataset,label):
max_ig=0.0
for la in label:
label_index=label.index(la)
ig=info_gain(dataset,label_index)
if ig>max_ig:
max_ig=ig
baset_feature=la
return(baset_feature,label_index)
###返回数量做多的value
def max_count(dataset):
if dataset[0]==1:
for row in dataset:
class_count[row[0]]=class_count.setdefault(row[0],0)+1
sorted_count=sorted(class_count.items(),reverse=True)
return(sorted_count[0][0])
else:
return(None)
###create_tree
def create_tree(dataset,label):
class_list=[x[-1] for x in dataset]
if class_list.count(class_list[0])==len(dataset):
return(class_list[0])
if len(dataset)==1:
return(max_count(dataset))
best_feature,index=find_bestfeature(dataset,label)
tree={best_feature:{}}
feature_list=set(x[index] for x in dataset)
label.remove(best_feature)
for ft in feature_list:
sublabel=label[:]
subset=split_dataset(dataset,index,ft)
tree[best_feature][ft]=create_tree(subset,sublabel)
return(tree)
def classfy(inputTree,featlabels,testVec):
firstStr=list(inputTree.keys())[0]
secondDict=inputTree[firstStr]
featIndex=featlabels.index(firstStr)
for key in secondDict.keys():
if testVec[featIndex]==key:
if type(secondDict[key]).__name__=='dict':
classLabel=classfy(secondDict[key],featlabels,testVec)
else:
classLabel=secondDict[key]
return classLabel
mytree=create_tree(data,la)
test=classfy(mytree,['no surfacing','flippes'],[0,1])
print(test)