from numpy import *
#创建词列表
def creatvocablist(dataset):
vocab=set([])
for line in dataset:
vocab=vocab|set(line)
return list(vocab)
#将输入数据转换成统一的向量表达式
def wordtovector(lineset,vocablist):
vector=[0]*len(vocablist)
for line in lineset:
if line in vocablist:
vector[vocablist.index(line)]+=1
return vector
#训练算法
def trainbyes(traindata,labels):
y=len(labels)
y1=labels.count(1)
p1=y1/y
numdata=len(traindata[0])
p1Num=ones(numdata)
p_Num=ones(numdata)
p1Denom=2.0
p_Denom=2.0
for i in range(len(traindata)):
if labels[i]==1:
p1Num+=traindata[i]
p1Denom+=1
else:
p_Num += traindata[i]
p_Denom+=1
p_1=log(p1Num/p1Denom)
p_i=log(p_Num/p_Denom)
return p_1,p_i,p1
# 测试算法
def testbyes(testVector,p_1,p_i,p1):
y1=sum(p_1*testVector)+log(p1)
yi=sum(p_i*testVector)+log(1-p1)
if y1>yi:
return 1
else:
return -1
if __name__ == "__main__":
dataset = [[1, 'S'], [1, 'M'], [1, 'M'], [1, 'S'], [1, 'S'], [2, 'S'], [2, 'M'], [2, 'M'], [2, 'L'], [2, 'L'], [2, 'L'],[3, 'M'], [3, 'M'], [3, 'L'], [3, 'L']]
label = [-1, -1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, -1]
vocablist=creatvocablist(dataset)
print(vocablist)
traindata=[]
for i in dataset:
traindata.append(wordtovector(i,vocablist))
print(traindata)
p_1,p_0,p1=trainbyes(traindata,label)
print(p_1,p_0,p1)
test=[2,'M']
testVector=wordtovector(test,vocablist)
print(testVector)
predict=testbyes(testVector,p_1,p_0,p1)
print(predict)
本程序参照机器学习实战中对于朴素贝叶斯算法的应用,对其进行适当的修改与调整,具体的理论部分请参照李航机器学习中朴素贝叶斯算法这一章。