贝叶斯定理:
from math import *
from numpy import *
import random
创建数据集和标签
def loadData():
postingList=[['my','dog','has','flea','problems','help','please'],
['maybe','not','take','him','to','dog','park','stupid'],
['my','dalmation','is','so','cute','I','love','him'],
['stop','posting','stupid','worthless','garbage'],
['mr','licks','ate','my','steak','how','to','stop','him'],
['quit','buying','worthless','dog','food','stupid']]
classVec=[0,1,0,1,0,1]#0是正常词汇,1是具有侮辱性的词汇
return postingList,classVec
根据上面的数据集创建词汇库
输入即是上面的数据集postingList
输出无重复的词汇库
def createVocaList(dataset):
vocabset=set([])
for document in dataset:
vocabset=vocabset|set(document)#union交集,除去重复性词汇
return list(vocabset)
将单个样本隐射到词汇库中,统计单个样本在词库中的出现情况
def setOfWords2Vec(vocabList,inputset):
returnVec=[0]*len(vocabList)#创建一个与词汇库长度相同的0向量表
for word in inputset:
if word in vocabList:
returnVec[vocabList.index(word)]=1#词汇表中出现过的词汇记1
else:
print("the word:{0} is not in my vocabulary".format(word))
return returnVec
def trainNB0(trainMatrix,trainCategory):
numTrainDocs=len(trainMatrix)#统计训练样本的大小此处numTrainDocs=6
numWords=len(trainMatrix[0])#统计样本中第一个文档中包含的词汇个数
pAbusive=sum(trainCategory)/float(numTrainDocs)#计算侮辱性词汇的概率
p0Num=ones(numWords)#初始样本个数为1,防止条件概率为0
p1Num=ones(numWords)
p0Denom=2.0#初始样本个数为2,防止条件概率为0
p1Denom=2.0
for i in range(numTrainDocs):
if trainCategory[i]==1:#计算类别为1的词汇出现概率
p1Num+=trainMatrix[i]#当某一文档中出现1时,p1Num加1
p1Denom+=sum(trainMatrix[i])#同时,在整个训练样本中类别为1的词汇数+1
else:
p0Num+=trainMatrix[i]
p1Denom+=sum(trainMatrix[i])
p1Vect=p1Num/p1Denom
p1Vect=[log(x) for x in p1Vect]#p1Vect表示条件概率P(Wi|c=1)
p0Vect=p0Num/p0Denom
p0Vect=[log(x) for x in p0Vect]#p0Vect表示条件概率P(Wi|c=0)
return p0Vect,p1Vect,pAbusive
训练贝叶斯分类算法
def classifyNB(vec2classify,p0Vec,p1Vec,pclass1):
p1=sum(vec2classify*p1Vec)+log(pclass1)
p0=sum(vec2classify*p0Vec)+log(1.0-pclass1)
if p1>p0:
return 1
else:
return 0
其中p1和p0表示的是lnp(w1|c=1)p(w2|c=1)...p(wn|c=1)∗p(c=1)lnp(w1|c=1)p(w2|c=1)...p(wn|c=1)∗p(c=1)和lnp(w1|c=0)p(w2|c=0)...p(wn|c=0)∗p(c=0)lnp(w1|c=0)p(w2|c=0)...p(wn|c=0)∗p(c=0),取对数是因为防止p(w_1|c=1)p(w_2|c=1)p(w_3|c=1)…p(w_n|c=1)多个小于1的数相乘结果值下溢。
测试分类模型
def testingNB():
listOPosts,listclass=loadData()#加载数据集和类标号
myVocabList=createVocaList(listOPosts)#创建词库
trainMat=[]
for postinDoc in listOPosts:#计算词库中的每个样本的出现情况
trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
p0V, p1V, pAb = trainNB0(numpy.array(trainMat), numpy.array(listclass))#计算条件概率
testEntry = ['love', 'my', 'dalmation']
thisDoc = numpy.array(setOfWords2Vec(myVocabList, testEntry))
print("testEntry classified as:{0}".format(classifyNB(thisDoc, p0V, p1V, pAb)))
testEntry = ['stupid', 'garbage','quit']
thisDoc = numpy.array(setOfWords2Vec(myVocabList, testEntry))
print("testEntry classified as:{0}".format(classifyNB(thisDoc, p0V, p1V, pAb)))
利用朴素贝叶斯进行垃圾邮件测试
def textParse(bigstring):
import re#导入正则式
listOfTokens=re.split(r'\w*',bigstring)
return [tok.lower() for tok in listOfTokens if len(tok)>2]#去掉少于两个字符的字符串,,并将所有的字符串转换为小写
def spamTest():
docList=[]
classList=[]
fullText=[]
for i in range(1,26):
wordList=textParse(open('spam/%d.txt'%i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList=textParse(open('ham/%d.txt'%i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList=createVocaList(docList)
trainingSet=list(range(50))
testSet=[]
for i in range(10):#随机选择十封邮件 作为测试集
randIndex=int(random.uniform(0,len(trainingSet)))#随机选择十个数字,数字对应的文档被添加到测试集
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])#将选中的文档从训练集中删除
trainMat=[]
trainClasses=[]
for docIndex in trainingSet:
trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam=trainNB0(array(trainMat),array(trainClasses))
errorCount=0
for docIndex in testSet:
wordVector=setOfWords2Vec(vocabList,docList[docIndex])
if classifyNB(array(wordVector),p0V,p1V,pSpam)!=classList[docIndex]:
errorCount+=1
print("the error rate is {0}".format(float(errorCount)/len(testSet)))