from sklearn.naive_bayes import MultinomialNB,GaussianNB
import numpy as np
MultinomialNB 多项式朴素贝叶斯
GaussianNB 高斯朴素贝叶斯
import re
def creatVocabList(wordList): #创建词集
vocabSet=set([])
for document in wordList:
vocabSet=vocabSet|set(document)
vocabList=list(vocabSet)
return vocabList
def textParse(bigString):
line = re.split(’\W’,bigString)
return [tokens.lower() for tokens in line if len(tokens) > 2]
wordList = [] #每句话的单词
classList = [] #类标签
for i in range(1, 26):
wordList_s = textParse(open(‘C:\Users\Administrator\Desktop\emailEnglish\spam\%d.txt’%i).read())
wordList.append(wordList_s)
classList.append(1)
wordList_h = textParse(open(‘C:\Users\Administrator\Desktop\emailEnglish\health\%d.txt’%i).read())
wordList.append(wordList_h)
classList.append(0)
def setOfWords2Vec(vocabList,words): #把词转换为向量
wordVec=[0]*len(words)
for word in words:
if word in vocabList:
wordVec[vocabList.index(word)]=1
return wordVec
trainMat=[]
wordLists=creatVocabList(wordList) #词集
for words in wordList:
trainMat.append(setOfWords2Vec(words,wordLists))
m1=MultinomialNB()
m1=m1.fit(trainMat,classList)
x=textParse(open(‘C:\Users\Administrator\Desktop\emailEnglish\health\2.txt’).read())
x=np.array(setOfWords2Vec(x,wordLists))
print(m1.predict(x.reshape(1,-1)))