email.py
import numpy as np
import random
import re
"""
函数说明:接收一个大字符串并将其解析为字符串列表
Parameters:
bigString - 接收的字符串
Returns:
"""
def textParse(bigString):
listOfTokens = re.split(r'\W+',bigString)
return [tok.lower() for tok in listOfTokens if len(tok)>2]
"""
函数说明:将切分的实验样本词条整理成词汇表
Parameters:
dataSet - 整理的样本数据集
Returns:
vocabSet - 返回词汇表
"""
def createVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet|set(document)
return list(vocabSet)
"""
函数说明:根据vocabList词汇表,将inputSet向量化,向量的每个元素为1或0
Parameters:
vocabList - 词汇表
inputSet - 切分的词条向量
Returns:
returnVec - 文档向量
"""
def setOfWords2Vec(vocabList,inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
return returnVec
"""
函数说明:根据vocabList词汇表,构建词袋模型
Parameters:
vocabList - 词汇表
inputSet - 切分的词条列表
Returns:
returnVec - 文档向量,词袋模型
"""
def bagOfWords2VecMN(vocabList,inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec
"""
函数说明:朴素贝叶斯分类器训练函数
Parameters:
trainMatrix - 训练文档矩阵,即setOfWords2Vec返回的returnVec构成的矩阵
trainCategory - 训练类别标签向量
Returns:
p0Vect - 非侮辱类的条件概率数组
p1Vect - 侮辱类的条件概率数组
pAbusive - 文档属于侮辱类的概率
"""
def trainNB0(trainMatrix,trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(trainCategory)/float(numTrainDocs)
p0Num = np.ones(numWords);p1Num = np.ones(numWords)
p0Denom = 2.0; p1Denom = 2.0
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vect = np.log(p1Num/p1Denom)
p0Vect = np.log(p0Num/p1Denom)
return p0Vect,p1Vect,pAbusive
"""
函数说明:朴素贝叶斯分类器分类函数
Parameters:
vec2Classify - 待分类的词条数组
p0Vec - 非侮辱类的条件概率数组
p1Vec -侮辱类的条件概率数组
pClass1 - 文档属于侮辱类的概率
Returns:
0 - 属于非侮辱类
1 - 属于侮辱类
"""
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
p1 = sum(vec2Classify*p1Vec)+np.log(pClass1)
p0 = sum(vec2Classify*p0Vec)+np.log(1.0-pClass1)
if p1>p0:
return 1
else:
return 0
"""
函数说明:测试朴素贝叶斯分类器
Parameters:
无
Returns:
无
"""
def spamTest():
docList = []; classList = []; fullText = []
for i in range(1,26):
wordlist = textParse(open('email/spam/%d.txt'%i,'r').read())
docList.append(wordlist)
fullText.append(wordlist)
classList.append(1)
wordlist = textParse(open('email/ham/%d.txt'%i,'r').read())
docList.append(wordlist)
fullText.append(wordlist)
classList.append(0)
vocabList = createVocabList(docList)
trainMat = list(range(50)); testMat = []
for i in range(10):
docIndex = int(random.uniform(0,len(trainMat)))
testMat.append(trainMat[docIndex])
del(trainMat[docIndex])
trainMatrix = [];trainClasses = []
for randIndex in trainMat:
trainMatrix.append(setOfWords2Vec(vocabList,docList[randIndex]))
trainClasses.append(classList[randIndex])
p0V,p1V,pSpam = trainNB0(np.array(trainMatrix),np.array(trainClasses))
errorCount = 0.0
for randIndex in testMat:
vector = setOfWords2Vec(vocabList,docList[randIndex])
if classifyNB(vector,p0V,p1V,pSpam) != classList[randIndex]:
errorCount += 1.0
print('错误向量',docList[randIndex])
print("错误率:%.2f%%"%(float(errorCount)/len(testMat)*100))
main.py
from email import *
if __name__ == '__main__':
spamTest()