朴素贝叶斯法是基于贝叶斯定理与特征条件独立假设的分类方法。
算法的核心思想就是比较概率的大小,认定概率大的类别为所属类别
下面是公式推导
下面是朴素贝叶斯的python代码实现
import numpy as np
from functools import reduce
def loadDataSet():
"""
创建数据集
:return:postingList:实验样本切分词条,classVec:类别标签向量:1:侮辱类,2:非侮辱类
"""
postingList = [['my','dog','has','flea','probelms','help','please'],
['maybe','not','take','him','to','dog','park','stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0,1,0,1,0,1]
return postingList,classVec
def setOfWords2Vec(vocabList,inputSet):
"""
根据vocabList词汇表,将inputSet向量化,即词汇表
:param vocabList:词汇表
:param inputSet:切分的词条列表
:return:文档向量
"""
returnVec = [0]*len(vocabList)#创建一个向量其中所有元素都为0
for word in inputSet:#遍历每个词条
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
print("the word:%s is not in my Vocabulary!" % word)
return returnVec
def createVocabList(dataSet):
"""
将数据转化为词汇表,即词条不重复
:param dataSet:
:return:
"""
vocabSet = set([])#创建一个空的不重复的列表
for document in dataSet:
vocabSet = vocabSet | set(document)#取并集
return list(vocabSet)
def trainNB0(trainMatrix,trainCategory):
"""
朴素贝叶斯分类器
:param trainMatrix: 训练文档矩阵
:param trainCategory: 训练类别标签向量
:return: p0Vect-侮辱类的条件概率数组
p1Vect-非侮辱类的条件概率数组
pAbusive-文档属于侮辱类的概率
"""
numTrainDocs = len(trainMatrix)#计算训练的文档数目
numWords = len(trainMatrix[0])#计算每篇文档的词条数
pAusive = sum(trainCategory)/float(numTrainDocs)
#p0Num = np.zeros(numWords);p1Num = np.zeros(numWords)#创建numpy.zeros数组,词条出现数初始化为0
p0Num = np.ones(numWords);p1Num = np.ones(numWords)#词条出现数初始化为1,拉普拉斯平滑
#p0Denom = 0.0;p1Denom = 0.0#分母初始化为0
p0Denom = 2.0;p1Denom = 2.0#分母初始化为2,拉普拉斯平滑
for i in range(numTrainDocs):
if trainCategory[i] == 1:#统计属于侮辱类的条件概率所需的数据:P(w0&1)
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:#统计属于非侮辱类的条件概率所需的数据
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vect = p1Num/p1Denom
p0Vect = p0Num/p0Denom
return p0Vect,p1Vect,pAusive
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
"""
朴素贝叶斯分类函数
:param vec2Classify: 待分类词条数组
:param p0Vec: 侮辱类的条件概率数组
:param p1Vec: 非侮辱类的条件概率数组
:param pClass1: 文档属于侮辱类的概率
:return: 0-侮辱类 1-非侮辱类
"""
# #reduce:将一个可以迭代的对象应用到两个带有参数的方法上,我们称这个方法为fun,遍历这个可迭代的对象,将其中元素依次作为fun的参数
# p1 = reduce(lambda x,y:x*y,vec2Classify*p1Vec)*pClass1
# p2 = reduce(lambda x,y:x*y,vec2Classify*p1Vec)*(1.0 - pClass1)
#用log防止下溢出 logA*B = logA + logB
p1 = sum(vec2Classify*p1Vec)+np.log(pClass1)
p0 = sum(vec2Classify*p0Vec)+np.log(1.0 - pClass1)
print("p1:",p1)
print("p2:",p0)
if p1>p0:
return 1
else:
return 0
def testingNB():
"""
测试贝叶斯分类器
:return: None
"""
listOPosts,listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat = []
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
p0V,p1V,pAb = trainNB0(trainMat,listClasses)#训练
testEntry = ['love','my','dalmation']
thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry))#测试样本向量化
if classifyNB(thisDoc,p0V,p1V,pAb):
print(testEntry,"属于侮辱类")
else:
print(testEntry, "不属于侮辱类")
testEntry = ['stupid','garbage']
thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry))#测试样本向量化
if classifyNB(thisDoc,p0V,p1V,pAb):
print(testEntry,"属于侮辱类")
else:
print(testEntry, "不属于侮辱类")
if __name__ == '__main__':
testingNB()
运行结果