贝叶斯分类是一类分类算法的总称,这类算法均以贝叶斯定理为基础,故统称为贝叶斯分类。而朴素贝叶斯分类器是贝叶斯分类器中最简单常用的一种方法。并且朴素贝叶斯算法仍然是最流行的十大挖掘算法之一。该算法简单易懂,学习效率高,在某些分类问题上能够和神经网络和决策树相媲美。
import numpy as np
def load_data_set():
posting_list = [
['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
class_vec = [0, 1, 0, 1, 0, 1] # 1 is 侮辱性的文字, 0 is not
return posting_list, class_vec
def createVocabList(postingList):
result = set({})
for posting in postingList:
result = result|set(posting)
return list(result)
def setOfWord2Vec(vocablist,inputset):
result = [0]*len(vocablist)
for input in inputset:
if input in vocablist:
result[vocablist.index(input)] = 1
else:
print(input,"is not in vocablist")
return result
#朴素贝叶斯词袋模型 词的频率可能携带某些信息
def bagOfWord2Vec(vocabList,inpuSet):
result = [0] * len(vocabList)
for input in inpuSet:
if input in vocabList:
result[vocabList.index(input)] += 1
else:
print(input,"is not in vocablist")
return result
def trainNB(trainMatrix,classVec):
numPost = len(trainMatrix)
numWord = len(trainMatrix[0])
pc = np.sum(classVec) / np.float(numPost)
num0Vec = np.ones(numWord)
num1Vec = np.ones(numWord)
num0 = 2.0
num1 = 2.0
for i in range(numPost):
if classVec[i] == 1:
num1Vec += trainMatrix[i]
num1 += np.sum(trainMatrix[i])
else:
num0Vec += trainMatrix[i]
num0 += np.sum(trainMatrix[i])
p1 = np.log(num1Vec / num1)
p0 = np.log(num0Vec / num0)
return p0,p1,pc
def classifier(vec2classifier,p0Vec,p1Vec,pc):
p0 = np.sum(vec2classifier * p0Vec) + np.log(1.0 - pc)
p1 = np.sum(vec2classifier * p1Vec) + np.log(pc)
print(p0,p1)
if p0 < p1:
return 1
else:
return 0
def testingNB():
listPosts,listClasses = load_data_set()
myVocab = createVocabList(listPosts)
trainMat = []
for i in listPosts:
trainMat.append(setOfWord2Vec(myVocab,i))
p0,p1,pc = trainNB(np.array(trainMat),np.array(listClasses))
p0 = np.array(p0)
p1 = np.array(p1)
test0 = ['love', 'my', 'dalmation']
test1 = ['stupid', 'garbage']
test0vect = np.array(setOfWord2Vec(myVocab, test0))
test1vect = np.array(setOfWord2Vec(myVocab, test1))
print("test0 :", classifier(test0vect, p0, p1, pc))
print("test1 :", classifier(test1vect, p0, p1, pc))
if __name__ == "__main__":
#testingNB()
pass