numpy手写朴素贝叶斯
import numpy as np
# 1 构建词向量矩阵
def loadDataSet():
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0, 1, 0, 1, 0, 1] # 1 is abusive, 0 not
return postingList, classVec
def createVocabList(dataSet):
vocabSet = set([]) # create empty set
for document in dataSet:
vocabSet = vocabSet | set(document) # union of the two sets
return list(vocabSet)
def setOfWords2Vec(vocabList, inputSet):
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
print("the word: %s is not in my Vocabulary!" % word)
return returnVec
def cal_prob(trainMat, labelMat):
p0 = sum(np.array(labelMat) == 0) / len(labelMat)
p1 = 1 - p0
pA_vec, pB_vec = np.zeros((1, len(trainMat[0]))), np.zeros((1, len(trainMat[0])))
pA_Num, pB_Num = 0, 0
for i in range(len(trainMat)):
if labelMat[i] == 1:
pA_vec += trainMat[i]
pA_Num += sum(trainMat[i])
elif labelMat[i] == 0:
pB_Num += sum(trainMat[i])
pB_vec += trainMat[i]
pA = pA_vec + 1 / pA_Num + 2
pB = pB_vec + 1/ pB_Num + 2
return pA, pB, p1
def classify(classvec, pA, pB, p1):
classvec = np.array(classvec).reshape(-1, len(pA[0]))
s0 = np.sum(np.log(np.multiply(classvec, pA)), axis=1) + np.log(1 - p1)
s1 = np.sum(np.log(np.multiply(classvec, pB)), axis=1) + np.log(p1)
return 0 if s0 > s1 else 1
if __name__ == '__main__':
postingList, classVec = loadDataSet()
wordset = createVocabList(postingList)
# word_vec0 = setOfWords2Vec(wordset, postingList[0])
trainmat = []
for input_text in postingList:
vec = setOfWords2Vec(wordset, input_text)
trainmat.append(vec)
test = setOfWords2Vec(wordset, postingList[1])
pA, pB, p1 = cal_prob(trainmat, classVec)
print(classify(test, pA, pB, p1))