今天看来朴素贝叶斯,最大的进步我觉得就是知道了她的一个非常重要的用途,用作文本分类任务,下面是copy两个大佬的代码,用来进行记忆:
#!/usr/bin/python
# coding:utf-8
from numpy import *
from math import *
def loadDataSet():
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak',
'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0, 1, 0, 1, 0, 1] # 1代表侮辱性文字 0代表正常的言论
return postingList, classVec
def createVocabList(dataSet): # 给定一个数据集,创建一个不含重复单词的词向量
vocabList = set([])
for document in dataSet:
vocabList = vocabList | set(document)
return list(vocabList)
def setOfWords2Vec(vocabList, inputSet): #给定一个词向量和一条单词构成的记录,生成次条记录的向量
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
return returnVec
def trainNB0(trainMatrix, trainCategory): # 训练参数,得到一个参数矩阵,对应着各个单词对应分类的出现频率
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(trainCategory) / float(numTrainDocs)
p0Num = ones(numWords)
p1Num = ones(numWords)
p0Denom = 2
p1Denom = 2
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vec = p1Num / p1Denom
p0Vec = p0Num / p0Denom
return p0Vec, p1Vec, pAbusive
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1): # 计算分类
p1 = sum(vec2Classify * p1Vec) + log(pClass1) # array相乘并且加上类别个数的对数上
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
if p1 > p0:
return 1
else:
return 0
def testNB(): # 测试贝叶斯
listOPosts, listClasses = loadDataSet() #加载数据集
myVocabList = createVocabList(listOPosts) # 得到词向量
trainMat = []
for postinDoc in listOPosts: # 生成训练数据集的向量
vecOfWord = setOfWords2Vec(myVocabList, postinDoc)
trainMat.append(vecOfWord)
p0v, p1v, pAb = trainNB0(trainMat, listClasses) # 根据已有的数据训练出的参数p0 p1为矩阵
testEntry = ['stupid'] # 测试数据
testMatrix = array(setOfWords2Vec(myVocabList, testEntry)) # 得到测试数据的array
return classifyNB(testMatrix, p0v, p1v, pAb) # 分类
print(testNB()) #测试主函数
下面是调用sklearn实现的,用的语料库是fetch_20newsgroups中的新闻:
#coding=utf-8
from sklearn.datasets import fetch_20newsgroups # 从sklearn.datasets里导入新闻数据抓取器 fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer # 从sklearn.feature_extraction.text里导入文本特征向量化模块
from sklearn.naive_bayes import MultinomialNB # 从sklean.naive_bayes里导入朴素贝叶斯模型
from sklearn.metrics import classification_report
#1.数据获取
news = fetch_20newsgroups(subset='all')
print (len(news.data)) # 输出数据的条数:18846
#2.数据预处理:训练集和测试集分割,文本特征向量化
X_train,X_test,y_train,y_test = train_test_split(news.data,news.target,test_size=0.25,random_state=33) # 随机采样25%的数据样本作为测试集
#print y_train[0:100] #查看标签
#文本特征向量化
vec = CountVectorizer()
X_train = vec.fit_transform(X_train)
X_test = vec.transform(X_test)
print(X_train[0],len(X_train)) #查看训练样本,估计有很多维,并打印出维数
#3.使用朴素贝叶斯进行训练
mnb = MultinomialNB() # 使用默认配置初始化朴素贝叶斯
mnb.fit(X_train,y_train) # 利用训练数据对模型参数进行估计
y_predict = mnb.predict(X_test) # 对参数进行预测
#4.获取结果报告
#验证正确率
from sklearn.metrics import accuracy_score
print('Accuracy : % .2f' % accuracy_score(y_predict, y_test))
print ('The Accuracy of Naive Bayes Classifier is:', mnb.score(X_test,y_test))
print (classification_report(y_test, y_predict, target_names = news.target_names))
我觉得最重要的不是实现,而是明白它的工作原理,明白原理之后你才能自己去写出比较好的,适用的代码