贝叶斯分类
两组文本(一组属于a类,一组属于b类)
NBC(贝叶斯)通过计算样本在各个分类中的概率来进行分类
1.相关模块导入
import numpy as np
import re
import operator as op
2.数据获取
def str_strlist(String):
## 字符串——>字符串列表
listOfTokens=re.split(r"\W",String)
return [tok.lower() for tok in listOfTokens if len(tok)>2]
def open_test():
## 获取数据
dataSet=[]; classList=[];## 初始化文本词集[["c1","c2",...],...]、文本分类
for i in range(1,26):
# 读取属于a类的文本
# wordList = str_strlist(open('算法数据\贝叶斯\email\spam\%d.txt' % i).read())
wordList = str_strlist((open('算法数据\贝叶斯\a\%d.txt' % i,"rb").read()).decode("gbk","ignore"))
dataSet.append(wordList)
classList.append(1)
# 读取属于b类的文本
wordList = str_strlist((open('算法数据\贝叶斯\b\%d.txt' % i,"rb").read()).decode("gbk","ignore"))
# wordList =str_strlist(open('算法数据\贝叶斯\email\ham\%d.txt' % i).read())
dataSet.append(wordList)
classList.append(0)
return dataSet,classList
1.测试open_test()函数
dataSet,classList=open_test()
print(dataSet)
print(classList)2.测试结果(部分显示)
[[‘codeine’, ‘15mg’,…,‘only’], [‘peter’, ‘with’, …,‘eugene’]]
[1, 2,…]
3.词向量转化
def wordSet(dataSet):
## 合并文本词集为词集(不按文本分块)
wordSet=set([])#创建空集合
for wordData in dataSet:
wordSet=wordSet|set(wordData) # 循环合并、去重
return list(wordSet)
def wordCount(wordList, word0):
## 参数为词集和单个文档词集 返回词向量(存在1否则0)
wordVec = [0]*len(wordList)
for word1 in word0:
if word1 in wordList:
wordVec[wordList.index(word1)] = 1
return wordVec
1.测试wordSet()、wordCount()函数
dataSet,classList=open_test()
wordList=wordSet(dataSet)
wordVec=wordCount(wordList, dataSet[1])
print(wordList) ## 词汇表/词集
print(wordVec) ## 词向量2.测试结果
[‘can’,‘bags’,‘sent’, ‘express’, ‘school’,…, ‘assigning’, ‘inform’, ‘income’, ‘decision’, ‘mathematics’]
[0, 0, 1, 1, 0, …,0, 0, 1, 0, 0]
4.贝叶斯分类器
def drill_bayes(traindata,trainClass):
## 训练模型
numtxt=len(traindata)
numWords=len(traindata[0])
p=sum(trainClass)/float(numtxt)
paNum = np.ones(numWords)
pbNum = np.ones(numWords)
paDenom = 2.0
pbDenom = 2.0
for i in range(numtxt):#遍历每个文档
if trainClass[i]==1:
paNum +=traindata[i]
paDenom +=sum(traindata[i])
else:
pbNum +=traindata[i]
pbDenom +=sum(traindata[i])
pa = np.log(paNum / paDenom )
pb= np.log(pbNum / pbDenom )
return pa, pb, p
def class_bayes(data_txt,pa,pb,p):
## 模型分类
p1=sum(data_txt*pa)+np.log(p)
p0=sum(data_txt*pb)+np.log(1.0-p)
if p1>p0:
return 1
else:
return 0
5.训练及测试
## 训练模型
dataSet,classList=open_test()
wordList=wordSet(dataSet)
trainingSet = list(range(50))
import random
trainMat=[]; trainClasses=[]
for docIndex in trainingSet:
trainMat.append(wordCount(wordList,dataSet[docIndex]))
trainClasses.append(classList[docIndex])
pa,pb,p=drill_bayes(trainMat,trainClasses)
## 测试模型
testSet=[]
for i in range(10):
randIndex=int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
testSet
errorCount=0
for docIndex in testSet:
wordVector=wordCount(wordList,dataSet[docIndex])
if class_bayes(np.array(wordVector),pa,pb,p)!=classList[docIndex]:
errorCount+=1
print("分类错误的是:第 %s文件" %(docIndex+1))
print('错误率是:',float(errorCount)/len(testSet))
错误率是: 0.0
6.使用
txt36 = str_strlist((open('算法数据\贝叶斯\%d.txt' % 36,"rb").read()).decode("gbk","ignore"))
wordVector=wordCount(wordList,txt36)
cb=class_bayes(np.array(wordVector),pa,pb,p)
if cb==1:
print("该文本属于a类")
else:
print("该文本属于b类")
该文本属于a类
本文详细介绍了一种基于贝叶斯定理的文本分类方法,包括数据预处理、特征提取、模型训练及测试过程,展示了如何利用Python实现贝叶斯分类器,并对其准确性和效率进行了评估。
481

被折叠的 条评论
为什么被折叠?



