朴素贝叶斯分类的python实现

贝叶斯分类

两组文本(一组属于a类,一组属于b类)
NBC(贝叶斯)通过计算样本在各个分类中的概率来进行分类

1.相关模块导入
import numpy as np
import re
import operator as op
2.数据获取
def str_strlist(String):
    ## 字符串——>字符串列表
    listOfTokens=re.split(r"\W",String)
    return [tok.lower() for tok in listOfTokens if len(tok)>2]

def open_test():
	## 获取数据
    dataSet=[]; classList=[];## 初始化文本词集[["c1","c2",...],...]、文本分类
    for i in range(1,26):
        # 读取属于a类的文本
#         wordList = str_strlist(open('算法数据\贝叶斯\email\spam\%d.txt' % i).read())
        wordList = str_strlist((open('算法数据\贝叶斯\a\%d.txt' % i,"rb").read()).decode("gbk","ignore"))
        dataSet.append(wordList)
        classList.append(1)
        # 读取属于b类的文本
        wordList = str_strlist((open('算法数据\贝叶斯\b\%d.txt' % i,"rb").read()).decode("gbk","ignore"))
#         wordList =str_strlist(open('算法数据\贝叶斯\email\ham\%d.txt' % i).read())
        dataSet.append(wordList)
        classList.append(0)
    return dataSet,classList
1.测试open_test()函数

dataSet,classList=open_test()
print(dataSet)
print(classList)

2.测试结果(部分显示)

[[‘codeine’, ‘15mg’,…,‘only’], [‘peter’, ‘with’, …,‘eugene’]]
[1, 2,…]

3.词向量转化
def wordSet(dataSet):
    ## 合并文本词集为词集(不按文本分块)
    wordSet=set([])#创建空集合
    for wordData in dataSet:
        wordSet=wordSet|set(wordData) # 循环合并、去重
    return list(wordSet)

def wordCount(wordList, word0):
    ## 参数为词集和单个文档词集 返回词向量(存在1否则0)
    wordVec = [0]*len(wordList)
    for word1 in word0:
        if word1 in wordList:
            wordVec[wordList.index(word1)] = 1         
    return wordVec
1.测试wordSet()、wordCount()函数

dataSet,classList=open_test()
wordList=wordSet(dataSet)
wordVec=wordCount(wordList, dataSet[1])
print(wordList) ## 词汇表/词集
print(wordVec) ## 词向量

2.测试结果

[‘can’,‘bags’,‘sent’, ‘express’, ‘school’,…, ‘assigning’, ‘inform’, ‘income’, ‘decision’, ‘mathematics’]
[0, 0, 1, 1, 0, …,0, 0, 1, 0, 0]

4.贝叶斯分类器
def drill_bayes(traindata,trainClass):
	## 训练模型
    numtxt=len(traindata)
    numWords=len(traindata[0])
    p=sum(trainClass)/float(numtxt)
    paNum = np.ones(numWords)
    pbNum = np.ones(numWords)
    paDenom = 2.0
    pbDenom = 2.0
    for i in range(numtxt):#遍历每个文档
        if trainClass[i]==1:
            paNum +=traindata[i]
            paDenom +=sum(traindata[i])
        else:
            pbNum +=traindata[i]
            pbDenom +=sum(traindata[i])
    pa = np.log(paNum / paDenom )
    pb= np.log(pbNum / pbDenom )
    return pa, pb, p
    
def class_bayes(data_txt,pa,pb,p):
	## 模型分类
    p1=sum(data_txt*pa)+np.log(p)
    p0=sum(data_txt*pb)+np.log(1.0-p)
    if p1>p0:
        return 1
    else:
        return 0
5.训练及测试
## 训练模型
dataSet,classList=open_test()
wordList=wordSet(dataSet)
trainingSet = list(range(50))
import random
trainMat=[]; trainClasses=[]
for docIndex in trainingSet:
    trainMat.append(wordCount(wordList,dataSet[docIndex]))
    trainClasses.append(classList[docIndex])
pa,pb,p=drill_bayes(trainMat,trainClasses)
## 测试模型
testSet=[]
for i in range(10):
    randIndex=int(random.uniform(0,len(trainingSet)))
    testSet.append(trainingSet[randIndex])
    del(trainingSet[randIndex])
testSet
errorCount=0
for docIndex in testSet:
    wordVector=wordCount(wordList,dataSet[docIndex])
    if class_bayes(np.array(wordVector),pa,pb,p)!=classList[docIndex]:
        errorCount+=1
        print("分类错误的是:第 %s文件" %(docIndex+1))
print('错误率是:',float(errorCount)/len(testSet))

错误率是: 0.0

6.使用
txt36  = str_strlist((open('算法数据\贝叶斯\%d.txt' % 36,"rb").read()).decode("gbk","ignore"))
wordVector=wordCount(wordList,txt36)
cb=class_bayes(np.array(wordVector),pa,pb,p)
if cb==1:
    print("该文本属于a类")
else:
    print("该文本属于b类")

该文本属于a类

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值