python12讲稿 bayes+tfidf多分类(3类)及总结对比

本文探讨了朴素贝叶斯分类器在文本分类任务中的应用,并通过引入TF-IDF加权方法显著提高了分类准确率。实验结果显示,在分类数逐渐增加的情况下,使用TF-IDF的Bayes方法能够保持较高的稳定性。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

1 语料准备和加载模型
import pdb,jieba,string
#pdb.set_trace()
import os,sys
import numpy as np
#1 函数定义部分
def textParse2(bigString):
    stop_f=open('d:/email/stopwords.txt',encoding='utf8')
    stopwords=list()
    for line in stop_f.readlines():
        line=line.strip()
        stopwords.append(line)
    stop_f.close()
    seg_list=jieba.lcut(bigString,cut_all=False)
    outstr=[]
    for i in seg_list:
        if i not in stopwords and i not in string.punctuation and i not in [' ','\n']:
            outstr.append(i)
    return outstr  
    
def createVocabList(dataSet,classList):
    vocabSet1 = set([])  #create empty set
    vocabSet2 = [set([])]*3
    for i in range(len(dataSet)):
        if classList[i]==0:
            vocabSet2[0]=vocabSet2[0]|set(dataSet[i])
        if classList[i]==1:
            vocabSet2[1]=vocabSet2[1]|set(dataSet[i])
        if classList[i]==2:
            vocabSet2[2]=vocabSet2[2]|set(dataSet[i])
    vocabSet1 = vocabSet2[0] | vocabSet2[1] | vocabSet2[2]
    return list(vocabSet1),list(vocabSet2)

def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec
    
def classList2Vec(classList):
    l=len(classList)
    for i in range(l):
        if classList[i]=='环境200':
            classList[i]=0
        elif classList[i]=='计算机200':
            classList[i]=1
        elif classList[i]=='交通214':
            classList[i]=2   
    return classList
def classifyNB(vec2Classify, re3, re):
    p=[]
    for i in range(len(re)):
        p.append(sum(vec2Classify*re3[i])+np.log(re[i]))
    return np.argmax(p)
#2 加载数据和训练部分
#2.1 加载数据 
list1=os.listdir('d:/jiqixuexi')
docList=[]; classList = []; fullText =[]
for l in list1:
    filepath=os.path.join('d:/jiqixuexi/',l)
    if os.path.isdir(filepath):
        for li in os.listdir(filepath):
            #print(filepath,li)
            wordList=textParse2(open(filepath+'/'+li,'r',encoding='utf8').read())
            docList.append(wordList)
            fullText.extend(wordList)
            classList.append(filepath.split('/')[-1])
    else:
        pass
        
classList=classList2Vec(classList)
vocabList1, vocabList2= createVocabList(docList,classList)        
#2.2 一共615篇文章,三个文件夹:环境200,计算机200,交通215.从每个文件夹各选20篇组成测试集,剩下的555篇组成训练集.
trainingSet= list(range(615));testSet=[]
for i in range(60):
    randIndex=int(np.random.uniform(0,20))
    testSet.append(trainingSet[randIndex])
    del trainingSet[randIndex]
    
trainMat=[]; trainClasses = []
for docIndex in trainingSet:#train the classifier (get probs) trainNB0
    trainMat.append(bagOfWords2VecMN(vocabList1, docList[docIndex]))
    trainClasses.append(classList[docIndex])
1.1 如果采用伯努利模型,则代码如下:

trainMatrix=np.array(trainMat)
trainCategory=np.array(trainClasses)
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
re=np.array([0]*3,dtype=np.float64)
for i in range(3):
    re[i]=len(vocabList2[i])/len(vocabList1)
        
re1=[]
for i in range(3):re1.append([1]*numWords)
re1=np.array(re1,dtype=np.float64)
for i in range(numTrainDocs):
    if trainCategory[i] == 0:
        re1[0] += trainMatrix[i]
    elif trainCategory[i] == 1:
        re1[1] += trainMatrix[i]
    elif trainCategory[i] == 2:
        re1[2] += trainMatrix[i]
re3=np.zeros(re1.shape)
for i in range(len(re1)):
    re3[i]=re1[i]/(len(vocabList2[i])+len(vocabList1))
re3=np.log(re3)

结果测试:
errorCount = 0
for docIndex in testSet:        #classify the remaining items
    wordVector = bagOfWords2VecMN(vocabList1, docList[docIndex])
    if classifyNB(np.array(wordVector),re3,re) != classList[docIndex]:
        errorCount += 1

#错误率
print('error rate:%f'% (float(errorCount)/len(testSet)))
#分类错误的文章的数量
print('errorCount=%d'% errorCount)

结果为:
error rate:0.066667
errorCount=4

1.2 如果采用TF-IDF,则结果如下:
def calc_tfidf():
    tf=np.zeros([numTrainDocs,numWords])
    idf=np.ones(numWords)
    tf_idf=np.ones([numTrainDocs,numWords])
    for idx in range(numTrainDocs):
        t=bagOfWords2VecMN(vocabList1, docList[trainingSet[idx]])
        tf[idx]=t/np.sum(t)
    for word in vocabList1:
        for idx in range(numTrainDocs):
            if word in docList[trainingSet[idx]]:
                idf[vocabList1.index(word)]+=1
    idf=np.log(float(numTrainDocs)/idf)
    tf_idf=tf*idf
    return tf_idf
    
    

#vocabList1, vocabList2= createVocabList(docList,classList)
trainMatrix=np.array(trainMat)
trainCategory=np.array(trainClasses)
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
tf_idf=calc_tfidf()
re=np.array([0]*3,dtype=np.float64)
for i in range(3):
    re[i]=len(vocabList2[i])/len(vocabList1)
        
re1=[]
for i in range(3):re1.append([1]*numWords)
re1=np.array(re1,dtype=np.float64)
for i in range(numTrainDocs):
    if trainCategory[i] == 0:
        re1[0] += tf_idf[i]
    elif trainCategory[i] == 1:
        re1[1] += tf_idf[i]
    elif trainCategory[i] == 2:
        re1[2] += tf_idf[i]
re3=np.zeros(re1.shape)
for i in range(len(re1)):
    re3[i]=re1[i]/(len(vocabList2[i])+len(vocabList1))
re3=np.log(re3)

结果测试:
errorCount = 0
for docIndex in testSet:        #classify the remaining items
    wordVector = bagOfWords2VecMN(vocabList1, docList[docIndex])
    if classifyNB(np.array(wordVector),re3,re) != classList[docIndex]:
        errorCount += 1
#错误率
print('error rate:%f'% (float(errorCount)/len(testSet)))
#分类错误的文章的数量
print('errorCount=%d'% errorCount)

结果为:
error rate:0.000000
errorCount=0

2 bayes与bayes-tfidf的对比
分类数测试集数量不使用tf-idf使用tf-idf
错误数错误率错误数错误率
35610.01785710.017857
47920.025316100.126582
59880.081633100.102041
6111100.090090130.117117
7134100.074627120.089552
8153240.156863140.091503
9169240.142012160.094674

结论:在分类数逐渐增多的时候,使用tf-idf的bayes方法得到的结果比较稳定.

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值