LDA-inference

本文介绍了一种基于LDA主题模型进行文本分类的方法,通过吉布斯采样算法实现模型训练,并利用KNN进行分类预测。同时展示了如何评估模型效果,包括通过迭代过程中困惑度的变化来调整模型参数。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

LDA 主题模型 及inference,并查看迭代过程的困惑度,简单利用KNN预测分类结果

#-*- coding:utf-8 -*-
import logging
import logging.config
import ConfigParser
import numpy as np
import numpy
import random
import codecs
import os
import math
import string
import matplotlib.pyplot as plt
import re
import scipy.stats.mstats as sts
from sklearn.neighbors import KNeighborsClassifier
import scipy.stats.mstats as sts
from collections import OrderedDict#实现一个FIFO(先进先出)的dict,当容量超出限制时,先删除最早添加的Key:
#获取当前路径
path = os.getcwd()
#导入日志配置文件
logging.config.fileConfig("logging.conf")
#创建日志对象
logger = logging.getLogger()
# loggerInfo = logging.getLogger("TimeInfoLogger")
# Consolelogger = logging.getLogger("ConsoleLogger")
thee=[]
alpp=[]
#导入配置文件
conf = ConfigParser.ConfigParser()
conf.read("setting.conf") 
#文件路径
trainfile ='D:\\Documents\\data\\tra'
wordidmapfile = os.path.join(path,os.path.normpath(conf.get("filepath","wordidmapfile")))
thetafile = os.path.join(path,os.path.normpath(conf.get("filepath","thetafile")))
phifile = os.path.join(path,os.path.normpath(conf.get("filepath","phifile")))
paramfile = os.path.join(path,os.path.normpath(conf.get("filepath","paramfile")))
topNfile = os.path.join(path,os.path.normpath(conf.get("filepath","topNfile")))
tassginfile = os.path.join(path,os.path.normpath(conf.get("filepath","tassginfile")))
#模型初始参数
K = int(conf.get("model_args","K"))
alpha = float(conf.get("model_args","alpha"))
beta = float(conf.get("model_args","beta"))
iter_times = int(conf.get("model_args","iter_times"))
top_words_num = int(conf.get("model_args","top_words_num"))
thetaa=[]
alphii=[]
label=[]
label_t=[]
class Document(object):
    def __init__(self):
        self.words = []
        self.length = 0

class DataPreProcessing(object):

    def __init__(self):
        self.docs_count = 0
        self.words_count = 0
        self.docs = []
        self.word2id = OrderedDict()#字典;词袋

    def cachewordidmap(self):
        with codecs.open(wordidmapfile, 'w','utf-8') as f:
            for word,id in self.word2id.items():
                f.write(word +str(id)+"  \n")

class LDAModel(object):

    def __init__(self,dpre,dpre1):
        self.pre=[]
        self.dpre = dpre #获取预处理参数
        self.dpre1 = dpre1
        #
        #模型参数
        #聚类个数K当前K=20,迭代次数iter_times,每个类特征词个数top_words_num,超参数α(alpha) β(beta)
        #
        self.K = 50
        self.beta = beta
        self.alpha = alpha
        self.iter_times = 30
        self.top_words_num = top_words_num 
        #文件变量
        #分好词的文件trainfile
        #词对应id文件wordidmapfile
        #文章-主题分布文件thetafile
        #词-主题分布文件phifile
        #每个主题topN词文件topNfile
        #最后分派结果文件tassginfile
        #模型训练选择的参数文件paramfile
        #
        self.wordidmapfile = wordidmapfile
        self.trainfile = trainfile
        self.thetafile = thetafile
        self.phifile = phifile
        self.topNfile = topNfile
        self.tassginfile = tassginfile
        self.paramfile = paramfile
        # p,概率向量 double类型,存储采样的临时变量
        # nw,词word在主题topic上的分布;词在类上的分布 维度:M*K 其中M为文章集合的词的总个数
        # nwsum,每各topic的词的总数; nwsum 每个类上的词的总数 维度:K
        # nd,每个doc中各个topic的词的总数; nd 每篇文章中,各个类的词个数分布 维度:V*K 其中V为文章的总个数
        # ndsum,每各doc中词的总数;ndsum 每篇文章中的词的总个数 维度:V
        self.p = np.zeros(self.K)        
        self.nw = np.zeros((self.dpre.words_count,self.K),dtype="int")       
        self.nwsum = np.zeros(self.K,dtype="int")    
        self.nd = np.zeros((self.dpre.docs_count,self.K),dtype="int")       
        self.ndsum = np.zeros(dpre.docs_count,dtype="int")    
        self.Z = np.array([ [0 for y in xrange(dpre.docs[x].length)] for x in xrange(dpre.docs_count)])# M*doc.size(),文档中词的主题分布
        #测试集
        # p,概率向量 double类型,存储采样的临时变量
        # nw,词word在主题topic上的分布;词在类上的分布 维度:M*K 其中M为文章集合的词的总个数
        # nwsum,每各topic的词的总数; nwsum 每个类上的词的总数 维度:K
        # nd,每个doc中各个topic的词的总数; nd 每篇文章中,各个类的词个数分布 维度:V*K 其中V为文章的总个数
        # ndsum,每各doc中词的总数;ndsum 每篇文章中的词的总个数 维度:V
        #print 'self.Z',self.Z
        #随机先分配类型
        for x in xrange(len(self.Z)):
            self.ndsum[x] = self.dpre.docs[x].length
            for y in xrange(self.dpre.docs[x].length):
                #print self.dpre.docs[x].length,y
                topic = random.randint(0,self.K-1)#预定主题
                self.Z[x][y] = topic
                self.nw[self.dpre.docs[x].words[y]][topic] += 1
                self.nd[x][topic] += 1
                self.nwsum[topic] += 1

        self.theta = np.array([ [0.0 for y in xrange(self.K)] for x in xrange(self.dpre.docs_count) ])#docs行,K列
        self.ttheta= np.array([ [0.0 for y in xrange(self.K)] for x in xrange(self.dpre1.docs_count) ])#docs行,K列
        print 'ceshiji',np.shape(self.ttheta)
        self.phi = np.array([ [ 0.0 for y in xrange(self.dpre.words_count) ] for x in xrange(self.K)]) #K行,words列
    def sampling(self,i,j):#吉布斯采样i篇文档j词

        topic = self.Z[i][j]
        word = self.dpre.docs[i].words[j]
        self.nw[word][topic] -= 1
        self.nd[i][topic] -= 1
        self.nwsum[topic] -= 1
        self.ndsum[i] -= 1

        Vbeta = self.dpre.words_count * self.beta
        Kalpha = self.K * self.alpha
        self.p = (self.nw[word] + self.beta)/(self.nwsum + Vbeta) * (self.nd[i] + self.alpha) / (self.ndsum[i] + Kalpha)#phi*theta

        for k in xrange(1,self.K):
            self.p[k] += self.p[k-1]
        #随机选择或者叫轮盘法选择一个主题做为当前位置的主题topic_new
        u = random.uniform(0,self.p[self.K-1])
        for topic in xrange(self.K):
            if self.p[topic]>u:
                break

        self.nw[word][topic] +=1
        self.nwsum[topic] +=1
        self.nd[i][topic] +=1
        self.ndsum[i] +=1

        return topic
    def est(self):
        # Consolelogger.info(u"迭代次数为%s 次" % self.iter_times)
        for x in xrange(self.iter_times):
            for i in xrange(self.dpre.docs_count):
                for j in xrange(self.dpre.docs[i].length):
                    topic = self.sampling(i,j)
                    self.Z[i][j] = topic
            #print 'pre:',prep
            thetaa=self._theta()
            alphii=self._phi()

            prep=self.perplexity()
            self.pre.append(prep)
        print np.shape(alphii)
        self.graph_draw()
        logger.info(u"迭代完成。")
        logger.debug(u"保存模型")
        self.save()
    def _theta(self):
        for i in xrange(self.dpre.docs_count):
            self.theta[i] = (self.nd[i]+self.alpha)/(self.ndsum[i]+self.K * self.alpha)
        #print 'self.theta',np.shape(self.theta) 
        return self.theta
    def _phi(self):
        for i in xrange(self.K):
            self.phi[i] = (self.nw.T[i] + self.beta)/(self.nwsum[i]+self.dpre.words_count * self.beta)
        #print 'self.phi',np.shape(self.phi)
        return self.phi
    def save(self):
        #保存theta文章-主题分布
        logger.info(u"文章-主题分布已保存到%s" % self.thetafile)
        with codecs.open(self.thetafile,'w') as f:
            for x in xrange(self.dpre.docs_count):
                for y in xrange(self.K):
                    f.write(str(self.theta[x][y]) + '\t')
                f.write('\n')
        #保存phi词-主题分布
        logger.info(u"词-主题分布已保存到%s" % self.phifile)
        with codecs.open(self.phifile,'w') as f:
            for x in xrange(self.K):
                for y in xrange(self.dpre.words_count):
                    f.write(str(self.phi[x][y]) + '\t')
                f.write('\n')
        #保存参数设置
        logger.info(u"参数设置已保存到%s" % self.paramfile)
        with codecs.open(self.paramfile,'w','utf-8') as f:
            f.write('K=' + str(self.K) + '\n')
            f.write('alpha=' + str(self.alpha) + '\n')
            f.write('beta=' + str(self.beta) + '\n')
            f.write(u'迭代次数  iter_times=' + str(self.iter_times) + '\n')
            f.write(u'每个类的高频词显示个数  top_words_num=' + str(self.top_words_num) + '\n')
        #保存每个主题topic的词
        logger.info(u"主题topN词已保存到%s" % self.topNfile)

        with codecs.open(self.topNfile,'w','utf-8') as f:
            self.top_words_num = min(self.top_words_num,self.dpre.words_count)
            for x in xrange(self.K):
                f.write(u'第' + str(x) + u'类:' + ' ')
                twords = []
                twords = [(n,self.phi[x][n]) for n in xrange(self.dpre.words_count)]
                twords.sort(key = lambda i:i[1], reverse= True)
                for y in xrange(self.top_words_num):
                    word = OrderedDict({value:key for key, value in self.dpre.word2id.items()})[twords[y][0]]
                    f.write(' '*2+ word +' ' + str(twords[y][1])+ '    ')
                f.write('\n')
        #保存最后退出时,文章的词分派的主题的结果
        logger.info(u"文章-词-主题分派结果已保存到%s" % self.tassginfile)
        with codecs.open(self.tassginfile,'w') as f:
            for x in xrange(self.dpre.docs_count):
                for y in xrange(self.dpre.docs[x].length):
                    f.write(str(self.dpre.docs[x].words[y])+':'+str(self.Z[x][y])+ '\t')
                f.write('\n')
        logger.info(u"模型训练完成。")
    def perplexity(self):        
        phi = self.phi
        thetas = self.theta
        #print phi,thetas
        #print 'perplexity,shape:',np.shape(self.theta),np.shape(phi)
        log_per = N = 0
        for i in range(len(thetas)):

            for w in self.dpre.docs[i].words :
                log_per -= np.log(np.inner(phi[:,w], thetas[i]))#按行乘
            N += self.dpre.docs[i].length
        #print 'log_per / N',log_per , N
        x=np.exp(log_per / N)
        #print x
        return x
    def graph_draw(self):             #做主题数与困惑度的折线图
        x=range(self.iter_times)
        y=self.pre
        plt.figure()
        plt.plot(x,y,color="red",linewidth=2)
        plt.xlabel("Number of Topic")
        plt.ylabel("Perplexity")
        plt.show()
    def t_theta(self):
        for i in xrange(self.dpre1.docs_count):
            self.ttheta[i] = (self.tnd[i]+self.alpha)/(self.tndsum[i]+self.K * self.alpha)
        return self.ttheta
    def sampling_t(self,i,j):#吉布斯采样i篇文档j词
        topic = self.tZ[i][j]
        word = dpre1.docs[i].words[j]

        #self.nw[word][topic] -= 1
        #self.nwsum[topic] -= 1
        self.tnd[i][topic] -=1
        self.tndsum[i] -= 1

        #Kalpha = self.K * self.alpha
        #print type(self.phi[j]),type((self.tnd[i] + self.alpha)),type((self.tndsum[i] + Kalpha))
        self.tp = self.phi[:,word]* (self.tnd[i] + self.alpha) / ((self.tndsum[i]+self.K*self.alpha))#phi*theta
        for k in xrange(1,self.K):
            self.tp[k] += self.tp[k-1]
        #随机选择或者叫轮盘法选择一个主题做为当前位置的主题topic_new
        u = random.uniform(0,self.tp[self.K-1])
        for topic in xrange(self.K):
            if self.tp[topic]>u:
                break
        #self.nw[word][topic] += 1
        #self.nwsum[topic] += 1

        self.tnd[i][topic] +=1
        self.tndsum[i] +=1

        return topic    
    def inference(self):
        #测试集  
        self.tp = np.zeros(self.K) 
        self.tnd = np.zeros((dpre1.docs_count,self.K),dtype="int")       
        self.tndsum = np.zeros(dpre1.docs_count,dtype="int")  
        self.tZ = []
        # M*doc.size(),文档中词的主题分布
        self.ttheta= np.array([ [0.0 for y in xrange(self.K)] for x in xrange(dpre1.docs_count) ])#docs1行,K列
        for x in  xrange(dpre1.docs_count):
            self.tndsum[x] = dpre1.docs[x].length
            z_n=[]
            for y in xrange(dpre1.docs[x].length):
                word=dpre1.docs[x].words[y]
                z=numpy.random.randint(0, self.K)
                self.tnd[x][z] += 1
                z_n.append(z)
            self.tZ.append(numpy.array(z_n))
        for ij in xrange(30):#30次迭代
            for i in xrange(dpre1.docs_count):
                for j in xrange(dpre1.docs[i].length):
                    topic = self.sampling_t(i,j)
                    self.tZ[i][j] = topic
        self.t_theta()
    def probl(self):
        text_clf  = KNeighborsClassifier(n_neighbors=4, algorithm='brute') 
        print np.shape(self.theta),np.shape(label)
        text_clf = text_clf.fit(self.theta, label)
        doc_class_predicted = text_clf.predict(self.ttheta) 

        print np.mean(label_t==doc_class_predicted)
def preprocessing():
    logger.info(u'载入数据......')
    tra_path=os.listdir(trainfile)
    docs=[]
    for ea in tra_path:
        print ea
        docs.append(open('D:\\Documents\\data\\tra\\'+ea,'r').readlines())

    #测试集
    tes_path=os.listdir('D:\\Documents\\data\\tes\\')
    docs1=[]
    for ea in tes_path:
        print ea
        docs1.append(open('D:\\Documents\\data\\tes\\'+ea,'r').readlines())
    logger.debug(u"载入完成,准备生成字典对象和统计文本数据...")
    dpre = DataPreProcessing()#创建字典并保存
    dpre1= DataPreProcessing()
    items_idx = 0

    fr = open('stopwords.txt','r')  
    stopwords_list =[]
    for line in fr.readlines():
        line=line.decode('utf-8').strip().split()
        #print line,type(line),len(line)
        line=line[0]
        #print line,type(line),len(line)
        stopwords_list.append(line)      
    js=1
    for x in docs:
        js1=1
        for line in x:
            #print js
            label.append(js)
            #print line
            if line != "":
                tmp = line.decode('utf-8').strip().split()
                #生成一个文档对象,创建词袋
                doc = Document()#类
                tt=0
                for item in tmp:
                    #print item
                    if item not in stopwords_list:

                        tt+=1
                        if dpre.word2id.has_key(item):
                            doc.words.append(dpre.word2id[item])
                        else:
                            dpre.word2id[item] = items_idx
                            doc.words.append(items_idx)
                            items_idx += 1
                doc.length = tt
                dpre.docs.append(doc)
            else:
                pass
            js1+=1
            if js1>1000:#每类文件数
                break
        if js>10:#类别数目
            break
        js=js+1
    ttt=1
    for x in docs1:
        doc_1_t=1
        for line in x:
            doc_1_t+=1
            label_t.append(ttt)
            if line != "":
                #print line
                tmp = line.strip().split()
                #生成一个文档对象,创建词袋
                doc1 = Document()#类
                tt=0
                for item in tmp:
                    #print item
                    if dpre.word2id.has_key(item) :
                        docnum=dpre.word2id.get(item)
                        doc1.words.append(docnum)
                        tt+=1
                doc1.length = tt
                dpre1.docs.append(doc1)
            else:
                pass
            if doc_1_t>100:
                break
        if ttt>10:
            break
        ttt+=1
    dpre.docs_count = len(dpre.docs)
    dpre.words_count = len(dpre.word2id)
    dpre1.docs_count  = len(dpre1.docs)
    #print dpre1.docs[6].length
    logger.info(u"共有%s个文档" % dpre.docs_count)
    dpre.cachewordidmap()
    logger.info(u"词与序号对应关系已保存到%s" % wordidmapfile)
    return dpre,dpre1



dpre,dpre1 = preprocessing()
lda = LDAModel(dpre,dpre1)
lda.est()
lda.inference()
lda.probl()

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值