LDA 主题模型 及inference,并查看迭代过程的困惑度,简单利用KNN预测分类结果
#-*- coding:utf-8 -*-
import logging
import logging.config
import ConfigParser
import numpy as np
import numpy
import random
import codecs
import os
import math
import string
import matplotlib.pyplot as plt
import re
import scipy.stats.mstats as sts
from sklearn.neighbors import KNeighborsClassifier
import scipy.stats.mstats as sts
from collections import OrderedDict#实现一个FIFO(先进先出)的dict,当容量超出限制时,先删除最早添加的Key:
#获取当前路径
path = os.getcwd()
#导入日志配置文件
logging.config.fileConfig("logging.conf")
#创建日志对象
logger = logging.getLogger()
# loggerInfo = logging.getLogger("TimeInfoLogger")
# Consolelogger = logging.getLogger("ConsoleLogger")
thee=[]
alpp=[]
#导入配置文件
conf = ConfigParser.ConfigParser()
conf.read("setting.conf")
#文件路径
trainfile ='D:\\Documents\\data\\tra'
wordidmapfile = os.path.join(path,os.path.normpath(conf.get("filepath","wordidmapfile")))
thetafile = os.path.join(path,os.path.normpath(conf.get("filepath","thetafile")))
phifile = os.path.join(path,os.path.normpath(conf.get("filepath","phifile")))
paramfile = os.path.join(path,os.path.normpath(conf.get("filepath","paramfile")))
topNfile = os.path.join(path,os.path.normpath(conf.get("filepath","topNfile")))
tassginfile = os.path.join(path,os.path.normpath(conf.get("filepath","tassginfile")))
#模型初始参数
K = int(conf.get("model_args","K"))
alpha = float(conf.get("model_args","alpha"))
beta = float(conf.get("model_args","beta"))
iter_times = int(conf.get("model_args","iter_times"))
top_words_num = int(conf.get("model_args","top_words_num"))
thetaa=[]
alphii=[]
label=[]
label_t=[]
class Document(object):
def __init__(self):
self.words = []
self.length = 0
class DataPreProcessing(object):
def __init__(self):
self.docs_count = 0
self.words_count = 0
self.docs = []
self.word2id = OrderedDict()#字典;词袋
def cachewordidmap(self):
with codecs.open(wordidmapfile, 'w','utf-8') as f:
for word,id in self.word2id.items():
f.write(word +str(id)+" \n")
class LDAModel(object):
def __init__(self,dpre,dpre1):
self.pre=[]
self.dpre = dpre #获取预处理参数
self.dpre1 = dpre1
#
#模型参数
#聚类个数K当前K=20,迭代次数iter_times,每个类特征词个数top_words_num,超参数α(alpha) β(beta)
#
self.K = 50
self.beta = beta
self.alpha = alpha
self.iter_times = 30
self.top_words_num = top_words_num
#文件变量
#分好词的文件trainfile
#词对应id文件wordidmapfile
#文章-主题分布文件thetafile
#词-主题分布文件phifile
#每个主题topN词文件topNfile
#最后分派结果文件tassginfile
#模型训练选择的参数文件paramfile
#
self.wordidmapfile = wordidmapfile
self.trainfile = trainfile
self.thetafile = thetafile
self.phifile = phifile
self.topNfile = topNfile
self.tassginfile = tassginfile
self.paramfile = paramfile
# p,概率向量 double类型,存储采样的临时变量
# nw,词word在主题topic上的分布;词在类上的分布 维度:M*K 其中M为文章集合的词的总个数
# nwsum,每各topic的词的总数; nwsum 每个类上的词的总数 维度:K
# nd,每个doc中各个topic的词的总数; nd 每篇文章中,各个类的词个数分布 维度:V*K 其中V为文章的总个数
# ndsum,每各doc中词的总数;ndsum 每篇文章中的词的总个数 维度:V
self.p = np.zeros(self.K)
self.nw = np.zeros((self.dpre.words_count,self.K),dtype="int")
self.nwsum = np.zeros(self.K,dtype="int")
self.nd = np.zeros((self.dpre.docs_count,self.K),dtype="int")
self.ndsum = np.zeros(dpre.docs_count,dtype="int")
self.Z = np.array([ [0 for y in xrange(dpre.docs[x].length)] for x in xrange(dpre.docs_count)])# M*doc.size(),文档中词的主题分布
#测试集
# p,概率向量 double类型,存储采样的临时变量
# nw,词word在主题topic上的分布;词在类上的分布 维度:M*K 其中M为文章集合的词的总个数
# nwsum,每各topic的词的总数; nwsum 每个类上的词的总数 维度:K
# nd,每个doc中各个topic的词的总数; nd 每篇文章中,各个类的词个数分布 维度:V*K 其中V为文章的总个数
# ndsum,每各doc中词的总数;ndsum 每篇文章中的词的总个数 维度:V
#print 'self.Z',self.Z
#随机先分配类型
for x in xrange(len(self.Z)):
self.ndsum[x] = self.dpre.docs[x].length
for y in xrange(self.dpre.docs[x].length):
#print self.dpre.docs[x].length,y
topic = random.randint(0,self.K-1)#预定主题
self.Z[x][y] = topic
self.nw[self.dpre.docs[x].words[y]][topic] += 1
self.nd[x][topic] += 1
self.nwsum[topic] += 1
self.theta = np.array([ [0.0 for y in xrange(self.K)] for x in xrange(self.dpre.docs_count) ])#docs行,K列
self.ttheta= np.array([ [0.0 for y in xrange(self.K)] for x in xrange(self.dpre1.docs_count) ])#docs行,K列
print 'ceshiji',np.shape(self.ttheta)
self.phi = np.array([ [ 0.0 for y in xrange(self.dpre.words_count) ] for x in xrange(self.K)]) #K行,words列
def sampling(self,i,j):#吉布斯采样i篇文档j词
topic = self.Z[i][j]
word = self.dpre.docs[i].words[j]
self.nw[word][topic] -= 1
self.nd[i][topic] -= 1
self.nwsum[topic] -= 1
self.ndsum[i] -= 1
Vbeta = self.dpre.words_count * self.beta
Kalpha = self.K * self.alpha
self.p = (self.nw[word] + self.beta)/(self.nwsum + Vbeta) * (self.nd[i] + self.alpha) / (self.ndsum[i] + Kalpha)#phi*theta
for k in xrange(1,self.K):
self.p[k] += self.p[k-1]
#随机选择或者叫轮盘法选择一个主题做为当前位置的主题topic_new
u = random.uniform(0,self.p[self.K-1])
for topic in xrange(self.K):
if self.p[topic]>u:
break
self.nw[word][topic] +=1
self.nwsum[topic] +=1
self.nd[i][topic] +=1
self.ndsum[i] +=1
return topic
def est(self):
# Consolelogger.info(u"迭代次数为%s 次" % self.iter_times)
for x in xrange(self.iter_times):
for i in xrange(self.dpre.docs_count):
for j in xrange(self.dpre.docs[i].length):
topic = self.sampling(i,j)
self.Z[i][j] = topic
#print 'pre:',prep
thetaa=self._theta()
alphii=self._phi()
prep=self.perplexity()
self.pre.append(prep)
print np.shape(alphii)
self.graph_draw()
logger.info(u"迭代完成。")
logger.debug(u"保存模型")
self.save()
def _theta(self):
for i in xrange(self.dpre.docs_count):
self.theta[i] = (self.nd[i]+self.alpha)/(self.ndsum[i]+self.K * self.alpha)
#print 'self.theta',np.shape(self.theta)
return self.theta
def _phi(self):
for i in xrange(self.K):
self.phi[i] = (self.nw.T[i] + self.beta)/(self.nwsum[i]+self.dpre.words_count * self.beta)
#print 'self.phi',np.shape(self.phi)
return self.phi
def save(self):
#保存theta文章-主题分布
logger.info(u"文章-主题分布已保存到%s" % self.thetafile)
with codecs.open(self.thetafile,'w') as f:
for x in xrange(self.dpre.docs_count):
for y in xrange(self.K):
f.write(str(self.theta[x][y]) + '\t')
f.write('\n')
#保存phi词-主题分布
logger.info(u"词-主题分布已保存到%s" % self.phifile)
with codecs.open(self.phifile,'w') as f:
for x in xrange(self.K):
for y in xrange(self.dpre.words_count):
f.write(str(self.phi[x][y]) + '\t')
f.write('\n')
#保存参数设置
logger.info(u"参数设置已保存到%s" % self.paramfile)
with codecs.open(self.paramfile,'w','utf-8') as f:
f.write('K=' + str(self.K) + '\n')
f.write('alpha=' + str(self.alpha) + '\n')
f.write('beta=' + str(self.beta) + '\n')
f.write(u'迭代次数 iter_times=' + str(self.iter_times) + '\n')
f.write(u'每个类的高频词显示个数 top_words_num=' + str(self.top_words_num) + '\n')
#保存每个主题topic的词
logger.info(u"主题topN词已保存到%s" % self.topNfile)
with codecs.open(self.topNfile,'w','utf-8') as f:
self.top_words_num = min(self.top_words_num,self.dpre.words_count)
for x in xrange(self.K):
f.write(u'第' + str(x) + u'类:' + ' ')
twords = []
twords = [(n,self.phi[x][n]) for n in xrange(self.dpre.words_count)]
twords.sort(key = lambda i:i[1], reverse= True)
for y in xrange(self.top_words_num):
word = OrderedDict({value:key for key, value in self.dpre.word2id.items()})[twords[y][0]]
f.write(' '*2+ word +' ' + str(twords[y][1])+ ' ')
f.write('\n')
#保存最后退出时,文章的词分派的主题的结果
logger.info(u"文章-词-主题分派结果已保存到%s" % self.tassginfile)
with codecs.open(self.tassginfile,'w') as f:
for x in xrange(self.dpre.docs_count):
for y in xrange(self.dpre.docs[x].length):
f.write(str(self.dpre.docs[x].words[y])+':'+str(self.Z[x][y])+ '\t')
f.write('\n')
logger.info(u"模型训练完成。")
def perplexity(self):
phi = self.phi
thetas = self.theta
#print phi,thetas
#print 'perplexity,shape:',np.shape(self.theta),np.shape(phi)
log_per = N = 0
for i in range(len(thetas)):
for w in self.dpre.docs[i].words :
log_per -= np.log(np.inner(phi[:,w], thetas[i]))#按行乘
N += self.dpre.docs[i].length
#print 'log_per / N',log_per , N
x=np.exp(log_per / N)
#print x
return x
def graph_draw(self): #做主题数与困惑度的折线图
x=range(self.iter_times)
y=self.pre
plt.figure()
plt.plot(x,y,color="red",linewidth=2)
plt.xlabel("Number of Topic")
plt.ylabel("Perplexity")
plt.show()
def t_theta(self):
for i in xrange(self.dpre1.docs_count):
self.ttheta[i] = (self.tnd[i]+self.alpha)/(self.tndsum[i]+self.K * self.alpha)
return self.ttheta
def sampling_t(self,i,j):#吉布斯采样i篇文档j词
topic = self.tZ[i][j]
word = dpre1.docs[i].words[j]
#self.nw[word][topic] -= 1
#self.nwsum[topic] -= 1
self.tnd[i][topic] -=1
self.tndsum[i] -= 1
#Kalpha = self.K * self.alpha
#print type(self.phi[j]),type((self.tnd[i] + self.alpha)),type((self.tndsum[i] + Kalpha))
self.tp = self.phi[:,word]* (self.tnd[i] + self.alpha) / ((self.tndsum[i]+self.K*self.alpha))#phi*theta
for k in xrange(1,self.K):
self.tp[k] += self.tp[k-1]
#随机选择或者叫轮盘法选择一个主题做为当前位置的主题topic_new
u = random.uniform(0,self.tp[self.K-1])
for topic in xrange(self.K):
if self.tp[topic]>u:
break
#self.nw[word][topic] += 1
#self.nwsum[topic] += 1
self.tnd[i][topic] +=1
self.tndsum[i] +=1
return topic
def inference(self):
#测试集
self.tp = np.zeros(self.K)
self.tnd = np.zeros((dpre1.docs_count,self.K),dtype="int")
self.tndsum = np.zeros(dpre1.docs_count,dtype="int")
self.tZ = []
# M*doc.size(),文档中词的主题分布
self.ttheta= np.array([ [0.0 for y in xrange(self.K)] for x in xrange(dpre1.docs_count) ])#docs1行,K列
for x in xrange(dpre1.docs_count):
self.tndsum[x] = dpre1.docs[x].length
z_n=[]
for y in xrange(dpre1.docs[x].length):
word=dpre1.docs[x].words[y]
z=numpy.random.randint(0, self.K)
self.tnd[x][z] += 1
z_n.append(z)
self.tZ.append(numpy.array(z_n))
for ij in xrange(30):#30次迭代
for i in xrange(dpre1.docs_count):
for j in xrange(dpre1.docs[i].length):
topic = self.sampling_t(i,j)
self.tZ[i][j] = topic
self.t_theta()
def probl(self):
text_clf = KNeighborsClassifier(n_neighbors=4, algorithm='brute')
print np.shape(self.theta),np.shape(label)
text_clf = text_clf.fit(self.theta, label)
doc_class_predicted = text_clf.predict(self.ttheta)
print np.mean(label_t==doc_class_predicted)
def preprocessing():
logger.info(u'载入数据......')
tra_path=os.listdir(trainfile)
docs=[]
for ea in tra_path:
print ea
docs.append(open('D:\\Documents\\data\\tra\\'+ea,'r').readlines())
#测试集
tes_path=os.listdir('D:\\Documents\\data\\tes\\')
docs1=[]
for ea in tes_path:
print ea
docs1.append(open('D:\\Documents\\data\\tes\\'+ea,'r').readlines())
logger.debug(u"载入完成,准备生成字典对象和统计文本数据...")
dpre = DataPreProcessing()#创建字典并保存
dpre1= DataPreProcessing()
items_idx = 0
fr = open('stopwords.txt','r')
stopwords_list =[]
for line in fr.readlines():
line=line.decode('utf-8').strip().split()
#print line,type(line),len(line)
line=line[0]
#print line,type(line),len(line)
stopwords_list.append(line)
js=1
for x in docs:
js1=1
for line in x:
#print js
label.append(js)
#print line
if line != "":
tmp = line.decode('utf-8').strip().split()
#生成一个文档对象,创建词袋
doc = Document()#类
tt=0
for item in tmp:
#print item
if item not in stopwords_list:
tt+=1
if dpre.word2id.has_key(item):
doc.words.append(dpre.word2id[item])
else:
dpre.word2id[item] = items_idx
doc.words.append(items_idx)
items_idx += 1
doc.length = tt
dpre.docs.append(doc)
else:
pass
js1+=1
if js1>1000:#每类文件数
break
if js>10:#类别数目
break
js=js+1
ttt=1
for x in docs1:
doc_1_t=1
for line in x:
doc_1_t+=1
label_t.append(ttt)
if line != "":
#print line
tmp = line.strip().split()
#生成一个文档对象,创建词袋
doc1 = Document()#类
tt=0
for item in tmp:
#print item
if dpre.word2id.has_key(item) :
docnum=dpre.word2id.get(item)
doc1.words.append(docnum)
tt+=1
doc1.length = tt
dpre1.docs.append(doc1)
else:
pass
if doc_1_t>100:
break
if ttt>10:
break
ttt+=1
dpre.docs_count = len(dpre.docs)
dpre.words_count = len(dpre.word2id)
dpre1.docs_count = len(dpre1.docs)
#print dpre1.docs[6].length
logger.info(u"共有%s个文档" % dpre.docs_count)
dpre.cachewordidmap()
logger.info(u"词与序号对应关系已保存到%s" % wordidmapfile)
return dpre,dpre1
dpre,dpre1 = preprocessing()
lda = LDAModel(dpre,dpre1)
lda.est()
lda.inference()
lda.probl()