http://www.shuang0420.com/2016/05/18/Gensim-and-LDA-Training-and-Prediction/
import warnings warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim') import sys,os sys.path.append("../") from gensim import corpora, models, similarities import logging import jieba #导入招聘词典 userdictRootPathDir = "D:/下载/jieba-master/jieba-master/userdict/" if os.path.isdir(userdictRootPathDir): for cusdir in os.listdir(userdictRootPathDir): currentDir=userdictRootPathDir+cusdir+"/" # print currentDir if os.path.isdir(currentDir): for filename in os.listdir(currentDir): fileNameTotal=currentDir+filename #print fileNameTotal jieba.load_userdict(fileNameTotal) # print("jiebaload:"+fileNameTotal) # configuration logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # load data from file #f = open('C:/Users/lishaoxing/Desktop/zhaopinrizhi.txt', 'r', encoding='utf-8') f = open('C:/Users/lishaoxing/Desktop/topicmodel/rizhifenci.txt', 'r', encoding='utf-8') #f = open('C:/Users/lishaoxing/Desktop/topicmodel/date/20180601.txt', 'r', encoding='utf-8') documents = f.readlines() #tokenize texts = [[word for word in jieba.cut(document, cut_all = False)] for document in documents] # load id->word mapping (the dictionary) dictionary = corpora.Dictionary(texts) # word must appear >10 times, and no more than 40% documents dictionary.filter_extremes(no_below=40, no_above=0.1) # save dictionary dictionary.save('C:/Users/lishaoxing/Desktop/topicmodel/dict_v1.txt') # load corpus corpus = [dictionary.doc2bow(text) for text in texts] # initialize a model tfidf = models.TfidfModel(corpus) # use the model to transform vectors, apply a transformation to a whole corpus corpus_tfidf = tfidf[corpus] # extract 100 LDA topics, using 1 pass and updating once every 1 chunk (10,000 documents), using 500 iterations lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=100, iterations=500) # save model to files lda.save('C:/Users/lishaoxing/Desktop/topicmodel/mylda_v1.txt') # print topics composition, and their scores, for the first document. You will see that only few topics are represented; the others have a nil score. for index, score in sorted(lda[corpus_tfidf[0]], key=lambda tup: -1*tup[1]): print("Score: {}\t Topic: {}".format(score, lda.print_topic(index, 10))) print("\n"+"end1"+"\n"+100*"-") # print the most contributing words for 100 randomly selected topics lda.print_topics(100) print("\n"+"end2"+"\n"+100*"-") # load model and dictionary model = models.LdaModel.load('C:/Users/lishaoxing/Desktop/topicmodel/mylda_v1.txt') dictionary = corpora.Dictionary.load('C:/Users/lishaoxing/Desktop/topicmodel/dict_v1.txt') # predict unseen data query = "未收到奖励" query_bow = dictionary.doc2bow(jieba.cut(query, cut_all = False)) for index, score in sorted(model[query_bow], key=lambda tup: -1*tup[1]): print ("Score: {}\t Topic: {}".format(score, model.print_topic(index, 20))) print("\n"+"end3"+"\n"+100*"-") # if you want to predict many lines of data in a file, do the followings f = open('C:/Users/lishaoxing/Desktop/topicmodel/zhiwei.txt', 'r', encoding='utf-8') documents = f.readlines() texts = [[word for word in jieba.cut(document, cut_all = False)] for document in documents] corpus = [dictionary.doc2bow(text) for text in texts] # only print the topic with the highest score for c in corpus: flag = True for index, score in sorted(model[c], key=lambda tup: -1*tup[1]): if flag: print ("Score: {}\t Topic: {}".format(score, model.print_topic(index, 20))) print("\n"+"end3"+"\n"+100*"-")
Building prefix dict from the default dictionary ... Loading model from cache C:\Users\LISHAO~1\AppData\Local\Temp\jieba.cache Loading model cost 1.300 seconds. Prefix dict has been built succesfully. 2018-06-05 18:15:56,857 : INFO : adding document #0 to Dictionary(0 unique tokens: []) 2018-06-05 18:15:59,000 : INFO : adding document #10000 to Dictionary(36575 unique tokens: ['\x01', '\n', ' ', '1', '11239']...) 2018-06-05 18:16:01,314 : INFO : adding document #20000 to Dictionary(62239 unique tokens: ['\x01', '\n', ' ', '1', '11239']...) 2018-06-05 18:16:01,347 : INFO : built Dictionary(62508 unique tokens: ['\x01', '\n', ' ', '1', '11239']...) from 20125 documents (total 4050708 corpus positions) 2018-06-05 18:16:01,653 : INFO : discarding 59320 tokens: [('\x01', 20124), ('\n', 20125), (' ', 20124), ('1', 14281), ('2', 13658), ('3', 13640), ('30', 2098), ('34271690854826', 1), ('386', 18), ('4', 10357)]... 2018-06-05 18:16:01,654 : INFO : keeping 3188 tokens which were in no less than 40 and no more than 2012 (=10.0%) documents 2018-06-05 18:16:01,677 : INFO : resulting dictionary: Dictionary(3188 unique tokens: ['11239', '主管', '亦可', '信息', '做起']...) 2018-06-05 18:16:01,682 : INFO : saving Dictionary object under C:/Users/lishaoxing/Desktop/topicmodel/dict_v1.txt, separately None 2018-06-05 18:16:01,723 : INFO : saved C:/Users/lishaoxing/Desktop/topicmodel/dict_v1.txt 2018-06-05 18:16:04,620 : INFO : collecting document frequencies 2018-06-05 18:16:04,620 : INFO : PROGRESS: processing document #0 2018-06-05 18:16:04,734 : INFO : PROGRESS: processing document #10000 2018-06-05 18:16:04,873 : INFO : PROGRESS: processing document #20000 2018-06-05 18:16:04,875 : INFO : calculating IDF weights for 20125 documents and 3187 features (778045 matrix non-zeros) 2018-06-05 18:16:04,885 : INFO : using symmetric alpha at 0.01 2018-06-05 18:16:04,885 : INFO : using symmetric eta at 0.01 2018-06-05 18:16:04,886 : INFO : using serial LDA version on this node 2018-06-05 18:16:04,938 : INFO : running online (single-pass) LDA training, 100 topics, 1 passes over the supplied corpus of 20125 documents, updating model once every 2000 documents, evaluating perplexity every 20000 documents, iterating 500x with a convergence threshold of 0.001000 2018-06-05 18:16:05,467 : INFO : PROGRESS: pass 0, at document #2000/20125 2018-06-05 18:16:10,079 : INFO : merging changes from 2000 documents into a model of 20125 documents 2018-06-05 18:16:10,151 : INFO : topic #93 (0.010): 0.074*"2174" + 0.057*"后厨" + 0.025*"清洁" + 0.023*"整理" + 0.021*"烹饪" + 0.014*"凉菜" + 0.010*"厨房" + 0.010*"做" + 0.008*"辅助" + 0.008*"简单" 2018-06-05 18:16:10,151 : INFO : topic #79 (0.010): 0.041*"2174" + 0.021*"后厨" + 0.016*"14" + 0.013*"加工" + 0.013*"202" + 0.012*"342" + 0.012*"招" + 0.011*"印刷" + 0.011*"服务员" + 0.011*"服从" 2018-06-05 18:16:10,151 : INFO : topic #53 (0.010): 0.027*"2174" + 0.022*"后厨" + 0.016*"烹饪" + 0.014*"清洁" + 0.013*"卫生" + 0.011*"整理" + 0.010*"协助" + 0.010*"食材" + 0.008*"保证" + 0.008*"主播" 2018-06-05 18:16:10,152 : INFO : topic #23 (0.010): 0.025*"印刷" + 0.022*"2174" + 0.021*"制作" + 0.014*"厨房" + 0.013*"清洁" + 0.013*"2156" + 0.013*"烹饪" + 0.012*"后厨" + 0.011*"原料" + 0.010*"关经验" 2018-06-05 18:16:10,152 : INFO : topic #69 (0.010): 0.032*"2174" + 0.017*"凉菜" + 0.017*"后厨" + 0.012*"包" + 0.010*"餐饮" + 0.009*"烧烤" + 0.009*"烹饪" + 0.008*"清洁" + 0.008*"勤奋努力" + 0.007*"门店" 2018-06-05 18:16:10,154 : INFO : topic diff=85.208763, rho=1.000000 2018-06-05 18:16:10,697 : INFO : PROGRESS: pass 0, at document #4000/20125 C:\Users\lishaoxing\AppData\Roaming\Python\Python36\site-packages\gensim\models\ldamodel.py:775: RuntimeWarning: divide by zero encountered in log diff = np.log(self.expElogbeta) 2018-06-05 18:16:13,270 : INFO : merging changes from 2000 documents into a model of 20125 documents 2018-06-05 18:16:13,324 : INFO : topic #31 (0.010): 0.082*"控制" + 0.037*"部门" + 0.025*"管理" + 0.021*"宿舍" + 0.020*"质量" + 0.018*"电视" + 0.017*"衣柜" + 0.016*"30周岁" + 0.015*"免费" + 0.014*"上五休" 2018-06-05 18:16:13,324 : INFO : topic #4 (0.010): 0.040*"满勤奖" + 0.031*"供" + 0.026*"点" + 0.020*"性格" + 0.020*"服从安排" + 0.018*"2174" + 0.016*"月休4天" + 0.015*"主题" + 0.015*"做事" + 0.014*"包" 2018-06-05 18:16:13,324 : INFO : topic #73 (0.010): 0.017*"事情" + 0.016*"160" + 0.014*"联系电话" + 0.013*"摄影" + 0.012*"年龄18周岁" + 0.011*"文静" + 0.011*"妹子" + 0.011*"向上" + 0.010*"广告" + 0.010*"脸型" 20