LDA

文本预处理与LDA主题模型

最新推荐文章于 2025-07-29 15:35:30 发布

原创最新推荐文章于 2025-07-29 15:35:30 发布 · 1.2k 阅读

0 ·

CC 4.0 BY-SA版权

learning 同时被 2 个专栏收录

32 篇文章

订阅专栏

NLP

4 篇文章

订阅专栏

本文介绍了一种文本预处理方法，并通过LDA主题模型进行主题抽取。使用nltk进行分词、去除停用词及词干提取，gensim实现LDA模型训练。通过对几个示例文档进行处理并分析，展示了如何从文本中提取关键信息。

#-*- coding:utf8 -*-
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim.models.ldamodel import LdaModel
from gensim import corpora, models, similarities

import sys

reload(sys)
sys.setdefaultencoding('utf-8')
def main():
    doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
    doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
    doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
    doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
    doc_e = "Health professionals say that brocolli is good for your health."

    # compile sample documents into a list
    doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]
    # print doc_set


    tokenizer = RegexpTokenizer(r'\w+')
    p_stemmer = PorterStemmer()
    en_stop = get_stop_words('en')
    texts=[]
    for raw in doc_set:
        raw = raw.lower()
        tokens = tokenizer.tokenize(raw)
        # print tokens

        stopped_tokens=[i for i in tokens if not i in en_stop]
        stemmed_token=[p_stemmer.stem(i) for i in stopped_tokens]
        texts.append(stemmed_token)
    # print texts
    dictionary = corpora.Dictionary(texts)
    print dictionary
    print(dictionary.token2id)
    corpus = [dictionary.doc2bow(text) for text in texts]
    """词袋模型生成矩阵"""
    print corpus
    """
    num_topics: 必须。LDA 模型要求用户决定应该生成多少个主题。由于我们的文档集很小，所以我们只生成三个主题。
    id2word：必须。LdaModel 类要求我们之前的 dictionary 把 id 都映射成为字符串。
    passes：可选。模型遍历语料库的次数。遍历的次数越多，模型越精确。但是对于非常大的语料库，遍历太多次会花费很长的时间。
    """
    ldamodel=LdaModel(corpus,num_topics=2,id2word=dictionary,passes=20)
    # print ldamodel.print_topics(num_topics=3, num_words=4)
    # # print(dictionary.roken2id)
    # #分支一建立 TF-IDF
    tfidf = models.TfidfModel(corpus)
    print tfidf
    corpus_tfidf = tfidf[corpus]
    print corpus_tfidf
    similarity = similarities.Similarity('Similarity-tfidf-index', corpus_tfidf, num_features=600)
    print similarity
    # """使用tf-idf 模型得出该评论集的tf-idf 模型"""
    # corpus_tfidf = tfidf[corpus]
    new_sensence = "My mother spends a lot of time driving my brother around to baseball practice"
    tokens = tokenizer.tokenize(new_sensence.lower())
    tokens1 = [i for i in tokens if not i in en_stop]
    new_sen = [p_stemmer.stem(i) for i in tokens1]
    test_corpus_1 = dictionary.doc2bow(new_sen)
    vec_tfidf = tfidf[test_corpus_1]
    print vec_tfidf
    id2token={value:key for key,value in dictionary.token2id.items()}
    print id2token
    for (key,freq) in vec_tfidf:
        print id2token[key],freq
if __name__ == '__main__':
    main()