基于某评论的TF-IDF下的LDA主题模型分析

努力学习各种软件

已于 2024-06-04 16:50:24 修改

阅读量1.8k

点赞数 16

文章标签： tf-idf LDA 机器学习

于 2024-06-04 16:21:44 首次发布

本文链接：https://blog.youkuaiyun.com/m0_57265868/article/details/139445833

版权

完整代码：


import numpy as np
import re
import pandas as pd
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

df1 = pd.read_csv('小红书评论.csv')  # 读取同目录下csv文件
# df1 = df1.drop_duplicates(subset=['用户id'])  # 获取一个id只评论一次的数据
pattern = u'[\\s\\d,.<>/?:;\'\"[\\]{}()\\|~!\t"@#$%^&*\\-_=+a-zA-Z，。\n《》、？：；“”‘’｛｝【】（）…￥！—┄－]+'
df1['cut'] = df1['内容'].apply(lambda x: str(x))
df1['cut'] = df1['cut'].apply(lambda x: re.sub(pattern, ' ', x))  #对评论内容作清洗,只保留中文汉字，生成新的cut行
df1['cut'] = df1['cut'].apply(lambda x: " ".join(jieba.lcut(x)))  #对评论内容作分词和拼接
print(df1['cut'])
print(type(df1['cut']))


# 1.构造TF-IDF
tf_idf_vectorizer = TfidfVectorizer()
tf_idf = tf_idf_vectorizer.fit_transform(df1['cut'])
# 2.特征词列表
feature_names = tf_idf_vectorizer.get_feature_names_out()
# 3.将特征矩阵转变为pandas DataFrame
matrix = tf_idf.toarray()
feature_names_df = pd.DataFrame(matrix,columns=feature_names)
print(feature_names_df)
# 所有的特征词组成列，所有的评论组成行，矩阵中的元素表示这个特征词在该评论中所占的重要性，即tf-idf值，0表示该句评论中没有该词。

n_topics = 5
# 定义LDA对象
lda = LatentDirichletAllocation(
    n_components=n_topics,max_iter=50,
    learning_method='online',
    learning_offset=50.,
    random_state=0
)
# 核心，将TF-IDF矩阵放入LDA模型中
lda.fit(tf_idf)

#第1部分
# 要输出的每个主题的前 n_top_words 个主题词数
n_top_words = 50
def top_words_data_frame(model: LatentDirichletAllocation,
                         tf_idf_vectorizer: TfidfVectorizer,
                         n_top_words: int) -> pd.DataFrame:
    rows = []
    feature_names = tf_idf_vectorizer.get_feature_names_out()
    for topic in model.components_:
        top_words = [feature_names[i]
                     for i in topic.argsort()[:-n_top_words - 1:-1]]
        rows.append(top_words)
    columns = [f'topic {i + 1}' for i in range(n_top_words)]
    df = pd.DataFrame(rows, columns=columns)
    return df

#2
def predict_to_data_frame(model: LatentDirichletAllocation, X: np.ndarray) -> pd.DataFrame:
    matrix = model.transform(X)
    columns = [f'P(topic {i + 1})' for i in range(len(model.components_))]
    df = pd.DataFrame(matrix, columns=columns)
    return df


# 要输出的每个主题的前 n_top_words 个主题词数


# 计算 n_top_words 个主题词
top_words_df = top_words_data_frame(lda, tf_idf_vectorizer, n_top_words)

# 获取五个主题的前五十个特征词
print(top_words_df)

# 转 tf_idf 为数组，以便后面使用它来对文本主题概率分布进行计算
X = tf_idf.toarray()

# 计算完毕主题概率分布情况
predict_df = predict