完整代码:
import numpy as np
import re
import pandas as pd
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
df1 = pd.read_csv('小红书评论.csv') # 读取同目录下csv文件
# df1 = df1.drop_duplicates(subset=['用户id']) # 获取一个id只评论一次的数据
pattern = u'[\\s\\d,.<>/?:;\'\"[\\]{}()\\|~!\t"@#$%^&*\\-_=+a-zA-Z,。\n《》、?:;“”‘’{}【】()…¥!—┄-]+'
df1['cut'] = df1['内容'].apply(lambda x: str(x))
df1['cut'] = df1['cut'].apply(lambda x: re.sub(pattern, ' ', x)) #对评论内容作清洗,只保留中文汉字,生成新的cut行
df1['cut'] = df1['cut'].apply(lambda x: " ".join(jieba.lcut(x))) #对评论内容作分词和拼接
print(df1['cut'])
print(type(df1['cut']))
# 1.构造TF-IDF
tf_idf_vectorizer = TfidfVectorizer()
tf_idf = tf_idf_vectorizer.fit_transform(df1['cut'])
# 2.特征词列表
feature_names = tf_idf_vectorizer.get_feature_names_out()
# 3.将特征矩阵转变为pandas DataFrame
matrix = tf_idf.toarray()
feature_names_df = pd.DataFrame(matrix,columns=feature_names)
print(feature_names_df)
# 所有的特征词组成列,所有的评论组成行,矩阵中的元素表示这个特征词在该评论中所占的重要性,即tf-idf值,0表示该句评论中没有该词。
n_topics = 5
# 定义LDA对象
lda = LatentDirichletAllocation(
n_components=n_topics,max_iter=50,
learning_method='online',
learning_offset=50.,
random_state=0
)
# 核心,将TF-IDF矩阵放入LDA模型中
lda.fit(tf_idf)
#第1部分
# 要输出的每个主题的前 n_top_words 个主题词数
n_top_words = 50
def top_words_data_frame(model: LatentDirichletAllocation,
tf_idf_vectorizer: TfidfVectorizer,
n_top_words: int) -> pd.DataFrame:
rows = []
feature_names = tf_idf_vectorizer.get_feature_names_out()
for topic in model.components_:
top_words = [feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]]
rows.append(top_words)
columns = [f'topic {i + 1}' for i in range(n_top_words)]
df = pd.DataFrame(rows, columns=columns)
return df
#2
def predict_to_data_frame(model: LatentDirichletAllocation, X: np.ndarray) -> pd.DataFrame:
matrix = model.transform(X)
columns = [f'P(topic {i + 1})' for i in range(len(model.components_))]
df = pd.DataFrame(matrix, columns=columns)
return df
# 要输出的每个主题的前 n_top_words 个主题词数
# 计算 n_top_words 个主题词
top_words_df = top_words_data_frame(lda, tf_idf_vectorizer, n_top_words)
# 获取五个主题的前五十个特征词
print(top_words_df)
# 转 tf_idf 为数组,以便后面使用它来对文本主题概率分布进行计算
X = tf_idf.toarray()
# 计算完毕主题概率分布情况
predict_df = predict