from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
# 示例文本数据
documents = [
"This is the first document about technology and artificial intelligence.",
"The second document is about machine learning and natural language processing.",
"The third document discusses deep learning and neural networks.",
"Another document talks about data mining and big data analytics.",
"The last document covers computer vision and image recognition."
]
# 使用词袋模型提取特征
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)
# 设置 LDA 模型参数
num_topics = 2 # 指定要发现的主题数
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
# 拟合 LDA 模型
lda.fit(X)
# 输出每个主题的前几个关键词
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
print(f"Topic {topic_idx + 1}:")
top_features_idx = topic.argsort()[-5:][::-1] # 输出每个主题的前五个关键词
top_features = [feature_names[i] for i in top_features_idx]
print(top_features)
LDA主题提取
最新推荐文章于 2024-11-12 18:26:33 发布