from sklearn.feature_extraction.text import TfidfVectorizer
import jieba
# text = ['This is the first document.', 'This is the second second document.', 'And the third one.',
# 'Is this the first document?', ]
#
# tf = TfidfVectorizer(min_df=1)
#
# X = tf.fit_transform(text)
# names = tf.get_feature_names()
# print(names)
# print(X.toarray())
text = '今天天气真好,我要去北京天安门玩,要去景山攻牙之后,玩完大明劫'
# 进行结巴分词,精确模式
text_list = jieba.cut(text, cut_all=False)
text_list = ",".join(text_list)
context = []
context.append(text_list)
print(context)
tf = TfidfVectorizer(min_df=1)
X = tf.fit_transform(context)
names = tf.get_feature_names()
print(names)
print(X.toarray())
TfidfVectorizer统计词频
TF-IDF与结巴分词实战
最新推荐文章于 2025-08-13 19:21:33 发布
本文介绍了一个使用TF-IDF算法结合结巴分词处理中文文本的例子。通过Python的sklearn库和jieba分词库,实现了对中文文本的特征提取,展示了如何将中文文本转换为TF-IDF矩阵。

500

被折叠的 条评论
为什么被折叠?



