from sklearn.cluster import KMeans
from bert_serving.client import BertClient
import csv
# 先命令行启动服务
# bert-serving-start -model_dir C:\Users\jason\PycharmProjects\code\chinese_L-12_H-768_A-12 -num_worker=1
'''kmeans聚类所有跟合作有关的句子成3类'''
filename = "sentence_rela_hezuo.csv"
with open(filename, "r", newline='', encoding='utf-8-sig') as f:
with open("cluster3_hezuo_0.txt", "a", newline='', encoding='utf-8') as f0:
with open("cluster3_hezuo_1.txt", "a", newline='', encoding='utf-8') as f1:
with open("cluster3_hezuo_2.txt", "a", newline='', encoding='utf-8') as f2:
with open("cluster3_hezuo_center.txt", "a", newline='', encoding='utf-8') as fc:
f_reader = csv.reader(f)
sentence_list = [row[0] for row in f_reader] # 1273个字符串 句子
bc = BertClient(check_length=False)
vecs = bc.encode(sentence_list) # (1273,768) np.ndarray数组
用sklearn结合bert对中文句子聚类
最新推荐文章于 2024-12-31 20:34:14 发布
本文介绍了如何结合Python的sklearn库与预训练的BERT模型,对中文句子进行有效的聚类分析。首先,通过读取CSV文件获取数据,然后利用BERT进行文本特征提取。接着,运用sklearn的聚类算法(如KMeans)对BERT得到的向量进行处理,最终实现对中文文本的智能归类。

最低0.47元/天 解锁文章
1635

被折叠的 条评论
为什么被折叠?



