# -*- coding: UTF-8 -*-
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
# 分词
import jieba
def cut(sentence):
generator = jieba.cut(sentence,cut_all=False)
return [word for word in generator]
text0 = '多数学院已经放学,但多数学生仍在学习'
text1 = '数学专业的学生多数仍在学院学习'
text2 = '补习数学'
texts = [text0, text1, text2]
length = len(texts)
cross_text = [([word for word in jieba.cut(i)], [[word for word in jieba.cut(j)] for j in texts if j != i]) for i in texts]
# 文本相似度矩阵
from gensim import corpora, models, similarities
cross_dictionary = [corpora.Dictionary(i[1]) for i in cross_text]
def cross(i, cross_dictionary):
dictionary = cross_dictionary[i]
feature_cnt = len(dictionary.token2id.keys())
corpus = [dictionary.doc2bow(text) for text in cross_text[i][1]]
vector = dictionary.doc2bow(cross_text[i][0])
tfidf = models.TfidfModel(corpus)
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=feature_cnt)
sim = index[tfidf[vector]]
return [value for value in sim]
sim_matrix = [cross(i, cross_dictionary) for i in range(length)]
for i in range(len(sim_matrix)):
sim_matrix[i].insert(i, 1)
print sim_matrix
# 可视化
import matplotlib.pyplot as mp, seaborn
seaborn.heatmap(sim_matrix, center=1, annot=True)
mp.show()
唯一的问题是不对称
博客指出唯一的问题是不对称,但未明确具体所指领域。推测可能在信息技术相关场景中存在不对称情况,如数据分布、资源分配等方面。
440

被折叠的 条评论
为什么被折叠?



