from gensim import corpora, similarities, models
import jieba
import pandas as pd
data = pd.read_csv("data.csv", encoding="utf-8")
find = pd.read_csv("company.csv", encoding="utf-8")
data_split_word = data.user.apply(jieba.lcut)
dictionary = corpora.Dictionary(data_split_word.values)
data_corpus = data_split_word.apply(dictionary.doc2bow)
trantab = str.maketrans("0123456789", "零一二三四五六七八九")
find_corpus = find.name.apply(
lambda x: dictionary.doc2bow(jieba.lcut(x.translate(trantab))))
tfidf = models.TfidfModel(data_corpus.to_list())
index = similarities.SparseMatrixSimilarity(
tfidf[data_corpus], num_features=len(dictionary))
result = []
for corpus in find_corpus.values:
sim = pd.Series(index[corpus])
result.append(data.user[sim.nlargest(3).index].values)
result = pd.DataFrame(result)
result.rename(columns=lambda i: f"匹配{i+1}", inplace=True)
result = pd.concat([find, result], axis=1)
result.head(30)
result.to_excel(r" fuzzy mapping result.xlsx",index=False)