首先下载mycorpus.txt文件,地址:https://radimrehurek.com/gensim/mycorpus.txt
引入必须的函数
from gensim import corpora, similarities,models
from pprint import pprint
接着生成词库并保存
#停用词表,本次使用的txt文件只有9个文档,比较小。当文档数目多时可以使用加载别的停用词表
stoplist =set('for a of the and to in'.split())
#利用corpora.Dictionary函数将这9个文档保存为字典文件
dictionary = corpora.Dictionary(line.lower().split() for line in open(r'D:\pythonplaces\DeepLearning\Word2vectest\mycorpus.txt'))
#dictionary.token2id表示{词,对应的单词ID},去除停用词所对应的ID
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist]
#找到只出现一次的单词的ID
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
# 删除停用词和仅出现一次的词
dictionary.filter_tokens(stop_ids + once_ids)
dictionary.compactify()
# 消除id序列在删除词后产生的不连续的缺口
dictionary.save('/tmp/deerwester.dict')#保存字典
生成语料库文件并保存
#载入字典
dictionary = corpora.Dictionary.load(r'.\tmp\deerwester.dict')
#写一个逐行读取文件的类
class MyCorpus(object):
def __iter__(self):
for line in open(r'D:\pythonplaces\DeepLearning\Word2vectest\mycorpus.txt'):
#dictionary.doc2bow将文件中的单词转化为对应的(单词ID,出现次数)
yield dictionary.doc2bow(line.lower().split())
corpus_memory_friendly = MyCorpus()
#生成语料库
corpus = list(corpus_memory_friendly)
#保存语料库
corpora.MmCorpus.serialize(r'.\tmp\deerwester.mm', corpus)
载入语料库并进行相似度查询
#载入语料库
corpus = corpora.MmCorpus(r'.\tmp\corpus.mm')
#使用models.LsiModel对语料进行训练,并设定两个主题
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
#doc为待查询的文档
doc = "Human computer interaction"
#利用dictionary.doc2bow将doc进行转化
vec_bow = dictionary.doc2bow(doc.lower().split())
# 将语料转化为LSI空间向量
vec_lsi = lsi[vec_bow]
#将语料库转化为LST空间向量,并建立索引
index = similarities.MatrixSimilarity(lsi[corpus])
#保存为索引文件
index.save(r'.\tmp\deerwester.index')
##载入索引文件
index = similarities.MatrixSimilarity.load(r'.\tmp\deerwester.index')
# 进行相似度查询
sims = index[vec_lsi]
pprint(list(enumerate(sims)))
结果为:
[(0, 0.97755843),
(1, 0.9790665),
(2, 0.97880435),
(3, 0.95096743),
(4, 0.9602573),
(5, 0.025995243),
(6, 0.043878958),
(7, 0.05150853),
(8, 0.19920841)]
需要指出的是,文件夹的位置需要自己设定。本文借鉴部分https://blog.youkuaiyun.com/questionfish/article/details/46746947