本篇文章主要是实现python 自然语言处理包 gensim 中用于词向量建模的 word2vec算法。
示例代码如下:
# encoding=utf-8
import logging
import sys
from gensim.models import Word2Vec
if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
if len(sys.argv) < 3:
sys.exit(1)
outputFile1, outputFile2 = sys.argv[1:3]
sentences = [
"I think that most of us know by now that water is essential to our survival We’ve probably also all heard doctors say that drinking roughly eight glasses a day is ideal",
"yoyoyo you go home now to sleep"]
vocab = [s.encode('utf-8').decode().split() for s in sentences]
#建立模型
model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
#保存模型
model.save(outputFile1)
model.save_word2vec_format(outputFile2, binary=False)
#测试模型
# encoding='utf-8'
import logging
import sys
from gensim.models import Word2Vec
if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
if len(sys.argv) < 3:
sys.exit(1)
file, word = sys.argv[1:3]
#从磁盘文件 file 加载模型
model = Word2Vec.load_word2vec_format(file, binary=False)
print(model.most_similar(word))
更多内容可以查看官方文档