pip install gensim
python word2vec.py patent.txt patent.model patent.vector
import logging
import os.path
import sys
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
if __name__ == '__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logging.info('running %s' % ' '.join(sys.argv))
if len(sys.argv) < 4:
print(globals()['__doc__'] % locals())
'''
inp:分好词的数据,对应patent.txt
output_model:保存的模型,对应patent.model
output_txt:保存的词向量,patent.vector
'''
inp, output_model, output_txt = sys.argv[1:4]
model = Word2Vec(LineSentence(inp), size=768, window=5, sg=1, min_count=2)
model.save(output_model)
model.wv.save_word2vec_format(output_txt, binary=False)