1 word2vec训练
#coding=utf-8 import jieba import jieba.posseg as pseg import smart_open import xlrd import codecs from gensim.models import Word2Vec # 读文件 def read_file(): jieba.load_userdict("dic.txt") user_query_segment_list = [] with open("07.txt", mode="r", encoding="utf-8") as f1: lines = f1.readlines() for line in lines: line = line.replace("\n", "").strip() every_query_list = [] # print(line) words = pseg.cut(line) words_list = list(words) every_query_list = [element.word for element in words_list] user_query_segment_list.append(every_query_list) print(len(user_query_segment_list)) with open("zhishiku.txt", mode="r", encoding="utf-8") as f1: lines = f1.readlines() for line in lines: line = line.replace("\n", "").strip() every_query_list = [] # print(line) words = pseg.cut(line) words_list = list(words) every_query_list = [element.word for element in words_list] user_query_segment_list.append(every_query_list) print(len(user_query_segment_list)) with open("seg_result.txt", mode="w", encoding="utf-8") as fw: for element in user_query_segment_list: temp = " ".join(list(element)) temp = temp.strip() fw.writelines(temp + "\n") return user_query_segment_list def export_to_file(model, output_file): output = codecs.open(output_file, 'w', 'utf-8') print('done loading Word2Vec') vocab = model.wv.vocab for mid in vocab: #print(model[mid]) #print(mid) vector = list() for dimension in model[mid]: vector.append(str(dimension)) #line = { "mid": mid, "vector": vector } vector_str = " ".join(vector) line = mid + " " + vector_str #line = json.dumps(line) output.write(line + "\n") output.close() if __name__ == '__main__': user_query_list = read_file() # # user_query_list是list的list,里面是分好词的句子 # model = Word2Vec(user_query_list, size=100, window=5, min_count=1, workers=4, iter=10) # # model.save('/tmp/MyModel') # # model.save_word2vec_format('mymodel2.bin', binary=True) # model.wv.save_word2vec_format('mymodel3.bin', binary=True) # # print(model.wv.most_similar("公积金")) # # export_to_file(model,"word2vec_by_gensim_ly_train_dev_test_0702.txt") # model = Word2Vec() # model.build_vocab(user_query_list) # model.train(user_query_list, total_examples = model.corpus_count, epochs = 10) # print(model.wv.most_similar("公积金")) # model.wv.save_word2vec_format('mymodel2.bin', binary=True) # # model.save('mymodel2.bin') # # https: // www.jianshu.com / p / 05fb666a72c4
2 word2vec使用
import gensim from gensim.models import word2vec model = gensim.models.KeyedVectors.load_word2vec_format("skipgram_200_mincount10.vec") print(model.similarity("日期", "时间")) print(model.similarity("日期", "延期"))
https://blog.youkuaiyun.com/qq_41814556/article/details/80990976
3 fasttext使用