1.word2vec模型训练
import jieba.analyse
import jieba
import os
from gensim.models.word2vec import Word2Vec
dirname = './data/LCQMC'
sentence = []
words = []
for filename in os.listdir(dirname):
with open(os.path.join(dirname, filename), 'r') as lcqmc:
for line in lcqmc:
linedict = eval(line) #将json字符串转化为json对象
word = linedict['sentence1']
pos = linedict['sentence2']
sentence.append(word)
sentence.append(pos)
with open('./data/data_text', 'r') as f:
for line in f:
sentence.append(line)
print("data_text size:", len(sentence))#sentence存放所有要训练的语料
for string in sentence:
temp = list(string)
str = ''
for ch in temp:
str = str+ch+' '
# print(str)
words.append(str)
#size指定训练的字表示向量大小
model = Word2Vec(words, size=128, window=4, min_count=1, sg=1, workers=2)
model.save('./data/word2vecModel') #将训练好的模型存放在该代码文件下data目录中word2vecModel文件中
2.word2vec模型的调用
model = Word2Vec.load('./gensim_word2vec/data/word2vecModel')
def wordToVector(words):
result = []
for senarr in words:
temp = []
for i in range(30): #将一句语句设置固定长度30字,下标低于30的字向量从模型中取,超过的获取128的列向量
if i < len(senarr):
word_vec = model[senarr[i]] #获取字向量
else:
word_vec = [0 for _ in range(128)]
word_vec = np.asarray(word_vec)
temp.append(word_vec)
# temp = np.asarray(temp)
result.append(temp)
return result
本文档展示了如何使用Java进行word2vec模型的训练和调用。首先,通过jieba和gensim库读取并处理LCQMC数据集与额外文本数据作为语料,然后使用Word2Vec模型进行训练,并保存模型。接着,详细说明了如何加载已训练好的模型,以及如何将输入的句子转换为固定长度的词向量数组。
5310

被折叠的 条评论
为什么被折叠?



