【word2vec实例1】

本博客介绍了一个使用大量中文语料训练Word2Vec模型的过程。从预处理语料到构建并训练Word2Vec模型,详细记录了每一步操作。使用的语料包括问题集,最终目标是创建一个能理解中文语义的词向量模型。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

#  coding = utf8
import wordcut
import create_dict
import vectorize
import classify
import pickle
import psutil
import parameters
import os
from collections import deque
import gensim
import numpy as np
import csv
import processHandler


def cos_sim(arrA, arrB):
    return arrA.dot(arrB)/(np.linalg.norm(arrA)*np.linalg.norm(arrB))


# def compare(word_cutter,vectorizer,sentenceA,sentenceB):
#     tokenA = word_cutter.word_cut(sentenceA) # word_cuttter returns a list
#     tokenB = word_cutter.word_cut(sentenceB)
#     vectorA = np.array(vectorizer.vectorize(tokenA))
#     vectorB = np.array(vectorizer.vectorize(tokenB))
#     return cos_sim(vectorA,vectorB)


def main():
    current_dir = os.path.abspath('.')
    stopword_set = set()
    parameter = parameters.Parameters(os.path.join(current_dir, 'config.ini'), stopword_set)
    # w2v_training = os.path.join(current_dir, 'training_set20170810.csv')
    # pickle_path = os.path.join(current_dir, 'corpus')
    sentence_list = deque()
    # answer_path = os.path.join(current_dir, 'answer.csv')
    question_path =os.path.join(current_dir, 'question.csv')
    # concept_des_path =os.path.join(current_dir, 'concept_description.csv')
    w2v_file = os.path.join(current_dir, 'w2v_file_2017012.bin')
    word_cutter = wordcut.WordCutter(overlapping=parameter.overlapping)
    # try using stop-words
    preprocessor = processHandler.Prerocessor(False,False,False)
    # file = open(pickle_path,'rb')
    # pklist = pickle.load(file)
    # file.close()
    trainingset = []
    # i = 0
    # for row in pklist:
    #     temp = row[1].replace('\r','')
    #     temp = temp.replace('\n', '')
    #     trainingset.append(temp)
    #     temp = row[2].replace('\r','')
    #     temp = temp.replace('\n', '')
    #     trainingset.append(temp)
    #     i += 1
    #     if i > 100000:
    #         break
    # del pklist
    # file = open(answer_path,encoding='gb18030')
    # cache = file.readlines()
    # file.close()
    # for item in cache:
    #     temp = item.replace('\n','')
    #     temp = temp.replace('\r', '')
    #     trainingset.append(temp)
    file = open(question_path,encoding='gb18030')
    cache = file.readlines()
    file.close()
    for item in cache:
        temp = item.replace('\n','')
        temp = temp.replace('\r', '')
        trainingset.append(temp)
    # file = open(concept_des_path,encoding='gb18030')
    # cache = file.readlines()
    # file.close()
    # for item in cache:
    #     temp = item.replace('\n','')
    #     temp = temp.replace('\r', '')
    #     trainingset.append(temp)
    # del cache
    while len(trainingset) > 0:
        contain_chinese = False
        last = trainingset.pop()
        for item in last:
            if word_cutter.is_chinese(item):
                contain_chinese = True
                break
        if contain_chinese:
            temp = last
            for symbol in (u'', u'', u'', '!', '?'):
                temp = temp.replace(symbol, ' ')
            temp = temp.split()
            for sentence in temp:
                #print(sentence)
                sentence_list.append(sentence)
    del trainingset
    sentence_token = deque()
    total = len(sentence_list)
    i = 0
    while len(sentence_list) > 0:
        i += 1
        #print(item)
        #print(preprocessor.process_main(item))
        temp = preprocessor.process_main(sentence_list.pop())[-1]
        if temp is not None:
            #print(temp)
            sentence_token.append(temp)
        if i >= 10000:
            print([len(sentence_list), total])
            i = 0
    dic = gensim.models.Word2Vec(sentence_token,size=parameter.n_neuron,workers=3,seed=1024,iter=20) #sg=1 represents using skip-gram #sentence_token,size=parameter.n_neuron,workers=4,seed=1024,iter=20,sg=0
    #dic = gensim.models.Doc2Vec(sentence_token,size=parameter.n_neuron,workers=3,seed=1024,iter=20)
    sentence_token = deque()
    dic.save(w2v_file)
    # dic.save_word2vec_format(w2v_file, binary=True)



if __name__ == '__main__':
    main()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值