1.xml格式转txt格式
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# 修改后的代码如下:
import logging
import os.path
import sys
from gensim.corpora import WikiCorpus
if __name__ == '__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
# check and process input arguments
if len(sys.argv) < 3:
print (globals()['__doc__'] % locals())
sys.exit(1)
inp, outp = sys.argv[1:3]
space = b' '
i = 0
output = open(outp, 'w',encoding='utf-8')
wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
for text in wiki.get_texts():
s=space.join(text)
s=s.decode('utf8') + "\n"
output.write(s)
i = i + 1
if (i % 10000 == 0):
logger.info("Saved " + str(i) + " articles")
output.close()
logger.info("Finished Saved " + str(i) + " articles")
#python process.py zhwiki-latest-pages-articles.xml.bz2 wiki.zh.text
在cmd中当前目录执行命令:
python process.py 文件名.xml.bz2 输出名.text
2.查看数据,转换为简体数据格式
opencc下载地址:https://download.youkuaiyun.com/download/weixin_43746433/11393396
https://blog.youkuaiyun.com/weixin_43746433/article/details/96838330
3.构造中文维基百度数据词word2vec向量模型
import logging
import os.path
import sys
import multiprocessing
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
if __name__ == '__main__':
#program = ''
#os.path.basename(sys.argv[0])
#logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
#logger.info("running %s" % ' '.join(sys.argv))
# check and process input arguments
#if len(sys.argv) < 4:
# print (globals()['__doc__'] % locals())
# sys.exit(1)
inp, outp1, outp2 = './data/zh.jian.wiki.seg-1.3g.txt','wiki.zh.text1.model','wiki.zh.text.vector'##sys.argv[1:4]
model = Word2Vec(LineSentence(inp), size=400, window=5, min_count=5, workers=multiprocessing.cpu_count())
model.save(outp1)
model.model.wv.save_word2vec_format(outp2, binary=False)
#python word2vec_model.py zh.jian.wiki.seg.txt wiki.zh.text.model wiki.zh.text.vector#模型的向量
4. 测试模型相似度结果
from gensim.models import Word2Vec
en_wiki_word2vec_model = Word2Vec.load('./data/wiki.zh.text.model')
testwords = ['苹果','数学','学术','白痴','篮球']
for i in range(5):
res = en_wiki_word2vec_model.most_similar(testwords[i])
print (testwords[i])
print (res)