数据源为txt文件,数据条数:20万,有数据重复出现的情况。
#coding=utf8
import os
import codecs
PATH = r"e:/SVN/chocolate_ime/doc"
BASE_FILE = os.path.join(PATH,"HZout_NoTone.txt")#encoding:utf-16
CIZU_FILE = os.path.join(PATH,"Cizu_komoxo95K.txt")#encoding:gbk
#********************set_time_consume_check********************************
def gen_set_cizu():
cizu_set = set()
with codecs.open(CIZU_FILE,encoding="gbk") as f:
for line in f.readlines():
if line.startswith(";"):
pass
else:
splited_line = line.split("\t")
cizu_set.add(splited_line[0])
# print len(cizu_set)
return cizu_set
def gen_set_word():
word_set = set()
with codecs.open(BASE_FILE,encoding="utf-16") as f:
for line in f.readlines():
splited_line = line.split("\t")
word_set.add(splited_line[0])
# print len(word_set)
return word_set
def set_combined():
cizu_