数据格式:(词,拼音,词频)
的 de
148709248
的 di 1193135
了 le 62873377
了 liao
3199200
是 shi 62432861
一 yi 58994539
不 bu 57479625
不 fou 1136895
将文件中多音字的高频词汇提取并返回
def chose_high_freq_word():
'''key:word,value:[pinyin](high frequence)'''
filepath = os.path.dirname(os.path.abspath(__file__))
filename = os.path.join(filepath,"dict.txt")
if not os.path.isfile(filename):
raise ValueError("No such file:{}".format(filename))
all_word_pinyin_freq_dic = {}
multi_word_pinyin_freq_dic = {}
with codecs.open(filename,encoding="utf-8") as f:
for line in f.readlines():
if line.startswith(";"):
pass
else:
splited_line = line.split("\t")
if len(splited_line) is not 3:
raise ValueError("splited lenth is not 3,in file HZout_NoTone.txt")
word = splited_line[0]
pinyin = splited_line[1]
freq = splited_line[2].strip()
check_multi = all_word_pinyin_freq_dic.get(word)
if check_multi is not None:
check_multi.append([pinyin,freq])
multi_word_pinyin_freq_dic[word] = check_multi
all_word_pinyin_freq_dic[word] = check_multi
else:
all_word_pinyin_freq_dic[word] = [[pinyin,freq]]
# print len(multi_word_pinyin_freq_dic)
pattern = re.compile(r"\d")
for word in multi_word_pinyin_freq_dic:
# print word,multi_word_pinyin_freq_dic[word]
multi_word_pinyin_freq_dic[word] = max(multi_word_pinyin_freq_dic[word],key=lambda x:int(x[1]))
multi_word_pinyin_freq_dic[word] = [pattern.sub("",multi_word_pinyin_freq_dic[word][0])]
# for word in multi_word_pinyin_freq_dic:
# print word,multi_word_pinyin_freq_dic[word]
return multi_word_pinyin_freq_dic