说明
此项目需要的数据:
- dic.xlsx: 词典包含了中文词,当做词典来用
- 以变量的方式提供了部分unigram概率 word_prob
基于枚举方法来搭建中文分词工具
import xlrd
import math
# TODO:读取字典。
print("Reading dic...")
# 获取一个Book对象
workbook = xlrd.open_workbook("dic.xlsx")
dic_words = []
# 获取一个sheet对象的列表
booksheet = workbook.sheet_by_index(0)
rows = booksheet.get_rows()
for row in rows:
dic_words.append(row[0].value)
print("len:" + str(len(dic_words)))
# 以下是每一个单词出现的概率。为了问题的简化,我们只列出了一小部分单词的概率。 在这里没有出现的的单词但是出现在词典里的,统一把概率设置成为0.00001
# 比如 p("学院")=p("概率")=...0.00001
word_prob = {"北京":0.03,"的":0.08,"天":0.005,"气":0.005,"天气":0.06,"真":0.04,"好":0.05,"真好":0.04,"啊":0.01,"真好啊":0.02,
"今":0.01,"今天":0.07,"课程":0.06,"内容":0.06,"有":0.05,"很":0.03,"很有":0.04,"意思":0.06,"有意思":0.005,"课":0.01,
"程":0.005,"经常":0.08,"意见":0.08,"意":0.01,"见":0.005,"有意见":0.02,"分歧":0.04,"分":0.02, "歧":0.005}
# TODO:计算-log(x)
for word in word_prob.keys():
word_prob[word]= round(-math.log(w