import xlrd
import numpy as np
file_path = '综合类中文词库.xlsx'
data = xlrd.open_workbook(file_path)
table = data.sheet_by_name('Sheet1')
dic_words = table.col_values(0)
word_len = [len(l) for l in dic_words]
max_len = max(word_len)
word_prob = {"北京":0.03,"的":0.08,"天":0.005,"气":0.005,"天气":0.06,"真":0.04,"好":0.05,"真好":0.04,"啊":0.01,"真好啊":0.02,
"今":0.01,"今天":0.07,"课程":0.06,"内容":0.06,"有":0.05,"很":0.03,"很有":0.04,"意思":0.06,"有意思":0.005,"课":0.01,
"程":0.005,"经常":0.08,"意见":0.08,"意":0.01,"见":0.005,"有意见":0.02,"分歧":0.04,"分":0.02, "歧":0.005}
word_s_prob = {}
for word in dic_words:
if word not in word_prob:
word_s_prob[word] = 0.00001
word_prob.update(word_s_prob)
def word_segment_viterbi(input_str):
"""
1. 基于输入字符串,词典,以及给定的unigram概率来创建DAG(有向图)。
2. 编写维特比算法来寻找最优的PATH
3. 返回分词结果
input_str: 输入字符串 输入格式:“今天天气好”
best_segment: 最好的分词结果 输出格式:["今天","天气","好"]
"""
'''
存储格式:
graph = {
图的节点下标:{该节点前面的边可以组成的word: array([该word对应的最优score, 该word最优score下的上一个节点下标]) }
}
eg: 今天天气好
graph = {
0:{}
1:{今:array([1,0])}
2:{天:array([2,1]),今天:array([0.5,0])}
3:{天:array([2,2]),天天:array([1.7,1])} 这里天天实际是 天天的概率+今的概率
4:{气:array([5,3]),天气:array([1,2])} 这里天气实际是 天气的概率+今天的概率
5:...
}
'''
opt = np.zeros(len(input_str)+1)
opt[0] = 1000
last_node = np.zeros(len(input_str) + 1)
graph = {}
graph[0] = {}
for i in range(1,len(input_str)+1):
graph[i] = {}
for j in range(0, i):
if input_str[j:i] in word_prob:
graph[i][input_str[j:i]] = np.zeros(2)
graph[i][input_str[j:i]][0] = -np.log(word_prob[input_str[j:i]]) + opt[j]
graph[i][input_str[j:i]][1] = j
temp_values = np.array(list(graph[i].values()))
opt[i] = min(temp_values[:,0])
row = np.argmin(temp_values[:,0])
last_node[i] = temp_values[row,1]
print("last_node", last_node)
cur_node = len(input_str)
last_node = last_node.astype(int)
segments = []
while cur_node != 0:
segments.append(input_str[last_node[cur_node]:cur_node])
cur_node = last_node[cur_node]
best_segment = segments[::-1]
return best_segment
print (word_segment_viterbi("今天天气好"))
print (word_segment_viterbi("北京的天气真好啊"))
print (word_segment_viterbi("今天的课程内容很有意思"))
print (word_segment_viterbi("经常有意见分歧"))
['今天', '天气', '好']
['北京', '的', '天气', '真好啊']
['今天', '的', '课程', '内容', '很有', '意思']
['经常', '有意见', '分歧']
- log()中不能是0
- 想对某种数据类型进行操作的时候,一定要先声明该数据格式,
如,l = {}之后才可以,l[“aaa”] = 123 - array中的元素本身属性不是int,即使看起来像,需要通过array.astype(int)来变成整型
参考文章:https://blog.youkuaiyun.com/u013044310/article/details/88234050 - array可以利用[:,1]这种格式找出一整列的元素,但是list不可以
- dict.values()返回的并不是list,如果要用list的方法操作,记得要用list(dict.values())转换一下
而如果想用array的方法,记得先转成list,即np.array(list(dict.values()))