前言:隐马尔科夫的相关概念就不多说了,就是一个三元组(A,B,Pi),分别表示转移概率,发射概率和初始状态概率。
首先是语料库的训练部分:
#!/usr/bin/python
#-*-coding:utf-8
import sys
import math
import pdb
state_M = 4
word_N = 0
A_dic = {}
B_dic = {}
Count_dic = {}
Pi_dic = {}
word_set = set()
state_list = ['B','M','E','S']
line_num = -1
# 语料库 这里用的是人民日报已经人工分词的语料库
INPUT_DATA = "RenMinData.txt"
# 初始状态概率
PROB_START = "prob_start.py"
# 发射状态概率
PROB_EMIT = "prob_emit.py"
# 转移状态概率
PROB_TRANS = "prob_trans.py"
def init():
global state_M
global word_N
for state in state_list:
A_dic[state] = {}
for state1 in state_list:
A_dic[state][state1] = 0.0
for state in state_list:
Pi_dic[state] = 0.0
B_dic[state] = {}
Count_dic[state] = 0
# 输入词语 输出状态 B代表其实字,M代表中间字,E代表结束字,S代表单字成词
def getList(input_str):
outpout_str = []
if len(input_str) == 1:
outpout_str.append('S')
elif len(input_str) == 2:
outpout_str = ['B','E