借鉴3.3.2 节的实现,尝试自己实现HMM 进行词性标注(语料可选用1998年人民日报饲性标注集
(注意没有语料库//download.youkuaiyun.com/download/qq_26954059/12185434 意思一下 其实书中也有资源)
使用两个hmm模型,一个是基于字的hmm分词,c_hmm词性标注,不做解释,自己看代码注释,还有看书,对hmm模型理解,知道前向。后向,维特比,本代码只是单纯走了一遍语料库,并没有多次迭代求收敛,直接上代码
思路:无论分词还是词性标注,都是先找隐藏状态和可观察状态,在分词中每个字的标签为隐藏状态,字为可观察,而在词性标注时每个词语的词性为隐藏,词语为可观察
#-------------------------------------------------------------------------------
# Name: module1
# Purpose:
#
# Author: nkenen
#
# Created: 12/03/2020
# Copyright: (c) nkenen 2020
# Licence: <your licence>
#-------------------------------------------------------------------------------
class HMM(object):
def __init__(self):
import os
#用于存取算法中间结果,不同每次都训练
self.model_file = 'data/hmm_model.pkl'
self.model_c_file = 'data/hmm_c_model.pkl'
#隐藏状态值集合
self.state_list = ['B','M','E','S']
self.charact_list = ['ag','a','ad','an','b','bg','c','dg','d',
'e','eng','f','g','h','i','j','k','l','m','mg',
'ng','n','na','nr','ns','nt','nx','nz','o','p',
'q','r','rg','s','tg','t','u','vg','v','vd',
'vn','vvn','w','x','y','yg','z']
#参数加载,用于判断是否需要重新加载,model_file
self.load_para = False
self.load_c_para = False
def try_load_c_model(self,trained):
if trained:
import pickle
with open(self.model_c_file,'rb') as f:
#pickle库是数据持续化,把hmm模型反序列导入字典里
self.A_c_dic = pickle.load(f)
self.B_c_dic = pickle.load(f)
self.Pi_c_dic = pickle.load(f)
self.load_c_para = True
else:
#状态转移概率(状态->状态的条件转移概率a[i][j])
self.A_c_dic = {}
#发射概率(该状态中观察到该词语的概率b[j][o[t])
self.B_c_dic = {}
#状态的初始概率(各状态初始出现的概率)
self.Pi_c_dic = {}
self.load_c_para = False
def try_load_model(self,trained):
if trained:
import pickle
with open(self.model_file,'rb') as f:
#pickle库是数据持续化,把hmm模型