#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Time : 2018/7/26 9:40
# @verion : python3.6
# @File : generate_datas.py.py
# @Software: PyCharm
__author__ = 'xiaohu'
hidden_states = ["A", "B", "C", "D", "F", "G", "I", "J", "K", "L", "M", "P", "S", "W", "X", "Z"]
def generate_transition_probability():
'''
得到状态转移概率矩阵文本A,每行格式:首状态,次状态,概率[首状态后面为该次状态的概率]
:return:
'''
result = []
with open('./data/nt.tr.txt', mode='r') as file:
all_data = file.readlines()
for line in all_data[1:]:
split_line = line.strip().split(',')
first_state = split_line[0] # 首状态
sumLineData = sum(int(s) for s in split_line[1:])
for index, degree in enumerate(split_line[1:]):
second_state = hidden_states[index] # 次状态
result.append([first_state, second_state, float(degree) / sumLineData])
# print(result)
# 写入文本
with open('./data/transition_probability.txt', mode='w') as out_file:
for thelist in result:
str_to_write = '%s,%s,%s\n' % (thelist[0], thelist[1], thelist[2])
out_file.write(str_to_write)
print('generate transition_probability.txt')
def generate_initial_vector():
'''
得到初始化概率向量π,每行格式:状态,出现次数,概率
:return:
'''
the_hidden_states = {x: 0 for x in hidden_states}
sum_total = 0
with open('./data/nt.txt', mode='r') as file:
all_data = file.readlines()
for line in all_data:
split_line = line.strip().split(' ')
states_and_degree = split_line[1:]
# print(split_line)
for index in range(0, len(states_and_degree), 2):
states_dict = states_and_degree[index:index + 2]
the_hidden_states[states_dict[0]] += eval(states_dict[1])
sum_total += eval(states_dict[1])
# print(the_hidden_states)
# 存入文本
with open('./data/initial_vector.txt', mode='w') as initial_file:
for state, degree in the_hidden_states.items():
str_to_write = '%s,%s,%s\n' % (state, degree, float(degree) / sum_total)
initial_file.write(str_to_write)
print('generate initial_vector.txt')
# 列表存东西很方便,字典对于算法中的表达式处理很方便
def generate_emit_probability():
'''
生成观测概率矩阵,每行格式为:隐状态,显状态,概率[该隐状态在该显状态下出现的次数 / 该隐状态总共出现的次数]
:return:
'''
result = []
initial_freqence = get_initial_freq()
with open('./data/nt.txt', mode='r') as file:
all_data = file.readlines()
for line in all_data:
split_line = line.strip().split(' ')
observation = split_line[0]
states_and_degree = split_line[1:]
for index in range(0, len(states_and_degree), 2):
state_dict = states_and_degree[index:index + 2]
result.append(
[state_dict[0], observation, float(state_dict[1]) / eval(initial_freqence[state_dict[0]])])
# print(result)
with open('./data/emit_probability.txt', mode='w') as emit_file:
for line in result:
str_to_write = '%s,%s,%s\n' % (line[0], line[1], line[2])
emit_file.write(str_to_write)
print('generate: emit_probability.txt')
def get_initial_freq():
'''
获取每个字典出现的频数
:return:字典;key为标签,value为频数
'''
the_hidden_state = {x: 0 for x in hidden_states}
with open('./data/initial_vector.txt', mode='r') as file:
all_data = file.readlines()
for line in all_data:
line_data = line.strip().split(',')
the_hidden_state[line_data[0]] = line_data[1]
return the_hidden_state
if __name__ == '__main__':
generate_transition_probability()
generate_initial_vector()
generate_emit_probability()
nlp之命名实体识别HMM方法(1)
最新推荐文章于 2025-04-23 21:55:13 发布