源代码和数据集已经上传,见:https://download.youkuaiyun.com/download/pengchengliu/10569142
代码如下:
#读入数据,对数据进行预处理
def prehandle():
f = open('raw_data.txt', 'r', encoding="utf-8") #utf-8
# f = open('test_data.txt', 'r', encoding="utf-8") # utf-8
tmp = f.read().split()
f.close()
#对数据集进行处理
dataset = []
element = []
for i in tmp:
element = i.split("/")
if len(element) == 1:
element.append('w')
if element[1]=='x':
element[1]='n'
dataset.append(element)
#将数据集按照8:2的比例划分为训练集与测试集
i = round(len(dataset) * 0.8)
while dataset[i][0] != '。': #找到这句话结尾
i += 1
trainset = []
testset = []
for j in range(i + 1):
trainset.append(dataset[j])
for j in range(i + 1, len(dataset)):
testset.append(dataset[j])
return trainset, testset
#参数学习
def learnMordel(trainset):
s_num = {}
o_num = {}
pai_num = {}
A_tmp = {} #用于存放每个动词产生多少其他词。每个名次产生多少词。
A_num = {}
B_num = {}
for i in range(len(trainset)):
# s_num {n:200 , v:300 , ...}
# 用于标记每个状态出现的次数
if trainset[i][1] in s_num:
s_num[trainset[i][1]] += 1
else:
s_num[trainset[i][1]] = 1
# o_num {'吃':2 , "跑":30 , ...}
# 用于标记每个观察值出现的次数
if trainset[i][0] in o_num:
o_num[trainset[i][0]] += 1
else:
o_num[trainset[i][0]] = 1
# pai_num
#用于记录句号后面一个词性
if i == 0 or trainset[i - 1][0] == '。