博客内容有空了再补充。先贴代码。
数据地址:链接: https://pan.baidu.com/s/1-RbHi5xxBwJDG1gqAYUReQ 密码: rkup
完整代码如下:
import argparse
import time
parser = argparse.ArgumentParser()
parser.add_argument('--train', help='input a training file')
parser.add_argument('--test', help='input a testing file')
args = parser.parse_args()
class POSTagging():
# ======输入文件处理=============
def __init__(self, train_path, test_path):
# 读取传入文件内容,生成训练及测试需要的数据格式
train_lst, test_lst = [], []
with open(train_path, 'r', encoding='utf8') as f1, open(test_path, 'r', encoding='utf8') as f2:
train_lst, test_lst = f1.readlines(), f2. readlines()
temp_train_lst = [train_line.strip() for train_line in train_lst]
temp_test_lst = [test_line.strip() for test_line in test_lst]
temp_train_sent, temp_test_sent = [], []
self.train_sent_lst, self.test_sent_lst = [], []
self.tags_cnt, self.words_cnt = 0, 0
self.tag2num, self.num2tag = {}, []
self.word2num = {}
for i in range(len(temp_train_lst)):
line = temp_train_lst[i]
if line.split('/')[0] == '###':
self.train_sent_lst.append(temp_train_sent)
temp_train_sent = []
elif line != '':
temp_train_sent.append(line)
if temp_train_sent != []:
self.train_sent_lst.append(temp_train_sent)
for i in range(len(temp_test_lst)):
line = temp_test_lst[i]
if line.split('/')[0] == '###':
self.test_sent_lst.append(temp_test_sent)
temp_test_sent = []
elif line != '':
temp_test_sent.append(line)
if temp_test_sent != []:
self.test_sent_lst.append(temp_test_sent)
# =========计算概率矩阵==========
def train(self):
emission_cnt = {}
trigram_cnt = {}
self.all_tags = set()
self.all_words = set()
self.all_words.add('UN

该博客虽未补充具体内容,但分享了数据地址及完整代码,涉及Python、算法、NLP、深度学习和动态规划等信息技术领域内容。
最低0.47元/天 解锁文章
980

被折叠的 条评论
为什么被折叠?



