我最近在做一个自然语言处理的作业,词性标注
首先对词性标注的数据集进行处理
直接上代码,我再解释一下
# -*- coding: utf-8 -*-
import nltk
def trans_chaToNum(tagged):
pred = []
for tag in tagged:
result = tag[1]
pred.append(result)
# 2.把字符编号,不同的字符对应不同的编号
mylist = set(pred)
mylist = list(mylist)
Pred = {}
for i in range(len(mylist)):
Pred[mylist[i]] = i
# 3.把刚刚那个形式里面的字符都替换成 编号
trans_pred = []
for i in pred:
trans_pred.append(Pred[i])
return Pred, trans_pred
if __name__ == '__main__':
A = open('D:/pythonProject/CwsPosNerCNNRNNLSTM-master/CwsPosNerCNNRNNLSTM-master/traindata.txt', "r",
encoding='utf-8').readlines()
tagged_sent = [] # [('to', 'TO'), ('hold', 'VB'), ('on', 'RP'), ('to', 'TO'), ('its', 'PRP$'), ('paper', 'NN'), ('.', '.')]
for a in A:
sent = a.split() # ['./.']
for item in sent:
pair = nltk.str2tuple(item) # ('.', '.')
tagged_sent.append(pair)