报错代码
修改为以下代码
# -*- coding: utf-8 -*-
import json
import codecs
import codecs
class ProcessDgreData:
def __init__(self):
self.train_file ="./test.json"
def get_ner_data(self):
with open(self.train_file,'r',encoding='utf-8') as fp:
num = 0
for i in fp.readlines():
num += 1
dic = json.loads(i)
sentence_list = []
h_list= []
e_list = []
type_list = []
sentence = dic['originalText']
entities = dic['entities']
tmp = {}
label_list = ["O"] * len(sentence)
for c in sentence:
sentence_list.append(c)
for j in entities:
# name = j['name']
label = j['label_type']
h_start = j['start_pos']
h_end = j['end_pos']-1
h_list.append(h_start)
e_list.append(h_end)
type_list.append(label)
for i,j,k in zip(h_list,e_list,type_list):
# print(i,j,k)
label_list[i] = "B-" + k
for i in range(i + 1, j + 1):
label_list[i] = "I-" + k
print(len(sentence))
print(len(label_list))
# print(sentence)
# print(label_list)
print('=========================')
tmp["id"] = "BIO" + str(num)
tmp["text"] = sentence_list
tmp["labels"] = label_list
with open('./final_test.json','a',encoding='utf-8') as fp:
json.dump(tmp,fp,ensure_ascii=False)
fp.write('\n')
if __name__ == "__main__":
processDgreData = ProcessDgreData()
processDgreData.get_ner_data()