1.原始数据集格式2.转换后的数据集格式
{"id": "BIO0", "text": ["记", "得", "小", "时", "候", ",", "妈", "妈", "说", "起", "哪", "个", "典", "型", "败", "家", "子", "形", "象", ",", "挂", "在", "嘴", "边", "的", "就", "是", "那", "人", "吃", "喝", "嫖", "赌", "瘾", "五", "毒", "俱", "全", "。"], "labels": ["O", "O", "B-时间", "I-时间", "I-时间", "O", "B-人物", "I-人物", "O", "O", "O", "O", "O", "O", "B-人物", "I-人物", "I-人物", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-人物", "I-人物", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
3.具体代码
# -*- coding: utf-8 -*-
import json
import codecs
import codecs
def split_list_into_chunks(input_list, chunks):
"""
将列表input_list分割成chunks个子列表。
参数:
input_list -- 要分割的原始列表。
chunks -- 分割成的子列表数量。
返回:
一个包含所有子列表的列表。
"""
# 计算每个子列表的理想长度
n = len(input_list) // chunks
# 计算是否有剩余元素需要分配
remainder = len(input_list) % chunks
# 初始化结果列表
result = []
start = 0
for i in range(chunks):
# 如果有剩余元素,当前子列表多一个元素
end = start + n + (1 if i < remainder else 0)
# 从原列表中切片并添加到结果列表
result.append(input_list[start:end])
# 更新下一次切片的起始位置
start = end
return result
class ProcessDgreData:
def __init__(self):
self.train_file ="./data/dev.json"
def get_ner_data(self):
with open(self.train_file,'r',encoding='utf-8') as fp:
dic = json.load(fp)
for num,i in enumerate(dic):
sentence_list = []
h_list= []
e_list = []
type_list = []
sentence = i['sentence']
entities = i['entities']
tmp = {}
label_list = ["O"] * len(sentence)
for c in sentence:
sentence_list.append(c)
for j in entities:
name = j['name']
label = j['type']
h_start = j['pos'][0]
h_end = j['pos'][1] -1
h_list.append(h_start)
e_list.append(h_end)
type_list.append(label)
for i,j,k in zip(h_list,e_list,type_list):
# print(i,j,k)
label_list[i] = "B-" + k
for i in range(i + 1, j + 1):
label_list[i] = "I-" + k
# print(len(sentence))
# print(len(label_list))
# print(sentence)
# print(label_list)
# print('=========================')
tmp["id"] = "BIO" + str(num)
tmp["text"] = sentence_list
tmp["labels"] = label_list
with open('./final_data/dev.json','a',encoding='utf-8') as fp:
json.dump(tmp,fp,ensure_ascii=False)
fp.write('\n')
if __name__ == "__main__":
processDgreData = ProcessDgreData()
processDgreData.get_ner_data()