【BERT系列】——命名实体识别

本文详细介绍使用BERT模型进行命名实体识别的过程,包括环境配置、数据准备、模型训练及评估等关键步骤,最终在验证集上实现了高精度的实体识别效果。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

本文是BERT实战的第二篇,使用BERT进行命名实体识别(序列标注类任务)。

1. 准备

1.1 环境

  • python 3.7
  • pytorch 1.3
  • transformers 2.3安装教程);

1.2 数据

  • 数据链接(链接:https://pan.baidu.com/s/1spwmV3_07U0HA9mlde2wMg
    提取码:reic);

2. 实战

2.1 训练代码


lr = 5e-5
max_length = 256
batch_size = 8
epoches = 20
cuda = True
# cuda = False
max_grad_norm = 1
warmup_steps = 3000
train_steps = 60000
train_dataset_file_path = './data/names/train.json'
eval_dataset_file_path = './data/names/text.json'

tokenizer = BertTokenizer('./bert_model/vocab.txt')

with open('./data/names/label.json', mode='r', encoding='utf8') as f:
    id2label, label2id = json.load(f)


# 得到attention mask
def get_atten_mask(tokens_ids, pad_index=0):
    return list(map(lambda x: 1 if x != pad_index else 0, tokens_ids))


class NerDataSet(Dataset):

    def __init__(self, file_path):
        token_ids = []
        token_attn_mask = []
        token_seg_type = []
        labels = []

        with open(file_path, mode='r', encoding='utf8') as f:
            data_set = json.load(f)
            data_set = data_set[:5]

        for data in data_set:
            text = data['text']
            tmp_token_ids = tokenizer.encode(text, max_length=max_length, pad_to_max_length=True)
            if len(text) < max_length - 2:
                tmp_labels = [label2id['O']] + [label2id[item] for item in data['labels']] + [label2id['O']] * (
                        max_length - len(data['labels']) - 1)
            else:
                tmp_labels = [label2id['O']] + [label2id[item] for item in data['labels']][:max_length - 2] + [
                    label2id['O']]
            tmp_token_attn_mask = get_atten_mask(tmp_token_ids)
            tmp_seg_type = tokenizer.create_token_type_ids_from_sequences(tmp_token_ids[1:-1])
            token_ids.append(tmp_token_ids)
            token_attn_mask.append(tmp_token_attn_mask)
            token_seg_type.append(tmp_seg_type)
            labels.append(tmp_labels)

        self.TOKEN_IDS = torch.from_numpy(np.array(token_ids)).long()
        self.TOKEN_ATTN_MASK = torch.from_numpy(np.array(token_attn_mask)).long()
        self.TOKEN_SEG_TYPE = torch.from_numpy(np.array(token_seg_type)).long()
        self.LABELS = torch.from_numpy(np.array(labels)).long()

    def __len__(self):
        return self.LABELS.shape[0]

    def __getitem__(self, item):
        return self.TOKEN_IDS[item], self.TOKEN_SEG_TYPE[item], \
               self.TOKEN_ATTN_MASK[item], self.LABELS[item]


def train(train_dataset, model: BertForTokenClassification, scheduler, optimizer: AdamW, batch_size=batch_size,
          device=None):
    train_sampler = RandomSampler(train_dataset)
    train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)
    model.train()
    tr_loss = 0.0
    tr_acc = 0
    global_step = 0
    if cuda:
        torch.cuda.empty_cache()
    for step, batch in tqdm(enumerate(train_loader)):
        # print(step)
        inputs = {
            'input_ids': batch[0].to(device),
            'token_type_ids': batch[1].to(device),
            'attention_mask': batch[2].to(device),
            'labels': batch[3].to(device)
        }
        outputs = model(**inputs)
        loss = outputs[0]
        # print(loss)
        logits = outputs[1].view(-1, len(label2id))

        tr_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        scheduler.step()
        optimizer.step()
        model.zero_grad()
        # 计算准确率
        _, pred = logits.max(1)
        number_corr = (pred == batch[-1].to(device).view(-1)).long().sum().item()
        tr_acc += number_corr
        global_step += 1

    return tr_loss / global_step, tr_acc / (len(train_dataset) * max_length)


class NER(tuple):
    def __init__(self, ner):
        self.ner = ner

    def __hash__(self):
        return self.ner.__hash__()

    def __eq__(self, other):
        return self.ner == other


def get_entities(text_list, label_list):
    # text = ''.join(text_list)
    result_ent = []
    buf_ent = []
    ner_clas = ''
    for i, item in enumerate(label_list):
        item = str(item)
        item = item.strip()
        if item == 'O':
            if len(buf_ent) > 0:
                result_ent.append((''.join(buf_ent), ner_clas))
                buf_ent = []
            continue

        pre_item, ner_item = item.split('-')

        if pre_item == 'B':
            if len(buf_ent) > 0:
                result_ent.append((''.join(buf_ent), ner_clas))
                buf_ent = []
            buf_ent.append(text_list[i])
            ner_clas = ner_item
        else:
            if ner_item == ner_clas:
                buf_ent.append(text_list[i])
            else:
                logger.warn('ner error')
    return result_ent


def predict_func(text, model, device=None):
    text = text.strip()
    token_ids = tokenizer.encode(text, max_length=max_length, pad_to_max_length=True)
    token_attn_mask = get_atten_mask(token_ids)
    seq_type_ids = tokenizer.create_token_type_ids_from_sequences(token_ids[1:-1])

    token_ids = torch.from_numpy(np.array(token_ids)).unsqueeze(0).long()
    token_attn_mask = torch.from_numpy(np.array(token_attn_mask)).unsqueeze(0).long()
    seq_type_ids = torch.from_numpy(np.array(seq_type_ids)).unsqueeze(0).long()

    inputs = {
        'input_ids': token_ids.to(device),
        'token_type_ids': seq_type_ids.to(device),
        'attention_mask': token_attn_mask.to(device),
    }
    output = model(**inputs)[0]
    output = output.squeeze()
    output = output[1:len(text) + 1, :]
    _, output = output.max(1)
    label_list = list(output.cpu().numpy())
    return get_entities(list(text), [id2label[str(item)] for item in label_list])


def evalate(model: BertForTokenClassification, device=None):
    with open('./data/names/text.json', mode='r', encoding='utf8') as f:
        test_data = json.load(f)
    X, Y, Z = 1e-10, 1e-10, 1e-10
    f1, precision, recall = 0.0, 0.0, 0.0
    result_list = []
    pbar = tqdm()
    for data in tqdm(test_data):
        predict_entities = predict_func(data['text'], model, device)
        predict_entities = [NER((item[0], item[1])) for item in predict_entities]

        entities = [NER((item[0], item[1])) for item in data['entities']]

        R = set(predict_entities)
        T = set(entities)

        X += len(R & T)
        Y += len(R)
        Z += len(T)
        f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
        pbar.update()
        pbar.set_description('f1: %.5f, precision: %.5f, recall: %.5f' %
                             (f1, precision, recall))

        s = {
            'text': data['text'],
            'ent_list': list(T),
            'ent_list_pred': list(R),
            'new': list(R - T),
            'lack': list(T - R),
        }
        result_list.append(s)
        with open('./predict.json', mode='w', encoding='utf8') as f:
            json.dump(result_list, f, indent=4, ensure_ascii=False)
    pbar.close()
    with open('./predict.json', mode='w', encoding='utf8') as f:
        json.dump(result_list, f, indent=4, ensure_ascii=False)
    return f1, precision, recall

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


if __name__ == '__main__':
    config = BertConfig.from_pretrained('./bert_model/bert_config.json')
    device = torch.device('cuda' if cuda else 'cpu')
    model = BertForTokenClassification.from_pretrained('./bert_model/pytorch_model.bin', config=config).to(device)

    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
         'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
         'weight_decay': 0.0}
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=1e-8)

    scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, train_steps)

    logger.info('create train dataset')
    train_dataset = NerDataSet(train_dataset_file_path)

    # logger.info('create eval dataset')
    # eval_dataset = NerDataSet(eval_dataset_file_path)

    eval_best_f1 = 0.0
    for e in range(1, epoches):
        start_time = time.time()
        train_loss, train_acc = train(train_dataset, model, scheduler, optimizer, batch_size, device)
        # eval_acc = evalate(eval_dataset, model, batch_size, device)
        eval_result = evalate(model, device)
        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        logger.info('Epoch: {:02} | Time: {}m {}s'.format(e, epoch_mins, epoch_secs))
        logger.info(
            'Train Loss: {:.6f} | Eval f1: {:.6f} | Eval Pre: {:.6f} | Eval Rec: {:.6f}'.format(train_loss,
                                                                                                eval_result[0],
                                                                                                eval_result[1],
                                                                                                eval_result[2]))
        if eval_result[0] > eval_best_f1:
            eval_best_f1 = eval_result[0]
            torch.save(model.state_dict(), './models/model_{}'.format(e))

3. 效果

  • 在验证集最终效果:f1:0.9247Precision:0.925Recall:0.924
### 使用 BERT 实现 BIO 命名实体识别 (NER) #### 数据预处理 为了使BERT模型适用于BIO命名实体识别任务,数据集需经过适当预处理。考虑到不同数据集中标注格式可能存在差异,在使用前应统一至BIO标准。对于非BIO格式的数据集,如采用BIOES标注法的情况,则需要转换成仅含`B-`、`I-`和`O`标签的形式[^1]。 #### 加载与配置环境 首先安装必要的库文件: ```bash pip install transformers datasets torch seqeval ``` 加载所需的模块并设置参数: ```python from transformers import BertTokenizer, BertForTokenClassification, Trainer, TrainingArguments import torch from datasets import load_dataset from sklearn.preprocessing import LabelEncoder import numpy as np ``` #### 准备训练数据 定义函数用于读取自定义格式的语料,并将其转化为适合输入给BERT模型的形式。这里假设已经有一个符合BIO格式的CSV或TXT文件作为起点。 ```python def prepare_data(file_path): dataset = load_dataset('csv', data_files={'train': file_path}) label_encoder = LabelEncoder() unique_tags = set(tag for tags in dataset['train']['ner_tags'] for tag in tags) label_encoder.fit(list(unique_tags)) tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples['tokens'], truncation=True, is_split_into_words=True, max_length=512, padding='max_length' ) labels = [] for i, label in enumerate(examples[f'ner_tags']): word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx = None label_ids = [] for word_idx in word_ids: if word_idx is None or word_idx == previous_word_idx: label_ids.append(-100) # Ignore index used by loss function to ignore predictions on special tokens. else: label_ids.append(int(label[word_idx])) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs encoded_datasets = dataset.map(tokenize_and_align_labels, batched=True) id2label = {idx: label for idx, label in enumerate(label_encoder.classes_)} model = BertForTokenClassification.from_pretrained('bert-base-chinese', num_labels=len(id2label), id2label=id2label, label2id={v:k for k,v in id2label.items()}) return encoded_datasets, model ``` 此部分代码实现了从原始文本到可用于训练BERT模型结构化的转变过程。通过调用`tokenizer()`方法完成句子编码的同时保持原有单词边界不变;并通过创建一个新的字段`labels`来保存调整过的位置对应的类别索引值^-100代表忽略预测结果^。 #### 训练模型 设定超参并启动训练循环: ```python training_args = TrainingArguments(output_dir='./results', evaluation_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=3, weight_decay=0.01) trainer = Trainer(model=model, args=training_args, train_dataset=train_set, eval_dataset=val_set, compute_metrics=lambda p : {"accuracy": (np.argmax(p.predictions, axis=-1).flatten() == p.label_ids.flatten()).mean()}, ) trainer.train() ``` 上述脚本展示了如何利用Hugging Face提供的Trainer API简化微调流程。值得注意的是计算评估指标的方式——在这里选择了简单的精确度测量,但在实际项目中可根据需求替换为更复杂的评价体系。 #### 测试与部署 完成训练之后即可对新样本执行推理操作。具体做法是先按照之前相同的逻辑对其进行编码再传入已训练好的网络获取最终输出的概率分布向量进而得到最有可能的目标类别。 ---
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值