NLP项目4-预测中间词_自然语言处理给定上下文预测中间的文字-优快云博客

该文详细描述了一个NLP项目，目标是预测句子中的中间词。首先，它加载了DistilRoBERTa的编码器进行分词，然后对数据进行批处理编码和过滤。接着，截断句子并整理成模型输入格式。之后，定义了一个下游任务模型，进行了训练和测试。最后，保存并加载模型以进行后续预测。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

NLP项目4-预测中间词

1.加载编码器分词

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')
print(tokenizer)
Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]
RobertaTokenizerFast(name_or_path='distilroberta-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

2.批编码试算

tokenizer.batch_encode_plus([
    'hide new secretions from the parental units',
    'this moive is great'])
{'input_ids': [[0, 37265, 92, 3556, 2485, 31, 5, 20536, 2833, 2], [0, 9226, 7458, 2088, 16, 372, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]]}

3.加载数据

from datasets import load_dataset, load_from_disk
# dataset = load_dataset(path='glue', name='sst2')
dataset = load_from_disk('../data/glue_sst2/')
dataset
DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

4.对每条数据Sentence部分进行批编码

def f(data, tokenizer):
    return tokenizer.batch_encode_plus(data['sentence'])
dataset = dataset.map(f,
           batched=True,
           batch_size=1000,
           num_proc=12,
           remove_columns=['sentence', 'idx', 'label'],
           fn_kwargs={'tokenizer': tokenizer})

5.过滤掉短句子

def f(data):
    return [len(i) >= 9 for i in data['input_ids']]
dataset = dataset.filter(f, batched=True, batch_size=1000, num_proc=12)                            
dataset
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 44279
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 861
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1776
    })
})

tokenizer.get_vocab()['<mask>']
50264

6.截断句子, 同时整理成模型需要的格式

def f(data):
    b = len(data['input_ids'])
    data['labels'] = data['attention_mask'].copy()
    for i in range(b):
        data['input_ids'][i] = data['input_ids'][i][:9] # 句子长度裁剪到9
        data['attention_mask'][i] = [1] * 9
        data['labels'][i] = [-100] * 9
        data['input_ids'][i][-1] = 2 # input_ids最后一位设置为2
        data['labels'][i][4] = data['input_ids'][i][4] # 真实标签复制
        data['input_ids'][i][4] = 50264 # 每一个句子第4个词为mask
    return data
import transformers
dataset = dataset.map(f, batched=True, batch_size=1000, num_proc=12)                                  
dataset
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 44279
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 861
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1776
    })
})

dataset['train'][0]
{'input_ids': [0, 37265, 92, 3556, 50264, 31, 5, 20536, 2],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, -100, -100, -100, 2485, -100, -100, -100, -100]}

7.数据加载器

import torch
from transformers.data.data_collator import default_data_collator
loader = torch.utils.data.DataLoader(
    dataset=dataset['train'],
    batch_size=8,
    collate_fn=default_data_collator,
    shuffle=True,
    drop_last=True)
for data in loader:
    break
    
len(loader), data
(5534,
 {'input_ids': tensor([[    0,  1264,     9,   475, 50264,     4,  1855,   873,     2],
          [    0,  8155,    34,  1348, 50264,   888,  2609,     5,     2],
...
          [    0, 10859,  2156,  1537, 50264, 16016,    66,     5,     2],
          [    0, 19746,    47,   619, 50264,    47,   393,   236,     2]]),
  'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1],
...
          [1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1]]),
  'labels': tensor([[ -100,  -100,  -100,  -100,   338,  -100,  -100,  -100,  -100],
          [ -100,  -100,  -100,  -100, 40350,  -100,  -100,  -100,  -100],
...
          [ -100,  -100,  -100,  -100,  1472,  -100,  -100,  -100,  -100],
          [ -100,  -100,  -100,  -100,    14,  -100,  -100,  -100,  -100]])})

8.定义下游任务模型

from transformers import RobertaModel,AutoModelForCausalLM
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.pretrained = RobertaModel.from_pretrained('distilroberta-base')
        decoder = torch.nn.Linear(768, tokenizer.vocab_size)
        decoder.bias = torch.nn.Parameter(torch.zeros(tokenizer.vocab_size))
        self.fc = torch.nn.Sequential(
            torch.nn.Linear(768, 768),
            torch.nn.GELU(),
            torch.nn.LayerNorm(768, eps=1e-5),
            decoder)
        # 加载预训练模型fc的各层参数
        parameters = AutoModelForCausalLM.from_pretrained('distilroberta-base')
        self.fc[0].load_state_dict(parameters.lm_head.dense.state_dict())
        self.fc[2].load_state_dict(parameters.lm_head.layer_norm.state_dict())
        self.fc[3].load_state_dict(parameters.lm_head.decoder.state_dict())
        self.criterion = torch.nn.CrossEntropyLoss()
    def forward(self, input_ids, attention_mask, labels=None):
        logits = self.pretrained(input_ids, attention_mask)
        logits = logits.last_hidden_state
        logits = self.fc(logits)
        loss = None
        if labels is not None:
            shifted_logits = logits[:, :-1].reshape(-1, tokenizer.vocab_size)
            shifted_labels = labels[:, 1:].reshape(-1)
            loss = self.criterion(shifted_logits, shifted_labels)
        return {'loss': loss, 'logits': logits}
model = Model()
Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/331M [00:00<?, ?B/s]

print(sum(i.numel() for i in model.parameters())) # 参数量
121364313

out = model(**data)
out['loss'], out['logits'].shape
(tensor(19.5734, grad_fn=<NllLossBackward0>), torch.Size([8, 9, 50265]))

9.测试1

def test(model):
    model.eval()
    loader_test = torch.utils.data.DataLoader(
        dataset = dataset['test'],
        batch_size=8,
        collate_fn=default_data_collator,
        shuffle=True,
        drop_last=True)
    correct = 0
    total = 0
    for i, data in enumerate(loader_test):
        label = data['labels'][:, 4].clone()
        data['labels'] = None # 从数据中抹掉label, 防止模型作弊
        with torch.no_grad(): # 计算
            out = model(**data) 
        out = out['logits'].argmax(dim=2)[:, 4]  # out['logits'].shape由[8, 9, 50265]变成[8, 9]
        correct += (label == out).sum().item()
        total += 8
        if i % 10 == 0:
            print(i)
            print(label)
            print(out)
        if i == 50:
            break 
    print('accuracy: ', correct / total)
    for i in range(8):
        print(tokenizer.decode(data['input_ids'][i]))
        print(tokenizer.decode(label[i]), tokenizer.decode(out[i])) # 真实值 预测值
        print()
test(model)
0
tensor([  822,  2789, 11783,   408,     9,   241,  1073,    12])
tensor([ 1816, 13536, 30609, 13670,   396,   241, 23250,    12])
...
50
tensor([    5,    29,   480, 10238,   110,     9,    10,  2156])
tensor([  664,    29,  4338, 21844,   239,     9,    98,   878])
accuracy:  0.35294117647058826
<s>the talents of<mask> actors helps ``</s>
 the  young

<s>when it '<mask> all wet,</s>
s s

<s>wait for video<mask> and then do</s>
 --  footage

<s>a tender and<mask> drama, based</s>
 touching  heartfelt

<s>if you pitch<mask> expectations at an</s>
 your  high

<s>a gentle blend<mask> present day testim</s>
 of  of

<s>theirs is<mask> simple and heart</s>
 a  so

<s>morton is<mask> as usual,</s>
,  running

10.训练

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device
device(type='cuda', index=0)

from transformers import AdamW
from transformers.optimization import get_scheduler
def train():
    optimizer = AdamW(model.parameters(), lr=2e-5)
    scheduler = get_scheduler(name='linear',
                             num_warmup_steps=0,
                             num_training_steps=len(loader),
                             optimizer=optimizer)
    model.to(device)
    model.train()
    for i, data in enumerate(loader):
        input_ids, attention_mask, labels = data['input_ids'], data['attention_mask'], data['labels']
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = out['loss']
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # 为训练稳定进行梯度裁剪
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        model.zero_grad()
        if i % 50 == 0:
            label = data['labels'][:, 4].to(device)
            out = out['logits'].argmax(dim=2)[:, 4]
            correct = (label == out).sum().item()
            accuracy = correct / 8
            lr = optimizer.state_dict()['param_groups'][0]['lr']
            print(i, loss.item(), accuracy, lr)
train()
0 18.58106231689453 0.25 1.9996385977593064e-05
50 6.724194049835205 0.125 1.9815684857246115e-05
...
5450 2.665070056915283 0.375 2.9996385977593064e-07
5500 1.7032554149627686 0.625 1.1926273942898448e-07

11.保存模型

torch.save(model, '../data/预测中间词.model')

12.加载模型

model2 = torch.load('../data/预测中间词.model', map_location='cpu')

13.测试2

test(model2)
0
tensor([5712, 2156,  409, 6670,  576,  189,   23,  295])
tensor([5712, 2156,  409,   34,  576,  189,   31,  295])
..
50
tensor([ 3486,   143,    19, 10713,     5,     5,   668, 32894])
tensor([ 3486,    41,    19, 10713,    10, 44009,   668, 32894])
accuracy:  0.5098039215686274
<s>neither funny<mask> suspenseful nor</s>
 nor  nor

<s>and forget about<mask> attempt at a</s>
 any  an

<s>a film made<mask> as little wit</s>
 with  with

<s>here, ad<mask> lyne comes</s>
rian rian

<s>the fact that<mask> rookie is a</s>
 the  a

<s>the scope of<mask> silberstein</s>
 the  david

<s>reign of<mask> may be little</s>
 fire  fire

<s>a solidly seaw<mask> chiller.</s>
orthy orthy