NLP项目3-预测最后一个词_nlp实战任务之预测最后一个词-优快云博客

本文链接：https://blog.youkuaiyun.com/weixin_60536251/article/details/130633166

NLP项目3-预测最后一个词

1.加载字典和分词工具 Huggingface

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilgpt2', use_fast=True)
tokenizer.batch_encode_plus([
    'hide new secretions from the parental units',
    'this moive is great'])
{'input_ids': [[24717, 649, 3200, 507, 422, 262, 21694, 4991], [5661, 6941, 425, 318, 1049]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}

print(tokenizer)
GPT2TokenizerFast(name_or_path='distilgpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'})

2.定义数据集

# 预测最后一个词 实际上是一个多分类问题
from datasets import load_dataset
dataset = load_dataset(path='glue', name='sst2')
  0%|          | 0/3 [00:00<?, ?it/s]
  
dataset
DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

3.分词

def f(data, tokenizer):
    return tokenizer.batch_encode_plus(data['sentence'])
dataset = dataset.map(f, batched=True, batch_size=1000, num_proc=12, # num_proc：进程数
                     remove_columns=['sentence', 'idx', 'label'], 
                     fn_kwargs={'tokenizer': tokenizer})             
dataset
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

4.过滤短句子规定一个句子最少要有8个单词

def f(data):
    return [len(i) >= 8 for i in data['input_ids']] # 句子长度大于等于8则留下
dataset = dataset.filter(f, batched=True, batch_size=1000, num_proc=12)                              
dataset
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 39905
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 848
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1730
    })
})

5.截断句子

def f(data):
    data['input_ids'] = [i[:8] for i in data['input_ids']] # 截断到第8个单词
    data['attention_mask'] = [[1] * 8] * len(data['attention_mask']) 
    # 模型帮我们做了偏移量问题, 这里输入和输出保持一致即可  前7个单词预测第8个单词
    data['labels'] = data['input_ids']
    return data
dataset = dataset.map(f, batched=True, batch_size=1000, num_proc=12)                                
dataset
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 39905
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 848
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1730
    })
})

dataset['train'][0]
{'input_ids': [24717, 649, 3200, 507, 422, 262, 21694, 4991],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [24717, 649, 3200, 507, 422, 262, 21694, 4991]}

6.数据加载器

import torch
from transformers.data.data_collator import default_data_collator
loader = torch.utils.data.DataLoader(
    dataset=dataset['train'],
    batch_size=16,
    collate_fn=default_data_collator,
    shuffle=True,
    drop_last=True)
for data in loader:
    break
len(loader), data
(2494,
 {'input_ids': tensor([[   64,  1643,   286,   257,   866,   263,   290,   257],
          [ 6738,   923,   284,  5461,   837,  9593,   257,  2121],
...
          [34751,  1363,   435,  8809,   318,   299,   470,  1016],
          [43395,    13,  4636,  2433,   837,   257, 28892,  3437]]),
  'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1],
...
          [1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1]]),
  'labels': tensor([[   64,  1643,   286,   257,   866,   263,   290,   257],
          [ 6738,   923,   284,  5461,   837,  9593,   257,  2121],
...
          [34751,  1363,   435,  8809,   318,   299,   470,  1016],
          [43395,    13,  4636,  2433,   837,   257, 28892,  3437]])})
from transformers import AutoModelForCausalLM, GPT2Model
tokenizer.vocab_size * 768
38597376

7.定义下游任务模型

class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # 简单写法 model = AutoModelForCausalLM.from_pretrained('distilgpt2')
        self.pretrained = GPT2Model.from_pretrained('distilgpt2')
        self.fc = torch.nn.Linear(768, tokenizer.vocab_size, bias=False)
        # 给fc这一层加载预训练权重
        parameters = AutoModelForCausalLM.from_pretrained('distilgpt2')
        self.fc.load_state_dict(parameters.lm_head.state_dict())
        self.criterion = torch.nn.CrossEntropyLoss()
    def forward(self, input_ids, attention_mask, labels=None):
        logits = self.pretrained(input_ids=input_ids, attention_mask=attention_mask)
        logits = logits.last_hidden_state
        logits = self.fc(logits)
        loss = None
        if labels is not None:
            shift_logits = logits[:, :-1].reshape(-1, tokenizer.vocab_size) # 1-7单词
            shift_labels = labels[:, 1:].reshape(-1) # 2-8单词
            loss = self.criterion(shift_logits, shift_labels)
        return {'loss': loss, 'logits': logits}
model = Model()
print(sum(i.numel() for i in model.parameters()) / 10000) # 参数量
12050.9952

out = model(**data) # python中** 自动把一个字典解包成关键词参数{} -> xxx =xxx, xxx=xxx
print(out['loss'], out['logits'].shape) # [16, 8, 50257]：批次数 8单词 类别数
tensor(6.5038, grad_fn=<NllLossBackward0>) torch.Size([16, 8, 50257])

8.测试1

def test(model):
    model.eval()
    loader_test = torch.utils.data.DataLoader(
        dataset=dataset['test'],
        batch_size=16,
        collate_fn=default_data_collator,
        shuffle=True,
        drop_last=True)
    correct = 0
    total = 0
    for i, data in enumerate(loader_test):
        label = data['input_ids'][:, -1].clone()  # 只计算最后一个词的准确率  # 克隆
        data['input_ids'][:, -1] = 0 # 再从数据中抹除最后一个词, 防止模型作弊
        data['labels'][:, :]  = 0 # label不需要
        with torch.no_grad(): # 计算
            out = model(**data)
        # 最后一个词的准确率, 因为有偏移量的关系, 这里取的是倒数第二个词
        out = out['logits'].argmax(dim=2)[:, -2]
        correct += (label==out).sum().item()
        total += 16
        if i % 10 == 0:
            print(i)
            print(label)
            print(out) 
        if i == 50:
            break 
    print('accuracy: ', correct / total)
    for i in range(8):
        print(tokenizer.decode(data['input_ids'][i, :-1]))
        print(tokenizer.decode(label[i]), tokenizer.decode(out[i]))
        print()
test(model) # 没有经过数据训练的模型, 只用预训练权重, 就达到20%准确率
0
tensor([ 3146, 11982,   264,   764, 39769,  2646,   764,   290,   705,  1502,
          511, 10997,  2644,   290,   906,   287])
tensor([ 976,   13, 3435,   13,   11, 1907,   13,  290,  821,  262,  484,  680,
          13,  290,  561,   13])
...
50
tensor([  287,  9137,   326, 12121,   287,  2429,   546,   705,   262,  1936,
          764,   326,  2565,   288, 23365,   284])
tensor([  287,  1169,   326,   290,    11,    12,   546,    13,   262,   584,
           13,   546,  3092,   976, 23365,   284])
accuracy:  0.2034313725490196
soul is what's lacking
 in  in

the long-range appeal of ``
 minority the

formula 51 is so trite
 that  that

a worthy entry into a very difficult
 genre  and

there is not an ounce of honesty
 in ,

it extends the writings of jean
 gen -

` dragonfly'is a movie
 about  about

but based on cq, i
 ' .

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device
device(type='cuda', index=0)

9.训练

from transformers import AdamW
from transformers.optimization import get_scheduler
def train():
    optimizer = AdamW(model.parameters(), lr=2e-5)
    scheduler = get_scheduler(name='linear',
                              num_warmup_steps=0,
                              num_training_steps=len(loader),
                              optimizer=optimizer)
    model.to(device)
    model.train()
    for i, data in enumerate(loader):
        input_ids, attention_mask, labels = data['input_ids'], data['attention_mask'], data['labels']
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = out['loss']
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1) # 为训练稳定, 进行梯度裁剪
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        model.zero_grad()
        if i % 50 == 0:
            labels = labels[:, 1:]
            out = out['logits'].argmax(dim=2)[:, :-1]
            correct = (labels == out).sum().item()
            accuracy = correct / (16 * 7)
            lr = optimizer.state_dict()['param_groups'][0]['lr']
            print(i, loss.item(), accuracy, lr)
train()
0    5.916006565093994   0.21428571428571427   1.9991980753809144e-05
50   5.455381870269775   0.20535714285714285   1.959101844426624e-05
...
2400 5.163261413574219   0.23214285714285715   7.457898957497996e-07
2450 4.403899669647217   0.2767857142857143    3.4482758620689656e-07

10.模型保存

torch.save(model, '../data/预测最后一个词.model')

11.模型加载

model2 = torch.load('../data/预测最后一个词.model', map_location='cpu')

12.测试2

test(model2)
0
tensor([ 6232,   379,    82,   532,  6958,  4035, 16223,  5321,   837,  2099,
          318,   286,  5701,  3729, 43527, 10378])
tensor([  262,   379,    82,    12,   257,  4035, 13289,  5321,  2003,  1611,
          318,   284, 10997,   257,   262,  6042])
...
50
tensor([  358, 32044,   703,   303,   581,   477,   477,   407,   290, 13437,
         8925,  3296,  1628,   706,  5688,   286])
tensor([  358, 32044,   262,   303,  1621,   262,  2642,   290,   475, 46374,
         2568,   220,   837,   329,  2646,   284])
accuracy:  0.28799019607843135
legendary irish writer bre
nd nd

if you can get past the fant
astical astical

enough similarities to gymkata and
 how  the

zany, exuberantly irre
ve ve

makmalbaf follows a
 res  story

you have to pay attention to follow
 all  the

only an epic documentary could get it
 all  wrong

bad company leaves a bad taste,
 not  and