from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilgpt2', use_fast=True)
tokenizer.batch_encode_plus(['hide new secretions from the parental units','this moive is great']){'input_ids':[[24717,649,3200,507,422,262,21694,4991],[5661,6941,425,318,1049]],'attention_mask':[[1,1,1,1,1,1,1,1],[1,1,1,1,1]]}print(tokenizer)
GPT2TokenizerFast(name_or_path='distilgpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token':'<|endoftext|>','eos_token':'<|endoftext|>','unk_token':'<|endoftext|>'})
deftest(model):
model.eval()
loader_test = torch.utils.data.DataLoader(
dataset=dataset['test'],
batch_size=16,
collate_fn=default_data_collator,
shuffle=True,
drop_last=True)
correct =0
total =0for i, data inenumerate(loader_test):
label = data['input_ids'][:,-1].clone()# 只计算最后一个词的准确率 # 克隆
data['input_ids'][:,-1]=0# 再从数据中抹除最后一个词, 防止模型作弊
data['labels'][:,:]=0# label不需要with torch.no_grad():# 计算
out = model(**data)# 最后一个词的准确率, 因为有偏移量的关系, 这里取的是倒数第二个词
out = out['logits'].argmax(dim=2)[:,-2]
correct +=(label==out).sum().item()
total +=16if i %10==0:print(i)print(label)print(out)if i ==50:breakprint('accuracy: ', correct / total)for i inrange(8):print(tokenizer.decode(data['input_ids'][i,:-1]))print(tokenizer.decode(label[i]), tokenizer.decode(out[i]))print()
test(model)# 没有经过数据训练的模型, 只用预训练权重, 就达到20%准确率0
tensor([3146,11982,264,764,39769,2646,764,290,705,1502,511,10997,2644,290,906,287])
tensor([976,13,3435,13,11,1907,13,290,821,262,484,680,13,290,561,13])...50
tensor([287,9137,326,12121,287,2429,546,705,262,1936,764,326,2565,288,23365,284])
tensor([287,1169,326,290,11,12,546,13,262,584,13,546,3092,976,23365,284])
accuracy:0.2034313725490196
soul is what's lacking
inin
the long-range appeal of ``
minority the
formula 51is so trite
that that
a worthy entry into a very difficult
genre and
there isnot an ounce of honesty
in,
it extends the writings of jean
gen -
` dragonfly'is a movie
about about
but based on cq, i
' .
device = torch.device('cuda:0'if torch.cuda.is_available()else'cpu')
device
device(type='cuda', index=0)
9.训练
from transformers import AdamW
from transformers.optimization import get_scheduler
deftrain():
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_scheduler(name='linear',
num_warmup_steps=0,
num_training_steps=len(loader),
optimizer=optimizer)
model.to(device)
model.train()for i, data inenumerate(loader):
input_ids, attention_mask, labels = data['input_ids'], data['attention_mask'], data['labels']
input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
loss = out['loss']
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(),1)# 为训练稳定, 进行梯度裁剪
optimizer.step()
scheduler.step()
optimizer.zero_grad()
model.zero_grad()if i %50==0:
labels = labels[:,1:]
out = out['logits'].argmax(dim=2)[:,:-1]
correct =(labels == out).sum().item()
accuracy = correct /(16*7)
lr = optimizer.state_dict()['param_groups'][0]['lr']print(i, loss.item(), accuracy, lr)
train()05.9160065650939940.214285714285714271.9991980753809144e-05505.4553818702697750.205357142857142851.959101844426624e-05...24005.1632614135742190.232142857142857157.457898957497996e-0724504.4038996696472170.27678571428571433.4482758620689656e-07
test(model2)0
tensor([6232,379,82,532,6958,4035,16223,5321,837,2099,318,286,5701,3729,43527,10378])
tensor([262,379,82,12,257,4035,13289,5321,2003,1611,318,284,10997,257,262,6042])...50
tensor([358,32044,703,303,581,477,477,407,290,13437,8925,3296,1628,706,5688,286])
tensor([358,32044,262,303,1621,262,2642,290,475,46374,2568,220,837,329,2646,284])
accuracy:0.28799019607843135
legendary irish writer bre
nd nd
if you can get past the fant
astical astical
enough similarities to gymkata and
how the
zany, exuberantly irre
ve ve
makmalbaf follows a
res story
you have to pay attention to follow
all the
only an epic documentary could get it
all wrong
bad company leaves a bad taste,notand