pytorch-transformers (BERT)微调

本文介绍了使用PyTorch-Transformers库进行BERT模型的微调,该库提供了一个统一的API,支持7种不同的Transformer架构和30种预训练权重。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

pytorch-transformers (BERT)微调

import torch
# from pytorch_transformers import *
from pytorch_transformers import BertModel,BertTokenizer,AdamW,BertForTokenClassification
import torch.nn as nn
import pytorch_transformers
torch.__version__
import pandas as pd
from torch.utils.data import DataLoader,dataset
import time


PyTorch-Transformers has a unified API

for 7 transformer architectures and 30 pretrained weights.

      Model(模型)    | Tokenizer(标记生成器)    | Pretrained weights shortcut(预训练权重)
# MODELS = [(BertModel,       BertTokenizer,      'bert-base-uncased'),
#           (OpenAIGPTModel,  OpenAIGPTTokenizer, 'openai-gpt'),
#           (GPT2Model,       GPT2Tokenizer,      'gpt2'),
#           (TransfoXLModel,  TransfoXLTokenizer, 'transfo-xl-wt103'),
#           (XLNetModel,      XLNetTokenizer,     'xlnet-base-cased'),
#           (XLMModel,        XLMTokenizer,       'xlm-mlm-enfr-1024'),
#           (RobertaModel,    RobertaTokenizer,   'roberta-base')]
# Let's encode some text in a sequence of hidden-states using each model:
# for model_class, tokenizer_class, pretrained_weights in MODELS:
#     # Load pretrained model/tokenizer
#     tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
#     model = model_class.from_pretrained(pretrained_weights)

#     # Encode text
#     input_ids = torch.tensor([tokenizer.encode("Here is some text to encode ", add_special_tokens=True)])  
#     print("input_ids = ",input_ids)
#     # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
#     with torch.no_grad():
#         last_hidden_states = model(input_ids)[0]  # Models outputs are now tuples
#         print("last_hidden_states = ",last_hidden_states)
#         print(last_hidden_states.size())
    
#     break #这里因为只需要BERT模型所以打断
# pretrained_weights = 'bert-base-chinese'
# model = BertModel.from_pretrained(pretrained_weights)
# tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
# print("bert_model = ",model)
# print("bert_tokenizer = ",tokenizer)
# BERT_MODEL_CLASSES = [BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction,
#                       BertForSequenceClassification, BertForMultipleChoice, BertForTokenClassification,
#                       BertForQuestionAnswering]

# All the classes for an architecture can be initiated from pretrained weights for this architecture
# Note that additional weights added for fine-tuning are only initialized
# and need to be trained on the down-stream task
# pretrained_weights = 'bert-base-chinese'
# model = BertModel.from_pretrained(pretrained_weights,
#                                  output_hidden_states = True,
#                                  output_attentions = True
#                                  )
# tokenizer = BertTokenizer.from_pretrained(pretrained_weights)

# input_ids = torch.tensor([tokenizer.encode("让我们看看在这个文本中的隐层和感知层")]) #一句话18个字
# print(input_ids) #tensor([[6375, 2769,  812, 4692, 4692, 1762, 6821,  702, 3152, 3315,  704, 4638,7391, 2231, 1469, 2697, 4761, 2231]])
# print(input_ids.size()) #torch.Size([1, 18])
# out = model(input_ids) 
# print("len(out) = ",len(out))   # 4
# #输出数据是四个维度的元组,
# # (1) 层数(12层) -- 模型最后一层输出处的隐藏状态序列(batch_size,sequence_length,hidden_size)
# # (2)batch(这里是1句)
# # (3)单词/令牌号 #(一个用于每个层的输出+嵌入的输出)的列表(batch_size,sequence_length,hidden_size)
# # (4)隐藏单元/特征号(768个特征)
# all_hidden_states, all_attentions = out[-2:]
# last_hidden_states = out[1] # The last hidden-state is the first element of the output tuple
# print("len(out[0]) = ",len(out[0]),"    out[0].size() = ",out[0].size()) #1,torch.Size([1, 18, 768])
# print("len(out[1]) = ",len(out[1]),"    out[1].size() = ",out[1].size()) #1,torch.Size([1, 768])
# print("len(all_hidden_states) = ",len(all_hidden_states))  #13
# print("len(all_attentions) = ",len(all_attentions))     #12
# print("all_hidden_states[-1].size() = ",all_hidden_states[-1].size()) #【1,18,768】=【batch,词语数量 ,词向量维度】
# # print("all_attentions.size[-1].size() = ",all_attentions[0-1.size())        #【1,12,18,18】
# Models are compatible with Torchscript
# model = model_class.from_pretrained(pretrained_weights, torchscript=True)
# traced_model = torch.jit.trace(model, (input_ids,))
# print("traced_model = ",traced_model)
# Simple serialization for models and tokenizers
# model.save_pretrained('./modelsave/bert/save_model_1/')  # save
# model = BertModel.from_pretrained('./modelsave/bert/save_model_1/') #reload
# tokenizer.save_pretrained('./modelsave/bert/save_token_1/') #save
# tokenizer = BertTokenizer.from_pretrained('./modelsave/bert/save_token_1/')
def read_data():<
评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值