pytorch-transformers (BERT)微调
import torch
# from pytorch_transformers import *
from pytorch_transformers import BertModel,BertTokenizer,AdamW,BertForTokenClassification
import torch.nn as nn
import pytorch_transformers
torch.__version__
import pandas as pd
from torch.utils.data import DataLoader,dataset
import time
PyTorch-Transformers has a unified API
for 7 transformer architectures and 30 pretrained weights.
Model(模型) | Tokenizer(标记生成器) | Pretrained weights shortcut(预训练权重)
# MODELS = [(BertModel, BertTokenizer, 'bert-base-uncased'),
# (OpenAIGPTModel, OpenAIGPTTokenizer, 'openai-gpt'),
# (GPT2Model, GPT2Tokenizer, 'gpt2'),
# (TransfoXLModel, TransfoXLTokenizer, 'transfo-xl-wt103'),
# (XLNetModel, XLNetTokenizer, 'xlnet-base-cased'),
# (XLMModel, XLMTokenizer, 'xlm-mlm-enfr-1024'),
# (RobertaModel, RobertaTokenizer, 'roberta-base')]
# Let's encode some text in a sequence of hidden-states using each model:
# for model_class, tokenizer_class, pretrained_weights in MODELS:
# # Load pretrained model/tokenizer
# tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
# model = model_class.from_pretrained(pretrained_weights)
# # Encode text
# input_ids = torch.tensor([tokenizer.encode("Here is some text to encode ", add_special_tokens=True)])
# print("input_ids = ",input_ids)
# # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
# with torch.no_grad():
# last_hidden_states = model(input_ids)[0] # Models outputs are now tuples
# print("last_hidden_states = ",last_hidden_states)
# print(last_hidden_states.size())
# break #这里因为只需要BERT模型所以打断
# pretrained_weights = 'bert-base-chinese'
# model = BertModel.from_pretrained(pretrained_weights)
# tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
# print("bert_model = ",model)
# print("bert_tokenizer = ",tokenizer)
# BERT_MODEL_CLASSES = [BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction,
# BertForSequenceClassification, BertForMultipleChoice, BertForTokenClassification,
# BertForQuestionAnswering]
# All the classes for an architecture can be initiated from pretrained weights for this architecture
# Note that additional weights added for fine-tuning are only initialized
# and need to be trained on the down-stream task
# pretrained_weights = 'bert-base-chinese'
# model = BertModel.from_pretrained(pretrained_weights,
# output_hidden_states = True,
# output_attentions = True
# )
# tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
# input_ids = torch.tensor([tokenizer.encode("让我们看看在这个文本中的隐层和感知层")]) #一句话18个字
# print(input_ids) #tensor([[6375, 2769, 812, 4692, 4692, 1762, 6821, 702, 3152, 3315, 704, 4638,7391, 2231, 1469, 2697, 4761, 2231]])
# print(input_ids.size()) #torch.Size([1, 18])
# out = model(input_ids)
# print("len(out) = ",len(out)) # 4
# #输出数据是四个维度的元组,
# # (1) 层数(12层) -- 模型最后一层输出处的隐藏状态序列(batch_size,sequence_length,hidden_size)
# # (2)batch(这里是1句)
# # (3)单词/令牌号 #(一个用于每个层的输出+嵌入的输出)的列表(batch_size,sequence_length,hidden_size)
# # (4)隐藏单元/特征号(768个特征)
# all_hidden_states, all_attentions = out[-2:]
# last_hidden_states = out[1] # The last hidden-state is the first element of the output tuple
# print("len(out[0]) = ",len(out[0])," out[0].size() = ",out[0].size()) #1,torch.Size([1, 18, 768])
# print("len(out[1]) = ",len(out[1])," out[1].size() = ",out[1].size()) #1,torch.Size([1, 768])
# print("len(all_hidden_states) = ",len(all_hidden_states)) #13
# print("len(all_attentions) = ",len(all_attentions)) #12
# print("all_hidden_states[-1].size() = ",all_hidden_states[-1].size()) #【1,18,768】=【batch,词语数量 ,词向量维度】
# # print("all_attentions.size[-1].size() = ",all_attentions[0-1.size()) #【1,12,18,18】
# Models are compatible with Torchscript
# model = model_class.from_pretrained(pretrained_weights, torchscript=True)
# traced_model = torch.jit.trace(model, (input_ids,))
# print("traced_model = ",traced_model)
# Simple serialization for models and tokenizers
# model.save_pretrained('./modelsave/bert/save_model_1/') # save
# model = BertModel.from_pretrained('./modelsave/bert/save_model_1/') #reload
# tokenizer.save_pretrained('./modelsave/bert/save_token_1/') #save
# tokenizer = BertTokenizer.from_pretrained('./modelsave/bert/save_token_1/')
def read_data():<