pytorch Bert模型初尝试
关于Bert模型就不多介绍了,一直想找一个看上去比较简洁的代码,了解学习Bert模型是如何在pytorch下使用的,昨天在GitHub上发现了一个仓库 Bert-Chinese-Text-Classification-Pytorch,觉得还挺不错。就稍微提炼下代码,只留下Bert部分的,方便深入一步步的了解Bert,以及如何使用,在此做个记录。
数据集可以在GitHub仓库获取。
模型就是chinese_L-12_H-768_A-12,需要先转成pytorch版的。
代码如下
from transformers import BertModel, BertTokenizer
from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
from torch.nn.functional import cross_entropy
from sklearn import metrics
from torch.utils.data import Dataset, DataLoader
PAD, CLS = '[PAD]', '[CLS]' # padding符号, bert中综合信息符号
class TitleDataset(Dataset):
def __init__(self, path, tokenizer, pad_size=32):
self.contents = []
with open(path, 'r', encoding='UTF-8') as f:
for line in tqdm(f):
lin = line.strip()
if not lin:
continue
content, label = lin.split('\t')
self.contents.append((content, label))
self.length = len(self.contents)
self.pad_size = pad_size
self.tokenizer = tokenizer
def __getitem__(self, index):
content, label = self.contents[index]
label = int(label)
token = self.tokenizer.tokenize(content)
token = [CLS] + token
mask = []
token_ids = self.tokenizer.convert_tokens_to_ids(token)
if self.pad_size:
if len(token) < self.pad_size:
mask = [1] * len(token_ids) + [0] * (pad_size - len(token))
token_ids += ([0] * (pad_size - len(token)))
else:
mask = [1] * pad_size
token_ids = token_ids[:pad_size]
mask = np.array(mask)
bert_input = np.array(token_ids)
output = {"input_ids": bert_input,
'mask': mask,
}, label
return output
def __len__(self):
return self.length
class Model(nn.Module):
def __init__(self, bert_path, hidden_size, num_classes):
super(Model, self).__init__()
self.bert = BertModel.from_pretrained(bert_path)
self.fc = nn.Linear(hidden_size, num_classes)
def forward(self, data):
context = data['input_ids'] # 输入的句子
mask = data['mask'] # 对padding部分进行mask,和句子一个size,padding部分用0表示,如:[1, 1, 1, 1, 0, 0]
_, pooled = self.bert(context, attention_mask=mask)
out = self.fc(pooled)
return out
def train(model, data_loader):
model.train()
data_dict = {}
losses = 0
i = 0
pbar = tqdm(data_loader)
for data_labels in pbar:
data = data_labels[0]
labels = data_labels[1].to(device).long()
data_dict['input_ids'] = data['input_ids'].to(device).long()
data_dict['mask'] = data['mask'].to(device).long()
outputs = model(data_dict)
loss = cross_entropy(outputs, labels)
losses = losses + loss.item()
i = i+1
optimizer.zero_grad()
loss.backward()
optimizer.step()
pbar.set_description(f"epoch:{epoch}-losses:{losses/i:.4f}")
def evaluate(model, data_loader):
model.eval()
data_dict = {}
predict_all = np.array([], dtype=int)
labels_all = np.array([], dtype=int)
with torch.no_grad():
pbar = tqdm(data_loader)
for data_labels in pbar:
data = data_labels[0]
labels = data_labels[1].cpu().long()
data_dict['input_ids'] = data['input_ids'].to(device).long()
data_dict['mask'] = data['mask'].to(device).long()
outputs = model(data_dict)
predic = torch.max(outputs.data, 1)[1].cpu().numpy()
labels_all = np.append(labels_all, labels)
predict_all = np.append(predict_all, predic)
acc = metrics.accuracy_score(labels_all, predict_all)
print('acc:', acc)
def set_seed():
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
torch.backends.cudnn.deterministic = True # 保证每次结果一样
if __name__ == '__main__':
USE_GPU = True
train_path = './data/train.txt' # 训练集
dev_path = './data/dev.txt' # 验证集
test_path = './data/test.txt' # 测试集
class_list = [x.strip() for x in open('./data/class.txt').readlines()] # 类别名单
save_path = './saved_dict/' + '.ckpt' # 模型训练结果
device = torch.device('cuda' if USE_GPU else 'cpu') # 设备
num_classes = len(class_list) # 类别数
num_epochs = 3 # epoch数
batch_size = 2 # mini-batch大小
pad_size = 32 # 每句话处理成的长度(短填长切)
learning_rate = 5e-5 # 学习率
bert_path = './model'
tokenizer = BertTokenizer.from_pretrained(bert_path)
hidden_size = 768
set_seed()
print("Loading data...")
train_data = TitleDataset(train_path, tokenizer)
dev_data = TitleDataset(dev_path, tokenizer)
# test_data = TitleDataset(test_path, tokenizer)
train_loader = DataLoader(train_data, batch_size, shuffle=True)
dev_loader = DataLoader(dev_data, batch_size, shuffle=True)
# test_loader = DataLoader(test_data, batch_size, shuffle=False)
# train
model = Model(bert_path, hidden_size, num_classes).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
train(model, train_loader)
evaluate(model, dev_loader)