数据和模型准备
模型:google-bert/bert-base-chinese at main
实体识别数据集:MSRA命名实体识别数据集 · 数据集
文本分类:github.com
实体识别
基于Transformers库,使用BERT进行命名实体识别(NER)任务,具体步骤包括:
- 数据集准备:选择一个适合NER任务的数据集(如CoNLL-03、OntoNotes等),该数据集应包含已标注的实体(如人名、地名、组织名等)标签。
- 数据预处理:对文本数据进行分词处理,使用BERT的分词器将文本转化为模型的输入格式(包括input_ids和attention_mask)。
对实体标签进行BIO(Begin-Inside-Outside)标注或其他标准化标注格式,将其与输入数据一一对应。
- 模型选择:使用BERT预训练模型(如:bert-base-uncased,chinese-bert-wwm等)进行实体识别任务。要求使用BertModel模型,而不是BertForTokenClassification。
- 模型训练:设置训练参数,定义损失函数(如交叉熵损失函数),并使用AdamW优化器进行优化。在训练集上训练BERT模型,并在验证集上进行评估,以监控模型的表现。
- 结果分析:
分析模型训练过程中的损失值、准确率、精确率、召回率、F1-score等指标。输出模型在测试集上的实体识别结果,并与真实标签进行对比,分析预测效果与潜在改进方法。
代码
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, BertConfig
import numpy as np
from seqeval.metrics import classification_report
from tqdm import tqdm
# 配置参数
class Config:
def __init__(self):
self.model_path = r"F:\Desktop\ai\BERT\model" # 您的BERT模型路径
self.train_file = "data/train.txt"
self.dev_file = "data/dev.txt"
self.test_file = "data/test.txt"
self.max_len = 128
self.batch_size = 16
self.epochs = 3
self.lr = 2e-5
self.label2id = {'O': 0, 'B-ORG': 1, 'I-ORG': 2, 'B-PER': 3, 'I-PER': 4, 'B-LOC': 5, 'I-LOC': 6}
self.id2label = {v: k for k, v in self.label2id.items()}
config = Config()
# 数据处理
class NERDataset(Dataset):
def __init__(self, filename, tokenizer):
self.data = []
current_words = []
current_labels = []
with open(filename, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line == "":
if current_words:
self.add_entry(current_words, current_labels, tokenizer)
current_words = []
current_labels = []
continue
parts = line.split()
if len(parts) >= 2 :
current_words.append(parts[0])
current_labels.append(parts[1])
if current_words: # 处理最后一个句子
self.add_entry(current_words, current_labels, tokenizer)
def add_entry(self, words, labels, tokenizer):
tokens = []
label_ids = []
for word, label in zip(words, labels):
word_tokens = tokenizer.tokenize(word)
tokens.extend(word_tokens)
# 处理子词标签对齐:第一个子词保留原标签,后续子词设为I-标签或X
label_ids.extend([config.label2id[label]] +
[config.label2id['I' + label[1:]] if label != 'O' else config.label2id['O']] * (
len(word_tokens) - 1))
# 截断处理
tokens = tokens[:config.max_len - 2]
label_ids = label_ids[:config.max_len - 2]
# 添加特殊token
tokens = ['[CLS]'] + tokens + ['[SEP]']
label_ids = [config.label2id['O']] + label_ids + [config.label2id['O']]
# 转换为ID
input_ids = tokenizer.convert_tokens_to_ids(tokens)
attention_mask = [1] * len(input_ids)
# 填充
padding_len = config.max_len - len(input_ids)
input_ids += [0] * padding_len
attention_mask += [0] * padding_len
label_ids += [config.label2id['O']] * padding_len
self.data.append({
'input_ids': torch.tensor(input_ids),
'attention_mask': torch.tensor(attention_mask),
'labels': torch.tensor(label_ids)
})
def __len__(self) :
return len(self.data)
def __getitem__(self, idx):
return self.data[idx]
# 模型构建
class BertForNER(nn.Module):
def __init__(self, bert_model, num_labels):
super().__init__()
self.bert = bert_model
self.classifier = nn.Linear(bert_model.config.hidden_size, num_labels)
self.num_labels = num_labels
def forward(self, input_ids, attention_mask, labels=None) :
outputs = self.bert(input_ids, attention_mask=attention_mask)
sequence_output = outputs.last_hidden_state
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
loss_fct = nn.CrossEntropyLoss()
active_loss = attention_mask.view(-1) == 1
active_logits = logits.view(-1, self.num_labels)[active_loss]
active_labels = labels.view(-1)[active_loss]
loss = loss_fct(active_logits, active_labels)
return {'loss': loss, 'logits': logits}
# 初始化组件
tokenizer = BertTokenizer.from_pretrained(config.model_path)
bert_config = BertConfig.from_pretrained(config.model_path)
bert_model = BertModel.from_pretrained(config.model_path, config=bert_config)
model = BertForNER(bert_model, num_labels=len(config.label2id))
# 创建数据集
train_dataset = NERDataset(config.train_file, tokenizer)
dev_dataset = NERDataset(config.dev_file, tokenizer)
test_dataset = NERDataset(config.test_file, tokenizer)
# 训练准备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=config.lr)
# 训练循环
for epoch in range(config.epochs):
model.train()
total_loss = 0
train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
inputs = {k : v.to(device) for k, v in batch.items() if k != 'labels'}
labels = batch['labels'].to(device)
optimizer.zero_grad()
outputs = model(**inputs, labels=labels)
loss = outputs['loss']
loss.backward()
optimizer.step()
total_loss += loss.item()
avg_loss = total_loss / len(train_loader)
print(f"Epoch {epoch + 1} Average Loss: {avg_loss:.4f}")
# 评估函数
def evaluate(model, dataset):
model.eval()
all_preds = []
all_labels = []
dataloader = DataLoader(dataset, batch_size=config.batch_size)
with torch.no_grad():
for batch in dataloader:
inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
labels = batch['labels'].cpu().numpy()
outputs = model(**inputs)
logits = outputs['logits'].cpu().numpy()
preds = np.argmax(logits, axis=-1)
for i in range(len(preds)):
mask = batch['attention_mask'][i].cpu().numpy().astype(bool)
valid_preds = [config.id2label[p] for p, m in zip(preds[i], mask) if m]
valid_labels = [config.id2label[l] for l, m in zip(labels[i], mask) if m]
all_preds.append(valid_preds[1:-1])
all_labels.append(valid_labels[1:-1])
# 返回原始结果用于详细分析
return all_labels, all_preds
print("\n验证集评估结果:")
dev_labels, dev_preds = evaluate(model, dev_dataset)
print(classification_report(dev_labels, dev_preds))
print("\n测试集评估结果:")
test_labels, test_preds = evaluate(model, test_dataset)
print(classification_report(test_labels, test_preds))
结果
该模型训练损失从0.40降至0.048表明训练集收敛良好,但验证/测试集微平均F1仅0.36,存在显著过拟合。PER实体因称谓规律性强表现最佳(F1=0.70),ORG实体因方位词干扰呈现高召回低精确特征(36% vs 26%),LOC实体受嵌套结构影响识别最差(F1=0.11)。典型错误分析显示,38%机构误判源自“中央”等方位词误识别,26%地点错误涉及行政区划组合。改进需构建机构后缀词库增强数据、采用BiLSTM-CRF强化标签约束、实施课程学习分阶段训练策略,以提升复杂实体泛化能力。
文本分类
基于Transformers库,使用BERT进行文本分类任务,具体步骤包括:
- 数据集准备:选择一个适合文本分类任务的数据集(如情感分析、新闻分类等)。
- 数据预处理:对文本数据进行分词处理,转化为模型输入的格式,特别是input_ids和attention_mask。
- 模型选择:使用BERT预训练模型(如:chinese-bert-wwm模型,该模型是一个支持中文的模型)要求:需要BertModel,而不是BertForSequenceClassification进行文本分类任务。
- 模型训练:设置训练参数,定义损失函数,训练模型,并在验证集上进行评估。
- 结果分析:分析模型训练过程中的损失值、准确率、以及预测结果。
代码
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, BertConfig
from sklearn.metrics import classification_report
from tqdm import tqdm
# 配置参数
class Config:
def __init__(self):
self.model_path = r"F:\Desktop\ai\BERT\model" # BERT模型路径
self.data_file = "data/data.txt" # 数据路径
self.max_len = 128
self.batch_size = 16
self.epochs = 3
self.lr = 2e-5
self.label2id = {
'news_story' : 0,
'news_culture' : 1,
'news_entertainment' : 2,
'news_sports' : 3,
'news_finance' : 4,
'news_house' : 5,
'news_car' : 6,
'news_edu' : 7,
'news_tech' : 8,
'news_military' : 9,
'news_travel' : 10,
'news_world' : 11,
'stock' : 12,
'news_agriculture' : 13,
'news_game' : 14
}
self.id2label = {v : k for k, v in self.label2id.items()}
config = Config()
# 数据集类
class ClassificationDataset(Dataset) :
def __init__(self, filename, tokenizer) :
self.data = []
with open(filename, 'r', encoding='utf-8') as f :
for line in f :
line = line.strip()
if not line :
continue
parts = line.split('_!_')
if len(parts) < 4 :
continue
label = parts[2]
text = parts[3]
if len(parts) >= 5 and parts[4] : # 合并关键词
text += " " + parts[4].replace(',', ' ')
encoding = tokenizer.encode_plus(
text,
max_length=config.max_len,
padding='max_length',
truncation=True,
return_tensors='pt'
)
self.data.append({
'input_ids' : encoding['input_ids'].squeeze(0),
'attention_mask' : encoding['attention_mask'].squeeze(0),
'labels' : torch.tensor(config.label2id[label], dtype=torch.long)
})
def __len__(self) :
return len(self.data)
def __getitem__(self, idx) :
return self.data[idx]
# BERT分类模型
class BertForClassification(nn.Module) :
def __init__(self, bert_model, num_labels) :
super().__init__()
self.bert = bert_model
self.dropout = nn.Dropout(0.1)
self.classifier = nn.Linear(bert_model.config.hidden_size, num_labels)
def forward(self, input_ids, attention_mask, labels=None) :
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
pooled_output = outputs.last_hidden_state[:, 0] # [CLS] token
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
loss = None
if labels is not None :
loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(logits, labels)
return {'loss' : loss, 'logits' : logits}
# 初始化组件
tokenizer = BertTokenizer.from_pretrained(config.model_path)
bert_config = BertConfig.from_pretrained(config.model_path)
bert_model = BertModel.from_pretrained(config.model_path, config=bert_config)
model = BertForClassification(bert_model, num_labels=len(config.label2id))
# 创建数据集
dataset = ClassificationDataset(config.data_file, tokenizer)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
# 训练准备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=config.lr)
# 训练循环
for epoch in range(config.epochs) :
model.train()
total_loss = 0
train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}") :
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
optimizer.zero_grad()
outputs = model(input_ids, attention_mask, labels=labels)
loss = outputs['loss']
loss.backward()
optimizer.step()
total_loss += loss.item()
avg_loss = total_loss / len(train_loader)
print(f"Epoch {epoch + 1} Average Loss: {avg_loss:.4f}")
# 评估函数
def evaluate(model, dataset) :
model.eval()
all_preds = []
all_labels = []
dataloader = DataLoader(dataset, batch_size=config.batch_size)
with torch.no_grad() :
for batch in dataloader :
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].cpu().numpy()
outputs = model(input_ids, attention_mask)
preds = torch.argmax(outputs['logits'], dim=1).cpu().numpy()
all_preds.extend(preds)
all_labels.extend(labels)
# 动态获取实际存在的标签
present_labels = sorted(set(all_labels + all_preds))
target_names = [config.id2label[label] for label in present_labels]
print(classification_report(
all_labels,
all_preds,
labels=present_labels,
target_names=target_names,
zero_division=0
))
print("\n验证集评估结果:")
evaluate(model, val_dataset)
结果
模型训练效果良好,三轮训练损失持续下降(1.2195→0.2354),验证集综合准确率达82%,其中文化(F1 0.88)、教育(F1 0.93)等大类表现优异,但军事类(召回0.65)、游戏类(召回0.69)等小样本类别存在漏检,财经类综合表现最弱(F1 0.69),后期需要增加小类样本量或采用类别权重调整以提升长尾类别识别能力。