#23 Counter Cache Column

本文介绍了一种通过在数据库中缓存关联记录的数量来提高性能的方法。这种方法适用于has_many关联场景,通过使用counter_cache特性更新计数器,避免了每次加载页面时都需要重新计算关联记录数量,从而显著提升了应用性能。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

If you need to display the record count for a has_many association, you can improve performance by caching that number in a column.
# migrations/006_add_tasks_count.rb
def self.up
add_column :projects, :tasks_count, :integer, :default => 0

Project.reset_column_information
Project.find(:all).each do |p|
Project.update_counters p.id, :tasks_count => p.tasks.length
end
end

def self.down
remove_column :projects, :tasks_count
end

# models/task.rb
belongs_to :project, :counter_cache => true
<!-- projects/index.rhtml -->
<%= pluralize project.tasks.size, 'task' %>
from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification from transformers import TrainingArguments, Trainer import evaluate from datasets import load_dataset import numpy as np from seqeval.metrics import classification_report, f1_score, precision_score, recall_score import torch import os import json import logging from collections import Counter from sklearn.utils.class_weight import compute_class_weight import matplotlib.pyplot as plt # 设置详细日志 logging.basicConfig(level=logging.INFO, format=&#39;%(asctime)s - %(levelname)s - %(message)s&#39;) logger = logging.getLogger(__name__) # 设置设备 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logger.info(f"Using device: {device}") # 模型路径 model_dir = &#39;C:/Users/admin/.cache/modelscope/hub/tiansz/bert-base-chinese&#39; # 加载tokenizer tokenizer = AutoTokenizer.from_pretrained(model_dir) logger.info("Tokenizer加载完成") # 加载数据集 path = r&#39;C:\Users\admin\八斗学院学习笔记\练习\week11_codes\data&#39; dataset = load_dataset(path) logger.info(f"数据集加载完成: {dataset}") # 提取所有实体标签 labels = set() for row in dataset[&#39;train&#39;]: for ent in row[&#39;entities&#39;]: labels.add(ent[&#39;label&#39;]) # 创建完整的标签列表(包含B-和I-前缀) entities = sorted(list(labels)) # 排序确保一致性 tags = [&#39;O&#39;] for ent in entities: tags.append(&#39;B-&#39; + ent.upper()) tags.append(&#39;I-&#39; + ent.upper()) # 创建标签到ID的映射 tag2id = {tag: i for i, tag in enumerate(tags)} id2tag = {i: tag for i, tag in enumerate(tags)} logger.info(f"标签数量: {len(tags)}") logger.info(f"标签列表: {tags}") # 保存标签映射 os.makedirs(&#39;label_mappings&#39;, exist_ok=True) with open(&#39;label_mappings/tag2id.json&#39;, &#39;w&#39;, encoding=&#39;utf-8&#39;) as f: json.dump(tag2id, f, ensure_ascii=False, indent=2) with open(&#39;label_mappings/id2tag.json&#39;, &#39;w&#39;, encoding=&#39;utf-8&#39;) as f: json.dump(id2tag, f, ensure_ascii=False, indent=2) logger.info("标签映射已保存到 label_mappings 目录") # 实体处理函数(修复版)- 确保标签正确性 def entities_proc(items): text = items[&#39;text&#39;] items_len = len(text) ent_tags = [&#39;O&#39;] * items_len # 初始化为&#39;O&#39; entities = items[&#39;entities&#39;] for ent in entities: start = ent[&#39;start_offset&#39;] end = ent[&#39;end_offset&#39;] label = ent[&#39;label&#39;].upper() # 统一转换为大写 # 确保位置在范围内 if start < items_len: # 设置B-标签 ent_tags[start] = f&#39;B-{label}&#39; # 设置I-标签(如果有多个字符) for pos in range(start + 1, min(end, items_len)): ent_tags[pos] = f&#39;I-{label}&#39; # 转换为ID(遇到未知标签默认为&#39;O&#39;) tag_ids = [tag2id.get(tag, tag2id[&#39;O&#39;]) for tag in ent_tags] return {&#39;ent_tags&#39;: tag_ids} # 应用实体处理 logger.info("处理实体标签...") ds = dataset.map(entities_proc) # 分析标签分布 def analyze_label_distribution(dataset, split): logger.info(f"\n分析 {split} 标签分布...") all_labels = [] for example in dataset[split]: all_labels.extend([id2tag.get(tag_id, &#39;O&#39;) for tag_id in example[&#39;ent_tags&#39;]]) label_counts = Counter(all_labels) total_labels = len(all_labels) logger.info(f"总标签数: {total_labels}") for label, count in label_counts.items(): percentage = (count / total_labels) * 100 logger.info(f"{label}: {count} ({percentage:.2f}%)") # 计算实体标签的比例 entity_labels = [label for label in label_counts.keys() if label != &#39;O&#39;] entity_count = sum([label_counts[label] for label in entity_labels]) entity_percentage = (entity_count / total_labels) * 100 logger.info(f"实体标签总数: {entity_count} ({entity_percentage:.2f}%)") logger.info(f"非实体标签(O): {label_counts[&#39;O&#39;]} ({100-entity_percentage:.2f}%)") # 可视化标签分布 plt.figure(figsize=(12, 6)) labels_list = list(label_counts.keys()) counts = [label_counts[l] for l in labels_list] plt.bar(labels_list, counts) plt.xticks(rotation=90) plt.title(f&#39;{split} 标签分布&#39;) plt.ylabel(&#39;数量&#39;) plt.tight_layout() plt.savefig(f&#39;label_distribution_{split}.png&#39;) logger.info(f"标签分布图已保存为 label_distribution_{split}.png") return label_counts # 分析标签分布 train_label_counts = analyze_label_distribution(ds, &#39;train&#39;) test_label_counts = analyze_label_distribution(ds, &#39;test&#39;) # 计算类别权重解决不平衡问题 logger.info("\n计算类别权重解决不平衡问题...") all_train_labels = [] for example in ds[&#39;train&#39;]: all_train_labels.extend(example[&#39;ent_tags&#39;]) # 计算类别权重 class_weights = compute_class_weight( class_weight=&#39;balanced&#39;, classes=np.unique(all_train_labels), y=all_train_labels ) class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device) # 增强实体标签的权重 for i, tag in enumerate(tags): if tag != &#39;O&#39;: class_weights[i] *= 5.0 # 增加实体标签的权重 logger.info(f"类别权重: {class_weights}") # 数据输入处理函数(修复版 - 使用offset_mapping) def data_input_proc(items): # 获取文本列表 texts = items[&#39;text&#39;] # 分词(启用offset_mapping) tokenized_inputs = tokenizer( texts, truncation=True, padding=True, max_length=128, return_offsets_mapping=True, is_split_into_words=False, return_tensors="pt", ) # 获取偏移量映射 offset_mappings = tokenized_inputs.pop(&#39;offset_mapping&#39;) all_labels = [] for i, text in enumerate(texts): # 获取字符级标签ID列表 char_tags = items[&#39;ent_tags&#39;][i] label_ids = [] offsets = offset_mappings[i] for j, offset in enumerate(offsets): start, end = offset # 特殊token ([CLS], [SEP], [PAD]) 设置为-100 if start == 0 and end == 0: label_ids.append(-100) else: # 取起始位置对应的字符标签 if start < len(char_tags): label_ids.append(char_tags[start]) else: # 处理截断情况 label_ids.append(-100) all_labels.append(label_ids) tokenized_inputs["labels"] = all_labels return tokenized_inputs # 应用输入处理 logger.info("处理输入数据...") ds1 = ds.map( data_input_proc, batched=True, batch_size=8, remove_columns=ds["train"].column_names ) # 训练参数 args = TrainingArguments( output_dir=&#39;ner_train&#39;, num_train_epochs=3, per_device_train_batch_size=4, per_device_eval_batch_size=8, evaluation_strategy="epoch", save_strategy="epoch", logging_strategy="epoch", load_best_model_at_end=False, # 暂时禁用,直到我们解决指标问题 report_to=&#39;tensorboard&#39;, logging_dir=&#39;ner_train/logs&#39;, save_total_limit=2, learning_rate=5e-5, weight_decay=0.01, overwrite_output_dir=True, warmup_ratio=0.1, gradient_accumulation_steps=4, fp16=torch.cuda.is_available(), logging_steps=20, remove_unused_columns=False, # 暂时不使用 metric_for_best_model ) # 创建带类别权重的自定义损失函数 class WeightedLossModel(torch.nn.Module): def __init__(self, model, class_weights): super().__init__() self.model = model self.class_weights = class_weights self.loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=-100) def forward(self, **inputs): outputs = self.model(**inputs) logits = outputs.logits if "labels" in inputs: loss = self.loss_fct(logits.view(-1, self.model.config.num_labels), inputs["labels"].view(-1)) outputs.loss = loss return outputs # 加载基础模型 logger.info("加载基础模型...") base_model = AutoModelForTokenClassification.from_pretrained( model_dir, num_labels=len(tags), id2label=id2tag, label2id=tag2id, ignore_mismatched_sizes=True ).to(device) # 创建带权重的模型 model = WeightedLossModel(base_model, class_weights).to(device) logger.info(f"模型结构: 带类别权重的{base_model.__class__.__name__}") logger.info(f"模型参数量: {sum(p.numel() for p in model.parameters()):,}") # 数据收集器 data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) # 评估指标计算函数(增强调试版) def compute_metrics(p): try: # 确保输入格式正确 if hasattr(p, "predictions") and hasattr(p, "label_ids"): predictions = p.predictions labels = p.label_ids elif isinstance(p, tuple) and len(p) == 2: predictions, labels = p else: logger.error(f"无法识别的输入格式: {type(p)}") return {"f1": 0.0, "precision": 0.0, "recall": 0.0, "accuracy": 0.0} # 确保预测结果有正确的形状 if predictions.ndim == 3: predictions = np.argmax(predictions, axis=2) # 移除忽略的索引(-100) true_predictions = [] true_labels = [] for i in range(len(predictions)): preds = [] lbls = [] for j in range(len(predictions[i])): if labels[i][j] != -100: preds.append(id2tag[predictions[i][j]]) lbls.append(id2tag[labels[i][j]]) true_predictions.append(preds) true_labels.append(lbls) # 计算指标 precision = 0.0 recall = 0.0 f1 = 0.0 accuracy = 0.0 if true_labels and any(true_labels): # 确保标签列表非空 try: precision = precision_score(true_labels, true_predictions, zero_division=0) recall = recall_score(true_labels, true_predictions, zero_division=0) f1 = f1_score(true_labels, true_predictions, zero_division=0) # 计算准确率(token级别) total = 0 correct = 0 for preds, lbls in zip(true_predictions, true_labels): for p, l in zip(preds, lbls): total += 1 if p == l: correct += 1 accuracy = correct / total if total > 0 else 0.0 # 生成分类报告 report = classification_report(true_labels, true_predictions, output_dict=True, zero_division=0) # 记录实体标签的性能 logger.info("\n===== 详细分类报告 =====") for tag in tags: if tag in report and tag != &#39;O&#39;: logger.info(f"{tag}: P={report[tag][&#39;precision&#39;]:.4f}, R={report[tag][&#39;recall&#39;]:.4f}, F1={report[tag][&#39;f1-score&#39;]:.4f}") # 记录前10个预测和标签 logger.info("\n===== 样本预测检查 =====") for i in range(min(3, len(true_labels))): logger.info(f"样本 {i} 预测: {&#39; &#39;.join(true_predictions[i][:20])}") logger.info(f"样本 {i} 标签: {&#39; &#39;.join(true_labels[i][:20])}") logger.info("") # 计算并记录非O标签的准确率 non_o_correct = 0 non_o_total = 0 for preds, lbls in zip(true_predictions, true_labels): for p, l in zip(preds, lbls): if l != &#39;O&#39;: non_o_total += 1 if p == l: non_o_correct += 1 non_o_accuracy = non_o_correct / non_o_total if non_o_total > 0 else 0.0 logger.info(f"非O标签准确率: {non_o_accuracy:.4f} ({non_o_correct}/{non_o_total})") except Exception as inner_e: logger.error(f"计算评估指标时出错: {inner_e}") logger.error(f"预测形状: {predictions.shape}") logger.error(f"标签形状: {labels.shape}") logger.error(f"true_labels长度: {len(true_labels)}") logger.error(f"true_labels内容示例: {true_labels[:1] if true_labels else &#39;空&#39;}") else: logger.warning("评估时没有有效标签!") return { "f1": f1, "precision": precision, "recall": recall, "accuracy": accuracy } except Exception as outer_e: logger.error(f"compute_metrics函数发生严重错误: {outer_e}") return { "f1": 0.0, "precision": 0.0, "recall": 0.0, "accuracy": 0.0 } # 创建Trainer trainer = Trainer( model=model, args=args, train_dataset=ds1[&#39;train&#39;], eval_dataset=ds1[&#39;test&#39;], data_collator=data_collator, compute_metrics=compute_metrics, tokenizer=tokenizer ) # 预评估测试 logger.info("进行预评估测试...") try: eval_result = trainer.evaluate() logger.info(f"预评估结果: {eval_result}") # 检查是否返回了我们的指标 if "eval_f1" in eval_result: logger.info("评估指标计算成功!") # 重新启用最佳模型保存 args.load_best_model_at_end = True args.metric_for_best_model = "f1" # 使用基础名称,Trainer会自动添加"eval_"前缀 trainer = Trainer( model=model, args=args, train_dataset=ds1[&#39;train&#39;], eval_dataset=ds1[&#39;test&#39;], data_collator=data_collator, compute_metrics=compute_metrics, tokenizer=tokenizer ) else: logger.warning("评估指标未返回,将禁用最佳模型保存功能") except Exception as e: logger.error(f"预评估失败: {e}") # 训练模型 logger.info("开始训练...") try: train_result = trainer.train() logger.info("训练完成!") except Exception as e: logger.error(f"训练失败: {e}") # 尝试更小的学习率 logger.info("尝试更小的学习率...") args.learning_rate = 1e-5 trainer = Trainer( model=model, args=args, train_dataset=ds1[&#39;train&#39;], eval_dataset=ds1[&#39;test&#39;], data_collator=data_collator, compute_metrics=compute_metrics, tokenizer=tokenizer ) train_result = trainer.train() logger.info("训练完成!") # 保存最终模型 trainer.save_model("final_ner_model") logger.info("最终模型已保存到 final_ner_model 目录") # 训练完成后评估模型 logger.info("\n===== 最终评估结果 =====") try: eval_results = trainer.evaluate(ds1[&#39;test&#39;]) logger.info(f"验证集准确率: {eval_results.get(&#39;eval_accuracy&#39;, 0.0):.4f}") logger.info(f"验证集精确率: {eval_results.get(&#39;eval_precision&#39;, 0.0):.4f}") logger.info(f"验证集召回率: {eval_results.get(&#39;eval_recall&#39;, 0.0):.4f}") logger.info(f"验证集F1值: {eval_results.get(&#39;eval_f1&#39;, 0.0):.4f}") # 打印所有可用的评估指标 logger.info("\n所有可用评估指标:") for key, value in eval_results.items(): logger.info(f"{key}: {value}") except Exception as e: logger.error(f"最终评估失败: {e}") # 绘制训练损失曲线 def plot_training_history(history): try: train_loss = [log[&#39;loss&#39;] for log in history if &#39;loss&#39; in log] eval_loss = [log[&#39;eval_loss&#39;] for log in history if &#39;eval_loss&#39; in log] if not train_loss: logger.warning("没有训练损失数据可绘制") return epochs = list(range(1, len(train_loss) + 1)) plt.figure(figsize=(12, 6)) plt.plot(epochs, train_loss, &#39;b-&#39;, label=&#39;训练损失&#39;) if eval_loss: plt.plot(epochs[:len(eval_loss)], eval_loss, &#39;r-&#39;, label=&#39;验证损失&#39;) plt.title(&#39;训练和验证损失&#39;) plt.xlabel(&#39;Epochs&#39;) plt.ylabel(&#39;损失&#39;) plt.legend() plt.grid(True) plt.savefig(&#39;training_loss.png&#39;) logger.info("训练损失图已保存为 training_loss.png") # 绘制F1分数(如果可用) eval_f1 = [log[&#39;eval_f1&#39;] for log in history if &#39;eval_f1&#39; in log] if eval_f1: plt.figure(figsize=(12, 6)) plt.plot(epochs[:len(eval_f1)], eval_f1, &#39;g-&#39;, label=&#39;验证F1&#39;) plt.title(&#39;验证F1分数&#39;) plt.xlabel(&#39;Epochs&#39;) plt.ylabel(&#39;F1分数&#39;) plt.legend() plt.grid(True) plt.savefig(&#39;eval_f1.png&#39;) logger.info("F1分数图已保存为 eval_f1.png") except Exception as e: logger.error(f"绘制训练历史时出错: {e}") # 绘制训练历史 plot_training_history(trainer.state.log_history) # 测试模型预测 logger.info("\n测试模型预测...") sample_texts = [ "我在北京大学学习人工智能", "马云是阿里巴巴集团的创始人", "上海市浦东新区张江高科技园区" ] # 自定义预测函数 def predict_entities(text, model, tokenizer, id2tag, device): try: # 分词 inputs = tokenizer( text, return_tensors="pt", return_offsets_mapping=True, truncation=True, max_length=128 ) # 移动到设备 input_ids = inputs[&#39;input_ids&#39;].to(device) attention_mask = inputs[&#39;attention_mask&#39;].to(device) token_type_ids = inputs.get(&#39;token_type_ids&#39;, None) if token_type_ids is not None: token_type_ids = token_type_ids.to(device) offset_mapping = inputs[&#39;offset_mapping&#39;].cpu().numpy()[0] # 预测 model.eval() with torch.no_grad(): outputs = model( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids ) # 获取预测结果 logits = outputs.logits predictions = torch.argmax(logits, dim=-1).cpu().numpy()[0] # 提取实体 entities = [] current_entity = None tokens = tokenizer.convert_ids_to_tokens(input_ids[0]) logger.info(f"\n预测文本: {text}") logger.info(f"Token列表: {tokens}") for i, (pred, offset) in enumerate(zip(predictions, offset_mapping)): # 跳过特殊token if offset[0] == 0 and offset[1] == 0: continue tag = id2tag[pred] logger.info(f"Token {i}: {tokens[i]} -> {tag} (位置: {offset[0]}-{offset[1]})") # 处理实体 if tag.startswith(&#39;B-&#39;): # 结束前一个实体 if current_entity: entities.append(current_entity) # 开始新实体 entity_type = tag[2:] start = offset[0] end = offset[1] current_entity = { &#39;start&#39;: start, &#39;end&#39;: end, &#39;type&#39;: entity_type, &#39;text&#39;: text[start:end] } elif tag.startswith(&#39;I-&#39;): # 继续当前实体 if current_entity and current_entity[&#39;type&#39;] == tag[2:]: current_entity[&#39;end&#39;] = offset[1] current_entity[&#39;text&#39;] = text[current_entity[&#39;start&#39;]:offset[1]] else: # 如果I标签没有匹配的B标签,则创建新实体 entity_type = tag[2:] start = offset[0] end = offset[1] current_entity = { &#39;start&#39;: start, &#39;end&#39;: end, &#39;type&#39;: entity_type, &#39;text&#39;: text[start:end] } else: # O # 结束前一个实体 if current_entity: entities.append(current_entity) current_entity = None # 添加最后一个实体 if current_entity: entities.append(current_entity) return entities except Exception as e: logger.error(f"预测过程中出错: {e}") return [] # 进行预测 for text in sample_texts: logger.info(f"\n===== 预测文本: {text} =====") try: entities = predict_entities(text, model, tokenizer, id2tag, device) if not entities: logger.info(" 未识别到实体") else: for ent in entities: logger.info(f" 实体: {ent[&#39;text&#39;]}, 类型: {ent[&#39;type&#39;]}, 位置: {ent[&#39;start&#39;]}-{ent[&#39;end&#39;]}") except Exception as e: logger.error(f"预测文本 &#39;{text}&#39; 时出错: {e}") logger.info("\nNER任务完成!")这代码都实现了什么还有可以优化的嘛
06-22
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值