SSD(Single Shot MultiBox Detector):绘制训练过程loss,accuracy曲线

本文详细介绍如何使用Caffe和SSD训练日志文件来绘制训练过程中的loss和accuracy曲线。通过自带的日志分析工具和gnuplot进行数据处理与可视化,帮助读者直观了解训练效果。

关于标准Caffe绘制loss,accuracy曲线参见这篇博客,写得很详细《Caffe 绘制训练过程loss,accuracy曲线》,而训练SSD时绘制loss,accuracy曲线稍简单点,因为SSD的训练脚本已经完成了日志输出重定向工作 。训练时训练日志已经以文件形式保存在jobs文件夹下
这里写图片描述
所以我们要做的就是直接利用这个日志文件绘制训练过程loss,accuracy曲线。

分析日志生成数据

caffe在tools/extra中自带日志分析工具,
在$ssd_root/tools/extra下执行(log文件名根据你的实际情况而定)

./parse_log.sh ../../jobs/VGGNet/VOC2007/SSD_300x300/VGG_VOC2007_SSD_300x300.log 

会生VGG_VOC2007_SSD_300x300.log.test,VGG_VOC2007_SSD_300x300.log.train两个解析过的文件
这里写图片描述
内容如下:
这里写图片描述

修改gnuplot设置

在$ssd_root/tools/extra下执行

cp plot_log.gnuplot.example plot_log.gnuplot

绘制train loss单曲线

复制plot_log.gnuplot.example的一个副本plot_log.gnuplot,在plot_log.gnuplot上修改,主要修改的是44行
下图是用rabbitvcs显示了修改前后的差异
这里写图片描述
删除掉注释后plot_log.gnuplot的完整内容及说明

reset
set terminal png
# 输出文件名 可不改
set output "VGG_VOC2007_SSD_300x300.png"
set style data lines
set key right

###### Fields in the data file your_log_name.log.train are
###### Iters Seconds TrainingLoss LearningRate

# Training loss vs. training iterations
# 设置标题名称
set title "SSD Training loss vs. training iterations"
# 设置x, y轴名称
set xlabel "Training iterations"
set ylabel "Training loss"
# 曲线绘制 title表示曲线名
# using 1:3 表示使用log-data.train的第一和第三列分别为x轴y轴
# VGG_VOC2007_SSD_300x300.log.train即为上一步生成的训练日志分析文件
plot "VGG_VOC2007_SSD_300x300.log.train" using 1:3 title "train loss"

在$ssd_root/tools/extra下执行

gnuplot plot_log.gnuplot

主就会在当前文件夹下生成 VGG_VOC2007_SSD_300x300.png。
这里写图片描述

绘制loss/accuracy双曲线

reset
set terminal png
# 输出文件名
set output "VGG_VOC2007_SSD_300x300.png"
set style data lines
set key right

###### Fields in the data file your_log_name.log.train are
###### Iters Seconds TrainingLoss LearningRate

# Training loss vs. training iterations
# 设置多图模式
set multiplot
# 设置标题名称
set title "SSD Training loss vs. training iterations"
# 设置x, y轴名称
set xlabel "Training iterations"
set ylabel "Training loss"
# 曲线绘制 title表示曲线名
# using 1:3 表示使用log-data.train的第一和第三列分别为x轴y轴
#plot "VGG_VOC2007_SSD_300x300.log.train" using 1:3 title "train loss", "VGG_VOC2007_SSD_300x300.log.test" using 1:3 title "test loss"

这里写图片描述

参考资料

《Caffe 绘制训练过程loss,accuracy曲线》

def preprocess_target(target): """ 预处理目标数据,将其转换为模型期望的格式,同时过滤无效边界框。 参数: - target: 一个样本的目标标注(字典列表) 返回: - 处理后的目标标注 """ # 提取有效的 bounding boxes 和 labels boxes = [] labels = [] for annotation in target: # 遍历每个目标字典 # 提取 COCO 格式的 bbox 和 category_id x_min, y_min, width, height = annotation["bbox"] x_max = x_min + width y_max = y_min + height if width > 0 and height > 0: # 过滤无效框 boxes.append([x_min, y_min, x_max, y_max]) labels.append(annotation["category_id"]) if len(boxes) == 0: return None # 没有有效框时返回 None # 转换为 PyTorch 张量 processed_target = { "boxes": torch.tensor(boxes, dtype=torch.float32), "labels": torch.tensor(labels, dtype=torch.int64), } return processed_target def collate_fn(batch): """ 自定义 collate_fn,用于处理无效数据并格式化目标标注。 """ images = [] targets = [] for img, target in batch: processed_target = preprocess_target(target) if processed_target is not None: # 跳过无效样本 images.append(img) targets.append(processed_target) if len(images) == 0: raise ValueError("All samples in the batch have invalid targets!") return images, targets import os import torch train_anno = '/kaggle/input/coco-2017-dataset/coco2017/annotations/instances_train2017.json' print("Training annotations file:", train_anno) print("File exists:", os.path.exists(train_anno)) device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu") import torch import torchvision from torchvision.transforms import ToTensor from torchvision.datasets import CocoDetection from torch.utils.data import DataLoader # from torchvision.models.detection import fcos_resnet50_fpn from pycocotools.coco import COCO import torch.optim as optim from torch.optim.lr_scheduler import StepLR from PIL import Image import matplotlib.pyplot as plt import os # 1. 配置路径 coco_root = "/kaggle/input/coco-2017-dataset/coco2017" # 替换为COCO数据集的根路径 train_dir = os.path.join(coco_root, "train2017") val_dir = os.path.join(coco_root, "val2017") train_anno = os.path.join(coco_root, "annotations", "instances_train2017.json") val_anno = os.path.join(coco_root, "annotations", "instances_val2017.json") # 2. 定义数据加载函数 def get_coco_data(root, annotation_file, split='train'): transform = ToTensor() # 仅包含基础的ToTensor转换 dataset = CocoDetection(root, annotation_file, transform=transform) return dataset # 3. 创建DataLoader def create_dataloader(dataset, batch_size=8, num_workers=4): dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers) return dataloader # 加载数据集 train_dataset = get_coco_data(train_dir, train_anno, split='train') train_dataloader = DataLoader( train_dataset, batch_size=8, shuffle=True, num_workers=4, collate_fn=collate_fn ) # 加载预训练ssd模型 model = torchvision.models.detection.ssdlite320_mobilenet_v3_large(pretrained=True) model.eval() # 设置为评估模式 model.to(device) from torch.utils.data import DataLoader from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix import matplotlib.pyplot as plt import seaborn as sns import numpy as np # 早停机制 class EarlyStopping: def __init__(self, patience=5, delta=0.0): self.patience = patience self.delta = delta self.best_loss = None self.counter = 0 self.stop_training = False def __call__(self, val_loss): if self.best_loss is None or val_loss + self.delta < self.best_loss: self.best_loss = val_loss self.counter = 0 else: self.counter += 1 if self.counter >= self.patience: self.stop_training = True def calculate_metrics(predictions, targets): # 获取预测框的类别 pred_labels = [item['labels'].cpu().numpy() for item in predictions] true_labels = [item['labels'].cpu().numpy() for item in targets] # 平展为一维数组 pred_labels = np.concatenate(pred_labels) true_labels = np.concatenate(true_labels) # 计算准确率、精确度、召回率、F1值 accuracy = accuracy_score(true_labels, pred_labels) precision = precision_score(true_labels, pred_labels, average='weighted') recall = recall_score(true_labels, pred_labels, average='weighted') f1 = f1_score(true_labels, pred_labels, average='weighted') return accuracy, precision, recall, f1 def plot_confusion_matrix(true_labels, pred_labels, class_names): cm = confusion_matrix(true_labels, pred_labels) plt.figure(figsize=(8, 6)) sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names) plt.xlabel("Predicted") plt.ylabel("True") plt.title("Confusion Matrix") plt.show() import os import time import torch from torch.optim.lr_scheduler import StepLR from torchmetrics.detection.mean_ap import MeanAveragePrecision # 定义设备 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # COCO 类别名称提取函数 def get_coco_class_names(coco): return [coco.loadCats(cat_id)[0]["name"] for cat_id in coco.getCatIds()] import os import time import torch from torch.optim.lr_scheduler import StepLR from torchmetrics.detection.mean_ap import MeanAveragePrecision def train_model_optimized(model, dataloader, device, optimizer, scheduler, num_epochs=10, early_stopping=None): model.train() # 移除训练过程中的mAP计算,放到验证阶段 start_time = time.time() best_loss = float('inf') best_model_path = "/kaggle/working/best_model.pth" for epoch in range(num_epochs): epoch_loss = 0.0 epoch_start_time = time.time() batch_count = 0 for batch_idx, (images, targets) in enumerate(dataloader): try: # 检查数据是否有效 if len(images) == 0: print(f"Warning: Empty batch at {batch_idx}, skipping...") continue # 转移数据到设备 images = [img.to(device) for img in images] targets = [{k: v.to(device) for k, v in t.items()} for t in targets] # 前向与反向传播 optimizer.zero_grad() loss_dict = model(images, targets) # 检查loss是否有效 if torch.isnan(loss_dict['bbox_regression']).any() or torch.isnan(loss_dict['classification']).any(): print(f"Warning: NaN loss detected at batch {batch_idx}, skipping...") continue losses = sum(loss for loss in loss_dict.values()) losses.backward() # 梯度裁剪防止梯度爆炸 torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) optimizer.step() # 累加损失 epoch_loss += losses.item() batch_count += 1 # 每100个batch打印一次 if batch_idx % 100 == 0: batch_time = time.time() - epoch_start_time avg_loss = epoch_loss / (batch_count if batch_count > 0 else 1) print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx}/{len(dataloader)}], " f"Avg Loss: {avg_loss:.4f}, Time: {batch_time:.2f}s") except Exception as e: print(f"Error in batch {batch_idx}: {e}") continue if batch_count == 0: print(f"Epoch {epoch+1} had no valid batches, skipping...") continue # 每个epoch完成后更新学习率 scheduler.step() avg_epoch_loss = epoch_loss / batch_count # 保存最佳模型(基于loss) if avg_epoch_loss < best_loss: best_loss = avg_epoch_loss torch.save(model.state_dict(), best_model_path) print(f"New best model saved with loss: {best_loss:.4f}") epoch_time = time.time() - epoch_start_time print(f"Epoch [{epoch+1}/{num_epochs}] Completed, Average Loss: {avg_epoch_loss:.4f}, Time: {epoch_time:.2f}s") # 早停检查 if early_stopping: early_stopping(avg_epoch_loss) if early_stopping.stop_training: print("Early stopping triggered!") break total_time = time.time() - start_time print(f"Training completed in {total_time // 60:.0f}m {total_time % 60:.0f}s") print(f"Best model saved at {best_model_path} with loss: {best_loss:.4f}") return model # 示例:定义优化器和学习率调度器 import torch.optim as optim from torchvision.models.detection import ssdlite320_mobilenet_v3_large model = ssdlite320_mobilenet_v3_large(pretrained=True) model.to(device) optimizer = optim.SGD(model.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005) lr_scheduler = StepLR(optimizer, step_size=3, gamma=0.1) # 开始训练 early_stopping = EarlyStopping(patience=5, delta=0.01) # 设定早停机制 train_model_optimized(model, train_dataloader, device, optimizer, lr_scheduler, num_epochs=10) def plot_lr_loss(lr_list, loss_list): plt.figure(figsize=(10, 5)) plt.plot(lr_list, label="Learning Rate") plt.plot(loss_list, label="Loss") plt.xlabel("Iterations") plt.ylabel("Value") plt.legend() plt.title("Learning Rate and Loss Curves") plt.show() from torchmetrics.detection import MeanAveragePrecision def calculate_map(model, dataloader, device): metric = MeanAveragePrecision() model.eval() for images, targets in dataloader: images = [img.to(device) for img in images] targets = [{k: v.to(device) for k, v in t.items()} for t in targets] with torch.no_grad(): preds = model(images) metric.update(preds, targets) map_score = metric.compute() return map_score import matplotlib.pyplot as plt from PIL import Image import torchvision.transforms as T import torch def detect_objects(model, image_path, device): """ 使用训练好的模型对目标进行检测,并显示检测结果。 参数: - model: 训练好的目标检测模型 - image_path: 待检测图像的路径 - device: 使用的设备 (CPU 或 GPU) """ model.eval() # 设置为评估模式 # 加载图片并进行预处理 image = Image.open(image_path).convert("RGB") transform = T.Compose([ T.ToTensor() # 转换为张量 ]) image_tensor = transform(image).unsqueeze(0).to(device) # 进行检测 with torch.no_grad(): outputs = model(image_tensor) # 提取结果 boxes = outputs[0]['boxes'].cpu().numpy() scores = outputs[0]['scores'].cpu().numpy() # 设置显示阈值 score_threshold = 0.5 # 仅显示置信度大于0.5的框 # 绘制结果 plt.figure(figsize=(12, 8)) plt.imshow(image) ax = plt.gca() for box, score in zip(boxes, scores): if score > score_threshold: # 绘制检测框 x_min, y_min, x_max, y_max = box rect = plt.Rectangle( (x_min, y_min), x_max - x_min, y_max - y_min, fill=False, color="red", linewidth=2 ) ax.add_patch(rect) ax.text( x_min, y_min, f"{score:.2f}", bbox={"facecolor": "yellow", "alpha": 0.5}, fontsize=10, color="black" ) plt.axis("off") plt.show() # 测试函数 image_path = "/kaggle/input/coco-2017-dataset/coco2017/test2017/000000000890.jpg" # 替换为你的测试图像路径 detect_objects(model, image_path, device) 为以上代码添加详细注释(越详细越好)
最新发布
10-18
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

10km

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值