基于YOLOv3-Tiny 的智能门铃的人体检测模型的实现（上）

技术与健康

于 2025-07-12 09:36:19 发布

阅读量656

点赞数 3

CC 4.0 BY-SA版权

文章标签： YOLO pytorch

本文为博主原创文章，未经博主允许不得转载。

本文链接：https://blog.youkuaiyun.com/Practicer2015/article/details/149289180

【边缘AI】如何构建智能门铃的人体检测模型这篇文章我们分享了模型的架构和设计，今天我们基于 YOLOv3-Tiny 的来开发这个模型。包括模型定义、数据集加载、训练循环、评估以及量化感知训练 (QAT) 的集成。

考虑到代码的完整性和可读性，我将代码分成几个核心部分：

配置 (Config)
模型定义 (Model Definition)
数据集与数据加载器 (Dataset & Dataloader)
工具函数 (Utility Functions)
训练脚本 (Training Script)
量化感知训练 (QAT) 集成
评估 (Evaluation)

前提条件

在运行代码之前，请确保你已经安装了必要的库：

Bash

pip install torch torchvision numpy opencv-python matplotlib tqdm Pillow

1. 配置 (Config)

我们首先定义一些常用的配置参数。

# config.py
import torch
import os

class Config:
    # 路径配置
    DATA_DIR = "data/doorbell_person_dataset"  # 你的数据集根目录
    TRAIN_IMG_DIR = os.path.join(DATA_DIR, "images/train")
    TRAIN_LABEL_DIR = os.path.join(DATA_DIR, "labels/train")
    VAL_IMG_DIR = os.path.join(DATA_DIR, "images/val")
    VAL_LABEL_DIR = os.path.join(DATA_DIR, "labels/val")
    ANCHORS = [
        [(10, 13), (16, 30), (33, 23)],  # P3/8 (针对小目标)
        [(30, 61), (62, 45), (59, 119)], # P4/16 (针对中等目标)
        [(116, 90), (156, 198), (373, 326)], # P5/32 (针对大目标)
    ] # 注意：这些是针对 COCO 的默认锚框，需要根据你的数据集进行聚类分析
    NUM_CLASSES = 1 # 只有“人”一个类别
    
    # 训练配置
    LEARNING_RATE = 1e-3
    BATCH_SIZE = 16
    NUM_EPOCHS = 100 # 初始轮次，可能需要更多
    IMAGE_SIZE = 416 # 输入图像尺寸 (YOLOv3-Tiny 通常使用 416x416 或 320x320)
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    SAVE_MODEL = True
    LOAD_MODEL = False
    MODEL_PATH = "yolov3_tiny_person.pth.tar"
    LOAD_MODEL_PATH = "pretrained_yolov3_tiny.pth.tar" # 如果有预训练模型

    # 数据增强配置
    AUGMENTATION_CONFIG = {
        "hflip_prob": 0.5,
        "vflip_prob": 0.0,
        "brightness": 0.2,
        "contrast": 0.2,
        "saturation": 0.2,
        "hue": 0.05,
        "noise_prob": 0.1,
        "cutout_prob": 0.1, # Cutout 数据增强
    }

    # 评估配置
    CONF_THRESHOLD = 0.05 # 用于NMS的初始置信度阈值
    NMS_IOU_THRESH = 0.5 # NMS的IoU阈值
    MAP_IOU_THRESH = 0.5 # 计算mAP的IoU阈值
    
    # 量化配置
    QUANT_MODE = False # 是否开启量化感知训练
    NUM_CALIBRATION_BATCHES = 10 # QAT 校准批次数量

2. 模型定义 (Model Definition)

我们将定义 YOLOv3-Tiny 的骨干网络和头部。

# model.py
import torch
import torch.nn as nn

"""
YOLOv3-Tiny 的实现
主要模块:
1. ConvBlock: 卷积层 + BatchNorm + LeakyReLU
2. YOLOLayer: YOLO检测头，处理输出特征图并生成边界框预测
"""

class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, use_bn=True, activation=True):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=not use_bn)
        self.bn = nn.BatchNorm2d(out_channels) if use_bn else None
        self.activation = nn.LeakyReLU(0.1) if activation else None

    def forward(self, x):
        x = self.conv(x)
        if self.bn is not None:
            x = self.bn(x)
        if self.activation is not None:
            x = self.activation(x)
        return x

class YOLOLayer(nn.Module):
    def __init__(self, anchors, num_classes, img_size):
        super().__init__()
        self.anchors = torch.tensor(anchors, dtype=torch.float32, requires_grad=False)
        self.num_anchors = len(anchors)
        self.num_classes = num_classes
        self.ignore_thres = 0.5 # IoU阈值，低于此值的预测不计入损失
        self.mse_loss = nn.MSELoss()
        self.bce_loss = nn.BCEWithLogitsLoss()
        self.obj_scale = 1 # 目标置信度损失权重
        self.noobj_scale = 100 # 无目标置信度损失权重 (YOLOv3-Tiny 常用更高无目标权重)
        self.metrics = {}
        self.img_size = img_size

    def forward(self, x, targets=None):
        # x: (batch_size, num_anchors * (5 + num_classes), grid_h, grid_w)
        num_batches = x.size(0)
        grid_h, grid_w = x.size(2), x.size(3)
        
        # 将输出调整为 (batch_size, num_anchors, grid_h, grid_w, 5 + num_classes)
        prediction = (
            x.view(num_batches, self.num_anchors, self.num_classes + 5, grid_h, grid_w)
            .permute(0, 1, 3, 4, 2)
            .contiguous()
        )

        # 提取预测
        x = torch.sigmoid(prediction[..., 0])  # x
        y = torch.sigmoid(prediction[..., 1])  # y
        w = prediction[..., 2]                 # w
        h = prediction[..., 3]                 # h
        conf = torch.sigmoid(prediction[..., 4]) # 置信度
        pred_cls = torch.sigmoid(prediction[..., 5:]) # 类别概率 (这里是1个类别)

        # 计算网格坐标
        grid_x = torch.arange(grid_w, device=x.device).repeat(grid_h, 1).view([1, 1, grid_h, grid_w]).type(torch.float32)
        grid_y = torch.arange(grid_h, device=x.device).repeat(grid_w, 1).t().view([1, 1, grid_h, grid_w]).type(torch.float32)
        
        # 锚框尺寸 (相对网格)
        scaled_anchors = self.anchors.view(1, self.num_anchors, 1, 1, 2).to(x.device) / (self.img_size / grid_h)

        # 调整边界框中心和尺寸到图像坐标
        pred_boxes = torch.empty_like(prediction[..., :4])
        pred_boxes[..., 0] = x + grid_x
        pred_boxes[..., 1] = y + grid_y
        pred_boxes[..., 2] = torch.exp(w) * scaled_anchors[..., 0]
        pred_boxes[..., 3] = torch.exp(h) * scaled_anchors[..., 1]
        
        # 将预测调整到 (num_batches, num_anchors * grid_h * grid_w, 5 + num_classes)
        output = torch.cat(
            (pred_boxes.view(num_batches, -1, 4) * (self.img_size / grid_h), # 将相对网格坐标转换为绝对图像坐标
             conf.view(num_batches, -1, 1),
             pred_cls.view(num_batches, -1, self.num_classes)),
            -1,
        )

        if targets is None:
            return output
        else:
            # 计算损失
            # targets: (num_targets, 6) -> (batch_idx, class, x, y, w, h) (归一化)
            
            # 创建目标张量 (所有目标都初始化为0)
            obj_mask = torch.zeros(num_batches, self.num_anchors, grid_h, grid_w, dtype=torch.bool, device=x.device)
            noobj_mask = torch.ones(num_batches, self.num_anchors, grid_h, grid_w, dtype=torch.bool, device=x.device)
            class_mask = torch.zeros(num_batches, self.num_anchors, grid_h, grid_w, dtype=torch.float32, device=x.device)
            iou_scores = torch.zeros(num_batches, self.num_anchors, grid_h, grid_w, dtype=torch.float32, device=x.device)
            tx = torch.zeros(num_batches, self.num_anchors, grid_h, grid_w, dtype=torch.float32, device=x.device)
            ty = torch.zeros(num_batches, self.num_anchors, grid_h, grid_w, dtype=torch.float32, device=x.device)
            tw = torch.zeros(num_batches, self.num_anchors, grid_h, grid_w, dtype=torch.float32, device=x.device)
            th = torch.zeros(num_batches, self.num_anchors, grid_h, grid_w, dtype=torch.float32, device=x.device)
            tconf = torch.zeros(num_batches, self.num_anchors, grid_h, grid_w, dtype=torch.float32, device=x.device)
            tcls = torch.zeros(num_batches, self.num_anchors, grid_h, grid_w, self.num_classes, dtype=torch.float32, device=x.device)

            # 将目标框归一化到网格尺度
            # (target_x, target_y, target_w, target_h) = (center_x, center_y, width, height)
            # targets 维度: (batch_idx, class_idx, x_norm, y_norm, w_norm, h_norm)
            # 将归一化坐标转换为当前特征图的网格坐标
            target_boxes = targets[:, 2:] * self.img_size # 转换为绝对图像坐标
            gxy = target_boxes[:, 0:2] * (grid_h / self.img_size) # 转换为网格中心坐标
            gwh = target_boxes[:, 2:4] * (grid_h / self.img_size) # 转换为网格宽高
            
            # 获取目标在网格内的整数坐标和浮点坐标
            gij = gxy.long()
            gi, gj = gij[:, 0], gij[:, 1] # gi: x索引, gj: y索引

            # 选择最佳锚框
            # 计算每个目标框与所有锚框的 IoU
            ious_target_anchors = bbox_wh_iou(gwh, scaled_anchors.view(-1, 2))
            best_n = torch.argmax(ious_target_anchors, 1) # 对每个目标，找到 IoU 最大的锚框

            num_targets = targets.size(0)
            b = targets[:, 0].long() # batch_idx
            target_labels = targets[:, 1].long() # class_idx
            
            for i in range(num_targets):
                n = best_n[i] # 最佳锚框索引
                
                # 检查索引是否越界 (可能因为目标尺寸与锚框不匹配)
                if gj[i] < grid_h and gi[i] < grid_w:
                    obj_mask[b[i], n, gj[i], gi[i]] = 1
                    noobj_mask[b[i], n, gj[i], gi[i]] = 0 # 有目标的网格不再是无目标
                    
                    # 目标在网格内的偏移
                    tx[b[i], n, gj[i], gi[i]] = gxy[i, 0] - gi[i].type(torch.float32)
                    ty[b[i], n, gj[i], gi[i]] = gxy[i, 1] - gj[i].type(torch.float32)
                    
                    # 目标宽高 (相对于锚框)
                    tw[b[i], n, gj[i], gi[i]] = torch.log(gwh[i, 0] / scaled_anchors[0, n, 0, 0])
                    th[b[i], n, gj[i], gi[i]] = torch.log(gwh[i, 1] / scaled_anchors[0, n, 0, 1])
                    
                    # 置信度目标
                    tconf[b[i], n, gj[i], gi[i]] = 1
                    
                    # 类别目标 (One-hot 编码)
                    tcls[b[i], n, gj[i], gi[i], target_labels[i]] = 1
            
            # 计算 IoU
            # pred_boxes (x, y, w, h)
            # targets_on_grid (batch_idx, anchor_idx, grid_y, grid_x, target_x, target_y, target_w, target_h)
            
            # 将所有预测框展开
            all_pred_boxes = pred_boxes[obj_mask == 1]
            all_target_boxes = torch.stack((
                tx[obj_mask == 1] + gi[best_n][obj_mask == 1], # gx
                ty[obj_mask == 1] + gj[best_n][obj_mask == 1], # gy
                torch.exp(tw[obj_mask == 1]) * scaled_anchors[0, best_n, 0, 0][obj_mask == 1], # gw
                torch.exp(th[obj_mask == 1]) * scaled_anchors[0, best_n, 0, 1][obj_mask == 1], # gh
            ), dim=1)
            
            # 将所有预测框和目标框转换为 (x1, y1, x2, y2) 格式进行 IoU 计算
            # 注意：这里需要确保坐标系一致，我们将它们都转换为相对于网格的绝对坐标
            all_pred_boxes_xywh = torch.stack((
                (all_pred_boxes[:, 0] - grid_x[obj_mask == 1]),
                (all_pred_boxes[:, 1] - grid_y[obj_mask == 1]),
                all_pred_boxes[:, 2],
                all_pred_boxes[:, 3],
            ), dim=1) # 转换回预测的原始形式 (相对于网格左上角)

            all_target_boxes_xywh = torch.stack((
                tx[obj_mask == 1], ty[obj_mask == 1],
                torch.exp(tw[obj_mask == 1]) * scaled_anchors[0, best_n, 0, 0][obj_mask == 1],
                torch.exp(th[obj_mask == 1]) * scaled_anchors[0, best_n, 0, 1][obj_mask == 1],
            ), dim=1)
            
            # 使用 IoU 计算损失
            iou_loss = bbox_giou_loss(xywh2xyxy(all_pred_boxes_xywh), xywh2xyxy(all_target_boxes_xywh))

            # 置信度损失 (有目标和无目标)
            loss_conf_obj = self.bce_loss(conf[obj_mask == 1], tconf[obj_mask == 1])
            loss_conf_noobj = self.bce_loss(conf[noobj_mask == 1], tconf[noobj_mask == 1])
            loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj
            
            # 类别损失
            loss_cls = self.bce_loss(pred_cls[obj_mask == 1], tcls[obj_mask == 1])

            # 总损失
            total_loss = iou_loss + loss_conf + loss_cls

            # 记录指标 (可选)
            self.metrics["iou_loss"] = iou_loss.item()
            self.metrics["conf_loss"] = loss_conf.item()
            self.metrics["cls_loss"] = loss_cls.item()
            self.metrics["total_loss"] = total_loss.item()
            
            return output, total_loss

class YOLOv3Tiny(nn.Module):
    def __init__(self, in_channels=3, num_classes=1, config_anchors=None):
        super().__init__()
        if config_anchors is None:
            raise ValueError("Anchors must be provided in Config.")
        
        self.num_classes = num_classes
        self.img_size = Config.IMAGE_SIZE

        # Darknet-Tiny Backbone
        self.features = nn.Sequential(
            ConvBlock(in_channels, 16, 3, 1, 1), # 0
            nn.MaxPool2d(2, 2), # 1
            ConvBlock(16, 32, 3, 1, 1), # 2
            nn.MaxPool2d(2, 2), # 3
            ConvBlock(32, 64, 3, 1, 1), # 4
            nn.MaxPool2d(2, 2), # 5
            ConvBlock(64, 128, 3, 1, 1), # 6
            nn.MaxPool2d(2, 2), # 7
            ConvBlock(128, 256, 3, 1, 1), # 8 -> P4/16 (route to concatenate)
            nn.MaxPool2d(2, 2), # 9
            ConvBlock(256, 512, 3, 1, 1), # 10
            nn.ZeroPad2d((0, 1, 0, 1)), # PaddedMaxPool (equivalent to max_pool_wh in Darknet)
            nn.MaxPool2d(2, 1), # 11 (stride=1 to avoid too much downsampling)
            ConvBlock(512, 1024, 3, 1, 1), # 12 (Darknet-Tiny v3 has 1024 here, but usually it's 512)
            ConvBlock(1024, 256, 1, 1, 0), # 13
            ConvBlock(256, 512, 3, 1, 1), # 14
        )

        # Detection Head 1 (Large objects - P5/32 scale)
        self.head1_conv = ConvBlock(512, len(config_anchors[2]) * (5 + num_classes), 1, 1, 0, use_bn=False, activation=False)
        self.yolo_layer1 = YOLOLayer(config_anchors[2], num_classes, Config.IMAGE_SIZE)

        # Detection Head 2 (Medium objects - P4/16 scale, using route from layer 8)
        self.head2_conv1 = ConvBlock(256, 128, 1, 1, 0) # 从layer 8的256特征图，通过1x1卷积降维
        self.head2_upsample = nn.Upsample(scale_factor=2, mode="nearest") # 上采样回 P4/16 尺寸

        # 将 P4/16 的特征图 (来自layer 8) 和上采样后的 P5/32 特征图拼接
        # 256 (from layer 8) + 128 (upsampled) = 384
        self.head2_conv2 = ConvBlock(128 + 256, 256, 3, 1, 1) # 拼接后的卷积
        self.head2_output = ConvBlock(256, len(config_anchors[1]) * (5 + num_classes), 1, 1, 0, use_bn=False, activation=False)
        self.yolo_layer2 = YOLOLayer(config_anchors[1], num_classes, Config.IMAGE_SIZE)

    def forward(self, x, targets=None):
        is_training = targets is not None
        outputs = []
        losses = []

        # Darknet-Tiny forward pass
        for i, layer in enumerate(self.features):
            x = layer(x)
            if i == 8: # 保存 P4/16 的特征图 (从第8层输出)
                route_p4 = x
        
        # P5/32 scale detection
        out_p5 = self.head1_conv(x)
        if is_training:
            output_p5, loss_p5 = self.yolo_layer1(out_p5, targets)
            losses.append(loss_p5)
        else:
            output_p5 = self.yolo_layer1(out_p5)
        outputs.append(output_p5)

        # P4/16 scale detection
        x = self.head2_conv1(x)
        x = self.head2_upsample(x)
        
        # 确保通道数匹配 before concatenation
        # 如果 route_p4 的通道数是 256, x 的通道数是 128, 拼接后 384
        x = torch.cat([x, route_p4], 1)
        
        x = self.head2_conv2(x)
        out_p4 = self.head2_output(x)

        if is_training:
            output_p4, loss_p4 = self.yolo_layer2(out_p4, targets)
            losses.append(loss_p4)
        else:
            output_p4 = self.yolo_layer2(out_p4)
        outputs.append(output_p4)
        
        if is_training:
            return torch.cat(outputs, 1), sum(losses)
        else:
            return torch.cat(outputs, 1) # (batch_size, total_predictions, 5 + num_classes)

# 用于 IoU 和 GIoU 损失计算的辅助函数
def bbox_iou(box1, box2, x1y1x2y2=True):
    """
    Returns the IoU of two bounding boxes
    """
    if not x1y1x2y2:
        # Transform from center coords to x1y1x2y2
        b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
        b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
        b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
        b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
    else:
        b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
        b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]

    # Intersection area
    inter_area = (torch.min(b1_x2, b2_x2) - torch.max(b1_x1, b2_x1)).clamp(0) * \
                 (torch.min(b1_y2, b2_y2) - torch.max(b1_y1, b2_y1)).clamp(0)

    # Union Area
    b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1)
    b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1)

    iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)

    return iou

def bbox_wh_iou(wh1, wh2):
    # wh1: (N, 2), wh2: (M, 2)
    wh1 = wh1.unsqueeze(1) # (N, 1, 2)
    wh2 = wh2.unsqueeze(0) # (1, M, 2)
    
    inter_area = torch.min(wh1[..., 0], wh2[..., 0]) * torch.min(wh1[..., 1], wh2[..., 1])
    union_area = wh1[..., 0] * wh1[..., 1] + wh2[..., 0] * wh2[..., 1] - inter_area + 1e-16
    return inter_area / union_area

def bbox_giou_loss(pred_boxes, target_boxes):
    """
    Calculates the GIOU loss between predicted and target bounding boxes.
    pred_boxes, target_boxes: (N, 4) in (x1, y1, x2, y2) format.
    """
    if pred_boxes.shape[0] == 0:
        return torch.tensor(0.0, device=pred_boxes.device)
    
    # Calculate IoU
    iou = bbox_iou(pred_boxes, target_boxes)

    # C coordinates
    c_x1 = torch.min(pred_boxes[:, 0], target_boxes[:, 0])
    c_y1 = torch.min(pred_boxes[:, 1], target_boxes[:, 1])
    c_x2 = torch.max(pred_boxes[:, 2], target_boxes[:, 2])
    c_y2 = torch.max(pred_boxes[:, 3], target_boxes[:, 3])

    # C area
    c_area = (c_x2 - c_x1) * (c_y2 - c_y1) + 1e-16

    # Union Area
    b1_area = (pred_boxes[:, 2] - pred_boxes[:, 0]) * (pred_boxes[:, 3] - pred_boxes[:, 1])
    b2_area = (target_boxes[:, 2] - target_boxes[:, 0]) * (target_boxes[:, 3] - target_boxes[:, 1])
    union = b1_area + b2_area - iou * b1_area # 这里 IoU 是按 b1_area 计算的，所以要修正

    # Original IoU calculation without explicit union area
    inter_area = (torch.min(pred_boxes[:, 2], target_boxes[:, 2]) - torch.max(pred_boxes[:, 0], target_boxes[:, 0])).clamp(0) * \
                 (torch.min(pred_boxes[:, 3], target_boxes[:, 3]) - torch.max(pred_boxes[:, 1], target_boxes[:, 1])).clamp(0)
    union = b1_area + b2_area - inter_area + 1e-16
    iou = inter_area / union

    giou = iou - (c_area - union) / c_area
    return (1 - giou).mean()

def xywh2xyxy(x):
    # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
    y = x.clone()
    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
    return y

3. 数据集与数据加载器 (Dataset & Dataloader)

你需要将你的数据集组织成以下结构：

data/doorbell_person_dataset/
├── images/
│   ├── train/
│   │   ├── 000001.jpg
│   │   └── ...
│   └── val/
│       └── ...
└── labels/
    ├── train/
    │   ├── 000001.txt  (YOLO格式: class_id x_center y_center width height)
    │   └── ...
    └── val/
        └── ...

标注文件格式 (YOLO格式)：每个图像对应一个 .txt 文件，内容如下：

<class_id> <x_center> <y_center> <width> <height>
<class_id> <x_center> <y_center> <width> <height>
...

其中所有坐标都是归一化到 [0, 1] 的。对于只有“人”一个类别，class_id 始终是 0。

# dataset.py
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
import os
from PIL import Image
import numpy as np
import cv2 # 用于图像处理和数据增强

class YOLODataset(Dataset):
    def __init__(self, img_dir, label_dir, anchors, img_size=416, num_classes=1, is_train=True, augment_config=None):
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.img_size = img_size
        self.num_classes = num_classes
        self.anchors = anchors
        self.is_train = is_train
        self.augment_config = augment_config if augment_config else {}

        self.img_files = [f for f in os.listdir(img_dir) if f.endswith(('.jpg', '.png', '.jpeg'))]
        self.label_files = [f.replace('.jpg', '.txt').replace('.png', '.txt').replace('.jpeg', '.txt') for f in self.img_files]
        
        # 图像变换 (不包含数据增强的部分，增强在 __getitem__ 中手动处理)
        self.transform = T.Compose([
            T.ToTensor(),
            T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), # ImageNet 标准化
        ])

    def __len__(self):
        return len(self.img_files)

    def __getitem__(self, index):
        img_path = os.path.join(self.img_dir, self.img_files[index])
        label_path = os.path.join(self.label_dir, self.label_files[index])

        image = Image.open(img_path).convert("RGB")
        original_w, original_h = image.size

        # 加载标签 (YOLO格式: class_id x_center y_center width height)
        # 这些坐标是归一化到 [0, 1] 的
        boxes = []
        if os.path.exists(label_path) and os.path.getsize(label_path) > 0:
            with open(label_path, 'r') as f:
                for line in f.readlines():
                    parts = list(map(float, line.strip().split()))
                    class_id = int(parts[0])
                    x_center, y_center, width, height = parts[1:]
                    boxes.append([class_id, x_center, y_center, width, height])
        
        boxes = torch.tensor(boxes, dtype=torch.float32)

        # 数据增强 (仅在训练模式下)
        if self.is_train and self.augment_config:
            image, boxes = self._augment_image_and_boxes(image, boxes, original_w, original_h)

        # 调整图像大小
        image = image.resize((self.img_size, self.img_size))
        image = self.transform(image)
        
        # 返回图片和边界框 (class_id, x_center, y_center, width, height) 归一化到 [0, 1]
        return image, boxes

    def _augment_image_and_boxes(self, image_pil, boxes, original_w, original_h):
        # 将 PIL 图像转换为 OpenCV 格式进行增强
        image_np = np.array(image_pil)
        image_cv2 = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)

        # 随机水平翻转
        if np.random.rand() < self.augment_config.get("hflip_prob", 0.0):
            image_cv2 = cv2.flip(image_cv2, 1)
            if boxes.numel() > 0:
                boxes[:, 1] = 1 - boxes[:, 1] # x_center = 1 - x_center

        # 随机亮度、对比度、饱和度
        image_cv2 = self._adjust_brightness_contrast_saturation(image_cv2, 
                                                                 self.augment_config.get("brightness", 0), 
                                                                 self.augment_config.get("contrast", 0), 
                                                                 self.augment_config.get("saturation", 0))

        # 添加高斯噪声
        if np.random.rand() < self.augment_config.get("noise_prob", 0.0):
            row, col, ch = image_cv2.shape
            mean = 0
            var = 0.01
            sigma = var**0.5
            gauss = np.random.normal(mean, sigma, (row, col, ch))
            gauss = gauss.reshape(row, col, ch)
            image_cv2 = image_cv2 + gauss * 255 # 乘以255使噪声可见
            image_cv2 = np.clip(image_cv2, 0, 255).astype(np.uint8)

        # Cutout (随机挖去一块)
        if np.random.rand() < self.augment_config.get("cutout_prob", 0.0) and boxes.numel() > 0:
            image_cv2 = self._apply_cutout(image_cv2)
            
        # 转换回 PIL 图像
        image_pil = Image.fromarray(cv2.cvtColor(image_cv2, cv2.COLOR_BGR2RGB))
        return image_pil, boxes

    def _adjust_brightness_contrast_saturation(self, image, brightness, contrast, saturation):
        # 简单的图像处理，可以替换为 imgaug 等更专业的库
        if brightness:
            image = self._adjust_brightness(image, brightness)
        if contrast:
            image = self._adjust_contrast(image, contrast)
        if saturation:
            image = self._adjust_saturation(image, saturation)
        return image

    def _adjust_brightness(self, img, delta):
        return np.clip(img * (1 + delta) + 0 * delta, 0, 255).astype(np.uint8)

    def _adjust_contrast(self, img, delta):
        return np.clip(127 + delta * (img - 127), 0, 255).astype(np.uint8)

    def _adjust_saturation(self, img, delta):
        img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
        img_hsv[:, :, 1] = np.clip(img_hsv[:, :, 1] * (1 + delta), 0, 255)
        return cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR).astype(np.uint8)

    def _apply_cutout(self, img):
        h, w, _ = img.shape
        cutout_size = np.random.randint(min(h, w) // 8, min(h, w) // 4)
        x1 = np.random.randint(0, w - cutout_size)
        y1 = np.random.randint(0, h - cutout_size)
        x2 = x1 + cutout_size
        y2 = y1 + cutout_size
        img[y1:y2, x1:x2] = 0 # 填充黑色
        return img


# 自定义 collate_fn 用于处理不同数量的边界框
def custom_collate_fn(batch):
    images = []
    targets = [] # (batch_idx, class_id, x, y, w, h)
    for i, (img, boxes) in enumerate(batch):
        images.append(img)
        if boxes.numel() > 0: # 确保有目标
            batch_idx = torch.full((boxes.shape[0], 1), i, dtype=torch.float32)
            targets.append(torch.cat((batch_idx, boxes), dim=1))
    
    images = torch.stack(images, 0)
    if targets:
        targets = torch.cat(targets, 0)
    else:
        targets = torch.empty(0, 6) # 返回空张量，保持维度一致
    
    return images, targets

4. 工具函数 (Utility Functions)

一些用于 NMS、mAP 计算和模型保存/加载的辅助函数。

# utils.py
import torch
import numpy as np
from collections import Counter
import os

def non_max_suppression(bboxes, iou_threshold, conf_threshold, num_classes=1):
    """
    Performs Non-Maximum Suppression (NMS) on input bounding boxes.
    bboxes: (list) of list [x1, y1, x2, y2, obj_conf, class_conf, class_pred]
            where x1, y1, x2, y2 are absolute coordinates.
    iou_threshold: (float) IoU threshold for suppressing boxes.
    conf_threshold: (float) Object confidence threshold for filtering boxes.
    num_classes: (int) Number of classes.
    
    Returns:
        list of bboxes after NMS, same format as input.
    """
    assert type(bboxes) == list

    bboxes = [box for box in bboxes if box[4] > conf_threshold] # 筛选置信度低的框
    bboxes = sorted(bboxes, key=lambda x: x[4], reverse=True) # 按置信度降序排序
    
    # 因为我们只有一个类别“人”，所以 class_pred 都是0，无需按类别分组
    # 如果有多个类别，这里需要按 class_pred 分组进行 NMS
    
    bboxes_after_nms = []

    while bboxes:
        chosen_box = bboxes.pop(0) # 取出置信度最高的框
        bboxes_after_nms.append(chosen_box)

        # 移除与 chosen_box IoU 过高的其他框
        bboxes = [
            box
            for box in bboxes
            if bbox_iou_numpy(
                torch.tensor(chosen_box[0:4]), torch.tensor(box[0:4]), x1y1x2y2=True
            ).item() < iou_threshold
        ]
    return bboxes_after_nms

def cells_to_boxes(predictions, img_size=416):
    """
    Converts model predictions (from YOLOLayer) into bounding box format [x1, y1, x2, y2, object_conf, class_conf, class_pred].
    predictions: (tensor) output from YOLOLayer, shape (N, total_predictions, 5 + num_classes)
                 (x, y, w, h) are absolute image coordinates
    img_size: (int) input image size
    
    Returns:
        list of lists, each inner list is [x1, y1, x2, y2, obj_conf, class_conf, class_pred]
    """
    all_boxes = []
    # predictions: (batch_size, num_total_predictions, 5 + num_classes)
    # 5 + num_classes: x, y, w, h, obj_conf, class_probs (for each class)
    
    for i in range(predictions.shape[0]): # 遍历每个batch
        batch_boxes = []
        for pred in predictions[i]: # 遍历当前batch的所有预测
            x, y, w, h, obj_conf = pred[0:5].tolist()
            class_probs = pred[5:]
            
            # 由于只有一个类别，直接取 class_probs[0]
            class_pred = 0 # 对于人体检测，类别始终是0
            class_conf = class_probs[0] # 类别置信度

            # 将 (x, y, w, h) (中心点坐标+宽高) 转换为 (x1, y1, x2, y2) (左上角+右下角)
            x1 = x - w / 2
            y1 = y - h / 2
            x2 = x + w / 2
            y2 = y + h / 2
            
            batch_boxes.append([x1, y1, x2, y2, obj_conf, class_conf, class_pred])
        all_boxes.append(batch_boxes)
    
    return all_boxes

# numpy 版本的 IoU, 用于 NMS
def bbox_iou_numpy(box1, box2, x1y1x2y2=True):
    """
    Returns the IoU of two bounding boxes (numpy version)
    """
    if not x1y1x2y2:
        # Transform from center coords to x1y1x2y2
        b1_x1, b1_x2 = box1[0] - box1[2] / 2, box1[0] + box1[2] / 2
        b1_y1, b1_y2 = box1[1] - box1[3] / 2, box1[1] + box1[3] / 2
        b2_x1, b2_x2 = box2[0] - box2[2] / 2, box2[0] + box2[2] / 2
        b2_y1, b2_y2 = box2[1] - box2[3] / 2, box2[1] + box2[3] / 2
    else:
        b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3]
        b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3]

    # Intersection area
    inter_area = (np.min(b1_x2, b2_x2) - np.max(b1_x1, b2_x1)).clip(0) * \
                 (np.min(b1_y2, b2_y2) - np.max(b1_y1, b2_y1)).clip(0)

    # Union Area
    b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1)
    b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1)

    iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)

    return iou

def mean_average_precision(pred_boxes, true_boxes, iou_threshold=0.5, num_classes=1):
    """
    Calculates mean average precision (mAP)
    Arguments:
        pred_boxes (list): List of lists containing all bboxes ordered by class
                           [[train_idx, class_pred, obj_conf, x1, y1, x2, y2], ...]
        true_boxes (list): List of lists containing all ground truths
                          [[train_idx, class_pred, x1, y1, x2, y2], ...]
        iou_threshold (float): threshold where predicted bboxes is correct
        num_classes (int): number of classes

    Returns:
        float: mAP value across all classes
    """

    average_precisions = []
    
    # 对于人体检测，num_classes=1，class_0
    for c in range(num_classes):
        detections = []
        ground_truths = []

        for detection in pred_boxes:
            if detection[1] == c: # class_pred
                detections.append(detection)

        for true_box in true_boxes:
            if true_box[1] == c: # class_pred
                ground_truths.append(true_box)

        # img 0 has 3 bboxes
        # img 1 has 5 bboxes
        # amount_bboxes = {0:3, 1:5}
        amount_bboxes = Counter([gt[0] for gt in ground_truths])

        for key, val in amount_bboxes.items():
            amount_bboxes[key] = torch.zeros(val)

        # detections = [[train_idx, class_pred, obj_conf, x1, y1, x2, y2], ...]
        detections.sort(key=lambda x: x[2], reverse=True) # Sort by object confidence

        TP = torch.zeros((len(detections)))
        FP = torch.zeros((len(detections)))
        total_true_bboxes = len(ground_truths)

        if total_true_bboxes == 0:
            continue

        for detection_idx, detection in enumerate(detections):
            # Only take out the ground_truths that have the same training idx as detection
            ground_truth_img = [
                bbox for bbox in ground_truths if bbox[0] == detection[0]
            ]

            num_gts = len(ground_truth_img)
            best_iou = 0

            for idx, gt in enumerate(ground_truth_img):
                iou = bbox_iou_numpy(
                    torch.tensor(detection[3:]),
                    torch.tensor(gt[2:]),
                    x1y1x2y2=True,
                )

                if iou > best_iou:
                    best_iou = iou
                    best_gt_idx = idx

            if best_iou > iou_threshold:
                # Only detect ground truth detection once
                if amount_bboxes[detection[0]][best_gt_idx] == 0:
                    TP[detection_idx] = 1
                    amount_bboxes[detection[0]][best_gt_idx] = 1
                else:
                    FP[detection_idx] = 1
            else:
                FP[detection_idx] = 1

        TP_cumsum = torch.cumsum(TP, dim=0)
        FP_cumsum = torch.cumsum(FP, dim=0)
        
        # Recall: TP / (TP + FN) = TP / total_true_bboxes
        recalls = TP_cumsum / (total_true_bboxes + 1e-6)
        
        # Precision: TP / (TP + FP)
        precisions = TP_cumsum / (TP_cumsum + FP_cumsum + 1e-6)

        # Append 0 and 1 to start and end of precisions and recalls to compute AP
        precisions = torch.cat((torch.tensor([1.0]), precisions))
        recalls = torch.cat((torch.tensor([0.0]), recalls))

        # Integrate precision-recall curve using trapezoidal rule
        average_precisions.append(torch.trapz(precisions, recalls))

    if len(average_precisions) == 0:
        return 0.0 # No detections or ground truths

    return sum(average_precisions) / len(average_precisions)


def save_checkpoint(model, optimizer, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    checkpoint = {
        "state_dict": model.state_dict(),
        "optimizer": optimizer.state_dict(),
    }
    torch.save(checkpoint, filename)

def load_checkpoint(checkpoint_file, model, optimizer=None):
    print("=> Loading checkpoint")
    checkpoint = torch.load(checkpoint_file, map_location=Config.DEVICE)
    model.load_state_dict(checkpoint["state_dict"])
    if optimizer:
        optimizer.load_state_dict(checkpoint["optimizer"])
    # If continuing training, set model to train mode
    # model.train()