白话Pytorch-YOLOv4 Loss-优快云博客

本文链接：https://blog.youkuaiyun.com/Noonebirdyou/article/details/111043462
本文深入剖析了pytorch-YOLOv4项目的损失函数计算原理及实现细节，包括Loss的构成、各类Loss的计算方法及其背后的逻辑。
摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >
最近抽时间看了一下pytorch-YOLOv4的源码, 里面的Loss计算方式对第一次看源码的童鞋不是很友好, 这里在看完后在原来源码基础上增加了很多对应的注释看起来就so easy啦, 就白话翻译了一下作者的实现方式, 这里附上注释版的.
class Yolo_loss(nn.Module):
    def __init__(self, n_classes=80, n_anchors=3, device=None, batch=2):
        super(Yolo_loss, self).__init__()
        self.device = device
        self.strides = [8, 16, 32]
        image_size = 608
        self.n_classes = n_classes
        self.n_anchors = n_anchors

        self.anchors = [[12, 16], [19, 36], [40, 28], [36, 75], [76, 55], [72, 146], [142, 110], [192, 243], [459, 401]]
        self.anch_masks = [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
        self.ignore_thre = 0.5

        self.masked_anchors, self.ref_anchors, self.grid_x, self.grid_y, self.anchor_w, self.anchor_h = [], [], [], [], [], []

        # 3种预测框尺寸,相对于原始图像W/H分别下降8/16/32倍
        for i in range(3):
            # 将锚点框(相对于原始图像尺寸)转换为对应的输出尺寸
            all_anchors_grid = [(w / self.strides[i], h / self.strides[i]) for w, h in self.anchors]
            # 当前预测框尺寸对应的锚点框真实尺寸[[12, 16], [19, 36], [40, 28]]
            masked_anchors = np.array([all_anchors_grid[j] for j in self.anch_masks[i]], dtype=np.float32)
            # size: 9x4
            ref_anchors = np.zeros((len(all_anchors_grid), 4), dtype=np.float32)
            ref_anchors[:, 2:] = np.array(all_anchors_grid, dtype=np.float32)
            ref_anchors = torch.from_numpy(ref_anchors)
            # calculate pred - xywh obj cls
            # 输出大小尺寸
            fsize = image_size // self.strides[i]
            # size: batch, 3, fsize, fsize
            grid_x = torch.arange(fsize, dtype=torch.float).repeat(batch, 3, fsize, 1).to(device)
            # size: batch, 3, fsize, fsize
            grid_y = torch.arange(fsize, dtype=torch.float).repeat(batch, 3, fsize, 1).permute(0, 1, 3, 2).to(device)
            # size: batch, 3, fsize, fsize
            anchor_w = torch.from_numpy(masked_anchors[:, 0]).repeat(batch, fsize, fsize, 1).permute(0, 3, 1, 2).to(
                device)
            # size: batch, 3, fsize, fsize
            anchor_h = torch.from_numpy(masked_anchors[:, 1]).repeat(batch, fsize, fsize, 1).permute(0, 3, 1, 2).to(
                device)

            self.masked_anchors.append(masked_anchors)
            self.ref_anchors.append(ref_anchors)
            self.grid_x.append(grid_x)
            self.grid_y.append(grid_y)
            self.anchor_w.append(anchor_w)
            self.anchor_h.append(anchor_h)

    def build_target(self, pred, labels, batchsize, fsize, n_ch, output_id):
        # target assignment
        # 5 + self.n_classes
        # self.n_anchors:3
        # fsize:19x19 38x38 76x76
        # 用来标识那些box是负责预测的, 然后其对应的classes+x,y,w,h均保留, 其它不负责预测的box置为0不参与Loss计算
        tgt_mask = torch.zeros(batchsize, self.n_anchors, fsize, fsize, 4 + self.n_classes).to(device=self.device)
        # 用来标识那些box是真正包含目标和确实不包含目标的
        obj_mask = torch.ones(batchsize, self.n_anchors, fsize, fsize).to(device=self.device)
        # w,h的缩放因子: 将对应到负责真实box预测的那个模型预测的box中w,h置为其对应的放大值
        tgt_scale = torch.zeros(batchsize, self.n_anchors, fsize, fsize, 2).to(self.device)
        # 将原始Dataset中获取到的labels转换为与模型预测相同的大小格式, 在对应的位置放入真实的x,y,w,h和classes
        target = torch.zeros(batchsize, self.n_anchors, fsize, fsize, n_ch).to(self.device)

        # labels = labels.cpu().data
        # labels: [batchsize, 60, 5], 此处的60x5为固定的, 即每张图片最多保留60个box, 如不足则剩余位置均为0值
        # 计算得到labels中的有效的box数量
        nlabel = (labels.sum(dim=2) > 0).sum(dim=1)  # number of objects

        # x,y: box中心点
        # 将x,y,w,h均换算为与当前输出Size一致的大小, 即除以下采样率
        truth_x_all = (labels[:, :, 2] + labels[:, :, 0]) / (self.strides[output_id] * 2)
        truth_y_all = (labels[:, :, 3] + labels[:, :, 1]) / (self.strides[output_id] * 2)
        truth_w_all = (labels[:, :, 2] - labels[:, :, 0]) / self.strides[output_id]
        truth_h_all = (labels[:, :, 3] - labels[:, :, 1]) / self.strides[output_id]
        # 对应于哪个grid来负责此目标的预测, 即对应于左上角的坐标值
        truth_i_all = truth_x_all.to(torch.int16).cpu().numpy()
        truth_j_all = truth_y_all.to(torch.int16).cpu().numpy()

        for b in range(batchsize):
            n = int(nlabel[b])
            if n == 0:
                continue
            # 构造真实的labels集合, 真正有多少个box就创建多少个, n就是标注的真正的box的数量
            # 每张图像依次来处理
            truth_box = torch.zeros(n, 4).to(self.device)
            # 真实的box尺寸, w/h
            truth_box[:n, 2] = truth_w_all[b, :n]
            truth_box[:n, 3] = truth_h_all[b, :n]
            # 每个box对应的负责grid的坐标
            truth_i = truth_i_all[b, :n]
            truth_j = truth_j_all[b, :n]

            # calculate iou between truth and reference anchors
            # 判定每个box与哪个anchor的IOU/CIOU值最大
            # 此处计算IOU时为按照左上角坐标同意固定为原点, 右下角坐标为(w,h)来计算IOU
            # 其实计算的目的就是来判断当前box的size是否应该在当前输出层进行预测, 也就是box的大小与当前层对应的anchor的大小相差不能过大
            # 意思就是模型输出不是有3层嘛, 分别预测小目标, 中目标和大目标, 对应的anchor尺寸也从小到大,
            # 然后就是若box与当前层的anchor尺寸匹配才在当前层进行预测, 否则应该在另外两层中进行预测
            anchor_ious_all = bboxes_iou(truth_box.cpu(), self.ref_anchors[output_id], CIoU=True)

            # temp = bbox_iou(truth_box.cpu(), self.ref_anchors[output_id])

            # 获取对应的最大IOU的anchor的索引[0-8]
            best_n_all = anchor_ious_all.argmax(dim=1)
            # 最大的应该与当前层对应的anchor相对应
            best_n = best_n_all % 3
            # 判定最大IOU对应的anchor是否为当前层对应的anchor
            # 也就是与之尺寸最匹配的anchor是哪个, 是否是当前层对应的3个anchor中的一个
            best_n_mask = ((best_n_all == self.anch_masks[output_id][0]) |
                           (best_n_all == self.anch_masks[output_id][1]) |
                           (best_n_all == self.anch_masks[output_id][2]))

            # 总共有多少个符合(即与当前层对应的anchor有最大IOU)
            # 也就是总共有多少box应该在当前输出层进行预测的, 即有多少个匹配上当前层的anchor的Size的
            if sum(best_n_mask) == 0:
                continue

            # 此处加入对应的真实x,y
            truth_box[:n, 0] = truth_x_all[b, :n]
            truth_box[:n, 1] = truth_y_all[b, :n]

            # pred[b].view(-1, 4)-->76x76x3=17328,每个grid预测3个总共76x76个网格-->17328x4当前层总共预测17328个box
            # 计算当前层预测的所有的box与真实的box的IOU值
            # 返回每个预测的box与每一个真实的box的IOU值, 即返回17328x真实box个数值
            pred_ious = bboxes_iou(pred[b].view(-1, 4), truth_box, xyxy=False)
            # 获得每个预测的box与所有真实的box的最大IOU值
            pred_best_iou, _ = pred_ious.max(dim=1)
            # 判定哪些预测的box的最大IOU小于阈值, 也就意味着其中不包含目标
            pred_best_iou = (pred_best_iou > self.ignore_thre)
            # pred[b].shape-->torch.Size([3, 76, 76, 4])
            # pred_best_iou.shape-->torch.Size([3, 76, 76])
            # 格式转换: 转换为[3, 76, 76]
            pred_best_iou = pred_best_iou.view(pred[b].shape[:3])
            # set mask to zero (ignore) if pred matches truth
            # 取反, 得到: 为1则意味着此预测的box中不含目标
            obj_mask[b] = ~ pred_best_iou

            for ti in range(best_n.shape[0]):
                # 此真实的box尺寸应在此输出层进行预测
                if best_n_mask[ti] == 1:
                    # 此真实box对应的预测的网格的坐标, 就是对应到哪个grid负责预测此box
                    i, j = truth_i[ti], truth_j[ti]
                    # a: 与当前box有最大IOU的anchor的索引, 对应到此grid预测的3个box中的具体哪一个, 至此已经可以明确哪个box负责预测此目标
                    a = best_n[ti]
                    # obj_mask是为了计算是否有目标存在Loss使用的, 上方已经知道了哪些box是没有目标的, 这里再把有目标的置为1
                    obj_mask[b, a, j, i] = 1
                    # 标识此box对应的classes+x,y,w,h需要用来计算Loss, 皆置为1
                    tgt_mask[b, a, j, i, :] = 1
                    # 构建真实的box对应的target,即模型预测的输出格式
                    # 即负责预测的box位置处进行真实标签值的填入
                    target[b, a, j, i, 0] = truth_x_all[b, ti] - truth_x_all[b, ti].to(torch.int16).to(torch.float)
                    target[b, a, j, i, 1] = truth_y_all[b, ti] - truth_y_all[b, ti].to(torch.int16).to(torch.float)
                    # 将w,h转换为模型预测对应的那个值
                    target[b, a, j, i, 2] = torch.log(
                        truth_w_all[b, ti] / torch.Tensor(self.masked_anchors[output_id])[best_n[ti], 0] + 1e-16)
                    target[b, a, j, i, 3] = torch.log(
                        truth_h_all[b, ti] / torch.Tensor(self.masked_anchors[output_id])[best_n[ti], 1] + 1e-16)
                    # 此处包含目标概率设为1
                    target[b, a, j, i, 4] = 1
                    # 5:表示开始的x,y,w,h和包含目标Object概率五个值, 然后将对应类别处的值设置为1
                    target[b, a, j, i, 5 + labels[b, ti, 4].to(torch.int16).cpu().numpy()] = 1
                    # 将负责预测预测的box的w,h乘以相应的系数值, 个人理解为增大小目标的w/h的Loss权重, 因为小目标是在fsize=76这一层预测的
                    # 而小目标的真实w/h都很小, 那么小目标的truth_w_all[b, ti] * truth_h_all[b, ti] / fsize / fsize值就很小, 则对应的结果越大
                    tgt_scale[b, a, j, i, :] = torch.sqrt(2 - truth_w_all[b, ti] * truth_h_all[b, ti] / fsize / fsize)
        return obj_mask, tgt_mask, tgt_scale, target

    def forward(self, xin, labels=None):
        # 具体Loss的计算就在这里
        # xin:[x2, x10, x18], 即为模型预测的输出, 三层输出分别为[batchsize, output_ch, 76(38/19), 76(38/19)]
        # output_ch = (4 + 1 + n_classes) * 3
        loss, loss_xy, loss_wh, loss_obj, loss_cls, loss_l2 = 0, 0, 0, 0, 0, 0
        # 接下来逐层来计算Loss
        for output_id, output in enumerate(xin):
            # output:batchsizex76x76x255
            # output:batchsizex38x38x255
            # output:batchsizex19x19x255
            # torch.Size([batchsize, output_ch, 76, 76])
            batchsize = output.shape[0]
            fsize = output.shape[2]
            n_ch = 5 + self.n_classes

            # batchsizex3x85x76x76
            # 输出尺寸:76x76, 每个grid预测3个Box, 每个Box预测(4 + 1 + n_classes)
            output = output.view(batchsize, self.n_anchors, n_ch, fsize, fsize)
            # 格式转换为[batchsize, 3, 76(38/19), 76(38/19), (4 + 1 + n_classes)]
            output = output.permute(0, 1, 3, 4, 2)  # .contiguous()torch.Size([batchsize, self.n_anchors, 76, 76, n_ch])

            # logistic activation for xy, obj, cls
            # 除了w,h其他均进行sigmoid操作
            output[..., np.r_[:2, 4:n_ch]] = torch.sigmoid(output[..., np.r_[:2, 4:n_ch]])

            # 预测的x,y为相对于当前网格左上角的偏移量, w,h皆为e^x*anchor=w中的这个x值, 然后通过计算得到预测对应的w/h值
            pred = output[..., :4].clone()
            pred[..., 0] += self.grid_x[output_id]
            pred[..., 1] += self.grid_y[output_id]
            pred[..., 2] = torch.exp(pred[..., 2]) * self.anchor_w[output_id]
            pred[..., 3] = torch.exp(pred[..., 3]) * self.anchor_h[output_id]
            # pred: torch.Size([batchsize, 3, 76, 76, 4])    labels:torch.Size([batchsize, 60, 5])
            # 此处先考虑一下如何来计算Loss值呢, 首先Loss是由目标分类(是否有目标)Loss+类别分类(具体子类别)Loss+中心偏移(x,y)Loss+宽高(w,h)Loss共同组成
            # 然后来分析一下各个Loss如何计算呢
            # 1:目标分类(是否有目标)Loss: 想计算此Loss那么需要知道哪些预测box中是有目标的, 哪些预测box中是没有目标的,
            # --> 标准为:此box与任一真实box(即labels中标注的box)的IOU均小于设定阈值则认作为无目标, 若此box负责某一真实box的预测,即与某一真实box有最大IOU/CIOU等, 则认作为有目标
            # 2: 其余的Loss均是针对负责预测的box来计算的, 即与某一真实box有最大IOU/CIOU等的那个预测的box
            # 然后让我们进入build_target方法中看具体的构造方式
            obj_mask, tgt_mask, tgt_scale, target = self.build_target(pred, labels, batchsize, fsize, n_ch, output_id)

            # 经过上面我们得到了obj_mask: 里面将有目标和无目标处置为1,这些box中的object概率会参与Loss计算
            # tgt_mask: 里面将真正对应到真实Box的那个模型预测的box中的类别classes和x,y,w,对应的位置置为1, 这些box会参与Loss的计算
            # tgt_scale: 则将真正对应到真实Box的那个模型预测的box中的w,h均乘以了对应的放大系数, 然后进行Loss的计算
            # 记下来逐个进行Loss计算就可以了
            # loss calculation
            output[..., 4] *= obj_mask
            output[..., np.r_[0:4, 5:n_ch]] *= tgt_mask
            output[..., 2:4] *= tgt_scale

            target[..., 4] *= obj_mask
            target[..., np.r_[0:4, 5:n_ch]] *= tgt_mask
            target[..., 2:4] *= tgt_scale

            loss_xy += F.binary_cross_entropy(input=output[..., :2], target=target[..., :2],
                                              weight=tgt_scale * tgt_scale, reduction='sum')
            loss_wh += F.mse_loss(input=output[..., 2:4], target=target[..., 2:4], reduction='sum') / 2
            loss_obj += F.binary_cross_entropy(input=output[..., 4], target=target[..., 4], reduction='sum')
            loss_cls += F.binary_cross_entropy(input=output[..., 5:], target=target[..., 5:], reduction='sum')
            loss_l2 += F.mse_loss(input=output, target=target, reduction='sum')

        loss = loss_xy + loss_wh + loss_obj + loss_cls

        return loss, loss_xy, loss_wh, loss_obj, loss_cls, loss_l2