faster_rcnn代码解析（6）

最新推荐文章于 2023-01-04 10:11:31 发布

原创最新推荐文章于 2023-01-04 10:11:31 发布 · 306 阅读

4 ·

CC 4.0 BY-SA版权

faster_rcnn代码详解专栏收录该内容

16 篇文章

订阅专栏

本文详细介绍了Fast R-CNN中的关键函数bbox_transform和bbox_transform_inv，解释了如何计算目标框回归参数，以及如何利用这些参数调整候选区域，实现目标检测。

lib/fast_rcnn/bbox_transform.py

# --------------------------------------------------------
# Fast R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick
# --------------------------------------------------------

import numpy as np

#函数作用：返回anchor相对gt的（dx,dy,dw,dh）四个回归值，shape（len(anchors),4）
def bbox_transform(ex_rois, gt_rois):
    #计算每一个anchor的width与heigt
    ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0   # xmax-xmin =width
    ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0   #ymax-ymin = height
    #计算每一个anchor的中心点x,y坐标
    ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths   #x轴的中心坐标
    ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights  #y轴的中心坐标

    #注意：当前的gt不是最一开始传进来的所有gt，而是与对应的anchor最匹配的gt,可能有重复信息
    #计算每一个gt的width和height
    gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0  #
    gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
    #计算每一个gt的中心点x,y坐标
    gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
    gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights

    #要对bbox进行回归需要四个量，dx,dy,dw,dh,分别为横纵平移量、宽高缩放比
    #此回归与fast-rcnn回归不同，fast要做的实在卷积完之后的特征向量进行回归，dx,dy,dw,dh都是对应与特征向量
    #此时由于对原图像可视野中的anchor进行回归，更直观
    #Tx=Pwdx(P)+Px Ty=Phdy(P)+Py Tw=Pwexp(dw(P)) Th=Phexp(dh(P))
    # P为anchor，T为target，最后要使得T～G，G为ground-True
    # 回归量dx(P)，dy(P)，dw(P)，dh(P)，即dx、dy、dw、dh
    #targets_dx, targets_dy, targets_dw, targets_dh都为（anchors.shape[0]，）大小
    # 所以targets为（anchors.shape[0]，4）
    targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths
    targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights
    targets_dw = np.log(gt_widths / ex_widths)
    targets_dh = np.log(gt_heights / ex_heights)

    targets = np.vstack(
        (targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
    return targets

#函数作用:得到改善后的anchor的信息（x1,y1,x2,y2）
def bbox_transform_inv(boxes, deltas):
    #boxes.shape[0]=K*A=Height*Width*A
    if boxes.shape[0] == 0:
        return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)

    boxes = boxes.astype(deltas.dtype, copy=False) #转换数据类型

    widths = boxes[:, 2] - boxes[:, 0] + 1.0
    heights = boxes[:, 3] - boxes[:, 1] + 1.0
    ctr_x = boxes[:, 0] + 0.5 * widths
    ctr_y = boxes[:, 1] + 0.5 * heights
    #deltas本来就只有4列，依次存（dx,dy,dw,dh）,每一行表示一个anchor
    #0::4表示先取第一个元素，以后每4个取一个，所以取的index为（0,4,8,12,16...），但是deltas本来就只有4列，所以只能取到一个值
    dx = deltas[:, 0::4]
    dy = deltas[:, 1::4]
    dw = deltas[:, 2::4]
    dh = deltas[:, 3::4]

    #预测后的中心点，与w与h
    pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
    #给原数组增加一个维度，根据np.newaxis放的位置不同，产生的新数组也不同
    pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
    pred_w = np.exp(dw) * widths[:, np.newaxis]
    pred_h = np.exp(dh) * heights[:, np.newaxis]
    #预测后的（x1,y1,x2,y2）存入 pred_boxes
    pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
    # x1
    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w  #xmin
    # y1
    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h   #xmax
    # x2
    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w   #ymin
    # y2
    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h    #ymax

    return pred_boxes #[xmin,ymin,xmax,ymax]

#函数作用：使得boxes位于图片内
def clip_boxes(boxes, im_shape):
    """
    Clip削减 boxes to image boundaries.
    """
    #im_shape[0]为图片高，im_shape[1]为图片宽
    # x1 >= 0
    boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0) #
    # y1 >= 0
    boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
    # x2 < im_shape[1]
    boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
    # y2 < im_shape[0]
    boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
    return boxes