ddd

最新推荐文章于 2024-12-01 23:54:40 发布

zouxiaolv

最新推荐文章于 2024-12-01 23:54:40 发布

阅读量334

点赞数

1、detectron整个的模型建立过程，以关键点为例说明
这里写图片描述

2、整体的流程如上，但是从
FPN.add_fpn_rpn_outputs(model, blob_in, dim_in, spatial_scale_in)开始我们要详细介绍一下
1）在这个函数里面会除了加上rpn的输出之外，还会顺带加上proposal的生成过程

model.GenerateProposals(
         [rpn_cls_probs_fpn, rpn_bbox_pred_fpn, 'im_info'],
         ['rpn_rois_fpn' + slvl, 'rpn_roi_probs_fpn' + slvl],
         anchors=lvl_anchors,
         spatial_scale=sc
)#产生的proposal将以rpn_rois_fpn_x命名，rpn_roi_probs_fpn_x将会表示每一个proposal的分数

2）接下来是model.CollectAndDistributeFpnRpnProposals()这个函数，这个函数将上面产生的proposal分配给各个level的FPN网络并且声称一系列的标签

class CollectAndDistributeFpnRpnProposalsOp(object):
    def __init__(self, train):
        self._train = train

    def forward(self, inputs, outputs):
        """See modeling.detector.CollectAndDistributeFpnRpnProposals for
        inputs/outputs documentation.
        """
        # inputs is
        # [rpn_rois_fpn2, ..., rpn_rois_fpn6,
        #  rpn_roi_probs_fpn2, ..., rpn_roi_probs_fpn6]                                       
        # If training with Faster R-CNN, then inputs will additionally include
        #  + [roidb, im_info]                                                              #输入的blob是这些
        rois = collect(inputs, self._train)                                                #collect函数的作用是将之前各个level的rois都连在一起，取分数最高的前2000个，获得这些rois和对应的分数
        if self._train:
            # During training we reuse the data loader code. We populate roidb
            # entries on the fly using the rois generated by RPN.
            # im_info: [[im_height, im_width, im_scale], ...]
            im_info = inputs[-1].data                                                      #获取到图像信息
            im_scales = im_info[:, 2]                                                      #把图像缩放尺度拿出来
            roidb = blob_utils.deserialize(inputs[-2].data)                                #roidb拿出来
            # For historical consistency with the original Faster R-CNN
            # implementation we are *not* filtering crowd proposals.
            # This choice should be investigated in the future (it likely does
            # not matter).

            json_dataset.add_proposals(roidb, rois, im_scales, crowd_thresh=0)            #向roidb里面加入proposal的信息，详细看下面该函数的介绍，经过了这个函数，roidb已经焕然一新了，加入proposal的信息了
            # Compute training labels for the RPN proposals; also handles
            # distributing the proposals over FPN levels
            output_blob_names = roi_data.fast_rcnn.get_fast_rcnn_blob_names()             #获得fasterrcnn所需要的blob
            blobs = {k: [] for k in output_blob_names}
            roi_data.fast_rcnn.add_fast_rcnn_blobs(blobs, im_scales, roidb)               #往faster-rcnn的blob里面添加对应的元素
            for i, k in enumerate(output_blob_names):
                blob_utils.py_op_copy_blob(blobs[k], outputs[i])                          #将blob里面的送到output里面去
        else:
            # For inference we have a special code path that avoids some data
            # loader overhead
            distribute(rois, None, outputs, self._train)
def collect(inputs, is_training):
    cfg_key = 'TRAIN' if is_training else 'TEST'
    post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
    k_max = cfg.FPN.RPN_MAX_LEVEL
    k_min = cfg.FPN.RPN_MIN_LEVEL
    num_lvls = k_max - k_min + 1
    roi_inputs = inputs[:num_lvls]
    score_inputs = inputs[num_lvls:]
    if is_training:
        score_inputs = score_inputs[:-2]

    # rois are in [[batch_idx, x0, y0, x1, y2], ...] format
    # Combine predictions across all levels and retain the top scoring
    rois = np.concatenate([blob.data for blob in roi_inputs])
    #pdb.set_trace()
    scores = np.concatenate([blob.data for blob in score_inputs]).squeeze()
    inds = np.argsort(-scores)[:post_nms_topN]
    rois = rois[inds, :]
    return rois

def add_proposals(roidb, rois, scales, crowd_thresh):
    """Add proposal boxes (rois) to an roidb that has ground-truth annotations
    but no proposals. If the proposals are not at the original image scale,
    specify the scale factor that separate them in scales.
    """
    box_list = []
    for i in range(len(roidb)):
        inv_im_scale = 1. / scales[i]                                                    #算出将图像变为原来的尺度需要多大的缩放
        idx = np.where(rois[:, 0] == i)[0]                                               #因为roi都是有编号的，所以把属于该张图片的roi拿出来
        box_list.append(rois[idx, 1:] * inv_im_scale)                                    #把roi乘以刚刚算出来的尺度变换参数把他变换到原来的图片空间上的proposal！！！！！！！！！！！！！！！！！这里是第一次设计尺度还原
    _merge_proposal_boxes_into_roidb(roidb, box_list)                                    #调用该函数把proposal加入到roidb中区
    if crowd_thresh > 0:
        _filter_crowd_proposals(roidb, crowd_thresh)                                     #过滤人群
    _add_class_assignments(roidb)                                                        #设置每个proposal的分类，是属于哪个分类的，以及最大重叠式多少同时做一系列的检查


def _merge_proposal_boxes_into_roidb(roidb, box_list):
    """Add proposal boxes to each roidb entry."""
    assert len(box_list) == len(roidb)
    for i, entry in enumerate(roidb):
        boxes = box_list[i]
        num_boxes = boxes.shape[0]
        gt_overlaps = np.zeros(
            (num_boxes, entry['gt_overlaps'].shape[1]),
            dtype=entry['gt_overlaps'].dtype
        )
        box_to_gt_ind_map = -np.ones(
            (num_boxes), dtype=entry['box_to_gt_ind_map'].dtype
        )

        # Note: unlike in other places, here we intentionally include all gt
        # rois, even ones marked as crowd. Boxes that overlap with crowds will
        # be filtered out later (see: _filter_crowd_proposals).
        gt_inds = np.where(entry['gt_classes'] > 0)[0]
        if len(gt_inds) > 0:
            gt_boxes = entry['boxes'][gt_inds, :]                                    #将gt框都拿出来
            gt_classes = entry['gt_classes'][gt_inds]                                #将gt对应的类别拿出来
            proposal_to_gt_overlaps = box_utils.bbox_overlaps(                       #计算proposal和gt之间的overlap
                boxes.astype(dtype=np.float32, copy=False),
                gt_boxes.astype(dtype=np.float32, copy=False)
            )
            # Gt box that overlaps each input box the most
            # (ties are broken arbitrarily by class order)
            argmaxes = proposal_to_gt_overlaps.argmax(axis=1)                        #算出每个proposal对应的gt是谁
            # Amount of that overlap
            maxes = proposal_to_gt_overlaps.max(axis=1)                              #把重叠最大的面积都拿出来，找出重叠不为0的那些
            # Those boxes with non-zero overlap with gt boxes
            I = np.where(maxes > 0)[0]                                               
            # Record max overlaps with the class of the appropriate gt box
            gt_overlaps[I, gt_classes[argmaxes[I]]] = maxes[I]                       #gt_overlap对应的分类的地方就要设置相应的分数，例如proposalA与gtB最大重合，就在gtB所对应的位置设置重叠
            box_to_gt_ind_map[I] = gt_inds[argmaxes[I]]                              #每一个proposal对应的是哪个gt
        entry['boxes'] = np.append(                                                  #之后向roidb的每个入口加入proposal的信息，同时保留之前的gt信息
            entry['boxes'],
            boxes.astype(entry['boxes'].dtype, copy=False),
            axis=0
        )
        entry['gt_classes'] = np.append(
            entry['gt_classes'],
            np.zeros((num_boxes), dtype=entry['gt_classes'].dtype)
        )
        entry['seg_areas'] = np.append(
            entry['seg_areas'],
            np.zeros((num_boxes), dtype=entry['seg_areas'].dtype)
        )
        entry['gt_overlaps'] = np.append(
            entry['gt_overlaps'].toarray(), gt_overlaps, axis=0
        )
        entry['gt_overlaps'] = scipy.sparse.csr_matrix(entry['gt_overlaps'])
        entry['is_crowd'] = np.append(
            entry['is_crowd'],
            np.zeros((num_boxes), dtype=entry['is_crowd'].dtype)
        )
        entry['box_to_gt_ind_map'] = np.append(
            entry['box_to_gt_ind_map'],
            box_to_gt_ind_map.astype(
                entry['box_to_gt_ind_map'].dtype, copy=False
            )
        )


def _filter_crowd_proposals(roidb, crowd_thresh):
    """Finds proposals that are inside crowd regions and marks them as
    overlap = -1 with each ground-truth rois, which means they will be excluded
    from training.
    """
    for entry in roidb:
        gt_overlaps = entry['gt_overlaps'].toarray()
        crowd_inds = np.where(entry['is_crowd'] == 1)[0]
        non_gt_inds = np.where(entry['gt_classes'] == 0)[0]
        if len(crowd_inds) == 0 or len(non_gt_inds) == 0:
            continue
        crowd_boxes = box_utils.xyxy_to_xywh(entry['boxes'][crowd_inds, :])
        non_gt_boxes = box_utils.xyxy_to_xywh(entry['boxes'][non_gt_inds, :])
        iscrowd_flags = [int(True)] * len(crowd_inds)
        ious = COCOmask.iou(non_gt_boxes, crowd_boxes, iscrowd_flags)
        bad_inds = np.where(ious.max(axis=1) > crowd_thresh)[0]
        gt_overlaps[non_gt_inds[bad_inds], :] = -1
        entry['gt_overlaps'] = scipy.sparse.csr_matrix(gt_overlaps)


def _add_class_assignments(roidb):
    """Compute object category assignment for each box associated with each
    roidb entry.
    """
    for entry in roidb:
        gt_overlaps = entry['gt_overlaps'].toarray()
        # max overlap with gt over classes (columns)
        max_overlaps = gt_overlaps.max(axis=1)
        # gt class that had the max overlap
        max_classes = gt_overlaps.argmax(axis=1)
        entry['max_classes'] = max_classes
        entry['max_overlaps'] = max_overlaps
        # sanity checks
        # if max overlap is 0, the class must be background (class 0)
        zero_inds = np.where(max_overlaps == 0)[0]
        assert all(max_classes[zero_inds] == 0)
        # if max overlap > 0, the class must be a fg class (not class 0)
        nonzero_inds = np.where(max_overlaps > 0)[0]
        assert all(max_classes[nonzero_inds] != 0)

再来看add_fast_rcnn_blobs

def add_fast_rcnn_blobs(blobs, im_scales, roidb):
    """Add blobs needed for training Fast R-CNN style models."""
    # Sample training RoIs from each image and append them to the blob lists
    for im_i, entry in enumerate(roidb):
        frcn_blobs = _sample_rois(entry, im_scales[im_i], im_i)                       #这个函数也是返回一个blob，这个blob里面有什么呢？看下面非代码部分的解释
        for k, v in frcn_blobs.items():
            blobs[k].append(v)
    # Concat the training blob lists into tensors
    for k, v in blobs.items():
        if isinstance(v, list) and len(v) > 0:
            blobs[k] = np.concatenate(v)
    # Add FPN multilevel training RoIs, if configured
    if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_ROIS:
        _add_multilevel_rois(blobs)


    # Perform any final work and validity checks after the collating blobs for
    # all minibatch images
    valid = True
    if cfg.MODEL.KEYPOINTS_ON:
        valid = roi_data.keypoint_rcnn.finalize_keypoint_minibatch(blobs, valid)

    return valid

sample_rois这个函数最终返回的是从proposal里面按照一个batch的大小拿出来的proposal，正负样本比例参考配置文件，默认256，最终的blob返回的是
blob_dict = dict(
labels_int32=sampled_labels.astype(np.int32, copy=False),
rois=sampled_rois,
bbox_targets=bbox_targets,
bbox_inside_weights=bbox_inside_weights,
bbox_outside_weights=bbox_outside_weights
)
如果有关键点加入训练的话，还要加入关键点的一些信息，关键点有哪些信息呢？
blobs[‘keypoint_rois’] = sampled_fg_rois
blobs[‘keypoint_locations_int32’] = heats.astype(np.int32, copy=False)
blobs[‘keypoint_weights’] = weights
经过了rois之后的blob的输出是
这里写图片描述

对于关键点部分的blob加入，是在前景对象上面设置关键点的信息

def add_keypoint_rcnn_blobs(
    blobs, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx
):
    """Add Mask R-CNN keypoint specific blobs to the given blobs dictionary."""
    # Note: gt_inds must match how they're computed in
    # datasets.json_dataset._merge_proposal_boxes_into_roidb
    gt_inds = np.where(roidb['gt_classes'] > 0)[0]                                       #找出gt
    max_overlaps = roidb['max_overlaps']                                                 #找出roidb里面的max_overlaps的对应
    gt_keypoints = roidb['gt_keypoints']                                                 #找出gt的keypoint信息

    ind_kp = gt_inds[roidb['box_to_gt_ind_map']]                                         #找出所有的box对应的gt的索引值,长度为N，也即boxes的数量
    within_box = _within_box(gt_keypoints[ind_kp, :, :], roidb['boxes'])                 #判断所有的关键点是否都在roidb的boxes范围内，此时within——box是一个Nx17的数组，其中N是所有的boxes的数量
    vis_kp = gt_keypoints[ind_kp, 2, :] > 0                                              #vis_kp的大小为Nx17，也即每个box对应的那个gt的每个关键点的可见性
    is_visible = np.sum(np.logical_and(vis_kp, within_box), axis=1) > 0                  #这句代码的目的是为了判断N个roi是否都有可见的关键点，is_visible是一个N维的向量，只有那些可见关键点在roi内的数量大于0的roi才会被采纳作为下一步的roi进行使用
    kp_fg_inds = np.where(
        np.logical_and(max_overlaps >= cfg.TRAIN.FG_THRESH, is_visible)                  #找出那些重叠度大于一定阈值的作为关键点部分proposal的正样本
    )[0]

    kp_fg_rois_per_this_image = np.minimum(fg_rois_per_image, kp_fg_inds.size)          #
    if kp_fg_inds.size > kp_fg_rois_per_this_image:
        kp_fg_inds = np.random.choice(
            kp_fg_inds, size=kp_fg_rois_per_this_image, replace=False
        )

    sampled_fg_rois = roidb['boxes'][kp_fg_inds]                                         #找出选取的那些作为人体关键点对应的那些box
    box_to_gt_ind_map = roidb['box_to_gt_ind_map'][kp_fg_inds]                           #找出选取的那些作为人体关键点对应的那些box所对应的gt

    num_keypoints = gt_keypoints.shape[2]                                                #Nx3x17
    sampled_keypoints = -np.ones(                                                        #预先定义采样的关键点，通通设置为-1，-1标签的点都是不参与训练的
        (len(sampled_fg_rois), gt_keypoints.shape[1], num_keypoints),
        dtype=gt_keypoints.dtype
    )
    for ii in range(len(sampled_fg_rois)):
        ind = box_to_gt_ind_map[ii]
        if ind >= 0:
            sampled_keypoints[ii, :, :] = gt_keypoints[gt_inds[ind], :, :]              #找到采样得到的框，然后找到其对应的关键点的gt信息，然后赋给它
            assert np.sum(sampled_keypoints[ii, 2, :]) > 0

    heats, weights = keypoint_utils.keypoints_to_heatmap_labels(                       #接下来制作heatmap标签，这是最关键的一步，给出sampled-roi和sampled-keypoints，来制作对应的heat和weight
        sampled_keypoints, sampled_fg_rois
    )

    shape = (sampled_fg_rois.shape[0] * cfg.KRCNN.NUM_KEYPOINTS, 1)                   #N*17
    heats = heats.reshape(shape)                                                      #reshape成Nx17
    weights = weights.reshape(shape)                                                  #reshape成NX17

    sampled_fg_rois *= im_scale                                                       #将rois的尺度刻画到缩放后的图像的尺度
    repeated_batch_idx = batch_idx * blob_utils.ones(
        (sampled_fg_rois.shape[0], 1)
    )
    sampled_fg_rois = np.hstack((repeated_batch_idx, sampled_fg_rois))                #然后将选择出来的roi加上第几张图片这个信息

    blobs['keypoint_rois'] = sampled_fg_rois                                          #将blob里面加上对应的信息
    blobs['keypoint_locations_int32'] = heats.astype(np.int32, copy=False)
    blobs['keypoint_weights'] = weights

接下来看一下keypoint是怎么转换为最终的label的
最终的map是要转为56x56的，所以会计算将原来的roi变成56x56，x和y方向各需要放大缩小多少倍，记住是要计算roi的怎么缩放到这个的倍数
1）首先计算缩放倍数，把关键点映射到最终的56x56的图上
2）来看一下关键点是不是合理的，也即是否映射到56x56的map图上
3）要看关键点合不合理，除了要做上述的判断还要看一下关键点是否是可见
4）然后将关键点映射到56x56map图上的2维位置变成一维的
5）最后返回生成的label和weight,都是NX17的大小
6）返回后将label和weight拉成一维向量

看一下关键点roi的选择原则：
1）首先roi内必须有可见关键点
2）然后roi的max_overlap必须达到一定的阈值
只有这些roi才有资格入选keypoint的roi,在关键点标签制作的时候，那么有一点就是那么那些在roi之外的关键点怎么办？
在刚刚的keypoint装换为label的时候会将roi之外的可见关键点设置为invalid，也即valid是false的
valid_loc = np.logical_and(
np.logical_and(x >= 0, y >= 0),
np.logical_and(
x < cfg.KRCNN.HEATMAP_SIZE, y < cfg.KRCNN.HEATMAP_SIZE))

就是这段话，会判断一个位置是不是一个合理的位置，要看gt_keypoints 经过映射之后是否还在界内，通过这些也可以发现关键点部分的训练是单独训练关键点的，就是用RPN提出来的proposal来训练关键点。详细内容参考keypoint.py文件和keypoint_rcnn.py

上面流程图中把distribute在肢解就变成了如下形式
这里写图片描述
最后的add_multilevel_roi_blobs的这个函数将roi分配给对应level的blob，因为roi分配的时候不同的level对应的索引是不同的，为了方便之后的恢复，blob里面有一个key就是做这个工作的

def add_multilevel_roi_blobs(
    blobs, blob_prefix, rois, target_lvls, lvl_min, lvl_max
):
    """Add RoI blobs for multiple FPN levels to the blobs dict.

    blobs: a dict mapping from blob name to numpy ndarray
    blob_prefix: name prefix to use for the FPN blobs
    rois: the source rois as a 2D numpy array of shape (N, 5) where each row is
      an roi and the columns encode (batch_idx, x1, y1, x2, y2)
    target_lvls: numpy array of shape (N, ) indicating which FPN level each roi
      in rois should be assigned to
    lvl_min: the finest (highest resolution) FPN level (e.g., 2)
    lvl_max: the coarest (lowest resolution) FPN level (e.g., 6)
    """
    rois_idx_order = np.empty((0, ))
    rois_stacked = np.zeros((0, 5), dtype=np.float32)  # for assert
    for lvl in range(lvl_min, lvl_max + 1):
        idx_lvl = np.where(target_lvls == lvl)[0]
        blobs[blob_prefix + '_fpn' + str(lvl)] = rois[idx_lvl, :]
        rois_idx_order = np.concatenate((rois_idx_order, idx_lvl))
        rois_stacked = np.vstack(
            [rois_stacked, blobs[blob_prefix + '_fpn' + str(lvl)]]
        )
    pdb.set_trace()
    rois_idx_restore = np.argsort(rois_idx_order).astype(np.int32, copy=False)
    blobs[blob_prefix + '_idx_restore_int32'] = rois_idx_restore
    # Sanity check that restore order is correct
    assert (rois_stacked[rois_idx_restore] == rois).all()

_idx_restore_int32这个参数就是存储对应顺序的一个参数，当然这只是个后缀，在分配roi的时候一共有4个level有别于rpn的5个level，其中边长224代表的是第四个level，112代表的是第三个level，56是第二个，448是第四个

做完这些，有一步是

    if cfg.MODEL.KEYPOINTS_ON:
        valid = roi_data.keypoint_rcnn.finalize_keypoint_minibatch(blobs, valid)

    return valid

来判断这一个batch是不是合格，判断标准是valid的关键点的数量是多少，是不是大于20个，如果这一个batch的参与训练的关键点的数量没有达到阈值，那么就是不合格的，valid返回false，否则返回true,除此之外，还会加入一个关键点的norm参数作为一个blob，blobs[‘keypoint_loss_normalizer’] = np.array(norm, dtype=np.float32)

3、加入fast-rcnn的head
这里写图片描述

来看一下RoIFeatureTransform函数
这个函数实现的是将各个level的roi进行roipooling，detectron采用的是roialign,由于roi之前存在了blob里面，按照不同的level进行了存储，不同型号的roi对应的level自然也是不一样的，rois_fpn_2,rois_fpn_3,rois_fpn_4,rois_fpn_5,对于上述的4中roi分别用fpn_res2_2_sum，fpn_res3_3_sum，fpn_res4_5_sum，fpn_res5_2_sum这几层的特征进行roi-pooling对于每一个level的roi做完pooling之后，将所有的pooling特征concat在一起，形成所有的roi的特征，也即

xform_shuffled, _ = self.net.Concat(
         bl_out_list, [blob_out + '_shuffled', '_concat_' + blob_out],
         axis=0
)

之前存储的时候保存了一个blob[‘roi_idx_restore_int32’]这样的一个blob，目的是为了恢复没有进行roi的level分配之前的roi顺序，因为之前制作标签的时候都是没有分配的顺序，所以为了之后的损失是和对应的标签是对应的，所以最后要将顺序进行还原，用这个restore_int32就可以实现

xform_out = self.net.BatchPermutation(
          [xform_shuffled, restore_bl], blob_out
)

总体代码如下

    def RoIFeatureTransform(
        self,
        blobs_in,
        blob_out,
        blob_rois='rois',
        method='RoIPoolF',
        resolution=7,
        spatial_scale=1. / 16.,
        sampling_ratio=0
    ):
        """Add the specified RoI pooling method. The sampling_ratio argument
        is supported for some, but not all, RoI transform methods.

        RoIFeatureTransform abstracts away:
          - Use of FPN or not
          - Specifics of the transform method
        """
        assert method in {'RoIPoolF', 'RoIAlign'}, \
            'Unknown pooling method: {}'.format(method)
        has_argmax = (method == 'RoIPoolF')
        if isinstance(blobs_in, list):
            # FPN case: add RoIFeatureTransform to each FPN level
            k_max = cfg.FPN.ROI_MAX_LEVEL  # coarsest level of pyramid
            k_min = cfg.FPN.ROI_MIN_LEVEL  # finest level of pyramid
            assert len(blobs_in) == k_max - k_min + 1
            bl_out_list = []
            for lvl in range(k_min, k_max + 1):
                bl_in = blobs_in[k_max - lvl]  # blobs_in is in reversed order
                sc = spatial_scale[k_max - lvl]  # in reversed order
                bl_rois = blob_rois + '_fpn' + str(lvl)
                bl_out = blob_out + '_fpn' + str(lvl)
                bl_out_list.append(bl_out)
                bl_argmax = ['_argmax_' + bl_out] if has_argmax else []
                self.net.__getattr__(method)(
                    [bl_in, bl_rois], [bl_out] + bl_argmax,
                    pooled_w=resolution,
                    pooled_h=resolution,
                    spatial_scale=sc,
                    sampling_ratio=sampling_ratio
                )
            # The pooled features from all levels are concatenated along the
            # batch dimension into a single 4D tensor.

            xform_shuffled, _ = self.net.Concat(
                bl_out_list, [blob_out + '_shuffled', '_concat_' + blob_out],
                axis=0
            )
            # Unshuffle to match rois from dataloader
            restore_bl = blob_rois + '_idx_restore_int32'
            xform_out = self.net.BatchPermutation(
                [xform_shuffled, restore_bl], blob_out
            )
        else:
            # Single feature level
            bl_argmax = ['_argmax_' + blob_out] if has_argmax else []
            # sampling_ratio is ignored for RoIPoolF
            xform_out = self.net.__getattr__(method)(
                [blobs_in, blob_rois], [blob_out] + bl_argmax,
                pooled_w=resolution,
                pooled_h=resolution,
                spatial_scale=spatial_scale,
                sampling_ratio=sampling_ratio
            )
        # Only return the first blob (the transformed features)
        return xform_out