proposal_layer.py与proposal_target_layer.py代码解读

本文深入解析Faster R-CNN的目标检测流程,包括RPN网络如何生成候选区域,以及如何通过Fast R-CNN模块进行精确分类和边界框回归。探讨了关键组件如proposal_layer和proposal_target_layer的功能和实现细节。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

proposal_layer.py

 

# --------------------------------------------------------
# Faster R-CNN
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick and Xinlei Chen
# --------------------------------------------------------
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import numpy as np
from model.config import cfg
from model.bbox_transform import bbox_transform_inv, clip_boxes, bbox_transform_inv_tf, clip_boxes_tf
from model.nms_wrapper import nms
'''
实际用到的是proposal_layer_tf,再network中调用。凡是有layer结尾的,都是再network里面调用。
'''
def proposal_layer(rpn_cls_prob, rpn_bbox_pred, im_info, cfg_key, _feat_stride, anchors, num_anchors):
  """A simplified version compared to fast/er RCNN
     For details please see the technical report
  """
  #下面提取好config里面预先设置好的rpn参数:
  #分别是nms前rpn最多框限制(12000个)以及之后最多框限制(300),以及rpn阈值
  if type(cfg_key) == bytes:
      cfg_key = cfg_key.decode('utf-8')
#Number of top scoring boxes to keep before apply NMS to RPN proposals 12000
  pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N
#Number of top scoring boxes to keep after applying NMS to RPN proposals 300
  post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
# NMS threshold used on RPN proposals 0.7
  nms_thresh = cfg[cfg_key].RPN_NMS_THRESH

  # Get the scores and bounding boxes
  #根据输入,提取分类概率和bbox位置
  scores = rpn_cls_prob[:, :, :, num_anchors:]
  rpn_bbox_pred = rpn_bbox_pred.reshape((-1, 4))
  scores = scores.reshape((-1, 1))
  proposals = bbox_transform_inv(anchors, rpn_bbox_pred)
  proposals = clip_boxes(proposals, im_info[:2])

  # Pick the top region proposals
  #下面就是排序,提取出排序后数据对应排序前的索引,输出为order
  order = scores.ravel().argsort()[::-1]
  #按设置的参数截取前RPN_PRE_NMS_TOP_N个proposal
  if pre_nms_topN > 0:
    order = order[:pre_nms_topN]
  proposals = proposals[order, :]
  #按索引order从scroes中切片出对应的scores
  scores = scores[order]

  # Non-maximal suppression
  #进行非极大值抑制
  keep = nms(np.hstack((proposals, scores)), nms_thresh)

  # Pick th top region proposals after NMS
  #按设置好的nms后框数量限制参数--RPN_POST_NMS_TOP_N
  #截取前RPN_POST_NMS_TOP_N个proposals
  if post_nms_topN > 0:
    keep = keep[:post_nms_topN]
  proposals = proposals[keep, :]
  scores = scores[keep]

  # Only support single image as input
  #生成一个空数组
  batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32)
  #为一张图片单独制造blob
  blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False)))

  return blob, scores


def proposal_layer_tf(rpn_cls_prob, rpn_bbox_pred, im_info, cfg_key, _feat_stride, anchors, num_anchors):
#这个是真实调用的,实际上比上面的要少一点,进行了一点浓缩,部分功能在其他位置实现。
  if type(cfg_key) == bytes:
    cfg_key = cfg_key.decode('utf-8')
  pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N
  post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
  nms_thresh = cfg[cfg_key].RPN_NMS_THRESH

  # Get the scores and bounding boxes
  scores = rpn_cls_prob[:, :, :, num_anchors:]
  scores = tf.reshape(scores, shape=(-1,))
  rpn_bbox_pred = tf.reshape(rpn_bbox_pred, shape=(-1, 4))

  proposals = bbox_transform_inv_tf(anchors, rpn_bbox_pred)
  proposals = clip_boxes_tf(proposals, im_info[:2])

  # Non-maximal suppression
  indices = tf.image.non_max_suppression(proposals, scores, max_output_size=post_nms_topN, iou_threshold=nms_thresh)

  boxes = tf.gather(proposals, indices)
  boxes = tf.to_float(boxes)
  scores = tf.gather(scores, indices)
  scores = tf.reshape(scores, shape=(-1, 1))

  # Only support single image as input
  batch_inds = tf.zeros((tf.shape(indices)[0], 1), dtype=tf.float32)
  blob = tf.concat([batch_inds, boxes], 1)

  return blob, scores


proposal_target_layer.py

 

# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick, Sean Bell and Xinlei Chen
# --------------------------------------------------------
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import numpy.random as npr
from model.config import cfg
from model.bbox_transform import bbox_transform
from utils.cython_bbox import bbox_overlaps

'''

'''


def proposal_target_layer(rpn_rois, rpn_scores, gt_boxes, _num_classes):
  """
  Assign object detection proposals to ground-truth targets. Produces proposal
  classification labels and bounding-box regression targets.
  根据gt,对rpn产生的proposal打上分类标签以及计算回归的偏差
  """

  # Proposal ROIs (0, x1, y1, x2, y2) coming from RPN
  # (i.e., rpn.proposal_layer.ProposalLayer), or any other source
  all_rois = rpn_rois
  all_scores = rpn_scores

  # Include ground-truth boxes in the set of candidate rois
  if cfg.TRAIN.USE_GT:#在config里面这个参数是false,这段代码应该是不执行的。
  #但是依然要解读下这段代码。看看为什么不执行。
    #按gt集合的形状生成个空数组,同样的行数
    zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype)
    #vstack按列堆叠列,hstack按行堆叠为列。
	#首先把前面生成的0数组和gt数组叠加起来,然后再与roi数组堆叠起来。
    all_rois = np.vstack
      (all_rois, np.hstack((zeros, gt_boxes[:, :-1])))
    )
    # not sure if it a wise appending, but anyway i am not using it
    #然后得分也这样处理。
    all_scores = np.vstack((all_scores, zeros))
  #TRAIN.BATCH_SIZE是感兴趣区域的数量
  #rois_per_image就是每一张图片允许的roi区域batch。
  #在其他地方也遇到了rois_per_image,名字不一样,其实就是一个限制参数。
  num_images = 1
  rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images
  #按cfg.TRAIN.FG_FRACTION参数计算得到每一张图片的batch个roi中前景的数量
  #比如rois_per_image=100,那么就是最多允许选取100*0.25=25个前景roi
  fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)

  # Sample rois with classification labels and bounding box regression
  # targets
  #_sample_rois函数,对每张图片的Batch按照参数设置随机采样
  labels, rois, roi_scores, bbox_targets, bbox_inside_weights = _sample_rois(
    all_rois, all_scores, gt_boxes, fg_rois_per_image,
    rois_per_image, _num_classes)
  #为了配合下面的操作,reshape一下
  #weight是权重,每一个roi都会产生误差,对所有roi使用均权计算。
  rois = rois.reshape(-1, 5)
  roi_scores = roi_scores.reshape(-1)
  labels = labels.reshape(-1, 1)
  bbox_targets = bbox_targets.reshape(-1, _num_classes * 4)
  bbox_inside_weights = bbox_inside_weights.reshape(-1, _num_classes * 4)
  bbox_outside_weights = np.array(bbox_inside_weights > 0).astype(np.float32)

  return rois, roi_scores, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights


def _get_bbox_regression_labels(bbox_target_data, num_classes):
  """Bounding-box regression targets (bbox_target_data) are stored in a
  compact form N x (class, tx, ty, tw, th)

  This function expands those targets into the 4-of-4*K representation used
  by the network (i.e. only one class has non-zero targets).

  Returns:
      bbox_target (ndarray): N x 4K blob of regression targets
      bbox_inside_weights (ndarray): N x 4K blob of loss weights
	  
  bbox_target_data=N x (class, tx, ty, tw, th)
  然后生成一个全0数组用于存放bbox_targets
  以及生成一个全0权重。
  然后就计算
  """
  
  clss = bbox_target_data[:, 0]
  bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32)
  bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
  #找出类别标号大于0的框,也就是找出前景的bbox
  inds = np.where(clss > 0)[0]
  #对于每一个前景bbox
  for ind in inds:
    cls = clss[ind]#首先提取出来类别标号
    start = int(4 * cls)#类别标号扩大4倍数再加4.如果前景=1那么start=1 end=8
    end = start + 4
	#如果是2的话,就是8-12。其实就是一个k类*4的向量,第一类的4个target在4-8之间,第二类在8-12之间,0类也就是背景在1-4之间
	#bbox_targets[ind,4:8]= bbox_target_data[ind ]
    bbox_targets[ind, start:end] = bbox_target_data[ind, 1:]
    bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS
	#也就是说,对于不同类别的bbox,根据类别进行编码,编码到一个统一的向量里面,四个偏移所在的位置,是根据其所属类别来定位的。
  return bbox_targets, bbox_inside_weights#同时返回权重。


def _compute_targets(ex_rois, gt_rois, labels):
  """Compute bounding-box regression targets for an image."""
#计算bbox回归量
  assert ex_rois.shape[0] == gt_rois.shape[0]
  assert ex_rois.shape[1] == 4
  assert gt_rois.shape[1] == 4
#bbox_transform这个函数就是gt的四个坐标和extrect的roi四个坐标比对,然后转换出偏移
  targets = bbox_transform(ex_rois, gt_rois)
  #下面的参数是一个正则化开关,是什么正则化?
  #事先进行了规定,得到means和stds,然后再计算的时候,用这些参数对目标进行...归一化?
  #这些means和stds是怎么计算出来的呢?
  #在config里面直接定义了,而means应该是均值把,stds因该是统计计算出来的标准差。
  #那么就是用了数学的归一化方法对数据进行了一些正则化,但直接用了参数,所以..
  #可能是不是不准?
  if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
    # Optionally normalize targets by a precomputed mean and stdev
    targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS))
               / np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS))
  return np.hstack(
    (labels[:, np.newaxis], targets)).astype(np.float32, copy=False)


def _sample_rois(all_rois, all_scores, gt_boxes, fg_rois_per_image, rois_per_image, num_classes):
  """
  对roi采样
  """
  # overlaps: (rois x gt_boxes)
  #bbox_overlaps是个计算函数,return计算结果
  overlaps = bbox_overlaps(
    np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float),
    np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float))
  gt_assignment = overlaps.argmax(axis=1)#axi=1 每一行的第几列最大,返回索引
  max_overlaps = overlaps.max(axis=1)#同上,不过返回具体值,及重叠率
  labels = gt_boxes[gt_assignment, 4]#得到对应的gt之后,以gt的标签为label
  #放在第4位的原因是,前0-3四个位置放的是坐标。

  # Select foreground RoIs as those with >= FG_THRESH overlap
  #定义前景
  fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
  # Guard against the case when an image has fewer than fg_rois_per_image
  # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
  bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) &
                     (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
  #定义背景,背景是一个范围,在参数里设置的是0.1~0.5

  # Small modification to the original version where we ensure a fixed number of regions are sampled
  #修改了原始版本,保证采样的数量是固定的?
  if fg_inds.size > 0 and bg_inds.size > 0:
  #如果正样本很多,大于设置的batch*正样本比例,则随机采样,负样本就为batch减掉正样本
  #在rpn_target_layer.py里面有类似的操作
    fg_rois_per_image = min(fg_rois_per_image, fg_inds.size)
    fg_inds = npr.choice(fg_inds, size=int(fg_rois_per_image), replace=False)
    bg_rois_per_image = rois_per_image - fg_rois_per_image
    to_replace = bg_inds.size < bg_rois_per_image
    bg_inds = npr.choice(bg_inds, size=int(bg_rois_per_image), replace=to_replace)
  elif fg_inds.size > 0:
    to_replace = fg_inds.size < rois_per_image
    fg_inds = npr.choice(fg_inds, size=int(rois_per_image), replace=to_replace)
    fg_rois_per_image = rois_per_image
  elif bg_inds.size > 0:
    to_replace = bg_inds.size < rois_per_image
    bg_inds = npr.choice(bg_inds, size=int(rois_per_image), replace=to_replace)
    fg_rois_per_image = 0
  else:
    import pdb
    pdb.set_trace()

  # The indices that we're selecting (both fg and bg)
  #最后挑选出来的前景和背景的索引
  keep_inds = np.append(fg_inds, bg_inds)
  # Select sampled values from various arrays:
  #提取出对应的labels
  labels = labels[keep_inds]
  # Clamp labels for the background RoIs to 0
  #又重复确定一边,25个框之后的全部设置为背景
  labels[int(fg_rois_per_image):] = 0
  rois = all_rois[keep_inds]
  #提取出对应的roi以及得分
  roi_scores = all_scores[keep_inds]
  #下面的函数在这个函数上面,计算bbox偏移
  bbox_target_data = _compute_targets(
    rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels)
  #根据偏移,进一步计算回归labels。实际是分为了两步计算。在这一步才有weight
  bbox_targets, bbox_inside_weights = \
    _get_bbox_regression_labels(bbox_target_data, num_classes)

  return labels, rois, roi_scores, bbox_targets, bbox_inside_weights
#返回框的前景背景labels,roi区域及其得分,Bbox需要的修正量以及输入的权重

#跟anchor_target_layer相比,有一定的重复,但是在前者的功能主要是针对anchor,也就是在rpn输入的阶段,而后者主要是rpn处理之后的结果再处理
rpn_rois, rpn_scores, gt_boxes, _num_classes从输入的类别就可以看到。虽然这里面依然定义了前景和背景,但是设计到_num_classes,其实有具体前景类别的标签了。可以用于rpn之后的训练。

 

_base_ = [ '_base_/datasets/hrsc.py', '_base_/schedules/schedule_3x.py', '_base_/default_runtime.py' ] angle_version = 'le90' model = dict( type='ReDet', backbone=dict( type='ReResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch', pretrained='checkpoints/ReDet_re50_refpn_3x_hrsc2016-d1b4bd29.pth'), neck=dict( type='ReFPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RotatedRPNHead', in_channels=256, feat_channels=256, version=angle_version, anchor_generator=dict( type='AnchorGenerator', scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0]), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), roi_head=dict( type='RoITransRoIHead', version=angle_version, num_stages=2, stage_loss_weights=[1, 1], bbox_roi_extractor=[ dict( type='SingleRoIExtractor', roi_layer=dict( type='RoIAlign', output_size=7, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), dict( type='RotatedSingleRoIExtractor', roi_layer=dict( type='RiRoIAlignRotated', out_size=7, num_samples=2, num_orientations=8, clockwise=True), out_channels=256, featmap_strides=[4, 8, 16, 32]), ], bbox_head=[ dict( type='RotatedShared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=1, bbox_coder=dict( type='DeltaXYWHAHBBoxCoder', angle_range=angle_version, norm_factor=2, edge_swap=True, target_means=[0., 0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2, 0.1]), reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), dict( type='RotatedShared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=1, bbox_coder=dict( type='DeltaXYWHAOBBoxCoder', angle_range=angle_version, norm_factor=None, edge_swap=True, proj_xy=True, target_means=[0., 0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1, 0.05]), reg_class_agnostic=False, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) ]), train_cfg=dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, match_low_quality=True, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, debug=False), rpn_proposal=dict( nms_pre=2000, max_per_img=2000, nms=dict(type='nms', iou_threshold=0.7), min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, match_low_quality=False, ignore_iof_thr=-1, iou_calculator=dict(type='BboxOverlaps2D')), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, match_low_quality=False, ignore_iof_thr=-1, iou_calculator=dict(type='RBboxOverlaps2D')), sampler=dict( type='RRandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False) ]), test_cfg=dict( rpn=dict( nms_pre=2000, max_per_img=2000, nms=dict(type='nms', iou_threshold=0.7), min_bbox_size=0), rcnn=dict( nms_pre=2000, min_bbox_size=0, score_thr=0.05, nms=dict(iou_thr=0.1), max_per_img=2000))) img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True), dict(type='RResize', img_scale=(800, 512)), dict(type='RRandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(800, 512), flip=False, transforms=[ dict(type='RResize'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img']) ]) ] data = dict( train=dict(pipeline=train_pipeline), val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline)) evaluation = dict(interval=3, metric='mAP') optimizer = dict(lr=0.01) 旋转等变网络运用群卷积这样的结构,在特征图当中,能够明确地对多个方向的特征响应进行编码处理,进而形成带有方向通道的等变特征表示形式。这样的一种设计,会让网络在面对输入图像所发生的旋转变换时具备等变性,也就是说,当图像经过旋转之后,其特征图能够借助通道轮换以及插值对齐的方式来处理,如此一来,便在很大程度上降低了模型对于经过旋转增强处理的数据的依赖程度。而RiRoI Align模块,它可以根据空间维度上RRoI的边界框扭曲区域特征,并通过循环切换方向通道和特征插值来对齐方向维度上的特征,从等变特征当中提取出完全具备旋转不变特性的特征。 但是,完全的旋转不变性有可能会使得角度预测的精度有所降低。当所有的特征通道都经过了方向的对齐操作之后,原始的方向信息就被做了均质化的处理,这样一来,角度回归头想要捕捉那些细微的方向差异就变得很困难了。这种矛盾情况在长宽比相差特别悬殊的目标,比如船舶、飞机等的检测当中表现得格外明显。 想使用部分方向对齐机制来改善,应该怎么改,
最新发布
06-01
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值