faster rcnn代码解读参考
:https://github.com/adityaarun1/pytorch_fast-er_rcnn
https://github.com/jwyang/faster-rcnn.pytorch
之前的rpn_head主要是为了获取feature map分类的fg/bg的anchor。
而proposal_layer是为了将所有的anchor进行nms。也就是进一步删除不必要的anchor。
class proposal_layer(nn.Module):
#从特征图中生成anchor
"""
Outputs object detection proposals by applying estimated bounding-box
transformations to a set of regular boxes (called "anchors").
"""
def __init__(self, feat_stride, scales, ratios):
super(proposal_layer, self).__init__()
self.feat_stride=feat_stride #anchor 进步
self.anchor_scales=scales #anchor尺度
self.anchor_ratios=ratios #anchor的横纵比
self.num_anchors = len(scales)*len(ratios)#每个像素点的anchor个数
def forward(self, rpn_cls_logit,rpn_bbox_pred,im_info,model='train'):
# rpn_cls_logit = softmax(rpn_cls_score) : [batch, feat_h, feat_w ,(num_anchors * 2)]
# rpn_bbox_pred: [batch,feat_h, feat_w, (num_anchors*4)]
# Algorithm:
# for each (H, W) location i
# generate A anchor boxes centered on cell i
# apply predicted bbox deltas at cell i to each of the A anchors
# clip predicted boxes to image
# remove predicted boxes with either height or width < threshold
# sort all (proposal, score) pairs by score from highest to lowest
# take top pre_nms_topN proposals before NMS
# apply NMS with threshold 0.7 to remaining proposals
# take after_nms_topN proposals after NMS
# return the top proposals (-> RoIs top, scores top)
# the first set of _num_anchors channels are bg probs
# the second set are the fg probs
_pre_nms_topN = 0
_post_nms_topN = 0
_nms_thresh =0
_min_size = 0
if model=='train':
_pre_nms_topN = cfg['train_rpn_pre_nms_top_n']
_post_nms_topN = cfg['train_rpn_post_nms_top_n']
_nms_thresh = cfg['train_rpn_nms_thresh']
_min_size = cfg['train_rpn_min_size']
if model=='test':
_pre_nms_topN = cfg['test_rpn_pre_nms_top_n']
_post_nms_topN = cfg['test_rpn_post_nms_top_n']
_nms_thresh = cfg['test_rpn_nms_thresh']
_min_size = cfg['test_rpn_min_size']
# 获取features的h\w [batch, feat_h, feat_w ,(num_anchors * 2)]
batch_size, feat_height, feat_width = rpn_cls_logit.shape[0:3]
# 在feature上生成anchor_length个anchors;
anchors, anchor_length = generate_anchors_pre(feat_height, feat_width, self.feat_stride,
self.anchor_scales, self.anchor_ratios)
# 扩展成批量大小
anchors = np.stack((anchors,) * batch_size, axis=0)
self.anchors = torch.from_numpy(anchors).to(rpn_bbox_pred.device)#将anchor转到相应的device
self.anchor_length = anchor_length#anchor个数
# Get the scores and bounding boxes;rpn_cls_logit :[batch, feat_h, feat_w ,(num_anchors * 2)]->[batch, feat_h, feat_w ,num_anchors]
scores = rpn_cls_logit[:, :, :, self.num_anchors:]#获取前景得分batch * h * w * num_anchors
#rpn_bbox_pred:[batch,feat_h, feat_w, (num_anchors*4)]->[batch,feat_h*feat_w*num_anchors, 4]
rpn_bbox_pred = rpn_bbox_pred.view((batch_size,-1, 4))
# 前景得分[batch, feat_h, feat_w, num_anchors]->#(batch , h * w * num_anchors,1 )*前景得分
scores = scores.contiguous().view(batch_size,-1, 1)
# anchor为feature的总anchor,且为左上右下角表示,rpn_bbox_pred为偏移量
# 将anchor利用预测的偏移量rpn_bbox_pred转换为预测的proposals
# ?如何保证anchors和rpn_bbox_pred形状一样,anchor设置和feature设置有关系
#[batch,feat_h*feat_w*num_anchors, 4]
proposals = bbox_transform_inv(self.anchors, rpn_bbox_pred)#两个角点的方式
# [batch, feat_h * feat_w * num_anchors, 4]根据变换后的图像大小对proposal超出边界范围的进行裁剪
proposals = clip_boxes_batch(proposals, im_info[0,:2], batch_size)# 限制proposal范围
scores_keep = scores #[batch, feat_h * feat_w * num_anchors, 1] #前景得分
proposals_keep = proposals #[batch, feat_h * feat_w * num_anchors, 4] #前景对应的proposal 还是左上右下的形式
blob_batch = proposals.new(batch_size, _post_nms_topN, 5).zero_() #有可能到不了这个_post_nms_topN
scores_batch = proposals.new(batch_size, _post_nms_topN, 1).zero_()#有可能到不了这个_post_nms_topN
for i in range(batch_size):
# # 3. remove predicted boxes with either height or width < threshold
# # (NOTE: convert min_size to input image scale stored in im_info[2])
#[feat_h * feat_w * num_anchors, 4]
proposals_single = proposals_keep[i]#获取当前图像的proposal
# [feat_h * feat_w * num_anchors, 1]
scores_single = scores_keep[i]#获取当前图像的score
# Pick the top region proposals
# [feat_h * feat_w * num_anchors]
# [feat_h * feat_w * num_anchors]
scores_single, order_single = scores_single.view(-1).sort(descending=True)
# # 4. sort all (proposal, score) pairs by score from highest to lowest
# # 5. take top pre_nms_topN (e.g. 6000)
if _pre_nms_topN > 0 and _pre_nms_topN < scores_keep.numel(): # 取出_pre_nms_topN个最大的
order_single = order_single[:_pre_nms_topN]#[_pre_nms_top]
scores_single = scores_single[:_pre_nms_topN].view(-1, 1)
# [_pre_nms_top,4]
proposals_single = proposals_single[order_single,:]#以及对应预测的proposal
# [_pre_nms_top,1]
# 6. apply nms (e.g. threshold = 0.7)
# 7. take after_nms_topN (e.g. 300)
# 8. return the top proposals (-> RoIs top)
# Non-maximal suppression
# # [_pre_nms_top,4] [_pre_nms_top,1]
keep_idx_i = nms(proposals_single, scores_single.squeeze(1), _nms_thresh)#按照_nms_thresh进行非极大值抑制score,并选出保留的keep
keep_idx_i = keep_idx_i.long().view(-1)
# Pick th top region proposals after NMS
if _post_nms_topN > 0:#从非极大值抑制的结果中取出_post_nms_topN最大的
keep_idx_i = keep_idx_i[:_post_nms_topN]
proposals_single = proposals_single[keep_idx_i, :]# 将proposal范围再次减小
scores_single = scores_single[keep_idx_i,]# 并找到对应得分
scores_batch[i,:scores_single.shape[0],:] = scores_single
# Only support single image as input
#batch_inds = proposals_single.new_zeros(proposals_single.size(0), 1)#创建一个与proposal类型一样大小为proposals.size(0), 1的batch_inds
batch_inds = proposals_single.new(proposals_single.size(0), 1).fill_(i)
blob = torch.cat((batch_inds, proposals_single), 1)#将batch_inds压入到proposals中,也就是porposal坐标+前景
blob_batch[i,:scores_single.shape[0],:] = blob
# print('scores_single',scores_single.shape)
# [batch ,post_nms_topN,5]\[batch , post_nms_topN,1 ]
# index = torch.where(scores_batch >= 0)
# blob_batch = blob_batch[index]
# scores_batch = scores_batch[index]
#
# print('blob_batch',blob_batch.shape)
return blob_batch, scores_batch
这里可能不是特别对。
特别是这两行,因为实际上经过nms最后的个数是很可能小于_post_nms_topN的,但是当时为了批量处理改成了这种形式。实际上最后也是用
了batch=1,因此可以认为没必要。
blob_batch = proposals.new(batch_size, _post_nms_topN, 5).zero_() #有可能到不了这个_post_nms_topN
scores_batch = proposals.new(batch_size, _post_nms_topN, 1).zero_()#有可能到不了这个_post_nms_topN
二、处理步骤
- 获取nms的前后阈值,
-
rpn_cls_logit也就是对应rpn_head的softmax后的得分,因为NMS第一步就是根据得分排序的。
- 使用generate_anchors_pre生成anchor_lenght个anchor,这里才是直接根据generate_anchors_pre生成啊anchor,rpn_head里面是从网络卷积来的,可以预测,但是在训练的过程中还是需要首先在feature map上生成anchors
- 然后就是根据传入的批量将单张图的anchor以栈的形式复制到批量中,这里可以发现,anchor的生成只是与feature map的shape有关。其他的数据信息未影响到anchor生成
- 从中提取rpn_cls_logit中提取前num_anchors个,也就是fg得分。[batch, feat_h, feat_w ,(num_anchors * 2)]->[batch, feat_h, feat_w ,num_anchors],为什么前num_anchors个就是前景得分,猜测是训练时约定的。
- 将rpn_bbox_pred从[batch,feat_h, feat_w, (num_anchors*4)]->[batch,feat_h*feat_w*num_anchors, 4],这里只是使用view。
- 然后进一步的将score转换成相同类型[batch, feat_h, feat_w, num_anchors]->#(batch , h * w * num_anchors,1 ),这里的得分是前景得分。
- 利用bbox_transform_inv将rpn_head预测的rpn_bbox_pred作用在刚才生成的anchors上,因为rpn_head预测的是偏移量。proposal的形式依旧是[batch,feat_h*feat_w*num_anchors, 4]也就是与rpn_bbox_pred相同,这里也是连个点坐标的形式(anchor)
- 将偏移出去的proposals进行裁剪,因为这里的proposals已经对应到了input上。
- 先将得分最高的_pre_nms_topN个proposal获取到,然后利用nms将iou小于threshold的proposal保留下来。
- 继续从这些筛选的结果里面取出_post_nms_topN个proposal,这里我是弄错了的,因为当前的proposal个数不一定大于_post_nms_topN,所以里面有很多全零填充,之所以这样,只是为了考虑可以批量的组合数据。
- 输出保留的proposal和得分score
三、总结
proposal layer这一层主要是为了将rpn_head中预测的平移量作用到对应的、anchor上,并利用非极大值抑制抑制进行筛选。换种看法就是rpn_head生成score和四个坐标平移和w、h放缩,然后proposal layer就利用这些回归量和NMS做一些bounding box的删除合并。
'''
in :
rpn_cls_logit = softmax(rpn_cls_score) : [batch, feat_h, feat_w ,(num_anchors * 2)]经过softmax的cls得分
rpn_bbox_pred : [batch,feat_h, feat_w, (num_anchors*4)]#bbox的平移放缩变换量
out :
rpn_proposal : [batch , post_nms_topN ,5] # 映射到输入图像的proposal,第0维度是全零表示类别,使用batch_inds进行cat,其实不应该是全0而应该对应batch的index。
rpn_proposal_scores : [batch , post_nms_topN ,1 ] # 每个rpn_proposal的得分,也就是通过NMS操作保留的那部分得分
'''