proposal_layer.py
# --------------------------------------------------------
# Faster R-CNN
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick and Xinlei Chen
# --------------------------------------------------------
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import numpy as np
from model.config import cfg
from model.bbox_transform import bbox_transform_inv, clip_boxes, bbox_transform_inv_tf, clip_boxes_tf
from model.nms_wrapper import nms
'''
实际用到的是proposal_layer_tf,再network中调用。凡是有layer结尾的,都是再network里面调用。
'''
def proposal_layer(rpn_cls_prob, rpn_bbox_pred, im_info, cfg_key, _feat_stride, anchors, num_anchors):
"""A simplified version compared to fast/er RCNN
For details please see the technical report
"""
#下面提取好config里面预先设置好的rpn参数:
#分别是nms前rpn最多框限制(12000个)以及之后最多框限制(300),以及rpn阈值
if type(cfg_key) == bytes:
cfg_key = cfg_key.decode('utf-8')
#Number of top scoring boxes to keep before apply NMS to RPN proposals 12000
pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N
#Number of top scoring boxes to keep after applying NMS to RPN proposals 300
post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
# NMS threshold used on RPN proposals 0.7
nms_thresh = cfg[cfg_key].RPN_NMS_THRESH
# Get the scores and bounding boxes
#根据输入,提取分类概率和bbox位置
scores = rpn_cls_prob[:, :, :, num_anchors:]
rpn_bbox_pred = rpn_bbox_pred.reshape((-1, 4))
scores = scores.reshape((-1, 1))
proposals = bbox_transform_inv(anchors, rpn_bbox_pred)
proposals = clip_boxes(proposals, im_info[:2])
# Pick the top region proposals
#下面就是排序,提取出排序后数据对应排序前的索引,输出为order
order = scores.ravel().argsort()[::-1]
#按设置的参数截取前RPN_PRE_NMS_TOP_N个proposal
if pre_nms_topN > 0:
order = order[:pre_nms_topN]
proposals = proposals[order, :]
#按索引order从scroes中切片出对应的scores
scores = scores[order]
# Non-maximal suppression
#进行非极大值抑制
keep = nms(np.hstack((proposals, scores)), nms_thresh)
# Pick th top region proposals after NMS
#按设置好的nms后框数量限制参数--RPN_POST_NMS_TOP_N
#截取前RPN_POST_NMS_TOP_N个proposals
if post_nms_topN > 0:
keep = keep[:post_nms_topN]
proposals = proposals[keep, :]
scores = scores[keep]
# Only support single image as input
#生成一个空数组
batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32)
#为一张图片单独制造blob
blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False)))
return blob, scores
def proposal_layer_tf(rpn_cls_prob, rpn_bbox_pred, im_info, cfg_key, _feat_stride, anchors, num_anchors):
#这个是真实调用的,实际上比上面的要少一点,进行了一点浓缩,部分功能在其他位置实现。
if type(cfg_key) == bytes:
cfg_key = cfg_key.decode('utf-8')
pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N
post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
nms_thresh = cfg[cfg_key].RPN_NMS_THRESH
# Get the scores and bounding boxes
scores = rpn_cls_prob[:, :, :, num_anchors:]
scores = tf.reshape(scores, shape=(-1,))
rpn_bbox_pred = tf.reshape(rpn_bbox_pred, shape=(-1, 4))
proposals = bbox_transform_inv_tf(anchors, rpn_bbox_pred)
proposals = clip_boxes_tf(proposals, im_info[:2])
# Non-maximal suppression
indices = tf.image.non_max_suppression(proposals, scores, max_output_size=post_nms_topN, iou_threshold=nms_thresh)
boxes = tf.gather(proposals, indices)
boxes = tf.to_float(boxes)
scores = tf.gather(scores, indices)
scores = tf.reshape(scores, shape=(-1, 1))
# Only support single image as input
batch_inds = tf.zeros((tf.shape(indices)[0], 1), dtype=tf.float32)
blob = tf.concat([batch_inds, boxes], 1)
return blob, scores
proposal_target_layer.py
# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick, Sean Bell and Xinlei Chen
# --------------------------------------------------------
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import numpy.random as npr
from model.config import cfg
from model.bbox_transform import bbox_transform
from utils.cython_bbox import bbox_overlaps
'''
'''
def proposal_target_layer(rpn_rois, rpn_scores, gt_boxes, _num_classes):
"""
Assign object detection proposals to ground-truth targets. Produces proposal
classification labels and bounding-box regression targets.
根据gt,对rpn产生的proposal打上分类标签以及计算回归的偏差
"""
# Proposal ROIs (0, x1, y1, x2, y2) coming from RPN
# (i.e., rpn.proposal_layer.ProposalLayer), or any other source
all_rois = rpn_rois
all_scores = rpn_scores
# Include ground-truth boxes in the set of candidate rois
if cfg.TRAIN.USE_GT:#在config里面这个参数是false,这段代码应该是不执行的。
#但是依然要解读下这段代码。看看为什么不执行。
#按gt集合的形状生成个空数组,同样的行数
zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype)
#vstack按列堆叠列,hstack按行堆叠为列。
#首先把前面生成的0数组和gt数组叠加起来,然后再与roi数组堆叠起来。
all_rois = np.vstack
(all_rois, np.hstack((zeros, gt_boxes[:, :-1])))
)
# not sure if it a wise appending, but anyway i am not using it
#然后得分也这样处理。
all_scores = np.vstack((all_scores, zeros))
#TRAIN.BATCH_SIZE是感兴趣区域的数量
#rois_per_image就是每一张图片允许的roi区域batch。
#在其他地方也遇到了rois_per_image,名字不一样,其实就是一个限制参数。
num_images = 1
rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images
#按cfg.TRAIN.FG_FRACTION参数计算得到每一张图片的batch个roi中前景的数量
#比如rois_per_image=100,那么就是最多允许选取100*0.25=25个前景roi
fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)
# Sample rois with classification labels and bounding box regression
# targets
#_sample_rois函数,对每张图片的Batch按照参数设置随机采样
labels, rois, roi_scores, bbox_targets, bbox_inside_weights = _sample_rois(
all_rois, all_scores, gt_boxes, fg_rois_per_image,
rois_per_image, _num_classes)
#为了配合下面的操作,reshape一下
#weight是权重,每一个roi都会产生误差,对所有roi使用均权计算。
rois = rois.reshape(-1, 5)
roi_scores = roi_scores.reshape(-1)
labels = labels.reshape(-1, 1)
bbox_targets = bbox_targets.reshape(-1, _num_classes * 4)
bbox_inside_weights = bbox_inside_weights.reshape(-1, _num_classes * 4)
bbox_outside_weights = np.array(bbox_inside_weights > 0).astype(np.float32)
return rois, roi_scores, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights
def _get_bbox_regression_labels(bbox_target_data, num_classes):
"""Bounding-box regression targets (bbox_target_data) are stored in a
compact form N x (class, tx, ty, tw, th)
This function expands those targets into the 4-of-4*K representation used
by the network (i.e. only one class has non-zero targets).
Returns:
bbox_target (ndarray): N x 4K blob of regression targets
bbox_inside_weights (ndarray): N x 4K blob of loss weights
bbox_target_data=N x (class, tx, ty, tw, th)
然后生成一个全0数组用于存放bbox_targets
以及生成一个全0权重。
然后就计算
"""
clss = bbox_target_data[:, 0]
bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32)
bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
#找出类别标号大于0的框,也就是找出前景的bbox
inds = np.where(clss > 0)[0]
#对于每一个前景bbox
for ind in inds:
cls = clss[ind]#首先提取出来类别标号
start = int(4 * cls)#类别标号扩大4倍数再加4.如果前景=1那么start=1 end=8
end = start + 4
#如果是2的话,就是8-12。其实就是一个k类*4的向量,第一类的4个target在4-8之间,第二类在8-12之间,0类也就是背景在1-4之间
#bbox_targets[ind,4:8]= bbox_target_data[ind ]
bbox_targets[ind, start:end] = bbox_target_data[ind, 1:]
bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS
#也就是说,对于不同类别的bbox,根据类别进行编码,编码到一个统一的向量里面,四个偏移所在的位置,是根据其所属类别来定位的。
return bbox_targets, bbox_inside_weights#同时返回权重。
def _compute_targets(ex_rois, gt_rois, labels):
"""Compute bounding-box regression targets for an image."""
#计算bbox回归量
assert ex_rois.shape[0] == gt_rois.shape[0]
assert ex_rois.shape[1] == 4
assert gt_rois.shape[1] == 4
#bbox_transform这个函数就是gt的四个坐标和extrect的roi四个坐标比对,然后转换出偏移
targets = bbox_transform(ex_rois, gt_rois)
#下面的参数是一个正则化开关,是什么正则化?
#事先进行了规定,得到means和stds,然后再计算的时候,用这些参数对目标进行...归一化?
#这些means和stds是怎么计算出来的呢?
#在config里面直接定义了,而means应该是均值把,stds因该是统计计算出来的标准差。
#那么就是用了数学的归一化方法对数据进行了一些正则化,但直接用了参数,所以..
#可能是不是不准?
if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
# Optionally normalize targets by a precomputed mean and stdev
targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS))
/ np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS))
return np.hstack(
(labels[:, np.newaxis], targets)).astype(np.float32, copy=False)
def _sample_rois(all_rois, all_scores, gt_boxes, fg_rois_per_image, rois_per_image, num_classes):
"""
对roi采样
"""
# overlaps: (rois x gt_boxes)
#bbox_overlaps是个计算函数,return计算结果
overlaps = bbox_overlaps(
np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float),
np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float))
gt_assignment = overlaps.argmax(axis=1)#axi=1 每一行的第几列最大,返回索引
max_overlaps = overlaps.max(axis=1)#同上,不过返回具体值,及重叠率
labels = gt_boxes[gt_assignment, 4]#得到对应的gt之后,以gt的标签为label
#放在第4位的原因是,前0-3四个位置放的是坐标。
# Select foreground RoIs as those with >= FG_THRESH overlap
#定义前景
fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
# Guard against the case when an image has fewer than fg_rois_per_image
# Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) &
(max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
#定义背景,背景是一个范围,在参数里设置的是0.1~0.5
# Small modification to the original version where we ensure a fixed number of regions are sampled
#修改了原始版本,保证采样的数量是固定的?
if fg_inds.size > 0 and bg_inds.size > 0:
#如果正样本很多,大于设置的batch*正样本比例,则随机采样,负样本就为batch减掉正样本
#在rpn_target_layer.py里面有类似的操作
fg_rois_per_image = min(fg_rois_per_image, fg_inds.size)
fg_inds = npr.choice(fg_inds, size=int(fg_rois_per_image), replace=False)
bg_rois_per_image = rois_per_image - fg_rois_per_image
to_replace = bg_inds.size < bg_rois_per_image
bg_inds = npr.choice(bg_inds, size=int(bg_rois_per_image), replace=to_replace)
elif fg_inds.size > 0:
to_replace = fg_inds.size < rois_per_image
fg_inds = npr.choice(fg_inds, size=int(rois_per_image), replace=to_replace)
fg_rois_per_image = rois_per_image
elif bg_inds.size > 0:
to_replace = bg_inds.size < rois_per_image
bg_inds = npr.choice(bg_inds, size=int(rois_per_image), replace=to_replace)
fg_rois_per_image = 0
else:
import pdb
pdb.set_trace()
# The indices that we're selecting (both fg and bg)
#最后挑选出来的前景和背景的索引
keep_inds = np.append(fg_inds, bg_inds)
# Select sampled values from various arrays:
#提取出对应的labels
labels = labels[keep_inds]
# Clamp labels for the background RoIs to 0
#又重复确定一边,25个框之后的全部设置为背景
labels[int(fg_rois_per_image):] = 0
rois = all_rois[keep_inds]
#提取出对应的roi以及得分
roi_scores = all_scores[keep_inds]
#下面的函数在这个函数上面,计算bbox偏移
bbox_target_data = _compute_targets(
rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels)
#根据偏移,进一步计算回归labels。实际是分为了两步计算。在这一步才有weight
bbox_targets, bbox_inside_weights = \
_get_bbox_regression_labels(bbox_target_data, num_classes)
return labels, rois, roi_scores, bbox_targets, bbox_inside_weights
#返回框的前景背景labels,roi区域及其得分,Bbox需要的修正量以及输入的权重
#跟anchor_target_layer相比,有一定的重复,但是在前者的功能主要是针对anchor,也就是在rpn输入的阶段,而后者主要是rpn处理之后的结果再处理
rpn_rois, rpn_scores, gt_boxes, _num_classes从输入的类别就可以看到。虽然这里面依然定义了前景和背景,但是设计到_num_classes,其实有具体前景类别的标签了。可以用于rpn之后的训练。