目录
模型结构
backbone
模型的backbone是res50,主要由四个layers组成,在输入进入第一个layer之前,需要经过两次下采样,也就是如图下面的操作
nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
norm_layer(64),nn.ReLU(inplace=True)
nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
每个layers分别由3,4,6,3个block组成。
每个layer的output_channels分别为[256,512,1024,2048],第二、三、四个layer有下采样操作,在这三层的第一个block中会有用于下采样的卷积操作。
block是该主干网络的基本结构单元,每个block可的卷积操作可以由普通卷积或者变形卷积来执行。
block
class Bottleneck(nn.Module):
""" Adapted from torchvision.models.resnet """
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None, norm_layer=nn.BatchNorm2d, dilation=1, use_dcn=False):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False, dilation=dilation)
self.bn1 = norm_layer(planes)
if use_dcn:
self.conv2 = DCN(planes, planes, kernel_size=3, stride=stride,
padding=dilation, dilation=dilation, deformable_groups=1)
self.conv2.bias.data.zero_()
self.conv2.conv_offset_mask.weight.data.zero_()
self.conv2.conv_offset_mask.bias.data.zero_()
else:
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
padding=dilation, bias=False, dilation=dilation)
self.bn2 = norm_layer(planes)
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False, dilation=dilation)
self.bn3 = norm_layer(planes * 4)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class ResNetBackbone(nn.Module):
""" Adapted from torchvision.models.resnet """
def __init__(self, layers, dcn_layers=[0, 0, 0, 0], dcn_interval=1, atrous_layers=[], block=Bottleneck, norm_layer=nn.BatchNorm2d):
super().__init__()
# These will be populated by _make_layer
self.num_base_layers = len(layers)
self.layers = nn.ModuleList()
self.channels = []
self.norm_layer = norm_layer
self.dilation = 1
self.atrous_layers = atrous_layers
# From torchvision.models.resnet.Resnet
self.inplanes = 64
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = norm_layer(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self._make_layer(block, 64, layers[0], dcn_layers=dcn_layers[0], dcn_interval=dcn_interval)
self._make_layer(block, 128, layers[1], stride=2, dcn_layers=dcn_layers[1], dcn_interval=dcn_interval)
self._make_layer(block, 256, layers[2], stride=2, dcn_layers=dcn_layers[2], dcn_interval=dcn_interval)
self._make_layer(block, 512, layers[3], stride=2, dcn_layers=dcn_layers[3], dcn_interval=dcn_interval)
# This contains every module that should be initialized by loading in pretrained weights.
# Any extra layers added onto this that won't be initialized by init_backbone will not be
# in this list. That way, Yolact::init_weights knows which backbone weights to initialize
# with xavier, and which ones to leave alone.
self.backbone_modules = [m for m in self.modules() if isinstance(m, nn.Conv2d)]
def _make_layer(self, block, planes, blocks, stride=1, dcn_layers=0, dcn_interval=1):
""" Here one layer means a string of n Bottleneck blocks. """
downsample = None
# This is actually just to create the connection between layers, and not necessarily to
# downsample. Even if the second condition is met, it only downsamples when stride != 1
if stride != 1 or self.inplanes != planes * block.expansion:
if len(self.layers) in self.atrous_layers:
self.dilation += 1
stride = 1
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=False,
dilation=self.dilation),
self.norm_layer(planes * block.expansion),
)
layers = []
use_dcn = (dcn_layers >= blocks)
layers.append(block(self.inplanes, planes, stride, downsample, self.norm_layer, self.dilation, use_dcn=use_dcn))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
use_dcn = ((i+dcn_layers) >= blocks) and (i % dcn_interval == 0)
layers.append(block(self.inplanes, planes, norm_layer=self.norm_layer, use_dcn=use_dcn))
layer = nn.Sequential(*layers)
self.channels.append(planes * block.expansion)
self.layers.append(layer)
return layer
def forward(self, x):
""" Returns a list of convouts for each layer. """
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
outs = []
for layer in self.layers:
x = layer(x)
outs.append(x)
return tuple(outs)
FPN
fpn的输入是backbone的layer2,3,4的输出 ,输出五层,输出的通道数统一为256。多输出的两层是通过卷积下采样获得的。FPN输出的第一层(下面那层)为不仅送入head,还会经上采样后用于proto_mask的计算。FPN输入的特征图的大小是模型输入的1/8。
class FPN(ScriptModuleWrapper):
"""
Implements a general version of the FPN introduced in
https://arxiv.org/pdf/1612.03144.pdf
Parameters (in cfg.fpn):
- num_features (int): The number of output features in the fpn layers.
- interpolation_mode (str): The mode to pass to F.interpolate.
- num_downsample (int): The number of downsampled layers to add onto the selected layers.
These extra layers are downsampled from the last selected layer.
Args:
- in_channels (list): For each conv layer you supply in the forward pass,
how many features will it have?
"""
__constants__ = ['interpolation_mode', 'num_downsample', 'use_conv_downsample', 'relu_pred_layers',
'lat_layers', 'pred_layers', 'downsample_layers', 'relu_downsample_layers']
def __init__(self, in_channels):
super().__init__()
self.lat_layers = nn.ModuleList([
nn.Conv2d(x, cfg.fpn.num_features, kernel_size=1)
for x in reversed(in_channels)
])
# This is here for backwards compatability
padding = 1 if cfg.fpn.pad else 0 # 1
self.pred_layers = nn.ModuleList([
nn.Conv2d(cfg.fpn.num_features, cfg.fpn.num_features, kernel_size=3, padding=padding)
for _ in in_channels
])
if cfg.fpn.use_conv_downsample: # true
self.downsample_layers = nn.ModuleList([
nn.Conv2d(cfg.fpn.num_features, cfg.fpn.num_features, kernel_size=3, padding=1, stride=2)
for _ in range(cfg.fpn.num_downsample)
]) # num_downsample = 2
self.interpolation_mode = cfg.fpn.interpolation_mode # blinear
self.num_downsample = cfg.fpn.num_downsample # 2
self.use_conv_downsample = cfg.fpn.use_conv_downsample # true
self.relu_downsample_layers = cfg.fpn.relu_downsample_layers # false
self.relu_pred_layers = cfg.fpn.relu_pred_layers # true
@script_method_wrapper
def forward(self, convouts:List[torch.Tensor]):
"""
Args:
- convouts (list): A list of convouts for the corresponding layers in in_channels.
Returns:
- A list of FPN convouts in the same order as x with extra downsample layers if requested.
"""
out = []
x = torch.zeros(1, device=convouts[0].device)
for i in range(len(convouts)):
out.append(x)
# For backward compatability, the conv layers are stored in reverse but the input and output is
# given in the correct order. Thus, use j=-i-1 for the input and output and i for the conv layers.
j = len(convouts)
for lat_layer in self.lat_layers:
j -= 1
if j < len(convouts) - 1:
_, _, h, w = convouts[j].size()
x = F.interpolate(x, size=(h, w), mode=self.interpolation_mode, align_corners=False)
x = x + lat_layer(convouts[j])
out[j] = x
# This janky second loop is here because TorchScript.
j = len(convouts)
for pred_layer in self.pred_layers:
j -= 1
out[j] = pred_layer(out[j])
if self.relu_pred_layers:
F.relu(out[j], inplace=True)
cur_idx = len(out)
for downsample_layer in self.downsample_layers:
out.append(downsample_layer(out[-1]))
return out
head
输出层每个位置有三个achor,宽高比为[0.5,1,2]。一共有五个head,每个head的anchor的尺寸分别为[24,48,96,192,384]
head包括bbox_layer,conf_layer,mask_layer(maskdim=32)
self.bbox_layer = nn.Conv2d(out_channels, self.num_priors * 4, **cfg.head_layer_params)
self.conf_layer = nn.Conv2d(out_channels, self.num_priors * self.num_classes, **cfg.head_layer_params)
self.mask_layer = nn.Conv2d(out_channels, self.num_priors * self.mask_dim, **cfg.head_layer_params)
protonet
将FPN的最后一层作为输入(大小为input_size/8),将输出结果与head的mask_layer的输出结果进行线性叠加得到mask,
protonet里有个上采样操作,将输入变为原来的两倍,所以输出是的大小是输入大小的四分之一。(论文原文:we upsample it to one fourth the dimensions of the input image to increase performance on small objects)
pythonSequential(
(0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(1): ReLU(inplace=True)
(2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(3): ReLU(inplace=True)
(4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(5): ReLU(inplace=True)
(6): InterpolateModule()
(7): ReLU(inplace=True)
(8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(9): ReLU(inplace=True)
(10): Conv2d(256, 32, kernel_size=(1, 1), stride=(1, 1))
)
最终的输出结果形势
loc: torch.Size([1, 19248, 4])
conf: torch.Size([1, 19248, 2])
mask: torch.Size([1, 19248, 32])
priors: torch.Size([19248, 4])
proto: torch.Size([1, 138, 138, 32]) #inputsize/4
segm: torch.Size([1, 1, 69, 69])
segm是语义分割用到的分支,输入为fpn的第一层输出。
train
cfg
‘positive_iou_threshold’: 0.5,
‘negative_iou_threshold’: 0.4,
‘ohem_negpos_ratio’: 3,
‘pred_aspect_ratios’: [1, 1/2, 2]
‘pred_scales’: [[24], [48], [96], [192], [384]],
‘conf_alpha’: 1,
‘bbox_alpha’: 1.5,
mask_alpha’: 0.4 / 256 * 140 * 140, # Some funky equation. Don’t worry about it.
‘nms_top_k’: 200,
‘nms_conf_thresh’: 0.05,
‘nms_thresh’: 0.4,
‘crowd_iou_threshold’: 0.7
match
Match each prior box with the ground truth box of the highest jaccard
overlap, encode the bounding boxes, then return the matched indices
corresponding to both confidence and location preds.
def match(pos_thresh, neg_thresh, truths, priors, labels, crowd_boxes, loc_t, conf_t, idx_t, idx, loc_data,
keypoints, keypoints_t):
"""
Args:
pos_thresh: (float) IoU > pos_thresh ==> positive.
neg_thresh: (float) IoU < neg_thresh ==> negative.
truths: (tensor) Ground truth boxes, Shape: [num_obj, num_priors].
priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4].
labels: (tensor) All the class labels for the image, Shape: [num_obj].
crowd_boxes: (tensor) All the crowd box annotations or None if there are none.
loc_t: (tensor) Tensor to be filled w/ endcoded location targets.
conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds. Note: -1 means neutral.
idx_t: (tensor) Tensor to be filled w/ the index of the matched gt box for each prior.
idx: (int) current batch index.
loc_data: (tensor) The predicted bbox regression coordinates for this batch.
Return:
The matched indices corresponding to 1)location and 2)confidence preds.
"""
decoded_priors = point_form(priors)
# Size [num_objects, num_priors]
overlaps = jaccard(truths, decoded_priors) if not cfg.use_change_matching else change(truths, decoded_priors)
# Size [num_priors] best ground truth for each prior
best_truth_overlap, best_truth_idx = overlaps.max(0)
# We want to ensure that each gt gets used at least once so that we don't
# waste any training data. In order to do that, find the max overlap anchor
# with each gt, and force that anchor to use that gt.
for _ in range(overlaps.size(0)):
# Find j, the gt with the highest overlap with a prior
# In effect, this will loop through overlaps.size(0) in a "smart" order,
# always choosing the highest overlap first.
best_prior_overlap, best_prior_idx = overlaps.max(1)
j = best_prior_overlap.max(0)[1] #选择那个与anchor有最大iou的gt
# Find i, the highest overlap anchor with this gt
i = best_prior_idx[j]
# Set all other overlaps with i to be -1 so that no other gt uses it
overlaps[:, i] = -1
# Set all other overlaps with j to be -1 so that this loop never uses j again
overlaps[j, :] = -1
# Overwrite i's score to be 2 so it doesn't get thresholded ever
best_truth_overlap[i] = 2
# Set the gt to be used for i to be j, overwriting whatever was there
best_truth_idx[i] = j
matches = truths[best_truth_idx] # Shape: [num_priors,4]
matches_kp = keypoints[best_truth_idx] # Shape: [num_priors,34]
conf = labels[best_truth_idx] + 1 # Shape: [num_priors]
conf[best_truth_overlap < pos_thresh] = -1 # label as neutral
conf[best_truth_overlap < neg_thresh] = 0 # label as background
# Deal with crowd annotations for COCO
if crowd_boxes is not None and cfg.crowd_iou_threshold < 1:
# Size [num_priors, num_crowds]
crowd_overlaps = jaccard(decoded_priors, crowd_boxes, iscrowd=True)
# Size [num_priors]
best_crowd_overlap, best_crowd_idx = crowd_overlaps.max(1)
# Set non-positives with crowd iou of over the threshold to be neutral.
conf[(conf <= 0) & (best_crowd_overlap > cfg.crowd_iou_threshold)] = -1
loc = encode(matches, priors, cfg.use_yolo_regressors)
kp = encode_kp(matches_kp, priors)
keypoints_t[idx] = kp
loc_t[idx] = loc # [num_priors,4] encoded offsets to learn
conf_t[idx] = conf # [num_priors] top class label for each prior
idx_t[idx] = best_truth_idx # [num_priors] indices for lookup
@torch.jit.script
def encode(matched, priors, use_yolo_regressors:bool=False):
variances = [0.1, 0.2]
g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2]
g_cxcy /= (variances[0] * priors[:, 2:])
g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
g_wh = torch.log(g_wh) / variances[1]
loc = torch.cat([g_cxcy, g_wh], 1)
return loc
multibox loss
bbox loss
smooth l1 计算损失bbox的损失
loc_data = predictions['loc']
conf_data = predictions['conf']
mask_data = predictions['mask']
priors = predictions['priors']
proto_data = predictions['proto']
loc_t = loc_data.new(batch_size, num_priors, 4)
gt_box_t = loc_data.new(batch_size, num_priors, 4)
conf_t = loc_data.new(batch_size, num_priors).long()
idx_t = loc_data.new(batch_size, num_priors).long()
#match
for idx in range(batch_size):
match(self.pos_threshold, self.neg_threshold,
truths, priors.data, labels[idx], crowd_boxes,
loc_t, conf_t, idx_t, idx, loc_data[idx])
gt_box_t[idx, :, :] = truths[idx_t[idx]]
pos = conf_t > 0
pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data)
loc_p = loc_data[pos_idx].view(-1, 4)
loc_t = loc_t[pos_idx].view(-1, 4)
losses['B'] = F.smooth_l1_loss(loc_p, loc_t, reduction='sum') * cfg.bbox_alpha
mask loss
交叉熵计算mask的损失,
mask proto与mask coef相乘后经过sigmoid激活后与target mask计算交叉熵
def lincomb_mask_loss(self, pos, idx_t, loc_data, mask_data, priors, proto_data, masks, gt_box_t, score_data,inst_data, labels, interpolation_mode='bilinear'):
mask_h = proto_data.size(1)
mask_w = proto_data.size(2)
# True
process_gt_bboxes = cfg.mask_proto_normalize_emulate_roi_pooling or cfg.mask_proto_crop
if cfg.mask_proto_remove_empty_masks: # false
# Make sure to store a copy of this because we edit it to get rid of all-zero masks
pos = pos.clone()
loss_m = 0
loss_d = 0 # Coefficient diversity loss
maskiou_t_list = []
maskiou_net_input_list = []
label_t_list = []
for idx in range(mask_data.size(0)):
with torch.no_grad():
downsampled_masks = F.interpolate(masks[idx].unsqueeze(0), (mask_h, mask_w),
mode=interpolation_mode, align_corners=False).squeeze(0)
downsampled_masks = downsampled_masks.permute(1, 2, 0).contiguous()
# True
if cfg.mask_proto_binarize_downsampled_gt: # True
downsampled_masks = downsampled_masks.gt(0.5).float()
cur_pos = pos[idx]
# for all priors find its best truth idx if the prior is positive
pos_idx_t = idx_t[idx, cur_pos]
pos_gt_box_t = gt_box_t[idx, cur_pos]
if pos_idx_t.size(0) == 0:
continue
proto_masks = proto_data[idx]
proto_coef = mask_data[idx, cur_pos, :]
num_pos = proto_coef.size(0)
# mask_t => (138,138,num_pos)
mask_t = downsampled_masks[:, :, pos_idx_t] # through best truth idx find its mask target
label_t = labels[idx][pos_idx_t]
# Size: [mask_h, mask_w, num_pos]
pred_masks = proto_masks @ proto_coef.t() # (138,138,32) @ (num_pos,32).t() = (138,138,num_pos)
pred_masks = cfg.mask_proto_mask_activation(pred_masks) # sigmoid
pred_masks = crop(pred_masks, pos_gt_box_t)
pre_loss = F.binary_cross_entropy(torch.clamp(pred_masks, 0, 1), mask_t, reduction='none')
if cfg.mask_proto_normalize_emulate_roi_pooling: # true
weight = mask_h * mask_w if cfg.mask_proto_crop else 1
pos_gt_csize = center_size(pos_gt_box_t) # convert to (cx,cy,w,h)
gt_box_width = pos_gt_csize[:, 2] * mask_w
gt_box_height = pos_gt_csize[:, 3] * mask_h
pre_loss = pre_loss.sum(dim=(0, 1)) / gt_box_width / gt_box_height * weight
# If the number of masks were limited scale the loss accordingly
if old_num_pos > num_pos:
pre_loss *= old_num_pos / num_pos
loss_m += torch.sum(pre_loss)
if cfg.use_maskiou:
if cfg.discard_mask_area > 0: # 25
gt_mask_area = torch.sum(mask_t, dim=(0, 1))
select = gt_mask_area > cfg.discard_mask_area # select the object with area > 25
if torch.sum(select) < 1:
continue
pos_gt_box_t = pos_gt_box_t[select, :]
pred_masks = pred_masks[:, :, select]
mask_t = mask_t[:, :, select]
label_t = label_t[select]
maskiou_net_input = pred_masks.permute(2, 0, 1).contiguous().unsqueeze(1)
pred_masks = pred_masks.gt(0.5).float()
maskiou_t = self._mask_iou(pred_masks, mask_t)
maskiou_net_input_list.append(maskiou_net_input)
maskiou_t_list.append(maskiou_t)
label_t_list.append(label_t)
losses = {'M': loss_m * cfg.mask_alpha / mask_h / mask_w}
if cfg.use_maskiou:
# discard_mask_area discarded every mask in the batch, so nothing to do here
if len(maskiou_t_list) == 0:
return losses, None
maskiou_t = torch.cat(maskiou_t_list)
label_t = torch.cat(label_t_list)
maskiou_net_input = torch.cat(maskiou_net_input_list)
num_samples = maskiou_t.size(0)
return losses, [maskiou_net_input, maskiou_t, label_t]
return losses
def _mask_iou(self, mask1, mask2):
intersection = torch.sum(mask1*mask2, dim=(0, 1))
area1 = torch.sum(mask1, dim=(0, 1))
area2 = torch.sum(mask2, dim=(0, 1))
union = (area1 + area2) - intersection
ret = intersection / union
return ret
hard negtive mining
def ohem_conf_loss(self, conf_data, conf_t, pos, num):
# Compute max conf across batch for hard negative mining
batch_conf = conf_data.view(-1, self.num_classes)
loss_c = log_sum_exp(batch_conf) - batch_conf[:, 0]
# Hard Negative Mining
loss_c = loss_c.view(num, -1)
loss_c[pos] = 0 # filter out pos boxes
loss_c[conf_t < 0] = 0 # filter out neutrals (conf_t = -1)
_, loss_idx = loss_c.sort(1, descending=True)
_, idx_rank = loss_idx.sort(1)
num_pos = pos.long().sum(1, keepdim=True)
num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1)
neg = idx_rank < num_neg.expand_as(idx_rank)
# Just in case there aren't enough negatives, don't start using positives as negatives
neg[pos] = 0
neg[conf_t < 0] = 0 # Filter out neutrals
# Confidence Loss Including Positive and Negative Examples
pos_idx = pos.unsqueeze(2).expand_as(conf_data)
neg_idx = neg.unsqueeze(2).expand_as(conf_data)
conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1, self.num_classes)
targets_weighted = conf_t[(pos+neg).gt(0)]
loss_c = F.cross_entropy(conf_p, targets_weighted, reduction='none')
loss_c = loss_c.sum()
return cfg.conf_alpha * loss_c # 1
test
fast nms
1.先找到每个类别得分topk的anchor,
2.按类别将这些框按得分排序,然后计算IOU矩阵
3.矩阵的每一列都有一个最大值,若该最大值超过阈值,那么该列对应的框就会被抑制掉。
def fast_nms(self, boxes, keypoints, keypoints_score, masks, scores, iou_threshold:float=0.5, top_k:int=200, second_threshold:bool=False):
scores, idx = scores.sort(1, descending=True)
idx = idx[:, :top_k].contiguous() # shape = (num_pos_class,topk),select topk predicts for each class
scores = scores[:, :top_k]
num_classes, num_dets = idx.size()
# it is irrelevant with class,so just view idx ,use idx
boxes = boxes[idx.view(-1), :].view(num_classes, num_dets, 4)
keypoints = keypoints[idx.view(-1), :].view(num_classes, num_dets, 34)
keypoints_score = keypoints_score[idx.view(-1), :].view(num_classes, num_dets, 17)
masks = masks[idx.view(-1), :].view(num_classes, num_dets, -1)
iou = jaccard(boxes, boxes)
iou.triu_(diagonal=1)
iou_max, _ = iou.max(dim=1) # shape (num_class,topk)
# Now just filter out the ones higher than the threshold
keep = (iou_max <= iou_threshold)
# We should also only keep detections over the confidence threshold, but at the cost of
# maxing out your detection count for every image, you can just not do that. Because we
# have such a minimal amount of computation per detection (matrix mulitplication only),
# this increase doesn't affect us much (+0.2 mAP for 34 -> 33 fps), so we leave it out.
# However, when you implement this in your method, you should do this second threshold.
if second_threshold:
keep *= (scores > self.conf_thresh)
# Assign each kept detection to its corresponding class
classes = torch.arange(num_classes, device=boxes.device)[:, None].expand_as(keep)
classes = classes[keep]
boxes = boxes[keep]
keypoints = keypoints[keep]
keypoints_score = keypoints_score[keep]
masks = masks[keep]
scores = scores[keep]
# Only keep the top cfg.max_num_detections highest scores across all classes
scores, idx = scores.sort(0, descending=True)
idx = idx[:cfg.max_num_detections]
scores = scores[:cfg.max_num_detections]
classes = classes[idx]
boxes = boxes[idx]
keypoints = keypoints[idx]
keypoints_score = keypoints_score[idx]
masks = masks[idx]
return boxes, keypoints, keypoints_score, masks, classes, scores