@des: 构建一个SSD网络
@p1: 主干网络名称(resnet_xx, vgg_xx等等)
@p2: 输入图片的大小
@p3: 特征提取层的用来作为feature map的stage
@p4: 额外卷积层的输出通道数(列表)
@p5: 锚框的sizes(一维列表)
@p6: 锚框的ratios(一维或者二维列表)
@p7: Step size of anchor boxes in each output layer.@p8: 类别的数目
@p9: 附加层是否使用1×1的卷积
@p10: 附加层是否使用BatchNorm
@p11: Channel reduction ratio of the transition layers.@p12: Minimum channel number of transition layers.@p13: 最后一层是否使用全局池化层
@p14: 是否使用预训练集参数
@p15: Std values to be divided/multiplied to box encoded values.@p16: 非极大值抑制阈值
@p17: Apply NMS to top k detection results, use -1 to disable so that every Detection result is used in NMS.@p18: Only return top `post_nms` detection results, the rest is discarded.@p19: For advanced users. Define `anchor_alloc_size` to generate large enough anchor maps, which will later saved in parameters.@p20: gpu/cpu选择
@p21: Normalization层
@p22: Normalization层参数
@p23: 存放预训练参数的目录
@p24: If `minimal_opset` is `True`, the network will use a minimal set of operators good for e.g., `TVM`.@p25: 预测层卷积核
@p26: 预测层padding
@p27: 锚框生成器
classSSD(HybridBlock):def__init__(self, network, base_size, features, num_filters, sizes, ratios,
steps, classes, use_1x1_transition=True, use_bn=True,
reduce_ratio=1.0, min_depth=128, global_pool=False, pretrained=False,
stds=(0.1,0.1,0.2,0.2), nms_thresh=0.45, nms_topk=400, post_nms=100,
anchor_alloc_size=128, ctx=mx.cpu(),
norm_layer=nn.BatchNorm, norm_kwargs=None,
root=os.path.join('~','.mxnet','models'), minimal_opset=False,
predictors_kernel=(3,3), predictors_pad=(1,1),
anchor_generator=SSDAnchorGenerator,**kwargs):super(SSD, self).__init__(**kwargs)if norm_kwargs isNone:
norm_kwargs ={}if network isNone:
num_layers =len(ratios)else:##--整个SSD特征图的金字塔层数。
num_layers =len(features)+len(num_filters)+int(global_pool)##--每个feature map对应的锚框是由列表内的size两两组合,故必定比层数大1。assertlen(sizes)== num_layers +1##--此处便是将size进行两两组和形成二维列表:sizes[:-1]是前n-1个数,size[1:]是后n-1个数。
sizes =list(zip(sizes[:-1], sizes[1:]))assertisinstance(ratios,(tuple,list)),"Must provide ratios as tuple or list"##--如果ratios是一维列表,则ratios用于所有的特征层。ifnotisinstance(ratios[0],(tuple,list)):
ratios = ratios * num_layers # propagate to all layers if use same ratio##--确保每一个特征层都有对应的sizes和ratios。assert num_layers ==len(sizes)==len(ratios), \
"Mismatched (number of layers) vs (sizes) vs (ratios): {}, {}, {}".format(
num_layers,len(sizes),len(ratios))assert num_layers >0,"SSD require at least one layer, suggest multiple."
self._num_layers = num_layers
self.classes = classes
self.nms_thresh = nms_thresh
self.nms_topk = nms_topk
self.post_nms = post_nms
with self.name_scope():if network isNone:##--这是社么鬼# use fine-grained manually designed block as featurestry:
self.features = features(pretrained=pretrained, ctx=ctx, root=root,
norm_layer=norm_layer, norm_kwargs=norm_kwargs)except TypeError:
self.features = features(pretrained=pretrained, ctx=ctx, root=root)else:##--这里为主干网络添加了附加层try:
self.features = FeatureExpander(
network=network, outputs=features, num_filters=num_filters,
use_1x1_transition=use_1x1_transition,
use_bn=use_bn, reduce_ratio=reduce_ratio, min_depth=min_depth,
global_pool=global_pool, pretrained=pretrained, ctx=ctx,
norm_layer=norm_layer, norm_kwargs=norm_kwargs, root=root)except TypeError:
self.features = FeatureExpander(
network=network, outputs=features, num_filters=num_filters,
use_1x1_transition=use_1x1_transition,
use_bn=use_bn, reduce_ratio=reduce_ratio, min_depth=min_depth,
global_pool=global_pool, pretrained=pretrained, ctx=ctx, root=root)
self.class_predictors = nn.HybridSequential()
self.box_predictors = nn.HybridSequential()
self.anchor_generators = nn.HybridSequential()
asz = anchor_alloc_size
im_size =(base_size, base_size)##--这里为每个特征图都添加锚框生成器、类别预测、bbox回归for i, s, r, st inzip(range(num_layers), sizes, ratios, steps):##--这里是生成某个特征图分支的锚框
branch_anchor_generator = anchor_generator(i, im_size, s, r, st,(asz, asz))
self.anchor_generators.add(branch_anchor_generator)
asz =max(asz //2,16)# pre-compute larger than 16x16 anchor map##--这里是每个像素对应的锚框数量
num_anchors = branch_anchor_generator.num_depth
##--类别预测层的输出通道数=锚框的数量*(类别数+1)每个锚框都要对所有类别+背景进行预测。
self.class_predictors.add(ConvPredictor(num_anchors *(len(self.classes)+1),
kernel=predictors_kernel,
pad=predictors_pad))##--每个锚框要对四个偏移量进行预测
self.box_predictors.add(ConvPredictor(num_anchors *4,
kernel=predictors_kernel,
pad=predictors_pad))##--bbbox解码器
self.bbox_decoder = NormalizedBoxCenterDecoder(stds, minimal_opset=minimal_opset)##--类别预测解码器
self.cls_decoder = MultiPerClassDecoder(len(self.classes)+1, thresh=0.01)@property@des: 返回非背景类
defnum_classes(self):returnlen(self.classes)@des: 设置非极大值抑制参数
defset_nms(self, nms_thresh=0.45, nms_topk=400, post_nms=100):
self._clear_cached_op()
self.nms_thresh = nms_thresh
self.nms_topk = nms_topk
self.post_nms = post_nms
# pylint: disable=arguments-differdefhybrid_forward(self, F, x):"""Hybrid forward"""##--获得6张特征图列表,每张都是(N,C,H,W)格式,如第一个特征图就是1*1024*32*32(512*512输入图片尺寸)
features = self.features(x)##--首先对每张特征图进行类别预测,即cp(feat)->1*84*32*32(每个像素有4个锚框,每个锚框预测21个类别,故通道数C为84,形状大小不发生改变)##--再将通道数维度移到最后->1*32*32*84。##--再对结果进行flatten内部拉平->1*86016,先把所有第一个位置的所有通道数数据依次写下,再到下一个位置。##--最终相当于[第一个像素的第一个锚框的类别0预测,..类别1..,第二个锚框,..,第二个像素]##a表示锚框,p表示像素,c表示类就是[p0a0c0,p0a0c1,...]
cls_preds =[F.flatten(F.transpose(cp(feat),(0,2,3,1)))for feat, cp inzip(features, self.class_predictors)]##--首先对每张特征图进行锚框变换参数预测,即bp(feat)->1*16*32*32(每个像素有4个锚框,每个锚框变换参数有4个值,故通道数C为16,形状大小不发生改变)##--再将通道数维度移到最后->1*32*32*16。##--再对结果进行flatten内部拉平->1*16384,先把所有第一个位置的所有通道数依次写下,再到下一个位置。
box_preds =[F.flatten(F.transpose(bp(feat),(0,2,3,1)))for feat, bp inzip(features, self.box_predictors)]##--每个特征图都算出一系列的锚框##--坐标数据整合到一个维度32*32*4*4个数据,所以列表第一个锚框形状是(1,16384)
anchors =[F.reshape(ag(feat), shape=(1,-1))for feat, ag inzip(features, self.anchor_generators)]##--concat把dim1维度的数据全部放在了一起 -> (1,86016+.....) -> [f0p0a0c0,f0p0a0c1....]##--reshape按num_clasee+1为最小单元放在一起 -> (1,6132,21) -> [f0p0a0[c0-c21],...f0p0a1[c0-c20]]
cls_preds = F.concat(*cls_preds, dim=1).reshape((0,-1, self.num_classes +1))##--concat -> (1,16384+...) -> [f0p0a0b0,f0p0a0b1...] -> [f0p0a0b[0-3],f0p0a1b[0-3]] -> (1,6132,21)
box_preds = F.concat(*box_preds, dim=1).reshape((0,-1,4))##--anchor和box完全一样,因为anchor四个坐标,box四个偏移量。
anchors = F.concat(*anchors, dim=1).reshape((1,-1,4))##--正在训练就可以直接返回了,数据整理好了。if autograd.is_training():return[cls_preds, box_preds, anchors]##--锚框+预测变换就可以得到最终的bounding box了。(1,6132,4)
bboxes = self.bbox_decoder(box_preds, anchors)##--首先将类别预测试softmax一下变成概率分布##--解码后由于没有背景类所以是(1,6132,20)
cls_ids, scores = self.cls_decoder(F.softmax(cls_preds, axis=-1))
results =[]for i inrange(self.num_classes):##--把某一类别的所有预测放在一起 -> (1,6132,1)
cls_id = cls_ids.slice_axis(axis=-1, begin=i, end=i+1)##--把某一类别预测分数放在一起 -> (1,6132,1)
score = scores.slice_axis(axis=-1, begin=i, end=i+1)# per class results##--加上对应的bboxes坐标组合一下 -> (1,6132,6)
per_result = F.concat(*[cls_id, score, bboxes], dim=-1)
results.append(per_result)##--21个类别都拼在一起(1,6132*20,6)
result = F.concat(*results, dim=1)if self.nms_thresh >0and self.nms_thresh <1:##--对结果做非极大值抑制,去除一些重叠框。并且socre会有一个降序排列。
result = F.contrib.box_nms(
result, overlap_thresh=self.nms_thresh, topk=self.nms_topk, valid_thresh=0.01,
id_index=0, score_index=1, coord_start=2, force_suppress=False)##--只要score排名再前post_nms个结果。if self.post_nms >0:
result = result.slice_axis(axis=1, begin=0, end=self.post_nms)##--重新分出来
ids = F.slice_axis(result, axis=2, begin=0, end=1)
scores = F.slice_axis(result, axis=2, begin=1, end=2)
bboxes = F.slice_axis(result, axis=2, begin=2, end=6)return ids, scores, bboxes
defreset_class(self, classes, reuse_weights=None):"""Reset class categories and class predictors.
Parameters
----------
classes : iterable of str
The new categories. ['apple', 'orange'] for example.
reuse_weights : dict
A {new_integer : old_integer} or mapping dict or {new_name : old_name} mapping dict,
or a list of [name0, name1,...] if class names don't change.
This allows the new predictor to reuse the
previously trained weights specified.
Example
-------
>>> net = gluoncv.model_zoo.get_model('ssd_512_resnet50_v1_voc', pretrained=True)
>>> # use direct name to name mapping to reuse weights
>>> net.reset_class(classes=['person'], reuse_weights={'person':'person'})
>>> # or use interger mapping, person is the 14th category in VOC
>>> net.reset_class(classes=['person'], reuse_weights={0:14})
>>> # you can even mix them
>>> net.reset_class(classes=['person'], reuse_weights={'person':14})
>>> # or use a list of string if class name don't change
>>> net.reset_class(classes=['person'], reuse_weights=['person'])
"""
self._clear_cached_op()
old_classes = self.classes
self.classes = classes
# trying to reuse weights by mapping old and new classesifisinstance(reuse_weights,(dict,list)):ifisinstance(reuse_weights,dict):# trying to replace str with indices
new_keys =[]
new_vals =[]for k, v in reuse_weights.items():ifisinstance(v,str):try:
new_vals.append(old_classes.index(v))# raise ValueError if not foundexcept ValueError:raise ValueError("{} not found in old class names {}".format(v, old_classes))else:if v <0or v >=len(old_classes):raise ValueError("Index {} out of bounds for old class names".format(v))
new_vals.append(v)ifisinstance(k,str):try:
new_keys.append(self.classes.index(k))# raise ValueError if not foundexcept ValueError:raise ValueError("{} not found in new class names {}".format(k, self.classes))else:if k <0or k >=len(self.classes):raise ValueError("Index {} out of bounds for new class names".format(k))
new_keys.append(k)
reuse_weights =dict(zip(new_keys, new_vals))else:
new_map ={}for x in reuse_weights:try:
new_idx = self.classes.index(x)
old_idx = old_classes.index(x)
new_map[new_idx]= old_idx
except ValueError:
warnings.warn("{} not found in old: {} or new class names: {}".format(
x, old_classes, self.classes))
reuse_weights = new_map
# replace class predictorswith self.name_scope():
class_predictors = nn.HybridSequential(prefix=self.class_predictors.prefix)for i, ag inzip(range(len(self.class_predictors)), self.anchor_generators):# Re-use the same prefix and ctx_list as used by the current ConvPredictor
prefix = self.class_predictors[i].prefix
old_pred = self.class_predictors[i].predictor
ctx =list(old_pred.params.values())[0].list_ctx()# to avoid deferred init, number of in_channels must be defined
in_channels =list(old_pred.params.values())[0].shape[1]
new_cp = ConvPredictor(ag.num_depth *(self.num_classes +1),
in_channels=in_channels, prefix=prefix)
new_cp.collect_params().initialize(ctx=ctx)if reuse_weights:assertisinstance(reuse_weights,dict)for old_params, new_params inzip(old_pred.params.values(),
new_cp.predictor.params.values()):
old_data = old_params.data()
new_data = new_params.data()for k, v in reuse_weights.items():if k >=len(self.classes)or v >=len(old_classes):
warnings.warn("reuse mapping {}/{} -> {}/{} out of range".format(
k, self.classes, v, old_classes))continue# always increment k and v (background is always the 0th)
new_data[k+1::len(self.classes)+1]= old_data[v+1::len(old_classes)+1]# reuse background weights as well
new_data[0::len(self.classes)+1]= old_data[0::len(old_classes)+1]# set data to new conv layers
new_params.set_data(new_data)
class_predictors.add(new_cp)
self.class_predictors = class_predictors
self.cls_decoder = MultiPerClassDecoder(len(self.classes)+1, thresh=0.01)