GluonCV ------ ssd.py

最新推荐文章于 2025-04-11 10:52:58 发布

ht_0628

最新推荐文章于 2025-04-11 10:52:58 发布

阅读量182

点赞数

分类专栏： DL 文章标签：深度学习神经网络 python

本文链接：https://blog.youkuaiyun.com/ht_0628/article/details/120032245

版权

DL 专栏收录该内容

11 篇文章

订阅专栏

该博客详细介绍了SSD（Single Shot MultiBox Detector）网络的构建过程，包括网络结构、参数设置、特征提取、锚框生成、类别预测、非极大值抑制等关键步骤。通过设置不同的主干网络、输入尺寸、锚框比例等参数，可以灵活定制SSD模型。此外，还讨论了如何利用预训练权重进行模型初始化，并提供了调整非极大值抑制参数的方法。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

1.import * 可导入类设置

__all__ = ['SSD', 'get_ssd', 'custom_ssd']

2.SSD网络类

@des: 	构建一个SSD网络
@p1:	主干网络名称（resnet_xx, vgg_xx等等）
@p2:	输入图片的大小
@p3:	特征提取层的用来作为feature map的stage
@p4:	额外卷积层的输出通道数（列表）	
@p5:	锚框的sizes（一维列表）
@p6:	锚框的ratios（一维或者二维列表）
@p7:	Step size of anchor boxes in each output layer.
@p8:	类别的数目
@p9:	附加层是否使用1×1的卷积
@p10:	附加层是否使用BatchNorm
@p11:	Channel reduction ratio of the transition layers.
@p12:	Minimum channel number of transition layers.
@p13:	最后一层是否使用全局池化层
@p14:	是否使用预训练集参数	
@p15:	Std values to be divided/multiplied to box encoded values.
@p16:	非极大值抑制阈值
@p17:	Apply NMS to top k detection results, use -1 to disable so that every Detection result is used in NMS.
@p18:	Only return top `post_nms` detection results, the rest is discarded.
@p19:	 For advanced users. Define `anchor_alloc_size` to generate large enough anchor maps, which will later saved in parameters.
@p20:	gpu/cpu选择
@p21:	Normalization层
@p22:	Normalization层参数
@p23:	存放预训练参数的目录
@p24:	If `minimal_opset` is `True`, the network will use a minimal set of operators good for e.g., `TVM`.
@p25:	预测层卷积核
@p26:	预测层padding
@p27:	锚框生成器
class SSD(HybridBlock):
    def __init__(self, network, base_size, features, num_filters, sizes, ratios,
                 steps, classes, use_1x1_transition=True, use_bn=True,
                 reduce_ratio=1.0, min_depth=128, global_pool=False, pretrained=False,
                 stds=(0.1, 0.1, 0.2, 0.2), nms_thresh=0.45, nms_topk=400, post_nms=100,
                 anchor_alloc_size=128, ctx=mx.cpu(),
                 norm_layer=nn.BatchNorm, norm_kwargs=None,
                 root=os.path.join('~', '.mxnet', 'models'), minimal_opset=False,
                 predictors_kernel=(3, 3), predictors_pad=(1, 1),
                 anchor_generator=SSDAnchorGenerator, **kwargs):
        super(SSD, self).__init__(**kwargs)
        if norm_kwargs is None:
            norm_kwargs = {}
        if network is None:
            num_layers = len(ratios)
        else:
        	##--整个SSD特征图的金字塔层数。
            num_layers = len(features) + len(num_filters) + int(global_pool)
        ##--每个feature map对应的锚框是由列表内的size两两组合，故必定比层数大1。
        assert len(sizes) == num_layers + 1
        ##--此处便是将size进行两两组和形成二维列表：sizes[:-1]是前n-1个数，size[1:]是后n-1个数。
        sizes = list(zip(sizes[:-1], sizes[1:]))
        assert isinstance(ratios, (tuple, list)), "Must provide ratios as tuple or list"
        ##--如果ratios是一维列表，则ratios用于所有的特征层。
        if not isinstance(ratios[0], (tuple, list)):
            ratios = ratios * num_layers  # propagate to all layers if use same ratio
        ##--确保每一个特征层都有对应的sizes和ratios。
        assert num_layers == len(sizes) == len(ratios), \
            "Mismatched (number of layers) vs (sizes) vs (ratios): {}, {}, {}".format(
                num_layers, len(sizes), len(ratios))
        assert num_layers > 0, "SSD require at least one layer, suggest multiple."
        self._num_layers = num_layers
        self.classes = classes
        self.nms_thresh = nms_thresh
        self.nms_topk = nms_topk
        self.post_nms = post_nms

        with self.name_scope():
            if network is None:
            	##--这是社么鬼
                # use fine-grained manually designed block as features
                try:
                    self.features = features(pretrained=pretrained, ctx=ctx, root=root,
                                             norm_layer=norm_layer, norm_kwargs=norm_kwargs)
                except TypeError:
                    self.features = features(pretrained=pretrained, ctx=ctx, root=root)
            else:
            	##--这里为主干网络添加了附加层
                try:
                    self.features = FeatureExpander(
                        network=network, outputs=features, num_filters=num_filters,
                        use_1x1_transition=use_1x1_transition,
                        use_bn=use_bn, reduce_ratio=reduce_ratio, min_depth=min_depth,
                        global_pool=global_pool, pretrained=pretrained, ctx=ctx,
                        norm_layer=norm_layer, norm_kwargs=norm_kwargs, root=root)
                except TypeError:
                    self.features = FeatureExpander(
                        network=network, outputs=features, num_filters=num_filters,
                        use_1x1_transition=use_1x1_transition,
                        use_bn=use_bn, reduce_ratio=reduce_ratio, min_depth=min_depth,
                        global_pool=global_pool, pretrained=pretrained, ctx=ctx, root=root)
            self.class_predictors = nn.HybridSequential()
            self.box_predictors = nn.HybridSequential()
            self.anchor_generators = nn.HybridSequential()
            asz = anchor_alloc_size
            im_size = (base_size, base_size)
            ##--这里为每个特征图都添加锚框生成器、类别预测、bbox回归
            for i, s, r, st in zip(range(num_layers), sizes, ratios, steps):
            	##--这里是生成某个特征图分支的锚框
                branch_anchor_generator = anchor_generator(i, im_size, s, r, st, (asz, asz))
                self.anchor_generators.add(branch_anchor_generator)
                asz = max(asz // 2, 16)  # pre-compute larger than 16x16 anchor map
                ##--这里是每个像素对应的锚框数量
                num_anchors = branch_anchor_generator.num_depth
                ##--类别预测层的输出通道数=锚框的数量*（类别数+1）每个锚框都要对所有类别+背景进行预测。
                self.class_predictors.add(ConvPredictor(num_anchors * (len(self.classes) + 1),
                                                        kernel=predictors_kernel,
                                                        pad=predictors_pad))
                ##--每个锚框要对四个偏移量进行预测
                self.box_predictors.add(ConvPredictor(num_anchors * 4,
                                                      kernel=predictors_kernel,
                                                      pad=predictors_pad))
            ##--bbbox解码器
            self.bbox_decoder = NormalizedBoxCenterDecoder(stds, minimal_opset=minimal_opset)
            ##--类别预测解码器
            self.cls_decoder = MultiPerClassDecoder(len(self.classes) + 1, thresh=0.01)

@property
	@des: 	返回非背景类
    def num_classes(self):
        return len(self.classes)
	
	@des:	设置非极大值抑制参数
    def set_nms(self, nms_thresh=0.45, nms_topk=400, post_nms=100):
        self._clear_cached_op()
        self.nms_thresh = nms_thresh
        self.nms_topk = nms_topk
        self.post_nms = post_nms

    # pylint: disable=arguments-differ
    def hybrid_forward(self, F, x):
        """Hybrid forward"""
        ##--获得6张特征图列表，每张都是(N,C,H,W)格式,如第一个特征图就是1*1024*32*32(512*512输入图片尺寸)
        features = self.features(x)
        ##--首先对每张特征图进行类别预测，即cp(feat)->1*84*32*32(每个像素有4个锚框，每个锚框预测21个类别，故通道数C为84，形状大小不发生改变)
        ##--再将通道数维度移到最后->1*32*32*84。
        ##--再对结果进行flatten内部拉平->1*86016，先把所有第一个位置的所有通道数数据依次写下，再到下一个位置。
        ##--最终相当于[第一个像素的第一个锚框的类别0预测，..类别1..,第二个锚框,..,第二个像素]
        ##a表示锚框，p表示像素，c表示类就是[p0a0c0,p0a0c1,...]
        cls_preds = [F.flatten(F.transpose(cp(feat), (0, 2, 3, 1)))
                     for feat, cp in zip(features, self.class_predictors)]
        ##--首先对每张特征图进行锚框变换参数预测，即bp(feat)->1*16*32*32(每个像素有4个锚框，每个锚框变换参数有4个值，故通道数C为16，形状大小不发生改变)
        ##--再将通道数维度移到最后->1*32*32*16。
        ##--再对结果进行flatten内部拉平->1*16384，先把所有第一个位置的所有通道数依次写下，再到下一个位置。
        box_preds = [F.flatten(F.transpose(bp(feat), (0, 2, 3, 1)))
                     for feat, bp in zip(features, self.box_predictors)]
        ##--每个特征图都算出一系列的锚框
        ##--坐标数据整合到一个维度32*32*4*4个数据，所以列表第一个锚框形状是(1,16384)
        anchors = [F.reshape(ag(feat), shape=(1, -1))
                   for feat, ag in zip(features, self.anchor_generators)]
        ##--concat把dim1维度的数据全部放在了一起 -> (1,86016+.....) -> [f0p0a0c0,f0p0a0c1....]
        ##--reshape按num_clasee+1为最小单元放在一起 -> (1,6132,21) -> [f0p0a0[c0-c21],...f0p0a1[c0-c20]]
        cls_preds = F.concat(*cls_preds, dim=1).reshape((0, -1, self.num_classes + 1))
  		##--concat -> (1,16384+...) -> [f0p0a0b0,f0p0a0b1...] -> [f0p0a0b[0-3],f0p0a1b[0-3]] -> (1,6132,21)
        box_preds = F.concat(*box_preds, dim=1).reshape((0, -1, 4))
        ##--anchor和box完全一样，因为anchor四个坐标，box四个偏移量。
        anchors = F.concat(*anchors, dim=1).reshape((1, -1, 4))
        ##--正在训练就可以直接返回了，数据整理好了。
        if autograd.is_training():
            return [cls_preds, box_preds, anchors]
        ##--锚框+预测变换就可以得到最终的bounding box了。(1,6132,4)
        bboxes = self.bbox_decoder(box_preds, anchors)
        ##--首先将类别预测试softmax一下变成概率分布
        ##--解码后由于没有背景类所以是(1,6132,20)
        cls_ids, scores = self.cls_decoder(F.softmax(cls_preds, axis=-1))
        results = []
        for i in range(self.num_classes):
        	##--把某一类别的所有预测放在一起 -> (1,6132,1)
            cls_id = cls_ids.slice_axis(axis=-1, begin=i, end=i+1)
            ##--把某一类别预测分数放在一起 -> (1,6132,1)
            score = scores.slice_axis(axis=-1, begin=i, end=i+1)
            # per class results
            ##--加上对应的bboxes坐标组合一下 -> (1,6132,6)
            per_result = F.concat(*[cls_id, score, bboxes], dim=-1)
            results.append(per_result)
        ##--21个类别都拼在一起(1,6132*20,6)
        result = F.concat(*results, dim=1)
        if self.nms_thresh > 0 and self.nms_thresh < 1:
        	##--对结果做非极大值抑制，去除一些重叠框。并且socre会有一个降序排列。
            result = F.contrib.box_nms(
                result, overlap_thresh=self.nms_thresh, topk=self.nms_topk, valid_thresh=0.01,
                id_index=0, score_index=1, coord_start=2, force_suppress=False)
            ##--只要score排名再前post_nms个结果。
            if self.post_nms > 0:
                result = result.slice_axis(axis=1, begin=0, end=self.post_nms)
       	##--重新分出来
        ids = F.slice_axis(result, axis=2, begin=0, end=1)
        scores = F.slice_axis(result, axis=2, begin=1, end=2)
        bboxes = F.slice_axis(result, axis=2, begin=2, end=6)
        return ids, scores, bboxes

    def reset_class(self, classes, reuse_weights=None):
        """Reset class categories and class predictors.

        Parameters
        ----------
        classes : iterable of str
            The new categories. ['apple', 'orange'] for example.
        reuse_weights : dict
            A {new_integer : old_integer} or mapping dict or {new_name : old_name} mapping dict,
            or a list of [name0, name1,...] if class names don't change.
            This allows the new predictor to reuse the
            previously trained weights specified.

        Example
        -------
        >>> net = gluoncv.model_zoo.get_model('ssd_512_resnet50_v1_voc', pretrained=True)
        >>> # use direct name to name mapping to reuse weights
        >>> net.reset_class(classes=['person'], reuse_weights={'person':'person'})
        >>> # or use interger mapping, person is the 14th category in VOC
        >>> net.reset_class(classes=['person'], reuse_weights={0:14})
        >>> # you can even mix them
        >>> net.reset_class(classes=['person'], reuse_weights={'person':14})
        >>> # or use a list of string if class name don't change
        >>> net.reset_class(classes=['person'], reuse_weights=['person'])

        """
        self._clear_cached_op()
        old_classes = self.classes
        self.classes = classes
        # trying to reuse weights by mapping old and new classes
        if isinstance(reuse_weights, (dict, list)):
            if isinstance(reuse_weights, dict):
                # trying to replace str with indices
                new_keys = []
                new_vals = []
                for k, v in reuse_weights.items():
                    if isinstance(v, str):
                        try:
                            new_vals.append(old_classes.index(v))  # raise ValueError if not found
                        except ValueError:
                            raise ValueError(
                                "{} not found in old class names {}".format(v, old_classes))
                    else:
                        if v < 0 or v >= len(old_classes):
                            raise ValueError(
                                "Index {} out of bounds for old class names".format(v))
                        new_vals.append(v)
                    if isinstance(k, str):
                        try:
                            new_keys.append(self.classes.index(k))  # raise ValueError if not found
                        except ValueError:
                            raise ValueError(
                                "{} not found in new class names {}".format(k, self.classes))
                    else:
                        if k < 0 or k >= len(self.classes):
                            raise ValueError(
                                "Index {} out of bounds for new class names".format(k))
                        new_keys.append(k)
                reuse_weights = dict(zip(new_keys, new_vals))
            else:
                new_map = {}
                for x in reuse_weights:
                    try:
                        new_idx = self.classes.index(x)
                        old_idx = old_classes.index(x)
                        new_map[new_idx] = old_idx
                    except ValueError:
                        warnings.warn("{} not found in old: {} or new class names: {}".format(
                            x, old_classes, self.classes))
                reuse_weights = new_map
        # replace class predictors
        with self.name_scope():
            class_predictors = nn.HybridSequential(prefix=self.class_predictors.prefix)
            for i, ag in zip(range(len(self.class_predictors)), self.anchor_generators):
                # Re-use the same prefix and ctx_list as used by the current ConvPredictor
                prefix = self.class_predictors[i].prefix
                old_pred = self.class_predictors[i].predictor
                ctx = list(old_pred.params.values())[0].list_ctx()
                # to avoid deferred init, number of in_channels must be defined
                in_channels = list(old_pred.params.values())[0].shape[1]
                new_cp = ConvPredictor(ag.num_depth * (self.num_classes + 1),
                                       in_channels=in_channels, prefix=prefix)
                new_cp.collect_params().initialize(ctx=ctx)
                if reuse_weights:
                    assert isinstance(reuse_weights, dict)
                    for old_params, new_params in zip(old_pred.params.values(),
                                                      new_cp.predictor.params.values()):
                        old_data = old_params.data()
                        new_data = new_params.data()

                        for k, v in reuse_weights.items():
                            if k >= len(self.classes) or v >= len(old_classes):
                                warnings.warn("reuse mapping {}/{} -> {}/{} out of range".format(
                                    k, self.classes, v, old_classes))
                                continue
                            # always increment k and v (background is always the 0th)
                            new_data[k+1::len(self.classes)+1] = old_data[v+1::len(old_classes)+1]
                        # reuse background weights as well
                        new_data[0::len(self.classes)+1] = old_data[0::len(old_classes)+1]
                        # set data to new conv layers
                        new_params.set_data(new_data)
                class_predictors.add(new_cp)
            self.class_predictors = class_predictors
            self.cls_decoder = MultiPerClassDecoder(len(self.classes) + 1, thresh=0.01)