视频4413.8 区域卷积神经网络（R-CNN）系列 13.5多尺度目标检测&13.6目标检测数据集

绿色左边那个点

非基于模型

无锚框（anchor-free）系列的目标检测算法之CenterNet：Keypoint Triplets for Object Detection_Duets_QH的博客-优快云博客_无锚框目标检测优势简介上一篇介绍了另一个CenterNet：Objects as Points，今天来给大家介绍下由中科院和华为诺亚方舟实验室出品的一篇文章CenterNet：Keypoint Triplets for Object Detection，进入正题：文章链接：https://arxiv.org/abs/1904.08189作者：Kaiwen Duan单位：中科院、华为诺亚方舟实验室代码地址：...https://blog.youkuaiyun.com/qq_39273781/article/details/97250649 详解通用物体检测算法：基于锚框与无需锚框_小白学视觉的博客-优快云博客物体检测通常是指在图像中检测出物体出现的位置及对应的类别，它是计算机视觉中的根本问题之一，同时也是最基础的问题，如图像分割、物体追踪、关键点检测等都依赖物体检测。从应用来看，物体检测已广泛...https://blog.youkuaiyun.com/qq_42722197/article/details/116725878

a[:ed]表示从0取到第ed-1个元素

某个卷积层输出就是特征图

1:1 2:1 1：2

13.6. 目标检测数据集 — 动手学深度学习 2.0.0-beta0 documentationhttps://zh.d2l.ai/chapter_computer-vision/object-detection-dataset.html

13.7ssd

def flatten_pred(pred):
    return torch.flatten(pred.permute(0, 2, 3, 1), start_dim=1)
4d变2d    permute把通道放在最后  start_dim=1后面三个维度
def concat_preds(preds):
    return torch.cat([flatten_pred(p) for p in preds], dim=1)

在宽上面concat 反正就是除了第一个不变，剩下的都放一起 1维了

def down_sample_blk(in_channels, out_channels):  下采样
    blk = []
    for _ in range(2):  2重复两次
        blk.append(nn.Conv2d(in_channels, out_channels,
                             kernel_size=3, padding=1))
        blk.append(nn.BatchNorm2d(out_channels))
        blk.append(nn.ReLU())
        in_channels = out_channels
    blk.append(nn.MaxPool2d(2))
    return nn.Sequential(*blk)

输入 2 3输出 20*20 通道的话会把通道数3变成10 高宽减半

def base_net(): 从原始图片抽特征 到第一次的对fmp做磨框
    blk = []
    num_filters = [3, 16, 32, 64]   通道数 从输入3 增加到16 然后翻倍
    for i in range(len(num_filters) - 1):
        blk.append(down_sample_blk(num_filters[i], num_filters[i+1])) 三个downblk
    return nn.Sequential(*blk)

forward(torch.zeros((2, 3, 256, 256)), base_net()).shape
原始2 3 256*256图片

结果2 64 因为通道翻倍最后是64 因为经历三次downblk

def get_blk(i):
    if i == 0:
        blk = base_net()
    elif i == 1:
        blk = down_sample_blk(64, 128)        通道数增加到128
    elif i == 4:
        blk = nn.AdaptiveMaxPool2d((1,1))            fmp压到1*1
    else:
        blk = down_sample_blk(128, 128)        2&3通道数没变
    return blk

def blk_forward(X, blk, size, ratio, cls_predictor, bbox_predictor):
    Y = blk(X)    Y：fmp
    anchors = d2l.multibox_prior(Y, sizes=size, ratios=ratio) 在这种size尺度和比例 生产磨框
    cls_preds = cls_predictor(Y)     不需要传递磨框 预测类别
    bbox_preds = bbox_predictor(Y)                  预测偏移量
    return (Y, anchors, cls_preds, bbox_preds)

sizes = [[0.2, 0.272],  第一个stage 最下面32*32的fmp层 0.2看到百分之20的图片

         [0.37, 0.447], 每次加0.17随意在0-1取得 
        [0.54, 0.619],   遇到下面fmp越大 size取的小？？？？？？？ 26:44
        [0.71, 0.79],
         [0.88, 0.961]]    0.961 覆盖掉百分之96的照片
ratios = [[1, 2, 0.5]] * 5     5个stage
num_anchors = len(sizes[0]) + len(ratios[0]) - 1

2+3-1=4

class TinySSD(nn.Module):
    def __init__(self, num_classes, **kwargs):
        super(TinySSD, self).__init__(**kwargs)
        self.num_classes = num_classes   类别
        idx_to_in_channels = [64, 128, 128, 128, 128]   每个blk输出
        for i in range(5):        5次
            # 即赋值语句self.blk_i=get_blk(i)          
            setattr(self, f'blk_{i}', get_blk(i))      
    通过f'blk_{i}'访问 get_blk(i)  下面预测类和偏移
            setattr(self, f'cls_{i}', cls_predictor(idx_to_in_channels[i],
                                                    num_anchors, num_classes))
            setattr(self, f'bbox_{i}', bbox_predictor(idx_to_in_channels[i],
                                                      num_anchors))

    def forward(self, X):
        anchors, cls_preds, bbox_preds = [None] * 5, [None] * 5, [None] * 5
        for i in range(5):        5次
            # getattr(self,'blk_%d'%i)即访问self.blk_i
            X, anchors[i], cls_preds[i], bbox_preds[i] = blk_forward(
                X, getattr(self, f'blk_{i}'), sizes[i], ratios[i],
                getattr(self, f'cls_{i}'), getattr(self, f'bbox_{i}'))
        anchors = torch.cat(anchors, dim=1)    所有anchors放一起
        cls_preds = concat_preds(cls_preds)    变成矩阵   所有放一起
        cls_preds = cls_preds.reshape(             吧最后的类拿出来
            cls_preds.shape[0], -1, self.num_classes + 1)
        bbox_preds = concat_preds(bbox_preds)
        return anchors, cls_preds, bbox_preds

net = TinySSD(num_classes=1)           创建实例
X = torch.zeros((32, 3, 256, 256))      批量大小 通道 size
anchors, cls_preds, bbox_preds = net(X)

print('output anchors:', anchors.shape)
print('output class preds:', cls_preds.shape)
print('output bbox preds:', bbox_preds.shape)

output anchors: torch.Size([1, 5444, 4])  anchor要四个值定义
output class preds: torch.Size([32, 5444, 2])  
批量大小    anchor     输入类别是1+0背景色 所以是2
对每个磨框预测两个值
output bbox preds: torch.Size([32, 21776])
5444*4  每个磨框做四个预测 跟真实的进行偏移 左上右下的四个坐标？？？
/home/d2l-worker/miniconda3/envs/d2l-zh-release-0/lib/python3.8/site-packages/torch/functional.py:568: UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at  ../aten/src/ATen/native/TensorShape.cpp:2228.)
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]

cls_loss = nn.CrossEntropyLoss(reduction='none')
bbox_loss = nn.L1Loss(reduction='none')

def calc_loss(cls_preds, cls_labels, bbox_preds, bbox_labels, bbox_masks):
    batch_size, num_classes = cls_preds.shape[0], cls_preds.shape[2]
    cls = cls_loss(cls_preds.reshape(-1, num_classes), 
 -1吧前面批量大小维度和磨框个数维度放一起  对磨框分类的box
                   cls_labels.reshape(-1)).reshape(batch_size, -1).mean(dim=1)
    bbox = bbox_loss(bbox_preds * bbox_masks, 当你磨框就是背景框时候就不预测了
                     bbox_labels * bbox_masks).mean(dim=1)
    return cls + bbox

def cls_eval(cls_preds, cls_labels):
    # 由于类别预测结果放在最后一维，argmax需要指定最后一维。
    return float((cls_preds.argmax(dim=-1).type(
        cls_labels.dtype) == cls_labels).sum())

def bbox_eval(bbox_preds, bbox_labels, bbox_masks):
    return float((torch.abs((bbox_labels - bbox_preds) * bbox_masks)).sum())

num_epochs, timer = 20, d2l.Timer()
animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs],
                        legend=['class error', 'bbox mae'])
net = net.to(device)
for epoch in range(num_epochs):
    # 训练精确度的和，训练精确度的和中的示例数
    # 绝对误差的和，绝对误差的和中的示例数
    metric = d2l.Accumulator(4)
    net.train()
    for features, target in train_iter:
        timer.start()
        trainer.zero_grad()
        X, Y = features.to(device), target.to(device)
        # 生成多尺度的锚框，为每个锚框预测类别和偏移量
        anchors, cls_preds, bbox_preds = net(X)
        # 为每个锚框标注类别和偏移量
        bbox_labels, bbox_masks, cls_labels = d2l.multibox_target(anchors, Y)
磨框和真实的一一对应起来
        # 根据类别和偏移量的预测和标注值计算损失函数
        l = calc_loss(cls_preds, cls_labels, bbox_preds, bbox_labels,
                      bbox_masks)
        l.mean().backward()
        trainer.step()
        metric.add(cls_eval(cls_preds, cls_labels), cls_labels.numel(),
                   bbox_eval(bbox_preds, bbox_labels, bbox_masks),
                   bbox_labels.numel())
    cls_err, bbox_mae = 1 - metric[0] / metric[1], metric[2] / metric[3]
    animator.add(epoch + 1, (cls_err, bbox_mae))
print(f'class err {cls_err:.2e}, bbox mae {bbox_mae:.2e}')
print(f'{len(train_iter.dataset) / timer.stop():.1f} examples/sec on '
      f'{str(device)}')