参考来源(本文主要做笔记使用)
锚框
简单理解,将某些我们感兴趣的物体框住的框被称为真实边界框(ground-truth bounding box),而用来预测边界框的框我们称之为锚框(anchor box)。这里我们介绍一种生成锚框的方法:以每个像素为中心,生成多个缩放比和宽高比不同锚框。
生成多个锚框
假设输入图像的高度为 h h h,宽度为 w w w,我们以每个像素点生成不同形状的锚框:缩放比 s ∈ ( 0 , 1 ] s\in(0, 1] s∈(0,1],宽高比 r > 0 r > 0 r>0。从而锚框的宽 w ′ w' w′和高 h ′ h' h′分别为 h s r hs\sqrt{r} hsr和 h s / r hs/\sqrt{r} hs/r(注:根据本人理解 w ′ / h ′ = r w / h w'/h'=rw/h w′/h′=rw/h, w ′ h ′ = w h s 2 w'h'=whs^2 w′h′=whs2,计算出来的 w ′ = w s r w'=ws\sqrt{r} w′=wsr,不过只是一种生成方法,故一下代码依旧使用老师原来的代码,ps:也可能是我理解有问题)。
我们设置许多缩放比取值
s
1
,
…
,
s
n
s_1,\ldots,s_n
s1,…,sn和宽高比取值
r
1
,
…
,
r
m
r_1,\ldots,r_m
r1,…,rm,对于图片的每个像素点,总共会生成
w
h
n
m
whnm
whnm个锚框,考虑到计算复杂度,我们只考虑包含
s
1
s_1
s1和
r
1
r_1
r1的组合,即
(
s
1
,
r
1
)
,
(
s
1
,
r
2
)
,
…
,
(
s
1
,
r
m
)
,
(
s
2
,
r
1
)
,
(
s
3
,
r
1
)
,
…
,
(
s
n
,
r
1
)
(s_1,r_1),(s_1, r_2), \ldots,(s_1, r_m),(s_2, r_1),(s_3, r_1),\ldots,(s_n, r_1)
(s1,r1),(s1,r2),…,(s1,rm),(s2,r1),(s3,r1),…,(sn,r1)
从而对于任意一个像素点,会生成
n
+
m
−
1
n+m-1
n+m−1个锚框。对于整张输入图像而言,将生成
w
h
(
n
+
m
−
1
)
wh(n+m-1)
wh(n+m−1)个锚框,生成锚框的代码如下:
def multibox_prior(data, sizes, ratios):
"""
生成以每个像素为中心具有不同形状的锚框
data:四维数据(批量大小,通道数,高,宽)
sizes:缩放比
ratios:宽高比
"""
in_height, in_width = data.shape[-2:]
device, num_sizes, num_ratios = data.device, len(sizes), len(ratios)
boxes_per_pixel = (num_sizes + num_ratios - 1) # 单个像素点的锚框数,只考虑(s1, [r1~rm])和([s1~sn], r1)
size_tensor = torch.tensor(sizes, device=device)
ratios_tensor = torch.tensor(ratios, device=device)
# 为了将锚点移动到像素的中心,需要设置偏移量
# 因为一个像素的高宽均为1,我们选择偏移我们的中心0.5
offset_h, offset_w = 0.5, 0.5
steps_h = 1.0 / in_height # 在y轴上缩放步长
steps_w = 1.0 / in_width # 在x轴上缩放步长
# 生成锚框的所有中心点
center_h = (torch.arange(in_height, device=device) + offset_h) * steps_h # (高 + 0.5) / 高
center_w = (torch.arange(in_width, device=device) + offset_w) * steps_w # (宽 + 0.5) / 宽
shift_y, shift_x = torch.meshgrid(center_h, center_w, indexing="ij") # 生成坐标,均为561*728的tensor
shift_y, shift_x = shift_y.reshape(-1), shift_x.reshape(-1) # 展平,大小为561*728=408408
# 生成“boxes_per_pixel”个高和宽
# 之后用于创建锚框的四角坐标(xmin,xmax,ymin,ymax)
# 根据李沐老师书中的计算结果,锚框的宽高w`和h`为hs*sqrt(r)和hs/sqrt(r)
# 但本人之后重算了一遍锚框宽为ws*sqrt(r),
# 不过本代码主要提供了一种思想,故代码采用原来的不变,
# 最后锚框的宽高均除以图片的宽高
w = torch.cat(
(size_tensor * torch.sqrt(ratios_tensor[0]), sizes[0] * torch.sqrt(ratios_tensor[1:]))) * in_height / in_width
h = torch.cat((size_tensor / torch.sqrt(ratios_tensor[0]), sizes[0] / torch.sqrt(ratios_tensor[1:]))) # w和h的大小均为5
# 除以2来获得半高和半宽
# 每个像素点5个锚框,一个锚框2个坐标,最终大小为(5*561*728, 4)=(2042040, 4)
anchor_manipulations = torch.stack((-w, -h, w, h)).T.repeat(in_height * in_width, 1) / 2
# 每个中心点都将有“boxes_per_pixel”个锚框
# 所以生成含有所有锚框中心的网格,重复了“boxes_per_pixel”次
out_grid = torch.stack([shift_x, shift_y, shift_x, shift_y], dim=1).repeat_interleave(boxes_per_pixel, dim=0) # (5*561*728, 4)
output = out_grid + anchor_manipulations # 坐标与锚框半高半宽相加得到锚框的左上和右下坐标
return output.unsqueeze(0)
img = d2l.plt.imread("../img/catdog.jpg") # 以561*728大小的图片为例
h, w = img.shape[:2]
print(h, w, img.shape) # 输出为: 561 728 (561, 728, 3)
X = torch.rand(size=(1, 3, h, w))
Y = multibox_prior(X, sizes=[0.75, 0.5, 0.25], ratios=[1, 2, 0.5])
print(Y.shape)
boxes = Y.reshape(h, w, 5, 4)
print(boxes[250, 250, 0, :])
显示以图像中以某个像素为中心的所有锚框
def show_bboxes(axes, bboxes, labels=None, colors=None):
"""显示所有边界框"""
def _make_list(obj, default_values=None):
if obj is None:
obj = default_values
elif not isinstance(obj, (list, tuple)):
obj = [obj]
return obj
labels = _make_list(labels)
colors = _make_list(colors, ['b', 'g', 'r', 'm', 'c'])
for i, bbox in enumerate(bboxes):
color = colors[i % len(colors)]
rect = d2l.bbox_to_rect(bbox.detach().numpy(), color)
axes.add_patch(rect)
if labels and len(labels) > i:
text_color = 'k' if color == 'w' else 'w'
axes.text(rect.xy[0], rect.xy[1], labels[i],
va="center", ha="center", fontsize=9, color=text_color,
bbox=dict(facecolor=color, lw=0))
d2l.set_figsize()
bbox_scale = torch.tensor((w, h, w, h))
fig = d2l.plt.imshow(img)
show_bboxes(fig.axes, boxes[250, 250, :, :] * bbox_scale,
['s=0.75, r=1', 's=0.5, r=1', 's=0.25, r=1', 's=0.75, r=2', 's=0.75, r=0.5'])
d2l.plt.show()
交并比
为了衡量锚框与真实边界框的相似度,杰卡德系数(Jaccard)可以衡量两组之间的相似性。
给定集合
A
\mathcal{A}
A和
B
\mathcal{B}
B,他们的杰卡德系数是他们交集的大小除以他们并集的大小:
J
(
A
,
B
)
=
∣
A
∩
B
∣
∣
A
∪
B
∣
J(\mathcal{A},\mathcal{B}) = \frac{\left|\mathcal{A} \cap \mathcal{B}\right|}{\left| \mathcal{A} \cup \mathcal{B}\right|}
J(A,B)=∣A∪B∣∣A∩B∣
def box_iou(boxes1, boxes2):
"""计算两个锚框或边界框列表中成对的交并比"""
box_area = lambda boxes: (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
# boxes1,boxes2,areas1,areas2的形状:
# boxes1:(boxes1的数量, 4)
# boxes2:(boxes2的数量, 4)
# areas1:(boxes1的数量,)
# areas2:(boxes2的数量,)
areas1 = box_area(boxes1)
areas2 = box_area(boxes2)
# inter_upperlefts,inter_lowerrights,inters的形状:
# (boxes1的数量,boxes2的数量,2)
inter_upperlefts = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # 广播
inter_lowerrights = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])
inters = (inter_lowerrights - inter_upperlefts).clamp(min=0) # 锚框不相交时,会导致交并比小于0
# inter_areas and union_areas的形状:(boxes1的数量,boxes2的数量)
inter_areas = inters[:, :, 0] * inters[:, :, 1]
union_areas = areas1[:, None] + areas2 - inter_areas
return inter_areas / union_areas
在训练数据中标注锚框
在训练集中,我们将每个锚框视为一个训练对象。为了训练目标检测模型,我们需要获取每个锚框的类别(class)和偏移值(offset)。类别为与该锚框相关对象的类别,偏移值为锚框与该类对象真实边界框的偏移。
在预测时,我们为每个图像生成多个锚框,预测所有锚框的类别和偏移量,根据预测的偏移量调整它们的位置以获得预测的边界框,最后只输出符合特定条件的预测边界框。
为此,我们可以考虑把最接近的真实边界框分配给锚框。
def assign_anchor_to_bbox(ground_truth, anchors, device, iou_threshold=0.5):
"""
将最接近的真实边界框分配给锚框
ground_truth: (真实边界框数量, 边界框坐标)
anchors: (锚框数量, 锚框坐标)
返回: 锚框到真实边界框的映射表
思路: 1.筛选出IoU值大于阈值的锚框
2.按从大到小顺序贪心的给每个边界框挑选锚框
"""
num_anchors, num_gt_boxes = anchors.shape[0], ground_truth.shape[0]
# 位于第i行和第j列的元素x_ij是锚框i和真实边界框j的IoU
jaccard = box_iou(anchors, ground_truth) # (边界框数量, 锚框数量)
# 对于每个锚框,分配的真实边界框的张量
anchors_bbox_map = torch.full((num_anchors,), -1, dtype=torch.long, device=device)
# 根据阈值,决定是否分配真实边界框
max_ious, indices = torch.max(jaccard, dim=1) # 对于每一个锚框,获取其对应最大交并比边界框id和交并比
anc_i = torch.nonzero(max_ious >= iou_threshold).reshape(-1) # 保留交并比大于阈值的锚框id
box_j = indices[max_ious >= iou_threshold] # 布尔索引
anchors_bbox_map[anc_i] = box_j # 整数索引
col_discard = torch.full((num_anchors,), -1) # 丢弃被选中元素所在列
row_discard = torch.full((num_gt_boxes,), -1) # 丢弃被选中元素所在行
for _ in range(num_gt_boxes):
max_idx = torch.argmax(jaccard)
box_idx = (max_idx % num_gt_boxes).long()
anc_idx = (max_idx / num_gt_boxes).long()
anchors_bbox_map[anc_idx] = box_idx
jaccard[:, box_idx] = col_discard
jaccard[anc_idx, :] = row_discard
return anchors_bbox_map
计算锚框与其对应边界框的偏移值,计算公式如下
给定框
A
A
A和
B
B
B,中心坐标分别为
(
x
a
,
y
a
)
(x_a, y_a)
(xa,ya)和
(
x
b
,
y
b
)
(x_b, y_b)
(xb,yb),宽度分别为
w
a
w_a
wa和
w
b
w_b
wb,高度分别为
h
a
h_a
ha和
h
b
h_b
hb,可以将
A
A
A的偏移量标记为:
(
x
b
−
x
a
w
a
−
μ
x
σ
x
,
y
b
−
y
a
h
a
−
μ
y
σ
y
,
log
w
b
w
a
−
μ
w
σ
w
,
log
h
b
h
a
−
μ
h
σ
h
)
,
\left( \frac{ \frac{x_b - x_a}{w_a} - \mu_x }{\sigma_x}, \frac{ \frac{y_b - y_a}{h_a} - \mu_y }{\sigma_y}, \frac{ \log \frac{w_b}{w_a} - \mu_w }{\sigma_w}, \frac{ \log \frac{h_b}{h_a} - \mu_h }{\sigma_h}\right),
(σxwaxb−xa−μx,σyhayb−ya−μy,σwlogwawb−μw,σhloghahb−μh),
其中常量的默认值为
μ
x
=
μ
y
=
μ
w
=
μ
h
=
0
,
σ
x
=
σ
y
=
0.1
\mu_x = \mu_y = \mu_w = \mu_h = 0, \sigma_x=\sigma_y=0.1
μx=μy=μw=μh=0,σx=σy=0.1 ,
σ
w
=
σ
h
=
0.2
\sigma_w=\sigma_h=0.2
σw=σh=0.2。
def offset_boxes(anchors, assigned_bb, eps=1e-6):
"""
对锚框偏移量的变换
计算锚框与其对应边界框的偏移值,计算方法遵循公式
"""
c_anc = d2l.box_corner_to_center(anchors)
c_assigned_bb = d2l.box_corner_to_center(assigned_bb)
offset_xy = 10 * (c_assigned_bb[:, :2] - c_anc[:, :2]) / c_anc[:, 2:]
offset_wh = 5 * torch.log(eps + c_assigned_bb[:, 2:] / c_anc[:, 2:])
offset = torch.cat([offset_xy, offset_wh], axis=1)
return offset
如果一个锚框没有被分配真实边界框,我们只需将锚框的类别标记为背景(background)。
背景类别的锚框通常被称为负类锚框,其余的被称为正类锚框。
我们使用真实边界框(labels参数)实现以下multibox_target函数,来标记锚框的类别和偏移量(anchors参数)。
此函数将背景类别的索引设置为零,然后将新类别的整数索引递增一。
def multibox_target(anchors, labels):
"""
使用真实边界框标记锚框
anchors: (批量大小, 锚框数量, 锚框坐标)
labels(真实边界框): (批量大小, 类别数量, 边界框类别id and 边界框坐标)
返回: (bbox_offset, bbox_mask, class_labels)
bbox_offset: anchor到真实值的偏移
bbox_mask: anchor是背景锚框还是对应真实边界框(0/1)
class_labels: anchor对应的类的标号
"""
batch_size, anchors = labels.shape[0], anchors.squeeze(0)
batch_offset, batch_mask, batch_class_labels = [], [], []
device, num_anchors = anchors.device, anchors.shape[0]
for i in range(batch_size):
label = labels[i, :, :] # 第i批次的真实边界框
anchors_bbox_map = assign_anchor_to_bbox(label[:, 1:], anchors, device) # 锚框与最接近的边界框的映射表
bbox_mask = ((anchors_bbox_map >= 0).float().unsqueeze(-1)).repeat(1, 4) # (锚框数量, 4),其中有边界框对应的值为全1,否则全0
# 将类标签和分配的边界框坐标初始化为零
class_labels = torch.zeros(num_anchors, dtype=torch.long, device=device) # 锚框对应的边界框id(初始化0)
assigned_bb = torch.zeros((num_anchors, 4), dtype=torch.float32, device=device) # 锚框对应的边界框坐标
# 使用真实边界框来标记锚框的类别
# 如果一个锚框没有被分配,标记其为背景(值为零)
indices_true = torch.nonzero(anchors_bbox_map >= 0) # 非背景锚框的id
bb_idx = anchors_bbox_map[indices_true] # 非背景锚框id所对应的边界框id
class_labels[indices_true] = label[bb_idx, 0].long() + 1 # 类别id + 1 (因为0为背景)
assigned_bb[indices_true] = label[bb_idx, 1:]
# 偏移量转换
offset = offset_boxes(anchors, assigned_bb) * bbox_mask # 没有边界框对应的锚框,其所计算的偏移值会被bbox_mask消除掉
batch_offset.append(offset.reshape(-1))
batch_mask.append(bbox_mask.reshape(-1))
batch_class_labels.append(class_labels)
bbox_offset = torch.stack(batch_offset)
bbox_mask = torch.stack(batch_mask)
class_labels = torch.stack(batch_class_labels)
return (bbox_offset, bbox_mask, class_labels)
样例测试
ground_truth = torch.tensor([[0, 0.1, 0.08, 0.52, 0.92],
[1, 0.55, 0.2, 0.9, 0.88]])
anchors = torch.tensor([[0, 0.1, 0.2, 0.3], [0.15, 0.2, 0.4, 0.4],
[0.63, 0.05, 0.88, 0.98], [0.66, 0.45, 0.8, 0.8],
[0.57, 0.3, 0.92, 0.9]])
fig = d2l.plt.imshow(img)
show_bboxes(fig.axes, ground_truth[:, 1:] * bbox_scale, ['dog', 'cat'], 'k')
show_bboxes(fig.axes, anchors * bbox_scale, ['0', '1', '2', '3', '4'])
d2l.plt.show()
labels = multibox_target(anchors.unsqueeze(dim=0), ground_truth.unsqueeze(dim=0)) # 加一个维度(批量大小)
print(labels[2])
print(labels[1])
print(labels[0])
输出结果
tensor([[0, 1, 2, 0, 2]])
tensor([[0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 1.,
1., 1.]])
tensor([[-0.00e+00, -0.00e+00, -0.00e+00, -0.00e+00, 1.40e+00, 1.00e+01,
2.59e+00, 7.18e+00, -1.20e+00, 2.69e-01, 1.68e+00, -1.57e+00,
-0.00e+00, -0.00e+00, -0.00e+00, -0.00e+00, -5.71e-01, -1.00e+00,
4.17e-06, 6.26e-01]])
使用非极大值抑制预测边界框
在预测时,我们先为图像生成多个锚框,再为这些锚框预测类别和偏移量。我们通过offset_inverse函数,将锚框和偏移量预测作为输入,应用逆偏移变换来返回预测的边界框坐标。
def offset_inverse(anchors, offset_preds):
"""根据带有预测偏移量的锚框来预测边界框"""
anc = d2l.box_corner_to_center(anchors)
pred_bbox_xy = (offset_preds[:, :2] * anc[:, 2:] / 10) + anc[:, :2]
pred_bbox_wh = torch.exp(offset_preds[:, 2:] / 5) * anc[:, 2:]
pred_bbox = torch.cat((pred_bbox_xy, pred_bbox_wh), axis=1)
predicted_bbox = d2l.box_center_to_corner(pred_bbox)
return predicted_bbox
当有许多锚框时,可能会输出许多相似的具有明显重叠的预测边界框,都围绕着同一目标。 为了简化输出,我们使用贪心算法,即非极大值抑制(non-maximum suppression,NMS)合并属于同一目标的类似的预测边界框。
def nms(boxes, scores, iou_threshold):
"""对预测边界框的置信度进行排序(贪心)"""
B = torch.argsort(scores, dim=-1, descending=True) # 往后B一直是递减序列
keep = [] # 保留预测边界框的指标
while B.numel() > 0:
i = B[0]
keep.append(i)
if B.numel() == 1: break
iou = box_iou(boxes[i, :].reshape(-1, 4),
boxes[B[1:], :].reshape(-1, 4)).reshape(-1)
inds = torch.nonzero(iou <= iou_threshold).reshape(-1)
B = B[inds + 1] # 注意这里inds + 1是因为B[1:]相对于原始的B偏移了1
return torch.tensor(keep, device=boxes.device)
将非极大值抑制应用于预测边界框
def multibox_detection(cls_probs, offset_preds, anchors, nms_threshold=0.5, pos_threshold=0.009999999):
"""
使用非极大值抑制来预测边界框
cls_probs: (训练批次, 类别数量, 锚框对各类的预测概率)
offset_preds: (训练批次, 偏移值预测 * anchors.numel())
anchors: (训练批次, 锚框坐标)
"""
device, batch_size = cls_probs.device, cls_probs.shape[0]
anchors = anchors.squeeze(0)
num_classes, num_anchors = cls_probs.shape[1], cls_probs.shape[2]
out = []
for i in range(batch_size):
cls_prob, offset_pred = cls_probs[i], offset_preds[i].reshape(-1, 4)
conf, class_id = torch.max(cls_prob[1:], 0) # 忽略背景概率,返回每个锚框对边界框最大概率的概率和id(注意从1:)
predicted_bb = offset_inverse(anchors, offset_pred) # 根据锚框和预测偏移值反解出边界框
keep = nms(predicted_bb, conf, nms_threshold) # 根据nms将相似的锚框筛选掉
# 找到所有的non_keep索引,并将类设置为背景
all_idx = torch.arange(num_anchors, dtype=torch.long, device=device)
combined = torch.cat((keep, all_idx))
uniques, counts = combined.unique(return_counts=True)
non_keep = uniques[counts == 1]
all_id_sorted = torch.cat((keep, non_keep))
class_id[non_keep] = -1
class_id = class_id[all_id_sorted]
conf, predicted_bb = conf[all_id_sorted], predicted_bb[all_id_sorted]
# pos_threshold是一个用于非背景预测的阈值
below_min_idx = (conf < pos_threshold)
class_id[below_min_idx] = -1
conf[below_min_idx] = 1 - conf[below_min_idx]
pred_info = torch.cat((class_id.unsqueeze(1), conf.unsqueeze(1), predicted_bb), dim=1)
out.append(pred_info)
return torch.stack(out)
调用实例
anchors = torch.tensor([[0.1, 0.08, 0.52, 0.92], [0.08, 0.2, 0.56, 0.95],
[0.15, 0.3, 0.62, 0.91], [0.55, 0.2, 0.9, 0.88]])
offset_preds = torch.tensor([0] * anchors.numel()) # 偏移值预测设为0,即无偏
cls_probs = torch.tensor([[0] * 4, # 背景的预测概率
[0.9, 0.8, 0.7, 0.1], # 狗的预测概率
[0.1, 0.2, 0.3, 0.9]]) # 猫的预测概率
fig = d2l.plt.imshow(img)
show_bboxes(fig.axes, anchors * bbox_scale,
['dog=0.9', 'dog=0.8', 'dog=0.7', 'cat=0.9'])
d2l.plt.show()
output = multibox_detection(cls_probs.unsqueeze(dim=0),
offset_preds.unsqueeze(dim=0),
anchors.unsqueeze(dim=0),
nms_threshold=0.5)
print(output)
fig = d2l.plt.imshow(img)
for i in output[0].detach().numpy():
if i[0] == -1:
continue
label = ('dog=', 'cat=')[int(i[0])] + str(i[1])
show_bboxes(fig.axes, [torch.tensor(i[2:]) * bbox_scale], label)
d2l.plt.show()
全部代码如下:
# !/user/bin/env python3
# -*- coding: utf-8 -*-
import os
import torch
from d2l import torch as d2l
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
torch.set_printoptions(2) # 精简输出精度
def multibox_prior(data, sizes, ratios):
"""
生成以每个像素为中心具有不同形状的锚框
data:四维数据(批量大小,通道数,高,宽)
sizes:缩放比
ratios:宽高比
"""
in_height, in_width = data.shape[-2:]
device, num_sizes, num_ratios = data.device, len(sizes), len(ratios)
boxes_per_pixel = (num_sizes + num_ratios - 1) # 单个像素点的锚框数,只考虑(s1, [r1~rm])和([s1~sn], r1)
size_tensor = torch.tensor(sizes, device=device)
ratios_tensor = torch.tensor(ratios, device=device)
# 为了将锚点移动到像素的中心,需要设置偏移量
# 因为一个像素的高宽均为1,我们选择偏移我们的中心0.5
offset_h, offset_w = 0.5, 0.5
steps_h = 1.0 / in_height # 在y轴上缩放步长
steps_w = 1.0 / in_width # 在x轴上缩放步长
# 生成锚框的所有中心点
center_h = (torch.arange(in_height, device=device) + offset_h) * steps_h # (高 + 0.5) / 高
center_w = (torch.arange(in_width, device=device) + offset_w) * steps_w # (宽 + 0.5) / 宽
shift_y, shift_x = torch.meshgrid(center_h, center_w, indexing="ij") # 生成坐标,均为561*728的tensor
shift_y, shift_x = shift_y.reshape(-1), shift_x.reshape(-1) # 展平,大小为561*728=408408
# 生成“boxes_per_pixel”个高和宽
# 之后用于创建锚框的四角坐标(xmin,xmax,ymin,ymax)
# 根据李沐老师书中的计算结果,锚框的宽高w`和h`为hs*sqrt(r)和hs/sqrt(r)
# 但本人之后重算了一遍锚框宽为ws*sqrt(r),
# 不过本代码主要提供了一种思想,故代码采用原来的不变,
# 最后锚框的宽高均除以图片的宽高
w = torch.cat(
(size_tensor * torch.sqrt(ratios_tensor[0]), sizes[0] * torch.sqrt(ratios_tensor[1:]))) * in_height / in_width
h = torch.cat((size_tensor / torch.sqrt(ratios_tensor[0]), sizes[0] / torch.sqrt(ratios_tensor[1:]))) # w和h的大小均为5
# 除以2来获得半高和半宽
# 每个像素点5个锚框,一个锚框2个坐标,最终大小为(5*561*728, 4)=(2042040, 4)
anchor_manipulations = torch.stack((-w, -h, w, h)).T.repeat(in_height * in_width, 1) / 2
# 每个中心点都将有“boxes_per_pixel”个锚框
# 所以生成含有所有锚框中心的网格,重复了“boxes_per_pixel”次
out_grid = torch.stack([shift_x, shift_y, shift_x, shift_y], dim=1).repeat_interleave(boxes_per_pixel, dim=0) # (5*561*728, 4)
output = out_grid + anchor_manipulations # 坐标与锚框半高半宽相加得到锚框的左上和右下坐标
return output.unsqueeze(0)
img = d2l.plt.imread("../img/catdog.jpg") # 以561*728大小的图片为例
h, w = img.shape[:2]
print(h, w, img.shape) # 输出为: 561 728 (561, 728, 3)
X = torch.rand(size=(1, 3, h, w))
Y = multibox_prior(X, sizes=[0.75, 0.5, 0.25], ratios=[1, 2, 0.5])
print(Y.shape)
boxes = Y.reshape(h, w, 5, 4)
print(boxes[250, 250, 0, :])
def show_bboxes(axes, bboxes, labels=None, colors=None):
"""显示所有边界框"""
def _make_list(obj, default_values=None):
if obj is None:
obj = default_values
elif not isinstance(obj, (list, tuple)):
obj = [obj]
return obj
labels = _make_list(labels)
colors = _make_list(colors, ['b', 'g', 'r', 'm', 'c'])
for i, bbox in enumerate(bboxes):
color = colors[i % len(colors)]
rect = d2l.bbox_to_rect(bbox.detach().numpy(), color)
axes.add_patch(rect)
if labels and len(labels) > i:
text_color = 'k' if color == 'w' else 'w'
axes.text(rect.xy[0], rect.xy[1], labels[i],
va="center", ha="center", fontsize=9, color=text_color,
bbox=dict(facecolor=color, lw=0))
d2l.set_figsize()
bbox_scale = torch.tensor((w, h, w, h))
fig = d2l.plt.imshow(img)
show_bboxes(fig.axes, boxes[250, 250, :, :] * bbox_scale,
['s=0.75, r=1', 's=0.5, r=1', 's=0.25, r=1', 's=0.75, r=2', 's=0.75, r=0.5'])
d2l.plt.show()
def box_iou(boxes1, boxes2):
"""计算两个锚框或边界框列表中成对的交并比"""
box_area = lambda boxes: (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
# boxes1,boxes2,areas1,areas2的形状:
# boxes1:(boxes1的数量, 4)
# boxes2:(boxes2的数量, 4)
# areas1:(boxes1的数量,)
# areas2:(boxes2的数量,)
areas1 = box_area(boxes1)
areas2 = box_area(boxes2)
# inter_upperlefts,inter_lowerrights,inters的形状:
# (boxes1的数量,boxes2的数量,2)
inter_upperlefts = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # 广播
inter_lowerrights = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])
inters = (inter_lowerrights - inter_upperlefts).clamp(min=0) # 锚框不相交时,会导致交并比小于0
# inter_areas and union_areas的形状:(boxes1的数量,boxes2的数量)
inter_areas = inters[:, :, 0] * inters[:, :, 1]
union_areas = areas1[:, None] + areas2 - inter_areas
return inter_areas / union_areas
def assign_anchor_to_bbox(ground_truth, anchors, device, iou_threshold=0.5):
"""
将最接近的真实边界框分配给锚框
ground_truth: (真实边界框数量, 边界框坐标)
anchors: (锚框数量, 锚框坐标)
返回: 锚框到真实边界框的映射表
思路: 1.筛选出IoU值大于阈值的锚框
2.按从大到小顺序贪心的给每个边界框挑选锚框
"""
num_anchors, num_gt_boxes = anchors.shape[0], ground_truth.shape[0]
# 位于第i行和第j列的元素x_ij是锚框i和真实边界框j的IoU
jaccard = box_iou(anchors, ground_truth) # (边界框数量, 锚框数量)
# 对于每个锚框,分配的真实边界框的张量
anchors_bbox_map = torch.full((num_anchors,), -1, dtype=torch.long, device=device)
# 根据阈值,决定是否分配真实边界框
max_ious, indices = torch.max(jaccard, dim=1) # 对于每一个锚框,获取其对应最大交并比边界框id和交并比
anc_i = torch.nonzero(max_ious >= iou_threshold).reshape(-1) # 保留交并比大于阈值的锚框id
box_j = indices[max_ious >= iou_threshold] # 布尔索引
anchors_bbox_map[anc_i] = box_j # 整数索引
col_discard = torch.full((num_anchors,), -1) # 丢弃被选中元素所在列
row_discard = torch.full((num_gt_boxes,), -1) # 丢弃被选中元素所在行
for _ in range(num_gt_boxes):
max_idx = torch.argmax(jaccard)
box_idx = (max_idx % num_gt_boxes).long()
anc_idx = (max_idx / num_gt_boxes).long()
anchors_bbox_map[anc_idx] = box_idx
jaccard[:, box_idx] = col_discard
jaccard[anc_idx, :] = row_discard
return anchors_bbox_map
def offset_boxes(anchors, assigned_bb, eps=1e-6):
"""
对锚框偏移量的变换
计算锚框与其对应边界框的偏移值,计算方法遵循公式
"""
c_anc = d2l.box_corner_to_center(anchors)
c_assigned_bb = d2l.box_corner_to_center(assigned_bb)
offset_xy = 10 * (c_assigned_bb[:, :2] - c_anc[:, :2]) / c_anc[:, 2:]
offset_wh = 5 * torch.log(eps + c_assigned_bb[:, 2:] / c_anc[:, 2:])
offset = torch.cat([offset_xy, offset_wh], axis=1)
return offset
def multibox_target(anchors, labels):
"""
使用真实边界框标记锚框
anchors: (批量大小, 锚框数量, 锚框坐标)
labels(真实边界框): (批量大小, 类别数量, 边界框类别id and 边界框坐标)
返回: (bbox_offset, bbox_mask, class_labels)
bbox_offset: anchor到真实值的偏移
bbox_mask: anchor是背景锚框还是对应真实边界框(0/1)
class_labels: anchor对应的类的标号
"""
batch_size, anchors = labels.shape[0], anchors.squeeze(0)
batch_offset, batch_mask, batch_class_labels = [], [], []
device, num_anchors = anchors.device, anchors.shape[0]
for i in range(batch_size):
label = labels[i, :, :] # 第i批次的真实边界框
anchors_bbox_map = assign_anchor_to_bbox(label[:, 1:], anchors, device) # 锚框与最接近的边界框的映射表
bbox_mask = ((anchors_bbox_map >= 0).float().unsqueeze(-1)).repeat(1, 4) # (锚框数量, 4),其中有边界框对应的值为全1,否则全0
# 将类标签和分配的边界框坐标初始化为零
class_labels = torch.zeros(num_anchors, dtype=torch.long, device=device) # 锚框对应的边界框id(初始化0)
assigned_bb = torch.zeros((num_anchors, 4), dtype=torch.float32, device=device) # 锚框对应的边界框坐标
# 使用真实边界框来标记锚框的类别
# 如果一个锚框没有被分配,标记其为背景(值为零)
indices_true = torch.nonzero(anchors_bbox_map >= 0) # 非背景锚框的id
bb_idx = anchors_bbox_map[indices_true] # 非背景锚框id所对应的边界框id
class_labels[indices_true] = label[bb_idx, 0].long() + 1 # 类别id + 1 (因为0为背景)
assigned_bb[indices_true] = label[bb_idx, 1:]
# 偏移量转换
offset = offset_boxes(anchors, assigned_bb) * bbox_mask # 没有边界框对应的锚框,其所计算的偏移值会被bbox_mask消除掉
batch_offset.append(offset.reshape(-1))
batch_mask.append(bbox_mask.reshape(-1))
batch_class_labels.append(class_labels)
bbox_offset = torch.stack(batch_offset)
bbox_mask = torch.stack(batch_mask)
class_labels = torch.stack(batch_class_labels)
return (bbox_offset, bbox_mask, class_labels)
def offset_inverse(anchors, offset_preds):
"""根据带有预测偏移量的锚框来预测边界框"""
anc = d2l.box_corner_to_center(anchors)
pred_bbox_xy = (offset_preds[:, :2] * anc[:, 2:] / 10) + anc[:, :2]
pred_bbox_wh = torch.exp(offset_preds[:, 2:] / 5) * anc[:, 2:]
pred_bbox = torch.cat((pred_bbox_xy, pred_bbox_wh), axis=1)
predicted_bbox = d2l.box_center_to_corner(pred_bbox)
return predicted_bbox
def nms(boxes, scores, iou_threshold):
"""对预测边界框的置信度进行排序(贪心)"""
B = torch.argsort(scores, dim=-1, descending=True) # 往后B一直是递减序列
keep = [] # 保留预测边界框的指标
while B.numel() > 0:
i = B[0]
keep.append(i)
if B.numel() == 1: break
iou = box_iou(boxes[i, :].reshape(-1, 4),
boxes[B[1:], :].reshape(-1, 4)).reshape(-1)
inds = torch.nonzero(iou <= iou_threshold).reshape(-1)
B = B[inds + 1] # 注意这里inds + 1是因为B[1:]相对于原始的B偏移了1
return torch.tensor(keep, device=boxes.device)
def multibox_detection(cls_probs, offset_preds, anchors, nms_threshold=0.5, pos_threshold=0.009999999):
"""
使用非极大值抑制来预测边界框
cls_probs: (训练批次, 类别数量, 锚框对各类的预测概率)
offset_preds: (训练批次, 偏移值预测 * anchors.numel())
anchors: (训练批次, 锚框坐标)
"""
device, batch_size = cls_probs.device, cls_probs.shape[0]
anchors = anchors.squeeze(0)
num_classes, num_anchors = cls_probs.shape[1], cls_probs.shape[2]
out = []
for i in range(batch_size):
cls_prob, offset_pred = cls_probs[i], offset_preds[i].reshape(-1, 4)
conf, class_id = torch.max(cls_prob[1:], 0) # 忽略背景概率,返回每个锚框对边界框最大概率的概率和id(注意从1:)
predicted_bb = offset_inverse(anchors, offset_pred) # 根据锚框和预测偏移值反解出边界框
keep = nms(predicted_bb, conf, nms_threshold) # 根据nms将相似的锚框筛选掉
# 找到所有的non_keep索引,并将类设置为背景
all_idx = torch.arange(num_anchors, dtype=torch.long, device=device)
combined = torch.cat((keep, all_idx))
uniques, counts = combined.unique(return_counts=True)
non_keep = uniques[counts == 1]
all_id_sorted = torch.cat((keep, non_keep))
class_id[non_keep] = -1
class_id = class_id[all_id_sorted]
conf, predicted_bb = conf[all_id_sorted], predicted_bb[all_id_sorted]
# pos_threshold是一个用于非背景预测的阈值
below_min_idx = (conf < pos_threshold)
class_id[below_min_idx] = -1
conf[below_min_idx] = 1 - conf[below_min_idx]
pred_info = torch.cat((class_id.unsqueeze(1), conf.unsqueeze(1), predicted_bb), dim=1)
out.append(pred_info)
return torch.stack(out)
if __name__ == "__main__":
ground_truth = torch.tensor([[0, 0.1, 0.08, 0.52, 0.92],
[1, 0.55, 0.2, 0.9, 0.88]])
anchors = torch.tensor([[0, 0.1, 0.2, 0.3], [0.15, 0.2, 0.4, 0.4],
[0.63, 0.05, 0.88, 0.98], [0.66, 0.45, 0.8, 0.8],
[0.57, 0.3, 0.92, 0.9]])
fig = d2l.plt.imshow(img)
show_bboxes(fig.axes, ground_truth[:, 1:] * bbox_scale, ['dog', 'cat'], 'k')
show_bboxes(fig.axes, anchors * bbox_scale, ['0', '1', '2', '3', '4'])
d2l.plt.show()
labels = multibox_target(anchors.unsqueeze(dim=0), ground_truth.unsqueeze(dim=0)) # 加一个维度(批量大小)
print(labels[2])
print(labels[1])
print(labels[0])
########################################
anchors = torch.tensor([[0.1, 0.08, 0.52, 0.92], [0.08, 0.2, 0.56, 0.95],
[0.15, 0.3, 0.62, 0.91], [0.55, 0.2, 0.9, 0.88]])
offset_preds = torch.tensor([0] * anchors.numel()) # 偏移值预测设为0,即无偏
cls_probs = torch.tensor([[0] * 4, # 背景的预测概率
[0.9, 0.8, 0.7, 0.1], # 狗的预测概率
[0.1, 0.2, 0.3, 0.9]]) # 猫的预测概率
fig = d2l.plt.imshow(img)
show_bboxes(fig.axes, anchors * bbox_scale,
['dog=0.9', 'dog=0.8', 'dog=0.7', 'cat=0.9'])
d2l.plt.show()
output = multibox_detection(cls_probs.unsqueeze(dim=0),
offset_preds.unsqueeze(dim=0),
anchors.unsqueeze(dim=0),
nms_threshold=0.5)
print(output)
fig = d2l.plt.imshow(img)
for i in output[0].detach().numpy():
if i[0] == -1:
continue
label = ('dog=', 'cat=')[int(i[0])] + str(i[1])
show_bboxes(fig.axes, [torch.tensor(i[2:]) * bbox_scale], label)
d2l.plt.show()