YOLO11代码的解读,在源代码上加入了注释,方便理解。
3.1 Detect类
ultralytics/nn/modules/head.py |
class Detect(nn.Module): """与YOLOv8使用的是同一个检测头""" # 下面是一些基础量定义,有英文注释 dynamic = False # force grid reconstruction # 暂时不用 export = False # export mode # 暂时不用 end2end = False # end2end # 暂时不用 max_det = 300 # max_det # 暂时不用 shape = None anchors = torch.empty(0) # init strides = torch.empty(0) # init def __init__(self, nc=80, ch=()): """检测头类初始化""" super().__init__() self.nc = nc # 类别数nc self.nl = len(ch) # 检测头层数nl, FPN结构3层 self.reg_max = 16 # dfl通道数reg_max # 检测头输出的每一个预测框信息数no=nc+4*reg_max # 4个ltrb偏移,即预测框的左、上、右、下4个边与网格中心点的偏移(在特征图上) self.no = nc + self.reg_max * 4 self.stride = torch.zeros(self.nl) #3层特征图在原图上的步长,初始化为0 # 检测头解耦为box回归(cv2模块)和cls分类(cv3模块) # 下面是构建cv2和cv3 c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], min(self.nc, 100)) # channels self.cv2 = nn.ModuleList( nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch ) self.cv3 = nn.ModuleList( nn.Sequential( nn.Sequential(DWConv(x, x, 3), Conv(x, c3, 1)), nn.Sequential(DWConv(c3, c3, 3), Conv(c3, c3, 1)), nn.Conv2d(c3, self.nc, 1), ) for x in ch ) # 定义dfl模块,dfl通道数>1时调用 self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity() if self.end2end: # 暂时不用 self.one2one_cv2 = copy.deepcopy(self.cv2) self.one2one_cv3 = copy.deepcopy(self.cv3) def forward(self, x): """检测头类前向通路计算""" if self.end2end: # 暂时不用 return self.forward_end2end(x) # 通过检测头cv2模块box和cv3模块cls计算,得到模型输出x # 模型输出x,含nl=3层: # 特征层1 shape=(nb, no, 80, 80)、特征层2 shape=(nb, no, 40, 40)、特征层3 shape=(nb, no, 20, 20) # 每次的训练、预测的批量为nb # 检测头输出的每一个预测框信息数no=nc +4*reg_max # nc为类别数,nc个类别预测值 # 4个ltrb偏移,即预测框的左、上、右、下4个边与网格中心点的偏移(在特征图上) # reg_max为dfl通道数,4个ltrb偏移中任意一个都有reg_max个偏移分布值 for i in range(self.nl): x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1) if self.training: # 如果是训练,直接返回模型输出x return x # 如果不是训练,需要进行dfl计算、预测框解码计算, # y包括: # dbox: shape=(nb, 4, 8400),原图上xyxy坐标 # cls: shape=(nb, nc, 8400),nc个类别概率 y = self._inference(x) # 如果不是训练,返回(y, x) return y if self.export else (y, x) def forward_end2end(self, x): """暂时不用""" def _inference(self, x): """dfl计算、预测框解码""" # 模型输出x的3层合并成x_cat,shape=(nb, no, 8400),8400=80*80+40*40+20*20 shape = x[0].shape # (nb, no, h, w) x_cat = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2) # self.shape第一次为None,需要下面计算 # dynanmic暂时不用 # self.anchors为特征图每一个网格的中心点坐标,shape=(2, 8400) # self.strides为特征图每一个网格在原图上的步长,shape=(1, 8400) if self.dynamic or self.shape != shape: self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5)) self.shape = shape if self.export and self.format in {"saved_model", "pb", "tflite", "edgetpu", "tfjs"}: # avoid TF FlexSplitV ops # 暂时不用 box = x_cat[:, : self.reg_max * 4] cls = x_cat[:, self.reg_max * 4 :] else: # 模型输出x_cat拆分为box、cls,box的shape=(nb, 4*reg_max, 8400)、cls的shape=(nb, nc, 8400) box, cls = x_cat.split((self.reg_max * 4, self.nc), 1) if self.export and self.format in {"tflite", "edgetpu"}: # 暂时不用 # Precompute normalization factor to increase numerical stability # See https://github.com/ultralytics/ultralytics/issues/7371 grid_h = shape[2] grid_w = shape[3] grid_size = torch.tensor([grid_w, grid_h, grid_w, grid_h], device=box.device).reshape(1, 4, 1) norm = self.strides / (self.stride[0] * grid_size) dbox = self.decode_bboxes(self.dfl(box) * norm, self.anchors.unsqueeze(0) * norm[:, :2]) else: # self.dfl(box)为dfl计算,box由shape=(nb, 4*reg_max, 8400)转为shape=(nb, 4, 8400) # self.decode_bboxes为预测框解码,ltrb偏移转为xyxy坐标 # ltrb偏移,即框的左、上、右、下4个边与网格中心点的偏移 # xyxy坐标,即框的xmin、ymin、xmax、ymax # self.strides表示将特征图上的xyxy坐标映射到原图上 dbox = self.decode_bboxes(self.dfl(box), self.anchors.unsqueeze(0)) * self.strides # dbox的shape=(nb, 4, 8400),原图上的xyxy坐标 # cls的shape=(nb, nc, 8400),经过sigmoid转为概率 return torch.cat((dbox, cls.sigmoid()), 1) def decode_bboxes(self, bboxes, anchors): """预测框解码""" # 调用dist2bbox函数,将ltrb偏移转为xyxy坐标,返回shape=(nb, 4, 8400) # bboxes为特征图每一个网格的ltrb偏移,shape=(nb, 4, 8400), # anchors为特征图每一个网格的中心点坐标,shape=(2, 8400) return dist2bbox(bboxes, anchors, xywh=not self.end2end, dim=1) |
3.2 DFL类
ultralytics/nn/modules/block.py |
class DFL(nn.Module): """ dfl提出的链接: Integral module of Distribution Focal Loss (DFL). Proposed in Generalized Focal Loss Generalized Focal Loss: Towards Efficient Representation Learning for Dense Object Detection | IEEE Journals & Magazine | IEEE Xplore """ def __init__(self, c1=16): """dfl初始化""" super().__init__() # dfl采用卷积计算,输入通道为reg_max,输出通道为1,卷积核1*1 self.conv = nn.Conv2d(c1, 1, 1, bias=False).requires_grad_(False) # 卷积核权值wi固定为0、1、…、reg_max-1 x = torch.arange(c1, dtype=torch.float) self.conv.weight.data[:] = nn.Parameter(x.view(1, c1, 1, 1)) self.c1 = c1 def forward(self, x): """dfl类的前向通路计算""" b, _, a = x.shape # dfl计算,box由x: shape=(nb, 4*reg_max, 8400)生成shape=(nb, 4, 8400) # reg_max个值softmax结果pi 与 固定卷积核权值wi 进行卷积,即Σpi*wi (i=0、1、…、reg_max-1) return self.conv(x.view(b, 4, self.c1, a).transpose(2, 1).softmax(1)).view(b, 4, a) |
3.3 dist2bbox函数
ultralytics/utils/tal.py |
def dist2bbox(distance, anchor_points, xywh=True, dim=-1): """框格式由ltrb偏移转为xywh或xyxy坐标 """ lt, rb = distance.chunk(2, dim) # 左、上偏移,右、下偏移 x1y1 = anchor_points – lt # xmin/ymin=中心点坐标–左、上偏移 x2y2 = anchor_points + rb # xmax/ymax=中心点坐标+右、下偏移 if xywh: # 如果xywh格式,还需要计算 c_xy = (x1y1 + x2y2) / 2 wh = x2y2 - x1y1 return torch.cat((c_xy, wh), dim) # xywh格式 return torch.cat((x1y1, x2y2), dim) # xyxy格式 |
3.4 Loss类
ultralytics/utils/loss.py |
class v8DetectionLoss: """检测损失类""" def __init__(self, model, tal_topk=10): """损失类初始化""" device = next(model.parameters()).device # get model device h = model.args # hyperparameters m = model.model[-1] # Detect() module self.bce = nn.BCEWithLogitsLoss(reduction="none") # BCE Loss self.hyp = h # 超参数 self.stride = m.stride # model strides,特征图在原图上的步长8、16、32 self.nc = m.nc # number of classes,类别数 self.no = m.nc + m.reg_max * 4 # 检测头输出的每一个预测框信息数 self.reg_max = m.reg_max # dfl通道数 self.device = device self.use_dfl = m.reg_max > 1 # dfl模式 # 正负样本动态分配策略 self.assigner = TaskAlignedAssigner(topk=tal_topk, num_classes=self.nc, alpha=0.5, beta=6.0) self.bbox_loss = BboxLoss(m.reg_max).to(device) # Box Loss self.proj = torch.arange(m.reg_max, dtype=torch.float, device=device) # 0、1、…、reg_max-1 def preprocess(self, targets, batch_size, scale_tensor): # targets进行预处理,即padding到最大数量、并把框的坐标缩放到原图上、xywh坐标转xyxy坐标 # targets: shape=(ng, 6),ng表示批量图片中所有真实框的数目 6: [0]为batch_idx、[1]为cls、[2-5]为bboxes (xywh坐标) # 举例:targets: shape=(ng, 6),nb张批量图片中真实框数目最多是mg,则padding后shape=(nb, mg, 5), # 再经过xywh坐标转xyxy坐标,5: [0]为cls、[1-4]为bboxes (xyxy坐标) nl, ne = targets.shape # nl=ng,ne=6 if nl == 0: # 无真实框 out = torch.zeros(batch_size, 0, ne - 1, device=self.device) else: # 有nl=ng个真实框 i = targets[:, 0] # batch_idx _, counts = i.unique(return_counts=True) counts = counts.to(dtype=torch.int32) # padding out = torch.zeros(batch_size, counts.max(), ne - 1, device=self.device) for j in range(batch_size): matches = i == j n = matches.sum() if n: out[j, :n] = targets[matches, 1:] out[..., 1:5] = xywh2xyxy(out[..., 1:5].mul_(scale_tensor)) # xywh坐标转xyxy坐标, 坐标缩放到原图上 return out def bbox_decode(self, anchor_points, pred_dist): """dfl计算和预测框解码""" if self.use_dfl: b, a, c = pred_dist.shape # pred_dist: shape=(nb, 8400, 4*reg_max) # dfl计算,与上面的DFL类中dfl计算方法不一样,但实质是相同的 # softmax(3): 针对c // 4=reg_max的维度进行 # self.proj: 0、1、…、reg_max-1 # matmul: 点积运算,pred_dist=reg_max个值与self.proj点积,shape=(nb, 8400, 4) pred_dist = pred_dist.view(b, a, 4, c // 4).softmax(3).matmul(self.proj.type(pred_dist.dtype)) # 调用dist2bbox函数,将ltrb偏移转为xyxy坐标,返回shape=(nb, 8400, 4) # pred_dist为特征图每一个网格的ltrb偏移,shape=(nb, 8400, 4), # anchor_points为特征图每一个网格的中心点坐标,shape=(8400, 2) return dist2bbox(pred_dist, anchor_points, xywh=False) def __call__(self, preds, batch): """计算box、cls、dfl损失""" # preds: 为模型输出x,含nl=3层 # 特征层1 shape=(nb, no, 80, 80)、特征层2 shape=(nb, no, 40, 40)、特征层3 shape=(nb, no, 20, 20) # 每次的训练、预测的批量为nb # 检测头输出的每一个预测框信息数no=nc +4*reg_max # nc为类别数,nc个类别预测值 # 4个ltrb偏移,即预测框的左、上、右、下4个边与网格中心点的偏移(在特征图上) # reg_max为dfl通道数,4个ltrb偏移中任意一个都有reg_max个偏移分布值 # batch: 所有真实框所属图片的批量序号batch_idx # 所有真实框所属类别cls # 所有真实框xywh坐标bboxes # 所有图片的原图img、原始尺寸ori_shape、resize后尺寸resize_shape、文件路径im_file # # # loss初始化,box、cls、dfl loss = torch.zeros(3, device=self.device) # 如果是训练,feats=preds=x # 如果不是训练,prdes=(y, x),feats=prdes[1]=x # y包括: # dbox: shape=(nb, 4, 8400),原图上xyxy坐标 # cls: shape=(nb, nc, 8400),nc个类别概率 feats = preds[1] if isinstance(preds, tuple) else preds # feats拆分成pred_distri、pred_scores # pred_distri: shape=(nb, 4*reg_max, 8400),模型输出x:每个网格的4组reg_max个偏移分布值 # pred_scores: shape=(nb, nc, 8400),模型输出x:每个网格的nc个类别预测值 pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split( (self.reg_max * 4, self.nc), 1) # pred_distri: shape=(nb, 8400, 4*reg_max) # pred_scores: shape=(nb, 8400, nc) pred_scores = pred_scores.permute(0, 2, 1).contiguous() pred_distri = pred_distri.permute(0, 2, 1).contiguous() # # dtype = pred_scores.dtype # 类别预测值类型 batch_size = pred_scores.shape[0] # nb imgsz = torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) * self.stride[0] # 原图尺寸 # anchor_points为特征图每一个网格的中心点坐标,shape=(8400, 2) # stride_tensor为特征图每一个网格在原图上的步长,shape=(8400, 1) anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5) # batch转成targets # targets: shape=(ng, 6),ng表示批量图片中所有真实框的数目 [0]为batch_idx、[1]为cls、[2-5]为bboxes (xywh坐标) # targets进行预处理,即padding到最大数量、并把框的坐标缩放到原图上、xywh坐标转xyxy坐标 # 举例:targets: shape=(ng, 6),nb批量图片中真实框数目最多是mg,则padding后shape=(nb, mg, 5), # 再经过xywh坐标转xyxy坐标,5: [0]为cls、[1-4]为bboxes (xyxy坐标) targets = self.preprocess(targets.to(self.device), batch_size, scale_tensor=imgsz[[1, 0, 1, 0]]) # targets分成gt_labels: shape=(nb, mg, 1)、gt_bboxe: shape=(nb, mg, 4),即为cls、bboxes gt_labels, gt_bboxes = targets.split((1, 4), 2) # 真实框标记 哪些框有目标、哪些框padding,mask_gt: shape=(nb, mg, 1) mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0.0) # dfl计算和预测框解码: # dfl计算,pred_distri由shape=(nb, 8400, 4*reg_max)转为shape=(nb, 8400, 4) #预测框解码,ltrb偏移转为xyxy坐标 # ltrb偏移,即框的左、上、右、下4个边与网格中心点的偏移 # xyxy坐标,即框的xmin、ymin、xmax、ymax # pred_bboxes: shape=( nb, 8400, 4),在特征图上 pred_bboxes = self.bbox_decode(anchor_points, pred_distri) # dfl_conf = pred_distri.view(batch_size, -1, 4, self.reg_max).detach().softmax(-1) # dfl_conf = (dfl_conf.amax(-1).mean(-1) + dfl_conf.amax(-1).amin(-1)) / 2 # 正负样本动态分配策略: # 首先,对于每个真实框,计算所有预测框与该真实框类别对应的分类分数; # 同时,计算预测框和真实框的IOU,将分类分数和IOU进行加权得到关联分类和回归的对齐分数; # 最后,选择前K个对齐分数大的预测框作为正样本,其余为负样本。 # 分配的真实框坐标 target_bboxes: shape=(nb, 8400, 4) # 分配的真实框类别概率(分数) target_scores: shape=( nb, 8400, 4) # 分配的真实框标记 fg_mask: shape=( nb, 8400) # 原图上每个网格中心点坐标 anchor_points * stride_tensor: shape=( 8400, 2) # 原图上每个网格预测框xyxy坐标 pred_bboxes * stride_tensor: shape=( nb, 8400, 4) # 原图上每个网格预测框类别概率 pred_scores.sigmoid(): shape=( nb, 8400, nc) # 原图上真实框xyxy坐标 gt_bboxes: shape=(nb, mg, 4) # 原图上真实框cls类别 gt_labels: shape=(nb, mg, 1) # 真实框标记 哪些框有目标、哪些框padding,mask_gt: shape=(nb, mg, 1) _, target_bboxes, target_scores, fg_mask, _ = self.assigner( # pred_scores.detach().sigmoid() * 0.8 + dfl_conf.unsqueeze(-1) * 0.2, pred_scores.detach().sigmoid(), (pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype), anchor_points * stride_tensor, gt_labels, gt_bboxes, mask_gt, ) # 分配的真实框类别概率和的最大值 target_scores_sum = max(target_scores.sum(), 1) # cls损失计算 # loss[1] = self.varifocal_loss(pred_scores, target_scores, target_labels) / target_scores_sum # VFL way loss[1] = self.bce(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum # BCE # box损失计算 if fg_mask.sum(): target_bboxes /= stride_tensor loss[0], loss[2] = self.bbox_loss( pred_distri, pred_bboxes, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask ) # loss[0] *= self.hyp.box # box gain loss[1] *= self.hyp.cls # cls gain loss[2] *= self.hyp.dfl # dfl gain return loss.sum() * batch_size, loss.detach() # loss(box, cls, dfl) |
3.5 TaskAlignedAssigner类
ultralytics/utils/tal.py |
class TaskAlignedAssigner(nn.Module): # 类部分注释,函数不全 def __init__(self, topk=13, num_classes=80, alpha=1.0, beta=6.0, eps=1e-9): # 初始化 super().__init__() self.topk = topk self.num_classes = num_classes self.bg_idx = num_classes self.alpha = alpha self.beta = beta self.eps = eps @torch.no_grad() def forward(self, pd_scores, pd_bboxes, anc_points, gt_labels, gt_bboxes, mask_gt): # 原图上每个网格预测框类别概率 pd_scores: shape=( nb, 8400, nc) # 原图上每个网格预测框xyxy坐标 pd_bboxes: shape=( nb, 8400, 4) # 原图上每个网格中心点坐标 anc_points: shape=( 8400, 2) # 原图上真实框cls类别 gt_labels: shape=(nb, mg, 1) # 原图上真实框xyxy坐标 gt_bboxes: shape=(nb, mg, 4) # 真实框标记 哪些框有目标、哪些框padding,mask_gt: shape=(nb, mg, 1) self.bs = pd_scores.shape[0] # bs=nb self.n_max_boxes = gt_bboxes.shape[1] # max_boxes=mg,一张图片对应真实框的最大数目 if self.n_max_boxes == 0: # 无真实框 device = gt_bboxes.device return ( torch.full_like(pd_scores[..., 0], self.bg_idx).to(device), torch.zeros_like(pd_bboxes).to(device), torch.zeros_like(pd_scores).to(device), torch.zeros_like(pd_scores[..., 0]).to(device), torch.zeros_like(pd_scores[..., 0]).to(device), ) # mask_pos: shape=(nb, mg, 8400),在真实框内、是真实框topk最重合正样本、满足mask_gt的锚点 # align_metric: shape=(nb, mg, 8400), # 某个锚点属于某个真实框的类概率乘上某个锚点预测框与真实框的重合程度 # overlaps: shape=(nb, mg, 8400),所有真实框和锚点的重合程度 mask_pos, align_metric, overlaps = self.get_pos_mask( pd_scores, pd_bboxes, gt_labels, gt_bboxes, anc_points, mask_gt ) # target_gt_idx: shape=(nb, 8400),每个anchor符合哪个gt # fg_mask: shape=(nb, 8400),每个anchor是否有符合的gt # mask_pos: shape=(nb, mg, 8400),one_hot后的target_gt_idx target_gt_idx, fg_mask, mask_pos = self.select_highest_overlaps(mask_pos, overlaps, self.n_max_boxes) # Assigned target # 指定目标到对应的anchor点上 # target_labels: shape=(nb, 8400) # target_bboxes: shape=(nb, 8400, 4) # target_scores: shape=(nb, 8400, nc) target_labels, target_bboxes, target_scores = self.get_targets(gt_labels, gt_bboxes, target_gt_idx, fg_mask) # Normalize # 乘上mask_pos,把不满足真实框满足的锚点的都置0 align_metric *= mask_pos # pos_align_metrics: shape=(nb, mg, 1),每个真实框对应的最大得分 pos_align_metrics = align_metric.amax(dim=-1, keepdim=True) # b, max_num_obj # pos_overlaps: shape=(nb, mg, 1),每个真实框对应的最大重合度 pos_overlaps = (overlaps * mask_pos).amax(dim=-1, keepdim=True) # b, max_num_obj # 把每个真实框和先验点的得分乘上最大重合程度,再除上最大得分 norm_align_metric = (align_metric * pos_overlaps / (pos_align_metrics + self.eps)).amax(-2).unsqueeze(-1) # target_scores作为正则的标签 target_scores = target_scores * norm_align_metric # 返回 return target_labels, target_bboxes, target_scores, fg_mask.bool(), target_gt_idx def get_pos_mask(self, pd_scores, pd_bboxes, gt_labels, gt_bboxes, anc_points, mask_gt): # 原图上每个网格预测框类别概率 pd_scores: shape=(nb, 8400, nc) # 原图上每个网格预测框xyxy坐标 pd_bboxes: shape=(nb, 8400, 4) # 原图上每个网格中心点坐标 anc_points: shape=(8400, 2) # 原图上真实框xyxy坐标 gt_bboxes: shape=(nb, mg, 4) # 真实框标记 哪些框有目标、哪些框padding,mask_gt: shape=(nb, mg, 1) # mask_in_gts: shape=( nb, mg, 8400),8400网格某一网格值为1,表示此网格被真实框包围 mask_in_gts = self.select_candidates_in_gts(anc_points, gt_bboxes) # mask_in_gts * mask_gt: 去掉padding的被真实框包围的网格标记 # overlaps: shape=(nb, mg, 8400),计算真实框和预测框的ciou,重合度 # align_metric: shape=(nb, mg, 8400)是一个计算出来的代价值 # 某个锚点属于某个真实框的类概率乘上某个锚点预测框与真实框的重合程度 align_metric, overlaps = self.get_box_metrics(pd_scores, pd_bboxes, gt_labels, gt_bboxes, mask_in_gts * mask_gt) # 正样本锚点需要同时满足: # 在真实框内; # 是真实框topk最重合的正样本; # 满足mask_gt。 # mask_topk: shape=(nb, mg, 8400),锚点在真实框的topk中的标记 mask_topk = self.select_topk_candidates(align_metric, topk_mask=mask_gt.expand(-1, -1, self.topk).bool()) # mask_pos: shape=(nb, mg, 8400),真实框去掉padding mask_pos = mask_topk * mask_in_gts * mask_gt # 返回 return mask_pos, align_metric, overlaps def get_box_metrics(self, pd_scores, pd_bboxes, gt_labels, gt_bboxes, mask_gt): # 原图上每个网格预测框类别概率 pd_scores: shape=(nb, 8400, nc) # 原图上每个网格预测框xyxy坐标 pd_bboxes: shape=(nb, 8400, 4) # 原图上真实框cls类别(标注类别) gt_labels: shape=(nb, mg, 1) # 原图上真实框xyxy坐标 gt_bboxes: shape=(nb, mg, 4) # 去掉padding的被真实框包围的网格(即锚点)标记 mask_gt: shape=(nb, mg, 8400) na = pd_bboxes.shape[-2] # 8400 mask_gt = mask_gt.bool() # 锚点标记改为bool类型 # overlaps: shape=(nb, mg, 8400)重合度 # bbox_scores: shape=( nb, mg, 8400)框的类别得分 overlaps = torch.zeros([self.bs, self.n_max_boxes, na], dtype=pd_bboxes.dtype, device=pd_bboxes.device) bbox_scores = torch.zeros([self.bs, self.n_max_boxes, na], dtype=pd_scores.dtype, device=pd_scores.device) # ind: shape=(2, nb, mg) ind = torch.zeros([2, self.bs, self.n_max_boxes], dtype=torch.long) # 2, b, max_num_obj # ind[0]: shape=(nb, mg),批量中的图片序号,0行是mg个真实框的图片序号为0,… ind[0] = torch.arange(end=self.bs).view(-1, 1).expand(-1, self.n_max_boxes) # ind[1]: shape=(nb, mg),真实框cls类别,0行是序号0图片mg个真实框的cls类别,… ind[1] = gt_labels.squeeze(-1) # pd_scores: shape=(nb, 8400, nc) # 去掉padding的被真实框包围的网格(即锚点)标记 mask_gt: shape=(nb, mg, 8400) # bbox_scores: shape=( nb, mg, 8400): # 1 张图片 1个真实框 cls类别 对应的预测概率 1个网格 # 1 张 1个 8400个 # 1 张 mg个 8400个 # nb张 mg个 8400个 # bbox_scores[mask_gt]: shape=( nb, mg, 8400),nb个图片真实框在8400个网格的类别得分 bbox_scores[mask_gt] = pd_scores[ind[0], :, ind[1]][mask_gt] # pd_bboxes: shape=(nb, 8400, 4),扩展为shape=(nb, mg, 8400, 4),通过mask_gt取标记的网格 # mask_gt: shape=(nb, mg, 8400),nb*mg*8400数目中标记的个数为mmg # pd_boxes: shape=(mmg, 4) pd_boxes = pd_bboxes.unsqueeze(1).expand(-1, self.n_max_boxes, -1, -1)[mask_gt] # gt_bboxes: shape=(nb, mg, 4) ,扩展为shape=(nb, mg, 8400, 4),通过mask_gt取标记的网格 # gt_boxes: shape=(mmg, 4) gt_boxes = gt_bboxes.unsqueeze(2).expand(-1, -1, na, -1)[mask_gt] # 计算真实框和预测框的ciou,重合度 # overlaps: shape=(nb, mg, 8400) overlaps[mask_gt] = self.iou_calculation(gt_boxes, pd_boxes) # align_metric: shape=(nb, mg, 8400)是一个算出来的代价值 # 某个锚点属于某个真实框的类的概率乘上某个锚点预测框与真实框的重合程度 align_metric = bbox_scores.pow(self.alpha) * overlaps.pow(self.beta) return align_metric, overlaps @staticmethod def select_candidates_in_gts(xy_centers, gt_bboxes, eps=1e-9): # 原图上每个网格中心点坐标 xy_centers: shape=( 8400, 2) # 原图上真实框xyxy坐标 gt_bboxes: shape=(nb, mg, 4) n_anchors = xy_centers.shape[0] # 先验框(锚框)的个数8400 bs, n_boxes, _ = gt_bboxes.shape # bs=nb, n_boxes=mg lt, rb = gt_bboxes.view(-1, 1, 4).chunk(2, 2) # lt=(xmin, ymin),rb=(xmax, ymax) # 真实框4个边与每个网格中心点(先验点、锚点)的距离 # 如果真实框包围锚点,则4个距离都大于0 # bbox_deltas: shape=(nb, mg, 8400, 4),nb*mg个真实框 与 8400个锚点 的 4个距离 bbox_deltas = torch.cat((xy_centers[None]-lt, rb-xy_centers[None]), dim=2).view(bs, n_boxes, n_anchors, -1) # bbox_deltas.amin(3): shape=( nb, mg, 8400),4个距离取最小值 # 如果锚点最小距离都大于eps即大于0,则此锚点被真实框包围 # 返回shape=( nb, mg, 8400),值为1表示此网格被真实框包围 return bbox_deltas.amin(3).gt_(eps)
|