facebookresearch发表的检测数据库detectron2几乎涵盖了当下最新的各种检测代码。
1. 模型建立
def build_model(cfg):
"""
构建整个模型架构,由 ``cfg.MODEL.META_ARCHITECTURE`` 定义。
根据配置函数里面的内容,找到对应的函数,然后调用创建模型。
"""
meta_arch = cfg.MODEL.META_ARCHITECTURE
model = META_ARCH_REGISTRY.get(meta_arch)(cfg)
model.to(torch.device(cfg.MODEL.DEVICE))
_log_api_usage("modeling.meta_arch." + meta_arch)
return model
把META_ARCH_REGISTRY
打印出来看一下,是registry
类型,类似于字典,get
函数根据关键字取值
Registry of META_ARCH:
╒═══════════════════╤════════════════════════════════════════════════════════════════════════╕
│ Names │ Objects │
╞═══════════════════╪════════════════════════════════════════════════════════════════════════╡
│ GeneralizedRCNN │ <class 'detectron2.modeling.meta_arch.rcnn.GeneralizedRCNN'> │
├───────────────────┼────────────────────────────────────────────────────────────────────────┤
│ ProposalNetwork │ <class 'detectron2.modeling.meta_arch.rcnn.ProposalNetwork'> │
├───────────────────┼────────────────────────────────────────────────────────────────────────┤
│ SemanticSegmentor │ <class 'detectron2.modeling.meta_arch.semantic_seg.SemanticSegmentor'> │
├───────────────────┼────────────────────────────────────────────────────────────────────────┤
│ PanopticFPN │ <class 'detectron2.modeling.meta_arch.panoptic_fpn.PanopticFPN'> │
├───────────────────┼────────────────────────────────────────────────────────────────────────┤
│ RetinaNet │ <class 'detectron2.modeling.meta_arch.retinanet.RetinaNet'> │
╘═══════════════════╧════════════════════════════════════════════════════════════════════════╛
2. 配置文件
使用配置文件如下
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50
RPN:
BBOX_REG_LOSS_TYPE: "giou"
BBOX_REG_LOSS_WEIGHT: 2.0
ROI_BOX_HEAD:
BBOX_REG_LOSS_TYPE: "giou"
BBOX_REG_LOSS_WEIGHT: 10.0
Base-RCNN-FPN.yaml
如下
MODEL:
META_ARCHITECTURE: "GeneralizedRCNN"
BACKBONE:
NAME: "build_resnet_fpn_backbone"
RESNETS:
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
FPN:
IN_FEATURES: ["res2", "res3", "res4", "res5"]
ANCHOR_GENERATOR:
SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map
ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps)
RPN:
IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level
PRE_NMS_TOPK_TEST: 1000 # Per FPN level
# Detectron1 uses 2000 proposals per-batch,
# (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
# which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
POST_NMS_TOPK_TRAIN: 1000
POST_NMS_TOPK_TEST: 1000
ROI_HEADS:
NAME: "StandardROIHeads"
IN_FEATURES: ["p2", "p3", "p4", "p5"]
ROI_BOX_HEAD:
NAME: "FastRCNNConvFCHead"
NUM_FC: 2
POOLER_RESOLUTION: 7
ROI_MASK_HEAD:
NAME: "MaskRCNNConvUpsampleHead"
NUM_CONV: 4
POOLER_RESOLUTION: 14
DATASETS:
TRAIN: ("coco_2017_train",)
TEST: ("coco_2017_val",)
SOLVER:
IMS_PER_BATCH: 16
BASE_LR: 0.02
STEPS: (60000, 80000)
MAX_ITER: 90000
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
VERSION: 2
3. GeneralizedRCNN
注意到META_ARCHITECTURE
为GeneralizedRCNN
,则
model = GeneralizedRCNN.get(meta_arch)(cfg)
打印model
得到的结果看一下,如下,包含以下三个类
- backbone
- proposal_generator
- roi_heads
GeneralizedRCNN(
(backbone): FPN(...)
(proposal_generator): RPN(...)
(roi_heads): StandardROIHeads(
(box_pooler): ROIPooler(...)
(box_head): FastRCNNConvFCHead(...)
(box_predictor): FastRCNNOutputLayers(
(cls_score): Linear(in_features=1024, out_features=81, bias=True)
(bbox_pred): Linear(in_features=1024, out_features=320, bias=True)
)
)
)
GeneralizedRCNN
类的定义
class GeneralizedRCNN(nn.Module):
"""
包含以下三个内容:
1. 每张图像特征提取(又名主干)
2. 区域建议网络生成
3. 每个区域的特征提取和预测
"""
@configurable
def __init__(
self,
*,
backbone: Backbone,
proposal_generator: nn.Module,
roi_heads: nn.Module,
pixel_mean: Tuple[float],
pixel_std: Tuple[float],
input_format: Optional[str] = None,
vis_period: int = 0,
):
"""
Args:
backbone: a backbone module, must follow detectron2's backbone interface
proposal_generator: a module that generates proposals using backbone features
roi_heads: a ROI head that performs per-region computation
pixel_mean, pixel_std: list or tuple with #channels element, representing
the per-channel mean and std to be used to normalize the input image
input_format: describe the meaning of channels of input. Needed by visualization
vis_period: the period to run visualization. Set to 0 to disable.
"""
super().__init__()
self.backbone = backbone
self.proposal_generator = proposal_generator
self.roi_heads = roi_heads
self.input_format = input_format
self.vis_period = vis_period
if vis_period > 0:
assert input_format is not None, "input_format is required for visualization!"
self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
assert (
self.pixel_mean.shape == self.pixel_std.shape
), f"{self.pixel_mean} and {self.pixel_std} have different shapes!"
4. backbone
初始化backbone
的时候,会使用一个Backbone
类来初始化,其中仅定义了一些基本函数的名字。
class Backbone(nn.Module, metaclass=ABCMeta):
"""
网络主干的基类
"""
def __init__(self):
"""
The `__init__` method of any subclass can specify its own set of arguments.
"""
super().__init__()
@abstractmethod
def forward(self):
"""
Subclasses must override this method, but adhere to the same return type.
Returns:
dict[str->Tensor]: mapping from feature name (e.g., "res2") to tensor
"""
pass
@property
def size_divisibility(self) -> int:
"""
Some backbones require the input height and width to be divisible by a
specific integer. This is typically true for encoder / decoder type networks
with lateral connection (e.g., FPN) for which feature maps need to match
dimension in the "bottom up" and "top down" paths. Set to 0 if no specific
input size divisibility is required.
"""
return 0
def output_shape(self):
"""
Returns:
dict[str->ShapeSpec]
"""
# this is a backward-compatible default
return {
name: ShapeSpec(
channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
)
for name in self._out_features
}
而在调用from_config
函数的时候,会使用build_backbone
函数,用来构造backbone
,使用的类为
backbone_name = cfg.MODEL.BACKBONE.NAME
打印backbone_name
为
'build_resnet_fpn_backbone'
顾名思义,就是包含resnet
网络和fpn
网络的建立
4.1 BasicStem
resnet
中最基本的网络单元,包含一个卷积层、一个激活层和一个最大池化层
class BasicStem(CNNBlockBase):
"""
The standard ResNet stem (layers before the first residual block),
with a conv, relu and max_pool.
"""
def __init__(self, in_channels=3, out_channels=64, norm="BN"):
"""
Args:
norm (str or callable): norm after the first conv layer.
See :func:`layers.get_norm` for supported format.
"""
super().__init__(in_channels, out_channels, 4)
self.in_channels = in_channels
self.conv1 = Conv2d(
in_channels,
out_channels,
kernel_size=7,
stride=2,
padding=3,
bias=False,
norm=get_norm(norm, out_channels),
)
weight_init.c2_msra_fill(self.conv1)
def forward(self, x):
x = self.conv1(x)
x = F.relu_(x)
x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
return x
4.2 BasicBlock
一个基本的残差模块BasicBlock
如左图所示,包含两个 3x3 的卷积层和一个残差。
特别地,在每一个layer
的第一个BasicBlock
,为了调整输入通道和输出通道的深度相同,需要在shortcut
路径上增加一个1x1卷积层。
class BasicBlock(CNNBlockBase):
"""
The basic residual block for ResNet-18 and ResNet-34 defined in :paper:`ResNet`,
with two 3x3 conv layers and a projection shortcut if needed.
"""
def __init__(self, in_channels, out_channels, *, stride=1, norm="BN"):
"""
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
stride (int): Stride for the first conv.
norm (str or callable): normalization for all conv layers.
See :func:`layers.get_norm` for supported format.
"""
super().__init__(in_channels, out_channels, stride)
# 在每一个layer的第一个BasicBlock,为了调整输入通道和输出通道的深度相同
# 需要在shortcut路径上增加一个1x1卷积层
if in_channels != out_channels:
self.shortcut = Conv2d(
in_channels,
out_channels,
kernel_size=1,
stride=stride,
bias=False,
norm=get_norm(norm, out_channels),
)
else:
self.shortcut = None
self.conv1 = Conv2d(
in_channels,
out_channels,
kernel_size=3,
stride=stride,
padding=1,
bias=False,
norm=get_norm(norm, out_channels),
)
self.conv2 = Conv2d(
out_channels,
out_channels,
kernel_size=3,
stride=1,
padding=1,
bias=False,
norm=get_norm(norm, out_channels),
)
for layer in [self.conv1, self.conv2, self.shortcut]:
if layer is not None: # shortcut can be None
weight_init.c2_msra_fill(layer)
def forward(self, x):
out = self.conv1(x)
out = F.relu_(out)
out = self.conv2(out)
# 是否在shortcut路径上增加一个1x1卷积层
if self.shortcut is not None:
shortcut = self.shortcut(x)
else:
shortcut = x
out += shortcut
out = F.relu_(out)
return out
4.3 make_stage
利用BasicBlock
重复num_block
遍,构建下图中的每一个layer
def make_stage(block_class, num_blocks, *, in_channels, out_channels, **kwargs):
"""
Create a list of blocks of the same type that forms one ResNet stage.
Args:
block_class (type): a subclass of CNNBlockBase that's used to create all blocks in this
stage. A module of this type must not change spatial resolution of inputs unless its
stride != 1.
num_blocks (int): number of blocks in this stage
in_channels (int): input channels of the entire stage.
out_channels (int): output channels of **every block** in the stage.
kwargs: other arguments passed to the constructor of
`block_class`. If the argument name is "xx_per_block", the
argument is a list of values to be passed to each block in the
stage. Otherwise, the same argument is passed to every block
in the stage.
Returns:
list[CNNBlockBase]: a list of block module.
"""
blocks = []
# import pdb;pdb.set_trace()
# 添加num_block个basicblock
for i in range(num_blocks):
curr_kwargs = {}
for k, v in kwargs.items():
if k.endswith("_per_block"):
assert len(v) == num_blocks, (
f"Argument '{k}' of make_stage should have the "
f"same length as num_blocks={num_blocks}."
)
newk = k[: -len("_per_block")]
assert newk not in kwargs, f"Cannot call make_stage with both {k} and {newk}!"
curr_kwargs[newk] = v[i]
else:
curr_kwargs[k] = v
import pdb;pdb.set_trace()
blocks.append(
block_class(in_channels=in_channels, out_channels=out_channels, **curr_kwargs)
)
in_channels = out_channels
# import pdb;pdb.set_trace()
return blocks
4.4 ResNet
利用layer
构建完整的ResNet
网络:
第2到第5层卷积层:
def build_resnet_backbone(cfg, input_shape):
"""
Create a ResNet instance from config.
Returns:
ResNet: a :class:`ResNet` instance.
"""
# need registration of new blocks/stems?
# import pdb;pdb.set_trace()
norm = cfg.MODEL.RESNETS.NORM
stem = BasicStem(
in_channels=input_shape.channels,
out_channels=cfg.MODEL.RESNETS.STEM_OUT_CHANNELS,
norm=norm,
)
# fmt: off
freeze_at = cfg.MODEL.BACKBONE.FREEZE_AT
out_features = cfg.MODEL.RESNETS.OUT_FEATURES
depth = cfg.MODEL.RESNETS.DEPTH
num_groups = cfg.MODEL.RESNETS.NUM_GROUPS
width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
bottleneck_channels = num_groups * width_per_group
in_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS
out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
stride_in_1x1 = cfg.MODEL.RESNETS.STRIDE_IN_1X1
res5_dilation = cfg.MODEL.RESNETS.RES5_DILATION
deform_on_per_stage = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE
deform_modulated = cfg.MODEL.RESNETS.DEFORM_MODULATED
deform_num_groups = cfg.MODEL.RESNETS.DEFORM_NUM_GROUPS
# fmt: on
assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation)
num_blocks_per_stage = {
18: [2, 2, 2, 2],
34: [3, 4, 6, 3],
50: [3, 4, 6, 3],
101: [3, 4, 23, 3],
152: [3, 8, 36, 3],
}[depth]
stages = []
for idx, stage_idx in enumerate(range(2, 6)):
# res5_dilation is used this way as a convention in R-FCN & Deformable Conv paper
dilation = res5_dilation if stage_idx == 5 else 1
first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2
stage_kargs = {
"num_blocks": num_blocks_per_stage[idx],
"stride_per_block": [first_stride] + [1] * (num_blocks_per_stage[idx] - 1),
"in_channels": in_channels,
"out_channels": out_channels,
"norm": norm,
}
# Use BasicBlock for R18 and R34.
if depth in [18, 34]:
stage_kargs["block_class"] = BasicBlock
else:
stage_kargs["bottleneck_channels"] = bottleneck_channels
stage_kargs["stride_in_1x1"] = stride_in_1x1
stage_kargs["dilation"] = dilation
stage_kargs["num_groups"] = num_groups
if deform_on_per_stage[idx]:
stage_kargs["block_class"] = DeformBottleneckBlock
stage_kargs["deform_modulated"] = deform_modulated
stage_kargs["deform_num_groups"] = deform_num_groups
else:
stage_kargs["block_class"] = BottleneckBlock
blocks = ResNet.make_stage(**stage_kargs)
in_channels = out_channels
out_channels *= 2
bottleneck_channels *= 2
stages.append(blocks)
return ResNet(stem, stages, out_features=out_features, freeze_at=freeze_at)
前向传递函数:
def forward(self, x):
"""
Args:
x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
Returns:
dict[str->Tensor]: names and the corresponding features
"""
assert x.dim() == 4, f"ResNet takes an input of shape (N, C, H, W). Got {x.shape} instead!"
outputs = {}
# 第1层卷积层
x = self.stem(x)
if "stem" in self._out_features:
outputs["stem"] = x
# 第2到5层卷积层
for name, stage in zip(self.stage_names, self.stages):
x = stage(x)
if name in self._out_features:
outputs[name] = x
# 池化层和全连接层
if self.num_classes is not None:
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.linear(x)
if "linear" in self._out_features:
outputs["linear"] = x
return outputs
5. FPN
FPN网络示意图如下,代码中用lateral_convs
表示对特征图降维的卷积层,用output_convs
表示特征图下采样融合后的输出卷积层。
class FPN(Backbone):
"""
This module implements :paper:`FPN`.
It creates pyramid features built on top of some input feature maps.
"""
_fuse_type: torch.jit.Final[str]
def __init__(
self, bottom_up, in_features, out_channels, norm="", top_block=None, fuse_type="sum"
):
"""
Args:
bottom_up (Backbone): module representing the bottom up subnetwork.
Must be a subclass of :class:`Backbone`. The multi-scale feature
maps generated by the bottom up network, and listed in `in_features`,
are used to generate FPN levels.
in_features (list[str]): names of the input feature maps coming
from the backbone to which FPN is attached. For example, if the
backbone produces ["res2", "res3", "res4"], any *contiguous* sublist
of these may be used; order must be from high to low resolution.
out_channels (int): number of channels in the output feature maps.
norm (str): the normalization to use.
top_block (nn.Module or None): if provided, an extra operation will
be performed on the output of the last (smallest resolution)
FPN output, and the result will extend the result list. The top_block
further downsamples the feature map. It must have an attribute
"num_levels", meaning the number of extra FPN levels added by
this block, and "in_feature", which is a string representing
its input feature (e.g., p5).
fuse_type (str): types for fusing the top down features and the lateral
ones. It can be "sum" (default), which sums up element-wise; or "avg",
which takes the element-wise mean of the two.
"""
super(FPN, self).__init__()
assert isinstance(bottom_up, Backbone)
assert in_features, in_features
# 自下而上的网络(例如 ResNet)的特征图步幅和通道
input_shapes = bottom_up.output_shape()
strides = [input_shapes[f].stride for f in in_features]
in_channels_per_feature = [input_shapes[f].channels for f in in_features]
# import pdb;pdb.set_trace()
_assert_strides_are_log2_contiguous(strides)
lateral_convs = []
output_convs = []
use_bias = norm == ""
for idx, in_channels in enumerate(in_channels_per_feature):
lateral_norm = get_norm(norm, out_channels)
output_norm = get_norm(norm, out_channels)
# 对特征图降维的卷积层
lateral_conv = Conv2d(
in_channels, out_channels, kernel_size=1, bias=use_bias, norm=lateral_norm
)
# 特征图下采样融合后的输出卷积层
output_conv = Conv2d(
out_channels,
out_channels,
kernel_size=3,
stride=1,
padding=1,
bias=use_bias,
norm=output_norm,
)
weight_init.c2_xavier_fill(lateral_conv)
weight_init.c2_xavier_fill(output_conv)
stage = int(math.log2(strides[idx]))
self.add_module("fpn_lateral{}".format(stage), lateral_conv)
self.add_module("fpn_output{}".format(stage), output_conv)
lateral_convs.append(lateral_conv)
output_convs.append(output_conv)
# import pdb;pdb.set_trace()
# 将转换按自上而下的顺序(从低到高分辨率)放置,以使自上而下的计算向前更清晰。
self.lateral_convs = lateral_convs[::-1]
self.output_convs = output_convs[::-1]
# 该模块在原始FPN中用于从P5生成下采样的P6特征,也是第一个下采样模块
self.top_block = top_block
# 从backbone中获得的不同尺度的特征图
self.in_features = tuple(in_features)
# 之前获得的resnet_backbone
self.bottom_up = bottom_up
# 其余参数省略
6. Proposal_generator
打印配置文件中使用的Proposal_generator
name = cfg.MODEL.PROPOSAL_GENERATOR.NAME
PROPOSAL_GENERATOR_REGISTRY.get(name)(cfg, input_shape)
得到RPN
:
RPN(
(rpn_head): StandardRPNHead(
(conv): Conv2d(
256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)
(activation): ReLU()
)
(objectness_logits): Conv2d(256, 3, kernel_size=(1, 1), stride=(1, 1))
(anchor_deltas): Conv2d(256, 12, kernel_size=(1, 1), stride=(1, 1))
)
(anchor_generator): DefaultAnchorGenerator(
(cell_anchors): BufferList()
)
)
6.1 anchor_generator
根据面积size
和长宽比aspect_ratio
生成不同大小和形状的anchors
def generate_cell_anchors(self, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2)):
"""
生成一个存储规范anchors的张量,这些anchors都是以 (0, 0) 为中心的不同大小和 aspect_ratios 的anchors。
我们稍后可以通过移动和平铺这些张量来构建完整特征图的anchors集(参见`meth:_grid_anchors`)。
Args:
sizes (tuple[float]):
aspect_ratios (tuple[float]]):
Returns:
Tensor of shape (len(sizes) * len(aspect_ratios), 4) storing anchor boxes in XYXY format.
"""
anchors = []
for size in sizes:
area = size ** 2.0
for aspect_ratio in aspect_ratios:
# 根据面积size和长宽比aspect_ratio计算w和h
w = math.sqrt(area / aspect_ratio)
h = aspect_ratio * w
x0, y0, x1, y1 = -w / 2.0, -h / 2.0, w / 2.0, h / 2.0
anchors.append([x0, y0, x1, y1])
return torch.tensor(anchors)
并将获得的anchors
放置在缓冲区
class BufferList(nn.Module):
"""
Similar to nn.ParameterList, but for buffers
"""
def __init__(self, buffers):
super().__init__()
for i, buffer in enumerate(buffers):
# Use non-persistent buffer so the values are not saved in checkpoint
self.register_buffer(str(i), buffer, persistent=False)
def __len__(self):
return len(self._buffers)
def __iter__(self):
return iter(self._buffers.values())
6.2 rpn_head
如图所示,将backbone
得到的特征图首先通过一个3x3的卷积层
def _get_rpn_conv(self, in_channels, out_channels):
return Conv2d(
in_channels,
out_channels,
kernel_size=3,
stride=1,
padding=1,
activation=nn.ReLU(),
)
然后用两个1x1的卷积层分别预测分类损失和边界框回归损失
class StandardRPNHead(nn.Module):
"""
Standard RPN classification and regression heads described in :paper:`Faster R-CNN`.
Uses a 3x3 conv to produce a shared hidden state from which one 1x1 conv predicts
objectness logits for each anchor and a second 1x1 conv predicts bounding-box deltas
specifying how to deform each anchor into an object proposal.
"""
@configurable
def __init__(
self, *, in_channels: int, num_anchors: int, box_dim: int = 4, conv_dims: List[int] = (-1,)
):
super().__init__()
cur_channels = in_channels
# Keeping the old variable names and structure for backwards compatiblity.
# Otherwise the old checkpoints will fail to load.
if len(conv_dims) == 1:
out_channels = cur_channels if conv_dims[0] == -1 else conv_dims[0]
# 3x3 conv for the hidden representation
self.conv = self._get_rpn_conv(cur_channels, out_channels)
cur_channels = out_channels
else:
self.conv = nn.Sequential()
for k, conv_dim in enumerate(conv_dims):
out_channels = cur_channels if conv_dim == -1 else conv_dim
if out_channels <= 0:
raise ValueError(
f"Conv output channels should be greater than 0. Got {out_channels}"
)
conv = self._get_rpn_conv(cur_channels, out_channels)
self.conv.add_module(f"conv{k}", conv)
cur_channels = out_channels
# 1x1 conv 用于预测 objectness logits
self.objectness_logits = nn.Conv2d(cur_channels, num_anchors, kernel_size=1, stride=1)
# 1x1 conv 用于预测 box2box transform deltas
self.anchor_deltas = nn.Conv2d(cur_channels, num_anchors * box_dim, kernel_size=1, stride=1)
7. ROIHeads
roi_heads包含三个部分:
- box_pooler
- box_head
- box_predictor
(roi_heads): StandardROIHeads(
(box_pooler): ROIPooler(
(level_poolers): ModuleList(
(0): ROIAlign(output_size=(7, 7), spatial_scale=0.25, sampling_ratio=0, aligned=True)
(1): ROIAlign(output_size=(7, 7), spatial_scale=0.125, sampling_ratio=0, aligned=True)
(2): ROIAlign(output_size=(7, 7), spatial_scale=0.0625, sampling_ratio=0, aligned=True)
(3): ROIAlign(output_size=(7, 7), spatial_scale=0.03125, sampling_ratio=0, aligned=True)
)
)
(box_head): FastRCNNConvFCHead(
(flatten): Flatten(start_dim=1, end_dim=-1)
(fc1): Linear(in_features=12544, out_features=1024, bias=True)
(fc_relu1): ReLU()
(fc2): Linear(in_features=1024, out_features=1024, bias=True)
(fc_relu2): ReLU()
)
(box_predictor): FastRCNNOutputLayers(
(cls_score): Linear(in_features=1024, out_features=81, bias=True)
(bbox_pred): Linear(in_features=1024, out_features=320, bias=True)
)
)