论文链接:BEVDepth: Acquisition of Reliable Depth for Multi-view 3D Object Detection
项目地址:https://github.com/Megvii-BaseDetection/BEVDepth
首先,克隆代码,并搭建所需环境:
step1:克隆代码,创建虚拟环境,激活环境,cd到相应的目录下
# 克隆代码
git clone https://github.com/Megvii-BaseDetection/BEVDepth.git
# 创建环境
conda create -n bevdepth python=3.7
# 激活环境
conda activate bevdepth
# cd到该文件夹中
cd BEVDepth/
数据集结构:
# 最后面三个文件,是运行gen_info.py脚本之后得到的
data
├── nuScenes
│ ├── maps
│ ├── samples
│ ├── sweeps
│ ├── v1.0-test
│ ├── v1.0-trainval
│ ├── nuscenes_infos_test.pkl
│ ├── nuscenes_infos_train.pkl
│ ├── nuscenes_infos_val.pkl
step2:根据README.md搭建环境,并初步处理数据
# 用pip下载要求的pytorch版本(注意对应的cuda版本不要超过本地cuda版本)
pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
# 下载mmcv,只需要注意两个地方本地的cuda版本和pytorch版本(如有不同,相应修改即可)
pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/{cu113}/{torch1.90}/index.htm
# 下载mmdet,不需要加版本号,自动匹配
pip install mmdet
# 下载mmsegemtation
pip install mmsegmentation
# 克隆mmdetection3d
git clone https://github.com/open-mmlab/mmdetection3d.git
# cd到mmdetection3d中
cd mmdetection3d
# 安装
pip install -v -e .
# 返回上一级目录
cd ..
# 安装相关依赖
pip install -r requirements.txt
# setup,大概看了一下,貌似是体素池化那个地方的并行操作,需要编译
python setup.py develop
# 这个链接方法好像行不通,不是很懂,跳过这步,直接根据数据集目录结构准备数据集
# ln -s [nuscenes root] ./data/
# 初步处理nuScenes
# 对这个脚本做了一些改动,将数据集路径改成自己的了
python scripts/gen_info.py
# depth gt现在可以在线生成,不用管
# python scripts/gen_depth_gt.py
读源码,整个代码实现使用pytorch_lightning进行封装
首先理一下继承关系:
bevdepth/exps/fusion
bevdepth/exps/mv
在上面两个文件夹中,有不同配置的模型实现脚本
选用bev_depth_fusion_lss_r50_256x704_128x128_24e.py进行断点测试
嵌套如下:
BEVDepthLightningModel
├── FusionBEVDepth
│ ├── FusionLSSFPN
│ │ ├── ResNet + SECONDFPN
│ │ ├── DepthNet
│ ├── BEVDepthHead
│ │ ├── GaussianFocalLoss + L1Loss
│ │ ├── DepthNet
图像特征提取:
配置文件如下
bev_backbone = dict(
type='ResNet',
in_channels=80,
depth=18,
num_stages=3,
strides=(1, 2, 2),
dilations=(1, 1, 1),
out_indices=[0, 1, 2],
norm_eval=False,
base_channels=160,
)
bev_neck = dict(type='SECONDFPN',
in_channels=[80, 160, 320, 640],
upsample_strides=[1, 2, 4, 8],
out_channels=[64, 64, 64, 64])
step1.模型构建
先看bev_depth_fusion_lss_r50_256x704_128x128_24e.py:
from bevdepth.exps.base_exp import BEVDepthLightningModel as BaseBEVDepthLightningModel
from bevdepth.models.fusion_bev_depth import FusionBEVDepth
class BEVDepthLightningModel(BaseBEVDepthLightningModel):
def __init__(self, *args, **kwargs)
super().__init__(*args, **kwargs)
self.model = FusionBEVDepth(self.backbone_conf, self.head_conf, is_train_depth=False)
self.use_fusion = True
def forward(self, sweep_imgs, mats, lidar_depth):
# 将sweep_imgs, mat, lidar_depth传入FusionBEVDepth中
# 返回结果
return self.model(sweep_imgs, mats, lidar_depth)
这里比较简单,主要涉及BEVDepthLightningModel和FusionBEVDepth
先扒一下FusionBEVDepth
1. FusionBEVDepth:
from bevdepth.layers.backbones.fusion_lss_fpn import FusionLSSFPN
from bevdepth.layers.heads.bev_depth_head import BEVDepthHead
from .base_bev_depth import BaseBEVDepth
class FusionBEVDepth(BaseBEVDepth):
# 从base_bev_depth中继承BaseBEVDepth
"""Source code of `BEVDepth`, `https://arxiv.org/abs/2112.11790`.
Args:
backbone_conf (dict): Config of backbone.
head_conf (dict): Config of head.
is_train_depth (bool): Whether to return depth.
Default: False.
"""
# TODO: Reduce grid_conf and data_aug_conf
def __init__(self, backbone_conf, head_conf, is_train_depth=False):
# 传入的配置文件继续给到FusionLSSFPN和BEVDepthHead
# 同时对init进行重写,主要是将self.backbone从BaseLSSFPN换成了FusionLSSFPN
# FusionLSSFPN也是从BaseLSSFPN继承下来的
# 这种继承一般是为了重写某些方法
super(BaseBEVDepth, self).__init__()
self.backbone = FusionLSSFPN(**backbone_conf)
self.head = BEVDepthHead(**head_conf)
self.is_train_depth = is_train_depth
def forward(
self,
x,
mats_dict,
lidar_depth,
timestamps=None,
):
"""Forward function for BEVDepth
Args:
x (Tensor): Input feature map.
mats_dict(dict):
sensor2ego_mats(Tensor): Transformation matrix from
camera to ego with shape of (B, num_sweeps,
num_cameras, 4, 4).
intrin_mats(Tensor): Intrinsic matrix with shape
of (B, num_sweeps, num_cameras, 4, 4).
ida_mats(Tensor): Transformation matrix for ida with
shape of (B, num_sweeps, num_cameras, 4, 4).
sensor2sensor_mats(Tensor): Transformation matrix
from key frame camera to sweep frame camera with
shape of (B, num_sweeps, num_cameras, 4, 4).
bda_mat(Tensor): Rotation matrix for bda with shape
of (B, 4, 4).
lidar_depth (Tensor): Depth generated by lidar.
timestamps (long): Timestamp.
Default: None.
Returns:
tuple(list[dict]): Output results for tasks.
"""
# 这里的前向传播比较简单,就是input -> backbone -> head -> output
if self.is_train_depth and self.training:
x = self.backbone(x, mats_dict, lidar_depth, timestamps)
preds = self.head(x)
return preds
else:
x = self.backbone(x, mats_dict, lidar_depth, timestamps)
preds = self.head(x)
return preds
主要涉及BaseBEVDepth, FusionLSSFPN和BEVDepthHead
1.1. FusionLSSFPN:
class DepthNet(nn.Module):
def __init__(self, in_channels, mid_channels, context_channels,
depth_channels):
super(DepthNet, self).__init__()
self.reduce_conv = nn.Sequential(
nn.Conv2d(in_channels,
mid_channels,
kernel_size=3,
stride=1,
padding=1),
nn.BatchNorm2d(mid_channels),
nn.ReLU(inplace=True),
)
self.context_conv = nn.Conv2d(mid_channels,
context_channels,
kernel_size=1,
stride=1,
padding=0)
self.mlp = Mlp(1, mid_channels, mid_channels)
self.se = SELayer(mid_channels) # NOTE: add camera-aware
self.depth_gt_conv = nn.Sequential(
nn.Conv2d(1, mid_channels, kernel_size=1, stride=1),
nn.ReLU(inplace=True),
nn.Conv2d(mid_channels, mid_channels, kernel_size=1, stride=1),
)
self.depth_conv = nn.Sequential(
BasicBlock(mid_channels, mid_channels),
BasicBlock(mid_channels, mid_channels),
BasicBlock(mid_channels, mid_channels),
)
self.aspp = ASPP(mid_channels, mid_channels)
self.depth_pred = nn.Conv2d(mid_channels,
depth_channels,
kernel_size=1,
stride=1,
padding=0)
class FusionLSSFPN(BaseLSSFPN):
# 继承BaseLSSFPN,没有继承init
# 配置DepthNet的方法,在BaseLSSFPN中定义好了,但两个DepthNet类定义的模型不一样
# 所以这里又将这个配置方法写了一遍,以用当前脚本中的DepthNet类来构建模型
def _configure_depth_net(self, depth_net_conf):
return DepthNet(
depth_net_conf['in_channels'],
depth_net_conf['mid_channels'],
self.output_channels,
self.depth_channels,
)
主要涉及BaseLSSFPN类和自定义的DepthNet类
1.1.1 BaseLSSFPN:
在LSS论文的代码实现基础上,做了一些改动,还没看forward方法,暂时不知道哪里做了改动
class BaseLSSFPN(nn.Module):
def __init__(self, x_bound, y_bound, z_bound, d_bound, final_dim,
downsample_factor, output_channels, img_backbone_conf,
img_neck_conf, depth_net_conf):
"""Modified from `https://github.com/nv-tlabs/lift-splat-shoot`.
Args:
x_bound (list): Boundaries for x.
y_bound (list): Boundaries for y.
z_bound (list): Boundaries for z.
d_bound (list): Boundaries for d.
final_dim (list): Dimension for input images.
downsample_factor (int): Downsample factor between feature map
and input image.
output_channels (int): Number of channels for the output
feature map.
img_backbone_conf (dict): Config for image backbone.
img_neck_conf (dict): Config for image neck.
depth_net_conf (dict): Config for depth net.
"""
super(BaseLSSFPN, self).__init__()
self.downsample_factor = downsample_factor
self.d_bound = d_bound
self.final_dim = final_dim
self.output_channels = output_channels
self.register_buffer(
'voxel_size',
torch.Tensor([row[2] for row in [x_bound, y_bound, z_bound]]))
self.register_buffer(
'voxel_coord',
torch.Tensor([
row[0] + row[2] / 2.0 for row in [x_bound, y_bound, z_bound]
]))
self.register_buffer(
'voxel_num',
torch.LongTensor([(row[1] - row[0]) / row[2]
for row in [x_bound, y_bound, z_bound]]))
self.register_buffer('frustum', self.create_frustum())
self.depth_channels, _, _, _ = self.frustum.shape
self.img_backbone = build_backbone(img_backbone_conf)
self.img_neck = build_neck(img_neck_conf)
self.depth_net = self._configure_depth_net(depth_net_conf)
self.img_neck.init_weights()
self.img_backbone.init_weights()
1.2. BEVDepthHead:
利用mmlab社区的工具箱进行搭建,继承的CenterHead是mmdetection3d的脚本
class BEVDepthHead(CenterHead):
"""Head for BevDepth.
Args:
in_channels(int): Number of channels after bev_neck.
tasks(dict): Tasks for head.
bbox_coder(dict): Config of bbox coder.
common_heads(dict): Config of head for each task.
loss_cls(dict): Config of classification loss.
loss_bbox(dict): Config of regression loss.
gaussian_overlap(float): Gaussian overlap used for `get_targets`.
min_radius(int): Min radius used for `get_targets`.
train_cfg(dict): Config used in the training process.
test_cfg(dict): Config used in the test process.
bev_backbone_conf(dict): Cnfig of bev_backbone.
bev_neck_conf(dict): Cnfig of bev_neck.
"""
def __init__(
self,
in_channels=256,
tasks=None,
bbox_coder=None,
common_heads=dict(),
loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
gaussian_overlap=0.1,
min_radius=2,
train_cfg=None,
test_cfg=None,
bev_backbone_conf=bev_backbone_conf,
bev_neck_conf=bev_neck_conf,
separate_head=dict(type='SeparateHead',
init_bias=-2.19,
final_kernel=3),
):
super(BEVDepthHead, self).__init__(
in_channels=in_channels,
tasks=tasks,
bbox_coder=bbox_coder,
common_heads=common_heads,
loss_cls=loss_cls,
loss_bbox=loss_bbox,
separate_head=separate_head,
)
self.trunk = build_backbone(bev_backbone_conf)
self.trunk.init_weights()
self.neck = build_neck(bev_neck_conf)
self.neck.init_weights()
del self.trunk.maxpool
self.gaussian_overlap = gaussian_overlap
self.min_radius = min_radius
self.train_cfg = train_cfg
self.test_cfg = test_cfg
1.3. BaseBEVDepth:
这个没什么好说的,FusionBEVDepth继承BaseBEVDepth,重写一些方法
class BaseBEVDepth(nn.Module):
"""Source code of `BEVDepth`, `https://arxiv.org/abs/2112.11790`.
Args:
backbone_conf (dict): Config of backbone.
head_conf (dict): Config of head.
is_train_depth (bool): Whether to return depth.
Default: False.
"""
# TODO: Reduce grid_conf and data_aug_conf
def __init__(self, backbone_conf, head_conf, is_train_depth=False):
# init被集成时进行了重写,具体看FusionBEVDepth
super(BaseBEVDepth, self).__init__()
self.backbone = BaseLSSFPN(**backbone_conf)
self.head = BEVDepthHead(**head_conf)
self.is_train_depth = is_train_depth
step2.模型传播:
先是FusionLSSFPN,然后再是BEVDepthHead
class BEVDepthLightningModel(BaseBEVDepthLightningModel):
def __init__(self, *args, **kwargs)
super().__init__(*args, **kwargs)
self.model = FusionBEVDepth(self.backbone_conf,
self.head_conf,
is_train_depth=False)
self.use_fusion = True
def forward(self, sweep_imgs, mats, lidar_depth):
return self.model(sweep_imgs, mats, lidar_depth)
class FusionBEVDepth(BaseBEVDepth):
def __init__(self, backbone_conf, head_conf, is_train_depth=False):
super(BaseBEVDepth, self).__init__()
self.backbone = FusionLSSFPN(**backbone_conf)
self.head = BEVDepthHead(**head_conf)
self.is_train_depth = is_train_depth
def forward(self, x, mats_dict, lidar_depth, timestamps=None):
if self.is_train_depth and self.training:
x = self.backbone(x, mats_dict, lidar_depth, timestamps)
preds = self.head(x)
return preds
else:
x = self.backbone(x, mats_dict, lidar_depth, timestamps)
preds = self.head(x)
return preds
2.1 先看backbone -> FusionLSSFPN类的forward函数:
def forward(self,
sweep_imgs,
mats_dict,
lidar_depth,
timestamps=None,
is_return_depth=False):
"""Forward function.
Args:
sweep_imgs(Tensor): Input images with shape of (B, num_sweeps,
num_cameras, 3, H, W).
mats_dict(dict):
sensor2ego_mats(Tensor): Transformation matrix from
camera to ego with shape of (B, num_sweeps,
num_cameras, 4, 4).
intrin_mats(Tensor): Intrinsic matrix with shape
of (B, num_sweeps, num_cameras, 4, 4).
ida_mats(Tensor): Transformation matrix for ida with
shape of (B, num_sweeps, num_cameras, 4, 4).
sensor2sensor_mats(Tensor): Transformation matrix
from key frame camera to sweep frame camera with
shape of (B, num_sweeps, num_cameras, 4, 4).
bda_mat(Tensor): Rotation matrix for bda with shape
of (B, 4, 4).
lidar_depth (Tensor): Depth generated by lidar.
timestamps(Tensor): Timestamp for all images with the shape of(B,
num_sweeps, num_cameras).
Return:
Tensor: bev feature map.
"""
# (batch_size, num_sweeps, num_cams, num_channels, img_height, img_width)
# (1, 1, 6, 3, 256, 704)
batch_size, num_sweeps, num_cams, num_channels, img_height, \
img_width = sweep_imgs.shape
# ladar_depth的shape是(1, 1, 6, 256, 704)
lidar_depth = self.get_downsampled_lidar_depth(lidar_depth)
key_frame_res = self._forward_single_sweep(
0,
sweep_imgs[:, 0:1, ...],
mats_dict,
lidar_depth[:, 0, ...],
is_return_depth=is_return_depth) # 输出是(1, 80, 128, 128)
# 由于num_sweeps == 1,直接返回key_frame_res
if num_sweeps == 1:
return key_frame_res
主要涉及self.get_downsampled_lidar_depth()和self._forward_single_sweep()
2.1.1 生成的lidar_depth传入self.get_downsampled_lidar_depth()方法进行处理:
def get_downsampled_lidar_depth(self, lidar_depth):
# (batch_size, num_sweeps, num_cams, height, width)
# (1, 1, 6, 256, 704)
batch_size, num_sweeps, num_cams, height, width = lidar_depth.shape
# 将lidar_depth的shape进行view操作
# downsample_factor在base_exp.py脚本中定义了,是16
lidar_depth = lidar_depth.view(
batch_size * num_sweeps * num_cams, # 将前三个维度进行压缩
height // self.downsample_factor, # 对图片高度进行下采样
self.downsample_factor,
width // self.downsample_factor, # 对图片高度进行下采样
self.downsample_factor,
1,
) # 输出是(6, 16, 16, 44, 16, 1)
# 将lidar_depth的shape进行permute操作
lidar_depth = lidar_depth.permute(0, 1, 3, 5, 2, 4).contiguous() # 输出是(6, 16, 44, 1, 16, 16)
# 再次将lidar_depth的shape进行view操作
lidar_depth = lidar_depth.view(
-1, self.downsample_factor * self.downsample_factor) # 输出是(4224, 256)
# 根据lidar_depth是否为0,进行合并操作,详见下方的链接
# 这里就是将lidar_depth中所有为0的数,都替换成lidar_depth.max(),最后返回值作为gt_depths_tmp
gt_depths_tmp = torch.where(lidar_depth == 0.0, lidar_depth.max(),
lidar_depth) # 输出是(4224, 256)
# 取出返回值中的values,即每一行中input张量的最小值
lidar_depth = torch.min(gt_depths_tmp, dim=-1).values
# 将lidar_depth进行view操作
lidar_depth = lidar_depth.view(batch_size, num_sweeps, num_cams, 1,
height // self.downsample_factor,
width // self.downsample_factor) # 输出是(1, 1, 6, 1, 16, 44)
# 将lidar_depth中的每个值除于self.d_bound[1]
lidar_depth = lidar_depth / self.d_bound[1] 输出是(1, 1, 6, 1, 16, 44)
return lidar_depth
按照一定的规则合并两个tensor类型
torch.where(condition, a, b)
如果满足condition,选择a作为输出,否则输出b
torch.min()方法:
torch.min(input, dim)
返回一个namedtuple(values, indices),其中values是给定dim中input的最小值,indices是最小值的索引
2.1.2 _forward_single_sweep()方法:
返回BEV特征图
def _forward_single_sweep(self,
sweep_index,
sweep_imgs,
mats_dict, # {dict:5}
sweep_lidar_depth, # shape:(1, 6, 1, 16, 44)
is_return_depth=False):
"""Forward function for single sweep.
Args:
sweep_index (int): Index of sweeps.
sweep_imgs (Tensor): Input images.
mats_dict (dict):
sensor2ego_mats(Tensor): Transformation matrix from
camera to ego with shape of (B, num_sweeps,
num_cameras, 4, 4).
intrin_mats(Tensor): Intrinsic matrix with shape
of (B, num_sweeps, num_cameras, 4, 4).
ida_mats(Tensor): Transformation matrix for ida with
shape of (B, num_sweeps, num_cameras, 4, 4).
sensor2sensor_mats(Tensor): Transformation matrix
from key frame camera to sweep frame camera with
shape of (B, num_sweeps, num_cameras, 4, 4).
bda_mat(Tensor): Rotation matrix for bda with shape
of (B, 4, 4).
sweep_lidar_depth (Tensor): Depth generated by lidar.
is_return_depth (bool, optional): Whether to return depth.
Default: False.
Returns:
Tensor: BEV feature map.
"""
# 和上面的get_downsampled_lidar_depth()方法一样,先得到一些参数
# (batch_size, num_sweeps, num_cams, num_channels, img_height, img_width)
# (1, 1, 6, 3, 256, 704)
batch_size, num_sweeps, num_cams, num_channels, img_height, \
img_width = sweep_imgs.shape
# 将进行过预处理的swpee_imgs传入get_cam_feats()方法中,返回得到img_feats
# img_feats的shape格式是(batch_size, num_sweeps, num_cams, num_channels, H, W)
img_feats = self.get_cam_feats(sweep_imgs) # 输出是(1, 1, 6, 512, 16, 44)
# 将传入的sweep_lidar_depth进行reshape()操作
# 传入的sweep_lidar_depth是lidar_depth[:, 0, ...],即得到第0个sweep中的所有元素,就是在0这个维度进行了降维
sweep_lidar_depth = sweep_lidar_depth.reshape(
batch_size * num_cams, *sweep_lidar_depth.shape[2:]) # 输出是(6, 1, 16, 44)
# 得到第0个sweep中的所有元素,就是在0这个维度进行了降维
# shape格式是(batch_size, num_cams, num_channels, H, W)
source_features = img_feats[:, 0, ...] # 输出是(1, 6, 512, 16, 44)
# 首先将前两维进行压缩,在送入_forward_depth_net()方法中
# 得到的depth_feature中,dim=1上前112是self.depth_channels,即深度的信息
depth_feature = self._forward_depth_net(
source_features.reshape(batch_size * num_cams,
source_features.shape[2],
source_features.shape[3],
source_features.shape[4]), mats_dict,
sweep_lidar_depth) # 输出是(6, 192, 16, 44)
# 得到depth
depth = depth_feature[:, :self.depth_channels].softmax(1) # 输出是(6, 112, 16, 44)
# 先将depth从(6, 112, 16, 44) -> (6, 1, 112, 16, 44)
# 再将depth_feature从(6, 192, 16, 44) -> (6, 80, 16, 44) -> (6, 80, 1, 16, 44)
img_feat_with_depth = depth.unsqueeze(
1) * depth_feature[:, self.depth_channels:(
self.depth_channels + self.output_channels)].unsqueeze(2) # 输出是(6, 80, 112, 16, 44)
img_feat_with_depth = self._forward_voxel_net(img_feat_with_depth) # 输出是(6, 80, 112, 16, 44)
img_feat_with_depth = img_feat_with_depth.reshape(
batch_size,
num_cams,
img_feat_with_depth.shape[1],
img_feat_with_depth.shape[2],
img_feat_with_depth.shape[3],
img_feat_with_depth.shape[4],
) # 输出是(1, 6, 80, 112, 16, 44)
geom_xyz = self.get_geometry(
mats_dict['sensor2ego_mats'][:, sweep_index, ...],
mats_dict['intrin_mats'][:, sweep_index, ...],
mats_dict['ida_mats'][:, sweep_index, ...],
mats_dict.get('bda_mat', None),
) # 输出是(1, 6, 112, 16, 44 ,3)
img_feat_with_depth = img_feat_with_depth.permute(0, 1, 3, 4, 5, 2) # 输出是(1, 6, 112, 16, 44, 80)
geom_xyz = ((geom_xyz - (self.voxel_coord - self.voxel_size / 2.0)) /
self.voxel_size).int() # 输出是(1, 6, 112, 16, 44 ,3)
feature_map = voxel_pooling(geom_xyz, img_feat_with_depth.contiguous(),
self.voxel_num.cuda()) # 输出是(1, 80, 128, 128)
if is_return_depth:
return feature_map.contiguous(), depth
return feature_map.contiguous()
此方法主要涉及self.get_cam_feats(), self._forward_depth_net(), self.get_geometry()和voxel_pooling
pytorch中的contiguous(): is_contiguous()和contiguous(),判断Tensor是否连续和将其变成连续(语义和内存同时相邻)
2.1.2.1 get_cam_feats()方法:
就是将imgs输入到backbone+neck中,返回特征图img_feats
def get_cam_feats(self, imgs):
"""Get feature maps from images."""
# imgs的shape格式是(batch_size, num_sweeps, num_cams, num_channels, imH, imW)
# (1, 1, 6, 3, 256, 704)
batch_size, num_sweeps, num_cams, num_channels, imH, imW = imgs.shape
# 先将imgs进行展平,再重新排列shape
imgs = imgs.flatten().view(batch_size * num_sweeps * num_cams,
num_channels, imH, imW) # 输出是(6, 3, 256, 704)
img_feats = self.img_neck(self.img_backbone(imgs))[0] # 输出是(6, 512, 16, 44)
# 将img_feats重新reshape成(batch_size, num_sweeps, num_cams, num_channels, H, W)格式
img_feats = img_feats.reshape(batch_size, num_sweeps, num_cams,
img_feats.shape[1], img_feats.shape[2],
img_feats.shape[3]) # 输出是(1, 1, 6, 512, 16, 44)
return img_feats
2.1.2.2 self._forward_depth_net()方法:
# 传入特征图,转换矩阵字典,激光雷达深度(sweep_lidar_depth)
# 特征图(6, 512, 16, 44)
depth_feature = self._forward_depth_net(source_features.reshape(
batch_size * num_cams, source_features.shape[2], source_features.shape[3], source_features.shape[4]),
mats_dict, sweep_lidar_depth)
里面涉及到的DepthNet,参考论文中的架构图(论文中叫Depth Correction,深度矫正)
整体forward流程是:
1.特征图经过两个分支的卷积得到一个80通道数的context和一个保持原来512通道数的特征图(这个分支的卷积核大小为3)
2.相机内参通过扩展和开根号等一系列操作,再通过MLP映射成(6, 512, 1, 1),然后通过SELayer与步骤1中的512通道数特征图进行融合,得到融合后的特征图
3.处理激光雷达深度信息,首先经过一个卷积,将其通道数那个维度(dim=1)放大到512(mid_channels),再将其与步骤2中融合后的特征图相加,并传入3*Residual Block中
4.进过DCN,即代码中的ASPP(Atrous Spatial Pyramid Pooling),空洞空间卷积池化金字塔,最后利用一个Conv2d(512, 112),得到depth输出,shape格式为(num_cameras, channels, H, W),具体为(6, 112, 16, 44)

# self.depth_net()方法继承自base_lss_fpn.py脚本
self.depth_net = self._configure_depth_net(depth_net_conf)
# fusion_lss_fpn.py脚本中重写了self._forward_depth_net()和self._configure_depth_net()方法
def _configure_depth_net(self, depth_net_conf):
return DepthNet(
depth_net_conf['in_channels'],
depth_net_conf['mid_channels'],
self.output_channels,
self.depth_channels,
)
# feat={Tensor:(6, 512, 16, 44)}, mats_dict={dict:5}, lidar_depth={Tensor(6, 1, 16, 44)}
def _forward_depth_net(self, feat, mats_dict, lidar_depth):
return self.depth_net(feat, mats_dict, lidar_depth)
# DepthNet()类中的前向传播函数
# fusion_lss_fpn.py脚本中重写了DepthNet类,故DepthNet是看fusion_lss_fpn.py脚本中的
def forward(self, x, mats_dict, lidar_depth, scale_depth_factor=1000.0):
x = self.reduce_conv(x) # 输出是(6, 512, 16, 44)
context = self.context_conv(x) # 输出是(6, 80, 16, 44)
# intrin_mats={Tensor:(B, num_sweeps, num_cameras, 4, 4)}
# 切片操作拿出第一帧的内参矩阵,shape格式依旧是(B, num_sweeps, num_cameras, 4, 4)
inv_intrinsics = torch.inverse(mats_dict['intrin_mats'][:, 0:1, ...]) # 输出是(1, 1, 6, 4, 4)
# 先是索引操作,shape格式为(B, num_sweeps, num_cameras)
# 然后stack(),输出是(1, 1, 6, 2)
# 再norm(),默认是Frobenius范数,在最后一维上操作,输出是(1, 1, 6)
# 最后reshape(-1, 1),即最后一维是1,第一维自动匹配
pixel_size = torch.norm(torch.stack(
[inv_intrinsics[..., 0, 0], inv_intrinsics[..., 1, 1]], dim=-1),
dim=-1).reshape(-1, 1) # 输出是(6, 1)
# ida_mats={Tensor:(B, num_sweeps, num_cameras, 4, 4)},即(1, 1, 6, 4, 4)
# sqrt()开根号操作,再reshape(-1, 1)
aug_scale = torch.sqrt(mats_dict['ida_mats'][:, 0, :, 0, 0]**2 +
mats_dict['ida_mats'][:, 0, :, 0,
0]**2).reshape(-1, 1) # 输出是(6, 1)
scaled_pixel_size = pixel_size * scale_depth_factor / aug_scale # 输出是(6, 1)
# self.mlp(scaled_pixel_size)的输出是(6, 512)
x_se = self.mlp(scaled_pixel_size)[..., None, None] # 输出是(6, 512, 1, 1)
# x={Tensor:(6, 512, 16, 44)},x_se={Tensor:(6, 512, 1, 1)}
x = self.se(x, x_se) # 输出是(6, 512, 16, 44)
# lidar_depth={Tensor:(6, 1, 16, 44)}
depth = self.depth_gt_conv(lidar_depth) # 输出是(6, 512, 16, 44)
# self.depth_conv是3*Residual Block
depth = self.depth_conv(x + depth) # 输出是(6, 512, 16, 44)
# ASPP,对应着图中的DCN
depth = self.aspp(depth) # 输出是(6, 512, 16, 44)
# self.depth_pred={Conv2d}Conv2d(512, 112)
depth = self.depth_pred(depth) # 输出是(6, 112, 16, 44)
# depth={Tensor:(6, 112, 16, 44)}, context={Tensor:(6, 80, 16, 44)}
return torch.cat([depth, context], dim=1) # 输出是(6, 192, 16, 44)
torch.norm()根据指定的范数和维度计算并返回
torch.stack()将相同shape的张量按传入的dim进行拼接
DepthNet网络具体实现:
根据代码来看,内参和特征图的融合是通过SELayer类实现的,传入的channels=mid_channels=512
class SELayer(nn.Module):
def __init__(self, channels, act_layer=nn.ReLU, gate_layer=nn.Sigmoid):
super().__init__()
self.conv_reduce = nn.Conv2d(channels, channels, 1, bias=True)
self.act1 = act_layer()
self.conv_expand = nn.Conv2d(channels, channels, 1, bias=True)
self.gate = gate_layer()
def forward(self, x, x_se):
# x={Tensor:(6, 512, 16, 44)}, x_se={Tensor:(6, 512, 1, 1)}
x_se = self.conv_reduce(x_se)
x_se = self.act1(x_se)
x_se = self.conv_expand(x_se) # 输出是(6, 512, 1, 1)
return x * self.gate(x_se) # 返回(6, 512, 16, 44)
python中tensor的乘法通过broad cast,即广播机制,将对应元素一一相乘
2.1.2.3 self.get_geometry():
ida, bda的含义,ida代表图像数据增强,bda代表BEV数据增强
def get_geometry(self, sensor2ego_mat, intrin_mat, ida_mat, bda_mat):
"""Transfer points from camera coord to ego coord.
Args:
rots(Tensor): Rotation matrix from camera to ego.
trans(Tensor): Translation matrix from camera to ego.
intrins(Tensor): Intrinsic matrix.
post_rots_ida(Tensor): Rotation matrix for ida.
post_trans_ida(Tensor): Translation matrix for ida
post_rot_bda(Tensor): Rotation matrix for bda.
Returns:
Tensors: points ego coord.
"""
# 传入的参数shape:
# sensor2ego_mat(1, 6, 4, 4), intrin_mat(1, 6, 4, 4), ida_mat(1, 6, 4, 4), bda_mat(1, 4, 4)
batch_size, num_cams, _, _ = sensor2ego_mat.shape
# undo post-transformation
# B x N x D x H x W x 3
points = self.frustum # 输出是(112, 16, 44, 4)
ida_mat = ida_mat.view(batch_size, num_cams, 1, 1, 1, 4, 4) # 输出是(1, 6, 1, 1, 1, 4, 4)
# 先将points变成(112, 16, 44, 4, 1)
points = ida_mat.inverse().matmul(points.unsqueeze(-1)) # 输出是(1, 6, 112, 16, 44, 4, 1)
# cam_to_ego
# points = frustum = torch.stack((x_coords, y_coords, d_coords, paddings))
# 先[x, y] * [d],再将得到的tensor与[d, paddings]在原来的dim上拼接起来
points = torch.cat(
(points[:, :, :, :, :, :2] * points[:, :, :, :, :, 2:3],
points[:, :, :, :, :, 2:]), 5) # 输出是(1, 6, 112, 16, 44, 4, 1)
# 先将intrin_mat转置,然后,sensor2ego_mat再与其进行矩阵乘法
combine = sensor2ego_mat.matmul(torch.inverse(intrin_mat)) # 输出是(1, 6, 4, 4)
# 先将combine变成(1, 6, 1, 1, 1, 4, 4),再与points进行矩阵乘法
points = combine.view(batch_size, num_cams, 1, 1, 1, 4,
4).matmul(points) # 输出是(1, 6, 112, 16, 44, 4, 1)
if bda_mat is not None:
# bda_mat=(1, 4, 4) -> (1, 1, 4, 4) -> (1, 6, 4, 4) -> (1, 6, 1, 1, 1, 4, 4)
bda_mat = bda_mat.unsqueeze(1).repeat(1, num_cams, 1, 1).view(
batch_size, num_cams, 1, 1, 1, 4, 4)
points = (bda_mat @ points).squeeze(-1) # 输出是(1, 6, 112, 16, 44, 4)
else:
points = points.squeeze(-1)
# 最后这个维度是(x_coords, y_coords, d_coords, paddings)组成的,所以这里是取前三个返回
return points[..., :3] # 输出是(1, 6, 112, 16, 44, 3)
self.frustum()方法,由self.register_buffer()方法和self.create_frustum()方法注册
# 根据self.create_frustum()返回的frustum注册成self.frustum
self.register_buffer('frustum', self.create_frustum())
torch.register_buffer()方法,self.register_buffer(‘name’,Tensor),该方法的作用在于定义一组参数。该组参数在模型训练时不会更新(即调用optimizer.step()后该组参数不会变化,只可人为地改变它们的值),但是该组参数又作为模型参数不可或缺的一部分。
def create_frustum(self):
"""Generate frustum"""
# make grid in image plane
# final_dim={tuple:2}(256, 704)
ogfH, ogfW = self.final_dim
# self.downsample_factor=16, (fH,fW)=(16, 44), self.d_bound={list:3}[2.0, 58.0, 0.5]
fH, fW = ogfH // self.downsample_factor, ogfW // self.downsample_factor
# 利用self.d_bound生成一个Tensor:(112,),再view成(112, 1, 1),最后拓展成(112, 16, 44)
d_coords = torch.arange(*self.d_bound,
dtype=torch.float).view(-1, 1,
1).expand(-1, fH, fW) # 输出是(112, 16, 44)
# D=112
D, _, _ = d_coords.shape
# 先生成一个0-704,长度为44的一维张量,即(44,),再view成(1, 1, 44),最后拓展成(112, 16, 44)
x_coords = torch.linspace(0, ogfW - 1, fW, dtype=torch.float).view(
1, 1, fW).expand(D, fH, fW) # 输出是(112, 16, 44)
y_coords = torch.linspace(0, ogfH - 1, fH,
dtype=torch.float).view(1, fH,
1).expand(D, fH, fW) # 输出是(112, 16, 44)
# 生成paddings,(112, 16, 44)
paddings = torch.ones_like(d_coords)
# D x H x W x 3(源代码中的注释,感觉是错的)
# 这里应该是在最后一个维度,叠加这四个张良,输出是(112, 16, 44, 4)
frustum = torch.stack((x_coords, y_coords, d_coords, paddings), -1)
return frustum
expand()方法拓展维度
torch.ones_like()方法生成与输入张量相同的全1张量
2.1.2.4 voxel_pooling()方法:
class VoxelPooling(Function):
@staticmethod
def forward(ctx, geom_xyz: torch.Tensor, input_features: torch.Tensor,
voxel_num: torch.Tensor)
"""Forward function for `voxel pooling.
Args:
geom_xyz (Tensor): xyz coord for each voxel with the shape
of [B, N, 3].
input_features (Tensor): feature for each voxel with the
shape of [B, N, C].
voxel_num (Tensor): Number of voxels for each dim with the
shape of [3].
Returns:
Tensor: (B, C, H, W) bev feature map.
"""
# geom_xyz={Tensor:(1, 6, 112, 16, 44, 3)}, input_features={Tensor:(1, 6, 112, 16, 44, 80)}, voxel_num={Tensor(3,)}
assert geom_xyz.is_contiguous()
assert input_features.is_contiguous()
# no gradient for input_features and geom_feats
# 这里没有改变geom_xyz的shape
ctx.mark_non_differentiable(geom_xyz)
# 得到grad_input_features,全0张量,shape和input_features一样
grad_input_features = torch.zeros_like(input_features)
# 压缩除了dim=0,-1之外所有的维度数据
geom_xyz = geom_xyz.reshape(geom_xyz.shape[0], -1, geom_xyz.shape[-1]) # 输出是(1, 473088, 3)
input_features = input_features.reshape(
(geom_xyz.shape[0], -1, input_features.shape[-1])) # 输出是(1, 473088, 80)
assert geom_xyz.shape[1] == input_features.shape[1]
batch_size = input_features.shape[0]
num_points = input_features.shape[1]
num_channels = input_features.shape[2]
output_features = input_features.new_zeros(batch_size, voxel_num[1],
voxel_num[0], num_channels) # 输出是(1, 128, 128, 80)
# Save the position of bev_feature_map for each input point.
pos_memo = geom_xyz.new_ones(batch_size, num_points, 3) * -1 # 输出是(1, 473088, 3)
voxel_pooling_ext.voxel_pooling_forward_wrapper(
batch_size,
num_points,
num_channels,
voxel_num[0],
voxel_num[1],
voxel_num[2],
geom_xyz,
input_features,
output_features,
pos_memo,
) # 输出是(1, 128, 128, 80)
# save grad_input_features and pos_memo for backward
ctx.save_for_backward(grad_input_features, pos_memo)
return output_features.permute(0, 3, 1, 2)
静态方法中的ctx 是用来保存变量的,以便在backward中调用
2.2 BEVDepthHead类中的forward函数:
def forward(self, x):
"""Forward pass.
Args:
feats (list[torch.Tensor]): Multi-level features, e.g.,
features produced by FPN.
Returns:
tuple(list[dict]): Output results for tasks.
"""
# FPN
trunk_outs = [x]
if self.trunk.deep_stem:
x = self.trunk.stem(x)
else:
x = self.trunk.conv1(x) # 输出是(1, 160, 64, 64)
x = self.trunk.norm1(x) # 输出是(1, 160, 64, 64)
x = self.trunk.relu(x)
for i, layer_name in enumerate(self.trunk.res_layers):
res_layer = getattr(self.trunk, layer_name)
x = res_layer(x)
if i in self.trunk.out_indices:
trunk_outs.append(x)
fpn_output = self.neck(trunk_outs)
ret_values = super().forward(fpn_output)
return ret_values
1254





