def extract_camera_features(
self,
x,
points,
radar_points,
camera2ego,
lidar2ego,
lidar2camera,
lidar2image,
camera_intrinsics,
camera2lidar,
img_aug_matrix,
lidar_aug_matrix,
img_metas,
gt_depths=None,
) -> torch.Tensor: # 返回一个 PyTorch 张量
B, N, C, H, W = x.size() # 从输入图像 x 中获取批次大小、相机数量、通道数、高度和宽度
x = x.view(B * N, C, H, W) # 将批次大小和相机数量合并,以便传入 backbone 进行特征提取
# 使用相机的 backbone(主干网络)提取特征
x = self.encoders["camera"]["backbone"](x)
# 使用 neck(特征融合模块)进一步处理特征
x = self.encoders["camera"]["neck"](x)
# 检查输出是否为张量(某些网络可能返回多输出,需要取第一个)
if not isinstance(x, torch.Tensor):
x = x[0] # 如果输出不是张量,取第一个输出
BN, C, H, W = x.size() # 获取经过 backbone 和 neck 处理后的特征的形状
x = x.view(B, int(BN / B), C, H, W) # 将特征张量的形状从 (B*N, C, H, W) 恢复为 (B, N, C, H, W)
# 使用 vtransform(视角变换模块)对特征进行进一步处理和融合
# vtransform 将不同传感器的信息(如点云、雷达、相机)进行融合
x = self.encoders["camera"]["vtransform"](
x,
points,
radar_points,
camera2ego,
lidar2ego,
lidar2camera,
lidar2image,
camera_intrinsics,
camera2lidar,
img_aug_matrix,
lidar_aug_matrix,
img_metas,
depth_loss=self.use_depth_loss,
gt_depths=gt_depths, # 真实深度标签(如果有)
)
return x # 返回处理后的融合特征张量
def voxelize(self, points, sensor):
# 初始化体素特征 (feats)、体素坐标 (coords) 和体素尺寸 (sizes) 的空列表
feats, coords, sizes = [], [], []
for k, res in enumerate(points): # 遍历每个批次的点云数据,逐一进行体素化处理
# 使用对应传感器(如 LiDAR 或雷达)的 voxelize 函数进行体素化
ret = self.encoders[sensor]["voxelize"](res)
if len(ret) == 3:
# hard voxelize # 如果返回结果长度为 3,则为 "hard voxelize"(带有尺寸信息)
f, c, n = ret # 分别是体素特征、体素坐标和体素尺寸
else: # 如果返回结果长度为 2,则为 "soft voxelize"(不带尺寸信息)
assert len(ret) == 2
f, c = ret
n = None
feats.append(f) # 将当前批次的体素特征和坐标添加到列表中
# 使用 F.pad 对体素坐标进行填充,在坐标的第一维度填入当前批次索引 k
# 这确保在多批次数据时,每个体素的批次信息得到保留
coords.append(F.pad(c, (1, 0), mode="constant", value=k))
if n is not None: # 如果尺寸信息不为空,则将其添加到尺寸列表
sizes.append(n)
feats = torch.cat(feats, dim=0) # 将所有批次的体素特征在第 0 维度拼接成一个张量
coords = torch.cat(coords, dim=0) # 将所有批次的体素坐标在第 0 维度拼接成一个张量
if len(sizes) > 0: # 如果存在尺寸信息,则将其在第 0 维度拼接成一个张量
sizes = torch.cat(sizes, dim=0)
if self.voxelize_reduce:
feats = feats.sum(dim=1, keepdim=False) / sizes.type_as(feats).view(
-1, 1
)
feats = feats.contiguous()
return feats, coords, sizes # 返回体素特征、体素坐标和尺寸信息
def extract_features(self, x, sensor) -> torch.Tensor:
# 将输入数据(点云或雷达数据)进行体素化(voxelization)
# 返回体素特征 feats、体素的坐标 coords 和体素的尺寸 sizes
feats, coords, sizes = self.voxelize(x, sensor)
# 根据体素的坐标计算批次大小(batch size)
# coords[-1, 0] 取最后一个体素的批次索引,并加 1 得到总批次大小
batch_size = coords[-1, 0] + 1
# 使用对应传感器的编码器(backbone)对体素特征进行处理
# 编码器将返回处理后的特征张量
x = self.encoders[sensor]["backbone"](feats, coords, batch_size, sizes=sizes)
return x # 返回处理后的特征张量
def forward_single(
self,
img,
points,
camera2ego,
lidar2ego,
lidar2camera,
lidar2image,
camera_intrinsics,
camera2lidar,
img_aug_matrix,
lidar_aug_matrix,
metas,
depths=None,
radar=None,
gt_masks_bev=None,
gt_bboxes_3d=None,
gt_labels_3d=None,
**kwargs,
):
features = [] # 用于存储不同传感器的特征
auxiliary_losses = {} # 用于存储辅助损失(如深度损失)
for sensor in ( # 遍历所有的传感器类型,根据训练模式顺序不同
self.encoders if self.training else list(self.encoders.keys())[::-1]
):
if sensor == "camera": # 如果传感器是相机
# 提取相机特征,并根据多个传感器间的关系进行融合
feature = self.extract_camera_features(
img,
points,
radar,
camera2ego,
lidar2ego,
lidar2camera,
lidar2image,
camera_intrinsics,
camera2lidar,
img_aug_matrix,
lidar_aug_matrix,
metas,
gt_depths=depths,
)
if self.use_depth_loss: # 如果启用了深度损失,将深度损失分离出来
feature, auxiliary_losses['depth'] = feature[0], feature[-1]
elif sensor == "lidar": # 如果传感器是 LiDAR
feature = self.extract_features(points, sensor)
elif sensor == "radar": # 如果传感器是雷达
feature = self.extract_features(radar, sensor)
else: # 如果传感器不支持,则抛出异常
raise ValueError(f"unsupported sensor: {sensor}")
features.append(feature) # 将每个传感器的特征添加到 features 列表中
# 如果不是训练模式,将特征反转,以避免内存占用过多(OOM)
if not self.training:
# avoid OOM
features = features[::-1]
# 如果存在 fuser(特征融合模块),则对特征进行融合
if self.fuser is not None:
_, x, _, _ = self.fuser(features)
else: # 如果没有 fuser,确保只存在一个特征输入
assert len(features) == 1, features
x = features[0]
batch_size = x.shape[0] # 获取批次大小
# 使用解码器的 backbone 和 neck 模块进一步处理特征
x = self.decoder["backbone"](x)
x = self.decoder["neck"](x)
# 如果是训练模式,计算损失
if self.training:
outputs = {} # 存储训练输出
for type, head in self.heads.items():
if type == "object": # for 3d object detection
pred_dict = head(x, metas) # 生成预测
losses = head.loss(gt_bboxes_3d, gt_labels_3d, pred_dict) # 计算损失
elif type == "map": # for 3d semantic segmentation
losses = head(x, gt_masks_bev) # 计算语义分割损失
else: # 如果类型不支持,抛出异常
raise ValueError(f"unsupported head: {type}")
for name, val in losses.items(): # 将损失和统计信息存储到输出字典中
if val.requires_grad:
outputs[f"loss/{type}/{name}"] = val * self.loss_scale[type]
else:
outputs[f"stats/{type}/{name}"] = val
if self.use_depth_loss:
if 'depth' in auxiliary_losses:
outputs["loss/depth"] = auxiliary_losses['depth']
else:
raise ValueError('Use depth loss is true, but depth loss not found')
return outputs # 返回训练输出的损失字典
# 如果不是训练模式,生成推理结果
else:
outputs = [{} for _ in range(batch_size)] # 为每个样本初始化一个空字典
for type, head in self.heads.items():
if type == "object":
pred_dict = head(x, metas) # 生成预测
bboxes = head.get_bboxes(pred_dict, metas) # 获取边界框结果
# 将每个样本的 3D 边界框结果存入输出字典
for k, (boxes, scores, labels) in enumerate(bboxes):
outputs[k].update(
{
"boxes_3d": boxes.to("cpu"),
"scores_3d": scores.cpu(),
"labels_3d": labels.cpu(),
}
)
elif type == "map":
logits = head(x) # 获取预测的分割掩码
for k in range(batch_size): # 将每个样本的 BEV 掩码结果存入输出字典
outputs[k].update(
{
"masks_bev": logits[k].cpu(),
"gt_masks_bev": gt_masks_bev[k].cpu(),
}
)
else: # 如果类型不支持,抛出异常
raise ValueError(f"unsupported head: {type}")
return outputs # 返回推理结果的字典列表
def forward(
self,
img,
points,
camera2ego, # 相机到自车(ego)坐标系的变换矩阵
lidar2ego, # LiDAR 到自车(ego)坐标系的变换矩阵
lidar2camera, # LiDAR 到相机坐标系的变换矩阵
lidar2image, # LiDAR 到图像像素坐标系的变换矩阵(用于将点云投影到图像上)
camera_intrinsics, # 相机的内参矩阵,用于将3D点映射到图像平面
camera2lidar, # 相机到 LiDAR 坐标系的变换矩阵
img_aug_matrix, # 图像增强变换矩阵(如数据增强操作后,对图像的变换)
lidar_aug_matrix, # 点云增强变换矩阵(如数据增强后的点云变换)
metas, # 元数据(meta information),可能包括样本的 ID、时间戳等信息
depths, # 深度信息(depth maps),通常从相机获得的深度估计数据
radar=None, # 可选的雷达数据(radar points),用于融合多传感器数据
gt_masks_bev=None, # 可选的鸟瞰视角(BEV)中的目标掩码,用于检测中的监督
gt_bboxes_3d=None, # 可选的 3D 边界框标签,用于训练时的监督
gt_labels_3d=None, # 可选的 3D 边界框对应的类别标签
**kwargs,
):
if isinstance(img, list): # 检查 img 是否为列表类型(不支持),如果是则抛出未实现的异常
raise NotImplementedError
else:
outputs = self.forward_single(
img,
points,
camera2ego,
lidar2ego,
lidar2camera,
lidar2image,
camera_intrinsics,
camera2lidar,
img_aug_matrix,
lidar_aug_matrix,
metas,
depths,
radar,
gt_masks_bev,
gt_bboxes_3d,
gt_labels_3d,
**kwargs,
)
return outputs
def get_voxel_and_fused_feature_teacher(
self,
img,
points,
camera2ego,
lidar2ego,
lidar2camera,
lidar2image,
camera_intrinsics, # 相机的内参矩阵,用于将3D点投影到图像平面
camera2lidar,
img_aug_matrix,
lidar_aug_matrix,
metas,
depths=None,
radar=None,
**kwargs,
):
features = []
for sensor in (
self.encoders if self.training else list(self.encoders.keys())[::-1]
):
if sensor == "camera":
feature = self.extract_camera_features(
img,
points,
radar,
camera2ego,
lidar2ego,
lidar2camera,
lidar2image,
camera_intrinsics,
camera2lidar,
img_aug_matrix,
lidar_aug_matrix,
metas,
gt_depths=depths,
)
if self.use_depth_loss:
feature = feature[0]
elif sensor == "lidar":
# feature = self.extract_features(points, sensor)
# 将 LiDAR 点云数据体素化(voxelization)
feats_tea, coords_tea, sizes_tea = self.voxelize(points, sensor)
batch_size_tea = coords_tea[-1, 0] + 1 # 计算批次大小(根据最后一个坐标中的批次索引)
feature = self.encoders[sensor]["backbone"](feats_tea, coords_tea, batch_size_tea, sizes=sizes_tea) # 使用 LiDAR 编码器提取特征
elif sensor == "radar":
feature = self.extract_features(radar, sensor)
else:
raise ValueError(f"unsupported sensor: {sensor}")
features.append(feature) # 将当前传感器提取的特征添加到特征列表中
if not self.training:
# avoid OOM
features = features[::-1]
if self.fuser is not None: # 如果存在 fuser 模块,则对多个传感器的特征进行融合
_, x, _, _ = self.fuser(features)
else: # 如果没有 fuser 模块,确保只有一个传感器特征,并直接使用
assert len(features) == 1, features
x = features[0] # 直接使用该传感器的特征
# 返回融合后的特征,以及 LiDAR 的体素化结果(用于后续任务)
return x, feats_tea, coords_tea, sizes_tea
def get_voxel_and_fused_feature_teacher(
self,
img,
points,
camera2ego,
lidar2ego,
lidar2camera,
lidar2image,
camera_intrinsics,
camera2lidar,
img_aug_matrix,
lidar_aug_matrix,
metas,
depths=None,
radar=None,
**kwargs,
):
features = []
# 遍历所有传感器,根据是否为训练模式决定遍历顺序
# 如果在训练模式下按默认顺序遍历;如果在推理模式下,则反转顺序
for sensor in (
self.encoders if self.training else list(self.encoders.keys())[::-1]
):
if sensor == "camera":
feature = self.extract_camera_features(
img,
points,
radar,
camera2ego,
lidar2ego,
lidar2camera,
lidar2image,
camera_intrinsics,
camera2lidar,
img_aug_matrix,
lidar_aug_matrix,
metas,
gt_depths=depths,
)
if self.use_depth_loss:
feature = feature[0]
elif sensor == "lidar":
# feature = self.extract_features(points, sensor)
feats_tea, coords_tea, sizes_tea = self.voxelize(points, sensor)
batch_size_tea = coords_tea[-1, 0] + 1
feature = self.encoders[sensor]["backbone"](feats_tea, coords_tea, batch_size_tea, sizes=sizes_tea)
elif sensor == "radar":
feature = self.extract_features(radar, sensor)
else:
raise ValueError(f"unsupported sensor: {sensor}")
features.append(feature)
if not self.training:
# avoid OOM
features = features[::-1]
if self.fuser is not None:
_, x, _, _ = self.fuser(features)
else:
assert len(features) == 1, features
x = features[0]
return x, feats_tea, coords_tea, sizes_tea