CenterFusion训练过程中数据处理

原创已于 2023-04-16 17:32:33 修改 · 1.8k 阅读

27 ·

CC 4.0 BY-SA版权

文章标签：

#python #开发语言 #计算机视觉 #自动驾驶

于 2023-02-21 22:43:04 首次发布

毫米波雷达和图像融合的3D目标检测专栏收录该内容

7 篇文章

订阅专栏

部署运行你感兴趣的模型镜像

数据处理文件在centerfusion/src/lib/dataset/datasets/generic_dataset.py下GenericDataset类中
首先在经过convert_nuScenes.py后会生成coco格式的json文件，分别为train.json,val.json,mini_train.json,mini_val.json,test.json这五个文件（这里顺便提一下由于生成的train.json和val.json比较大所以在训练的时候需要比较多的内存来加载这两个数据集，如果是申请的云服务器或者docker需要多申请一些内存，应该在80g以上可以满足需求。这里调试过程使用的是mini_train.json和mini_val.json）
接下里主要分析__getitem__()这个类大约在99行左右

  def __getitem__(self, index):
    opt = self.opt
    img, anns, img_info, img_path = self._load_data(index)
    height, width = img.shape[0], img.shape[1]

首先介绍__load_data(self,index)函数，该函数根据传入的index来获取coco实例中的数据，在GenericDataset类的初始化中，已经用train.json和val.json初始化一个coco数据集对象。

 def _load_data(self, index):
    coco = self.coco                   #初始化的coco数据集对象，传入的就是生成的json文件后获得self.coco对象
    img_dir = self.img_dir             #nuScenes数据集存放的位置
    img_id = self.images[index]        #self.images保存着所有的图像等数据的索引，这里根据传入的索引用于之后获取图像等数据
    img, anns, img_info, img_path = self._load_image_anns(img_id, coco, img_dir)
    #调用函数来得到image,anns等信息
    return img, anns, img_info, img_path

def _load_image_anns(self, img_id, coco, img_dir):
    img_info = coco.loadImgs(ids=[img_id])[0] #根据上边的image_id获取coco实例中的一个样本,
     '''
      img_info的格式和意义如下：
      参考: https://blog.youkuaiyun.com/ssj925319/article/details/124391041
      'id'：图片的id（基于全部的相机图片）
      'file_name'：图片的文件名
      'calib'：当前传感器的内参 3×4 矩阵
      'video_id'：自定义的图片所属场景 scene 的 id （外键）
      'frame_id'：将当前相机传感器所拍摄的图片数量作为自定义 id
      'sensor_id'：自定义的传感器外键
      'sample_token'：sample 外键
      'trans_matrix'：传感器相对于全局坐标系姿态复合变化矩阵列表化
      'velocity_trans_matrix'：传感器相对于全局坐标系旋转矩阵列表化
      'width'：图片的宽度
      'height'：图片的高度
      'pose_record_trans'：车辆的原始 translation 数据
      'pose_record_rot'：车辆的原始四元素 rotation 数据
      'cs_record_trans'：传感器的原始 translation 数据
      'cs_record_rot'：传感器的原始 rotation 数据
      'radar_pc'：当前相机传感器所对应的完整雷达点云图列表化
      'camera_intrinsic'：box 在当前相机传感器的坐标框架列表化
    '''
    file_name = img_info['file_name']
    img_path = os.path.join(img_dir, file_name)
    ann_ids = coco.getAnnIds(imgIds=[img_id])
    anns = copy.deepcopy(coco.loadAnns(ids=ann_ids))
    '''
      ann 的格式和意义如下：
      参考: https://blog.youkuaiyun.com/ssj925319/article/details/124391041
      'id': 有效 box 的 id
      'image_id': 当前 box 所属的图片 id
      'category_id': box 所属类别的 id
      'dim': box 的长宽高
      'location': box 的中心点
      'depth': box 的深度
      'occluded': 暂时不知道
      'truncated': 暂时不知道
      'rotation_y': 绕相机坐标系 y 轴旋转（观测角）
      'amodel_center': box 在图像中的中心点位置
      'iscrowd': 道路是否拥挤，0 为不拥挤 1 为拥挤
      'track_id': 被检测目标物体 box 的 id
      'attributes': 目标物体状态的 id
      'velocity': 目标物体在全局坐标系的速度
      'velocity_cam': box 在相机坐标系中的速度
    '''
    img = cv2.imread(img_path)
    return img, anns, img_info, img_path

目前获得了img,anns,image_info(convent_nuscense.py生成内容),该index对应的图像的图像位置。
代码继续

    c = np.array([img.shape[1] / 2., img.shape[0] / 2.], dtype=np.float32)
    s = max(img.shape[0], img.shape[1]) * 1.0 if not self.opt.not_max_crop \
      else np.array([img.shape[1], img.shape[0]], np.float32)
    aug_s, rot, flipped = 1, 0, 0

    ## data augmentation for training set
    if 'train' in self.split:
      c, aug_s, rot = self._get_aug_param(c, s, width, height)
      s = s * aug_s
      if np.random.random() < opt.flip: #判断是否要对图像进行垂直翻转
        flipped = 1
        img = img[:, ::-1, :]
        anns = self._flip_anns(anns, width)

对于训练数据进行数据增强

def _get_aug_param(self, c, s, width, height, disturb=False):
    if (not self.opt.not_rand_crop) and not disturb:          
      aug_s = np.random.choice(np.arange(0.6, 1.4, 0.1))             
      w_border = self._get_border(128, width)
      h_border = self._get_border(128, height)
      c[0] = np.random.randint(low=w_border, high=width - w_border)
      c[1] = np.random.randint(low=h_border, high=height - h_border)
      '''
      如果对图像进行裁剪的话，首先生成一个新的aug_s（比例是一个小数），之后分别在宽高方向生成裁剪的大小，最后在裁剪的区域内生成新的中心点
      '''
    else:
      sf = self.opt.scale
      cf = self.opt.shift
      # if type(s) == float:
      #   s = [s, s]
      temp = np.random.randn()*cf
      c[0] += s * np.clip(temp, -2*cf, 2*cf)
      c[1] += s * np.clip(np.random.randn()*cf, -2*cf, 2*cf)
      aug_s = np.clip(np.random.randn()*sf + 1, 1 - sf, 1 + sf)
	  '''
	  如果不进行裁剪的话，直接在图像本身上生成新的中心点，以及aug_s（比例是一个小数）
	  '''
    if np.random.random() < self.opt.aug_rot:          #是否要进行旋转
      rf = self.opt.rotate
      rot = np.clip(np.random.randn()*rf, -rf*2, rf*2)
    else:
      rot = 0
    
    return c, aug_s, rot

通过上述获得了新的参数来对图像进行仿射变换
代码继续

	trans_input = get_affine_transform(
      c, s, rot, [opt.input_w, opt.input_h])
    trans_output = get_affine_transform(
      c, s, rot, [opt.output_w, opt.output_h])
    inp = self._get_input(img, trans_input)
    ret = {'image': inp}
    gt_det = {'bboxes': [], 'scores': [], 'clses': [], 'cts': []}

def get_affine_transform(center,
                         scale,
                         rot,
                         output_size,
                         shift=np.array([0, 0], dtype=np.float32),
                         inv=0):
    if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
        scale = np.array([scale, scale], dtype=np.float32)

    scale_tmp = scale
    src_w = scale_tmp[0]
    dst_w = output_size[0]
    dst_h = output_size[1]

    rot_rad = np.pi * rot / 180
    src_dir = get_dir([0, src_w * -0.5], rot_rad)                  #调用旋转矩阵，但是由于旋转角度为0所以不发生变化
    dst_dir = np.array([0, dst_w * -0.5], np.float32)

    src = np.zeros((3, 2), dtype=np.float32)
    dst = np.zeros((3, 2), dtype=np.float32)
    src[0, :] = center + scale_tmp * shift                         #第一个点 中心点加上平移
    src[1, :] = center + src_dir + scale_tmp * shift               #第二个点 中心点加上平移后在h方向向上平移
    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5], np.float32) + dst_dir

    src[2:, :] = get_3rd_point(src[0, :], src[1, :])               #第三个点 由第一和第二个点生成
    dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
    '''
    在原图和输入网络的图像中找三对对应点，从而生成仿射变换的矩阵，用于原始图像的数据增强
    首先对原图的中心点进行平移，记作c'
    第一个点：s'和目标图形的中心点d',坐标分别记为(x1,y1),(x2,y2),
    第二个点：第二个点的坐标为(x1,y1-800),(x2,y2-400)，由于目标图像相对于原始图像缩小了一倍所以这里对应的减去了一半所产生的点为(a1,b1)(a2,b2)
    第三个点：首先根据第一个点和第二个点两个向量相减生成一个方向，之后用第二个点的向量加上该方向生成第三个点
             对于原始图像，第三个点首先生成方向向量(a1-x1,b1-y1),之后再用第二个点加上上述方向向量调整后的值(a1-b1+y1,b1+a1-x1)
             对于目标图像也进行相同的操作
    之后根据上述三对点生成对应的仿射变换矩阵用于输入图像的数据增强
    '''
    if inv:
        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
    else:
        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))

    return trans

旋转矩阵可以参考：https://blog.youkuaiyun.com/ahelloyou/article/details/108903506
讲述的三维的，代码中使用的是二维的，只需要把上述文章中的第三列和第三章去掉就是代码中的公式
仿射变换函数可以参考：https://blog.youkuaiyun.com/weixin_45335726/article/details/122531876
需要输入原图像和目标图像的三对点就可以得到一个图像的变化函数

def _get_input(self, img, trans_input):
    inp = cv2.warpAffine(img, trans_input, 
                        (self.opt.input_w, self.opt.input_h),
                        flags=cv2.INTER_LINEAR)
    
    inp = (inp.astype(np.float32) / 255.)
    if 'train' in self.split and not self.opt.no_color_aug:
      color_aug(self._data_rng, inp, self._eig_val, self._eig_vec)
    inp = (inp - self.mean) / self.std
    inp = inp.transpose(2, 0, 1)
    '''
    对原始图像利用生成的仿射变换函数进行变换得到输入图像(800*448*3)
    之后对图像进行颜色增强在亮度，对比度，和饱和度上进行增强：
    1、首先用图像生成对应的灰度图
    2、用通过处理灰度图来和原始图像来对图像进行颜色增强，具体可以参考代码中的颜色增强函数
    最后将图像进行均值方差归一化后在将图像维度转换成(C,H,W)作为网络的输入
    '''
    return inp

cv2.warpAffine可以参考：https://blog.youkuaiyun.com/m0_51545690/article/details/123959995
至此，完成了对图像部分数据处理主要有一下几个步骤
1、通过利用coco实例化对象获取图像以及相关数据的信息
2、获取图像数据增强的相关参数：中心点c，尺度scale，旋转rotia和翻转flip
3、根据上述参数生成仿射矩阵来对图像进行仿射变换
4、生成的放射图像进行颜色增强

代码继续
接下来介绍点云处理部分

    #  load point cloud data
    if opt.pointcloud:
      pc_2d, pc_N, pc_dep, pc_3d = self._load_pc_data(img, img_info, 
        trans_input, trans_output, flipped)
      ret.update({ 'pc_2d': pc_2d,
                   'pc_3d': pc_3d,
                   'pc_N': pc_N,
                   'pc_dep': pc_dep })

首先对点与数据进行加载并处理

  def _load_pc_data(self, img, img_info, inp_trans, out_trans, flipped=0):
    #点云原始特征一共有18维
    #x y z dyn_prop id rcs vx vy vx_comp vy_comp is_quality_valid ambig_state x_rms y_rms invalid_state pdh0 vx_rms vy_rms
    #这里在加载点云数据的时候已经转换到了相机坐标系下，所以此时的第三维是深度信息，xy分别是相机坐标系下的坐标
    img_height, img_width = img.shape[0], img.shape[1]
    radar_pc = np.array(img_info.get('radar_pc', None)) #加载点云数据
    if radar_pc is None:
      return None, None, None, None

    # calculate distance to points
    depth = radar_pc[2,:]
    
    # filter points by distance
    if self.opt.max_pc_dist > 0:                  #过滤点云中距离过近的点
      mask = (depth <= self.opt.max_pc_dist)
      radar_pc = radar_pc[:,mask]
      depth = depth[mask]

    # add z offset to radar points
    if self.opt.pc_z_offset != 0:                #原始设置没有执行这一步
      radar_pc[1,:] -= self.opt.pc_z_offset
    
    # map points to the image and filter ones outside
    pc_2d, mask = map_pointcloud_to_image(radar_pc, np.array(img_info['camera_intrinsic']), 
                              img_shape=(img_info['width'],img_info['height']))
    '''
    这里主要注释一下，因为论文作者在进行数据转换的时候（convert_Nuscenes.py）就已经将雷达点转换到向相机坐标系中
    所以在这里只是需要把相机坐标系根据相机的内参转换到图像坐标系就可以了
    同时再转换过后有可能有的点不在图像范围内上述函数也将这些点给去掉了
    '''
    pc_3d = radar_pc[:,mask]

    # sort points by distance
    ind = np.argsort(pc_2d[2,:])
    pc_2d = pc_2d[:,ind]
    pc_3d = pc_3d[:,ind]

    # flip points if image is flipped
    if flipped:
      pc_2d = self._flip_pc(pc_2d,  img_width)
      pc_3d[0,:] *= -1  # flipping the x dimension
      pc_3d[8,:] *= -1  # flipping x velocity (x is right, z is front)

    pc_2d, pc_3d, pc_dep = self._process_pc(pc_2d, pc_3d, img, inp_trans, out_trans, img_info)
    pc_N = np.array(pc_2d.shape[1])

def _process_pc(self, pc_2d, pc_3d, img, inp_trans, out_trans, img_info):    
    img_height, img_width = img.shape[0], img.shape[1]

    # transform points
    mask = None
    if len(self.opt.pc_feat_lvl) > 0:
      pc_feat, mask = self._transform_pc(pc_2d, out_trans, self.opt.output_w, self.opt.output_h)
      '''
      这里由于作者论文中是将雷达处理后的到的特征与图像中用于回归的特则进行拼接，
      所以这里的数据转换就是对投影到图像上的雷达点转换到输出特征维度上，生成与图像特征相同大小的雷达特征
      同时将转换后的雷达点不在特征宽高内的点进行过滤
      '''
      pc_hm_feat = np.zeros((len(self.opt.pc_feat_lvl), self.opt.output_h, self.opt.output_w), np.float32)
      #用于保存生成的雷达特征
    if mask is not None:
      pc_N = np.array(sum(mask))
      pc_2d = pc_2d[:,mask]
      pc_3d = pc_3d[:,mask]
    else:
      pc_N = pc_2d.shape[1]

    # create point cloud pillars
    if self.opt.pc_roi_method == "pillars":
      pillar_wh = self.create_pc_pillars(img, img_info, pc_2d, pc_3d, inp_trans, out_trans)

create_pc_pillars用于生成论文论文中的柱状

  def create_pc_pillars(self, img, img_info, pc_2d, pc_3d, inp_trans, out_trans):
    pillar_wh = np.zeros((2, pc_3d.shape[1]))
    boxes_2d = np.zeros((0,8,2))
    pillar_dim = self.opt.pillar_dims                 #保存柱状图形的长宽高 0 1 2分别表示h,w,l
    v = np.dot(np.eye(3), np.array([1,0,0]))
    ry = -np.arctan2(v[2], v[0])

    for i, center in enumerate(pc_3d[:3,:].T):
      # Create a 3D pillar at pc location for the full-size image
      box_3d = compute_box_3d(dim=pillar_dim, location=center, rotation_y=ry)
      '''
      为每个雷达点生成柱状体
      1、首先计算物体的在照相机坐标系下，物体的全局方向角，用于表示物体的前进方向
      （我感觉这里是想每个生成的柱状体根据这个角度进行旋转，但是貌似作者这里根据角度生成的是一个单位阵，并没有发生变化）
      2、根据该方位角生成一个三维的旋转矩阵
      3、根据柱状体每个维度上的长度分别生成xyz三个坐标的8个点,这里为每个点生成的柱状体都是一样的
      4、将生成的柱状体移动到雷达点上，生成柱状体的坐标与雷达的坐标相加
      通过上述过程得到了每个雷达点柱状体
      '''
      box_2d = project_to_image(box_3d, img_info['calib']).T  # [2x8]        
      '''
      由于生成柱状体的过程都是基于雷达三位点生成的也就是摄像机的坐标系下不是图像的坐标系，同时生成的是3d柱状体
      所以这里将3d柱状体投影到图像坐标系下，以上述雷达投影方式相同
      由于3d的柱状体是8个点所以投影下来就是2*8大小的矩阵
      '''
      ## save the box for debug plots
      if self.opt.debug:
        box_2d_img, m = self._transform_pc(box_2d, inp_trans, self.opt.input_w, 
                                            self.opt.input_h, filter_out=False)
        boxes_2d = np.concatenate((boxes_2d, np.expand_dims(box_2d_img.T,0)),0)

      # transform points
      box_2d_t, m = self._transform_pc(box_2d, out_trans, self.opt.output_w, self.opt.output_h)
      #将生成的柱状体转换和图像特征相同的维度
      if box_2d_t.shape[1] <= 1:
        continue

      # get the bounding box in [xyxy] format
      bbox = [np.min(box_2d_t[0,:]), 
              np.min(box_2d_t[1,:]), 
              np.max(box_2d_t[0,:]), 
              np.max(box_2d_t[1,:])] # format: xyxy
	 #这里由于投影下来是8个点，但是对于2d来说只需要四个点就够了，所以就去了最外围的点围成的最大的框来表示柱状体的投影特征
      # store height and width of the 2D box
      pillar_wh[0,i] = bbox[2] - bbox[0]
      pillar_wh[1,i] = bbox[3] - bbox[1]
    #这里删除了debug部分
    return pillar_wh

生成的部分结果图如下
在进行模型训练的时候加上–debug 4就可以显示所有的结果图

图1. 0pc_pillar_2d_ori

在这里插入图片描述

图2. 0pc_pillar_3d

代码接_process_pc函数

    # generate point cloud channels
    for i in range(pc_N-1, -1, -1):
      for feat in self.opt.pc_feat_lvl:
        point = pc_feat[:,i]
        depth = point[2]
        ct = np.array([point[0], point[1]])
        ct_int = ct.astype(np.int32)

        if self.opt.pc_roi_method == "pillars":
          wh = pillar_wh[:,i]
          b = [max(ct[1]-wh[1], 0), 
              ct[1], 
              max(ct[0]-wh[0]/2, 0), 
              min(ct[0]+wh[0]/2, self.opt.output_w)]
          b = np.round(b).astype(np.int32)
        
        elif self.opt.pc_roi_method == "hm":
          radius = (1.0 / depth) * self.opt.r_a + self.opt.r_b
          radius = gaussian_radius((radius, radius))
          radius = max(0, int(radius))
          x, y = ct_int[0], ct_int[1]
          height, width = pc_hm_feat.shape[1:3]
          left, right = min(x, radius), min(width - x, radius + 1)
          top, bottom = min(y, radius), min(height - y, radius + 1)
          b = np.array([y - top, y + bottom, x - left, x + right])
          b = np.round(b).astype(np.int32)
        
        if feat == 'pc_dep':
          channel = self.opt.pc_feat_channels['pc_dep']
          pc_hm_feat[channel, b[0]:b[1], b[2]:b[3]] = depth
        
        if feat == 'pc_vx':
          vx = pc_3d[8,i]
          channel = self.opt.pc_feat_channels['pc_vx']
          pc_hm_feat[channel, b[0]:b[1], b[2]:b[3]] = vx
        
        if feat == 'pc_vz':
          vz = pc_3d[9,i]
          channel = self.opt.pc_feat_channels['pc_vz']
          pc_hm_feat[channel, b[0]:b[1], b[2]:b[3]] = vz
	'''
	这里用于生成特征，特征的维度是3*112*200的，其中3个通道使用的雷达特征分别为pc_dep，pc_vx，pc_vz
	同时将框对应的位置在每个通道上用对应的属性进行赋值得到pc_hm_feat
	'''
    return pc_2d, pc_3d, pc_hm_feat

代码接_load_pc_data函数

    # pad point clouds with zero to avoid size mismatch error in dataloader
    n_points = min(self.opt.max_pc, pc_2d.shape[1])
    pc_z = np.zeros((pc_2d.shape[0], self.opt.max_pc))
    pc_z[:, :n_points] = pc_2d[:, :n_points]
    pc_3dz = np.zeros((pc_3d.shape[0], self.opt.max_pc))
    pc_3dz[:, :n_points] = pc_3d[:, :n_points]
	'''
	这里是为了防止在输入的时候尺度不一致导致出现问题，所以在这里将所有雷达点的个数都变成1000，
	前N个都是雷达点本身的特征，后边都是用0补齐的
	'''
    return pc_z, pc_N, pc_dep, pc_3dz

到这里完成了点云数据的处理，主要包括以下几个部分
1、从image_info中加载点云数据（这里的数据已经转换到相机坐标系）
2、将点云都应到图像坐标系，同时对图像外的点进行过滤
3、将对投影到图像上的雷达点转换到输出特征维度上
4、生成柱状体，同时将柱状体投影到输出维度上
5、根据柱状体生成雷达特征
代码继续
接下来是对anns的处理

    pre_cts, track_ids = None, None
    if opt.tracking:
      pre_image, pre_anns, frame_dist, pre_img_info = self._load_pre_data(
        img_info['video_id'], img_info['frame_id'], 
        img_info['sensor_id'] if 'sensor_id' in img_info else 1)
      if flipped:
        pre_image = pre_image[:, ::-1, :].copy()
        pre_anns = self._flip_anns(pre_anns, width)
        if pc_2d is not None:
          pc_2d = self._flip_pc(pc_2d,  width)
      if opt.same_aug_pre and frame_dist != 0:
        trans_input_pre = trans_input 
        trans_output_pre = trans_output
      else:
        c_pre, aug_s_pre, _ = self._get_aug_param(
          c, s, width, height, disturb=True)
        s_pre = s * aug_s_pre
        trans_input_pre = get_affine_transform(
          c_pre, s_pre, rot, [opt.input_w, opt.input_h])
        trans_output_pre = get_affine_transform(
          c_pre, s_pre, rot, [opt.output_w, opt.output_h])
      pre_img = self._get_input(pre_image, trans_input_pre)
      pre_hm, pre_cts, track_ids = self._get_pre_dets(
        pre_anns, trans_input_pre, trans_output_pre)
      ret['pre_img'] = pre_img
      if opt.pre_hm:
        ret['pre_hm'] = pre_hm
      if opt.pointcloud:
        pre_pc_2d, pre_pc_N, pre_pc_hm, pre_pc_3d = self._load_pc_data(pre_img, pre_img_info, 
            trans_input_pre, trans_output_pre, flipped)
        ret['pre_pc_2d'] = pre_pc_2d
        ret['pre_pc_3d'] = pre_pc_3d
        ret['pre_pc_N'] = pre_pc_N
        ret['pre_pc_hm'] = pre_pc_hm
    '''
    这部分针对追踪做的，目前还没做这个
    '''
    ### init samples
    self._init_ret(ret, gt_det)
    '''
    初始化一些运行网络需要的东西，一方面用于网络的输入，另一方面用于计算损失
    输入的ret中包含的数据为：
    image:仿射变换后的图像     3*448*800
    pc_2d:投影到图像后的雷达点 3*1000(3表示xyz三个方向的特征)
    pc_3d:原始的雷达特征       18*1000(18表示原始的雷达特征)
    pc_N:1000点中有效雷达点的数量
    pc_dep:根据柱状体生成的雷达特征图3*112*200
    输出后增加的雷达特征：
    hm:	在对应类别通道赋值bbox中心点的高斯特征图         10*112*200
    ind: ct[1]*w+ct[0]                    128
    表示把特征变成一维向量后该类别对应的索引
    比如此时的点的坐标是(2,3),w,h分别是3，4；那么ind表示3*3+2
    cat：bbox的类别                  128
    mask：1表示该位置有框            128
    pc_hm：雷达特征                  3*112*200
    该特征是由pc_dep转换生成的，根据视锥体模型来获取到最相关的雷达点，将该雷达点的特征值赋值给对应的bbox来生成pc_hm
    reg：最后预测的为整数，该部分用于保存根据hw计算出来的bbox中心点小数偏置       128*2
    比如：bbox中心点为(22.4,45.6),保存的为(0.4,0.6)
    reg_mask: 表示对应位置有值         128*2
    wh：bbox对应的wh                   128*2
    wh_mask：这个位置有wh              128*2
    nuscenes_att：表示物体的状态       128*8
    一共定义了物体的8种状态，把bbox的对应状态赋值为1
    nuscenes_att_mask：表示该位置有值      128*8
    velocity：box 在相机坐标系中的速度     128*3
    velocity_mask：表示该位置有值          128*3
    dep：Z值*aug_s            128*1
    dep_mask：有无z           128*1
    dim：bbox的lhw            128*3
    dim_mask：有数值          128*3
    amodel_offset：保存根据实际中心点bbox中心点小数偏置   128*2
    amodel_offset_mask：有值   128*2
    rosbin：a小于pi/6或者大于5pi/6第一个位置为0，-pi/6到-5pi/6第二个位置为0      128*2
    rosres：a大于-pi/6或者小于-5pi/6第一个位置为a-(-0.5*pi),-pi/6到-5pi/6第二个位置为a-(0.5*np.pi)      128*2
    ros_mask：表示物体是否旋转             128
    
    其中用于网络输入的部分为：
    image:应用仿射变换后的图像 3*448*800
    pc_dep:根据柱状体生成的雷达特征图，在进行Secondary Regression Heads用于和图像特征进行拼接
    计算损失的部分：
    
    '''
    calib = self._get_calib(img_info, width, height)

    # get velocity transformation matrix
    if "velocity_trans_matrix" in img_info:
      velocity_mat = np.array(img_info['velocity_trans_matrix'], dtype=np.float32)
    else:
      velocity_mat = np.eye(4)
    
    num_objs = min(len(anns), self.max_objs)
    for k in range(num_objs):
      ann = anns[k]
      cls_id = int(self.cat_ids[ann['category_id']])
      if cls_id > self.opt.num_classes or cls_id <= -999:
        continue
      bbox, bbox_amodal = self._get_bbox_output(
        ann['bbox'], trans_output, height, width)
      '''
      ann['bbox']中保存的box的是xyhw,xy表示bbox的左上角坐标
      该函数首先将xyhw转换成xyxy
      之后将bbox框应用和图像一样的仿射变换映射输出维度大小
      最后将bbox限定在图像内，图像外的部分以图象的边缘赋值
      '''
      if cls_id <= 0 or ('iscrowd' in ann and ann['iscrowd'] > 0):
        self._mask_ignore_or_crowd(ret, cls_id, bbox)
        continue
      self._add_instance(
        ret, gt_det, k, cls_id, bbox, bbox_amodal, ann, trans_output, aug_s, 
        calib, pre_cts, track_ids)

  def _add_instance(
    self, ret, gt_det, k, cls_id, bbox, bbox_amodal, ann, trans_output,
    aug_s, calib, pre_cts=None, track_ids=None):
    h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
    if h <= 0 or w <= 0:
      return
    radius = gaussian_radius((math.ceil(h), math.ceil(w)))
    '''
    分为三种情况
    参考：https://zhuanlan.zhihu.com/p/452632600
    1、预测框与gt box的两个角点是以r半径的圆外切，gt小于pred
    2、预测的框和GTbox两个角点以r为半径的圆内切，gt大于pred
    3、预测的框和GTbox两个角点以r为半径的圆一个边内切，一个边外切
    考虑真实的框在预测框的内部
    iou=H*W/((H+2R)*(W+2R))从而计算出来R
    最后取上述三钟情况r算出来的最小值
    '''
    radius = max(0, int(radius)) 
    ct = np.array(
      [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], dtype=np.float32)
    ct_int = ct.astype(np.int32)
    ret['cat'][k] = cls_id - 1
    ret['mask'][k] = 1
    if 'wh' in ret:
      ret['wh'][k] = 1. * w, 1. * h
      ret['wh_mask'][k] = 1
    ret['ind'][k] = ct_int[1] * self.opt.output_w + ct_int[0]
    ret['reg'][k] = ct - ct_int
    ret['reg_mask'][k] = 1
    draw_umich_gaussian(ret['hm'][cls_id - 1], ct_int, radius)

    gt_det['bboxes'].append(
      np.array([ct[0] - w / 2, ct[1] - h / 2,
                ct[0] + w / 2, ct[1] + h / 2], dtype=np.float32))
    gt_det['scores'].append(1)
    gt_det['clses'].append(cls_id - 1)
    gt_det['cts'].append(ct)

    if 'tracking' in self.opt.heads:
      if ann['track_id'] in track_ids:
        pre_ct = pre_cts[track_ids.index(ann['track_id'])]
        ret['tracking_mask'][k] = 1
        ret['tracking'][k] = pre_ct - ct_int
        gt_det['tracking'].append(ret['tracking'][k])
      else:
        gt_det['tracking'].append(np.zeros(2, np.float32))

    if 'ltrb' in self.opt.heads:
      ret['ltrb'][k] = bbox[0] - ct_int[0], bbox[1] - ct_int[1], \
        bbox[2] - ct_int[0], bbox[3] - ct_int[1]
      ret['ltrb_mask'][k] = 1

    ## ltrb_amodal is to use the left, top, right, bottom bounding box representation 
    # to enable detecting out-of-image bounding box (important for MOT datasets)
    if 'ltrb_amodal' in self.opt.heads:
      ret['ltrb_amodal'][k] = \
        bbox_amodal[0] - ct_int[0], bbox_amodal[1] - ct_int[1], \
        bbox_amodal[2] - ct_int[0], bbox_amodal[3] - ct_int[1]
      ret['ltrb_amodal_mask'][k] = 1
      gt_det['ltrb_amodal'].append(bbox_amodal)

    if 'nuscenes_att' in self.opt.heads:
      if ('attributes' in ann) and ann['attributes'] > 0:
        att = int(ann['attributes'] - 1)
        ret['nuscenes_att'][k][att] = 1
        ret['nuscenes_att_mask'][k][self.nuscenes_att_range[att]] = 1
      gt_det['nuscenes_att'].append(ret['nuscenes_att'][k])

    if 'velocity' in self.opt.heads:
      if ('velocity_cam' in ann) and min(ann['velocity_cam']) > -1000:
        ret['velocity'][k] = np.array(ann['velocity_cam'], np.float32)[:3]
        ret['velocity_mask'][k] = 1
      gt_det['velocity'].append(ret['velocity'][k])

    if 'hps' in self.opt.heads:
      self._add_hps(ret, k, ann, gt_det, trans_output, ct_int, bbox, h, w)

    if 'rot' in self.opt.heads:
      self._add_rot(ret, ann, k, gt_det)

    if 'dep' in self.opt.heads:
      if 'depth' in ann:
        ret['dep_mask'][k] = 1
        ret['dep'][k] = ann['depth'] * aug_s
        gt_det['dep'].append(ret['dep'][k])
      else:
        gt_det['dep'].append(2)

    if 'dim' in self.opt.heads:
      if 'dim' in ann:
        ret['dim_mask'][k] = 1
        ret['dim'][k] = ann['dim']
        gt_det['dim'].append(ret['dim'][k])
      else:
        gt_det['dim'].append([1,1,1])
    
    if 'amodel_offset' in self.opt.heads:
      if 'amodel_center' in ann:
        amodel_center = affine_transform(ann['amodel_center'], trans_output)
        ret['amodel_offset_mask'][k] = 1
        ret['amodel_offset'][k] = amodel_center - ct_int
        gt_det['amodel_offset'].append(ret['amodel_offset'][k])
      else:
        gt_det['amodel_offset'].append([0, 0])
    
    if self.opt.pointcloud:
      ## get pointcloud heatmap
      if self.opt.disable_frustum:
      '''
      对于训练阶段上述是False，测试的时候为True
      由于不知道实际的bbox所以之类直接使用上述生成的雷达特征
      '''
        ret['pc_hm'] = ret['pc_dep']
        if opt.normalize_depth:
          ret['pc_hm'][self.opt.pc_feat_channels['pc_dep']] /= opt.max_pc_dist
      else:
        dist_thresh = get_dist_thresh(calib, ct, ann['dim'], ann['alpha'])
        '''
        首先根据ann['dim']中的lwh生成一个3d的box(用8个点的三维坐标来表示)
        之后根据alpha来计算出全局观测角，之后对上述生成的3dbox进行旋转，来获得真实框的坐标
        最后通过计算8个坐标中z方向的最大值和最小值，利用而这插值的重点来作为阈值(用于之后的视锥体中雷达点的过滤)
        '''
        pc_dep_to_hm(ret['pc_hm'], ret['pc_dep'], ann['depth'], bbox, dist_thresh, self.opt)

在训练过程中，利用视锥体来过滤不需要的雷达点，同时将最小距离的雷达点当作是与物体最相关的来作为最后的雷达特征。

def pc_dep_to_hm(pc_hm, pc_dep, dep, bbox, dist_thresh, opt):
    if isinstance(dep, list) and len(dep) > 0:
      dep = dep[0]
    ct = np.array(
      [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], dtype=np.float32)
    bbox_int = np.array([np.floor(bbox[0]), 
                         np.floor(bbox[1]), 
                         np.ceil(bbox[2]), 
                         np.ceil(bbox[3])], np.int32)# format: xyxy

    roi = pc_dep[:, bbox_int[1]:bbox_int[3]+1, bbox_int[0]:bbox_int[2]+1]
    pc_dep = roi[opt.pc_feat_channels['pc_dep']]
    pc_vx = roi[opt.pc_feat_channels['pc_vx']]
    pc_vz = roi[opt.pc_feat_channels['pc_vz']]

    nonzero_inds = np.nonzero(pc_dep)  #返回数组a中非零元素的索引值数组，该部分表示返回含有雷达特征的矩阵坐标信息
    
    if len(nonzero_inds[0]) > 0:
    #   nonzero_pc_dep = np.exp(-pc_dep[nonzero_inds])
      nonzero_pc_dep = pc_dep[nonzero_inds]
      nonzero_pc_vx = pc_vx[nonzero_inds]
      nonzero_pc_vz = pc_vz[nonzero_inds]

      ## Get points within dist threshold
      within_thresh = (nonzero_pc_dep < dep+dist_thresh) \
              & (nonzero_pc_dep > max(0, dep-dist_thresh))
      
      pc_dep_match = nonzero_pc_dep[within_thresh]
      pc_vx_match = nonzero_pc_vx[within_thresh]
      pc_vz_match = nonzero_pc_vz[within_thresh]
	  '''
	  这里首先根据包含雷达点的坐标来获得所有雷达点的深度信息保存在nonzero_pc_dep中，
	  之后根据上述得到的阈值将真实深度信息加减阈值以为的雷达点过滤掉，
	  其中dep是anns['dep']表示该anns的深度信息，dis_thresh是上文中获取得到的框高度的一半（理性这么理解），
	  通过这两个坐标可以得到框在z方向的两个阈值，将该范围外的点进行过滤，分别对depth，vx，vz这三个特征来进行过滤，
	  上述过程就是作者提到的利用视锥体来过滤掉一些无关的雷达点，
	  结合作者论文中给出的图，我是这么理解代码的，首先这里的深度信息是表示是物体到摄像机的距离，之后阈值的两端分别用的是
	  dep-dis-dis_thresh和dep+dis_thresh，同时roi对应着图像中的Image Plane中的红色框，
	  所以此时根据现有的深度信息和图像平面的举行就可以构建一个视锥体模型，来过滤点深度信息在视锥体之外的点
	  '''
      if len(pc_dep_match) > 0:
        arg_min = np.argmin(pc_dep_match)
        dist = pc_dep_match[arg_min]
        vx = pc_vx_match[arg_min]
        vz = pc_vz_match[arg_min]
        if opt.normalize_depth:
          dist /= opt.max_pc_dist

        w = bbox[2] - bbox[0]
        w_interval = opt.hm_to_box_ratio*(w)
        w_min = int(ct[0] - w_interval/2.)
        w_max = int(ct[0] + w_interval/2.)
        
        h = bbox[3] - bbox[1]
        h_interval = opt.hm_to_box_ratio*(h)
        h_min = int(ct[1] - h_interval/2.)
        h_max = int(ct[1] + h_interval/2.)

        pc_hm[opt.pc_feat_channels['pc_dep'],
               h_min:h_max+1, 
               w_min:w_max+1+1] = dist
        pc_hm[opt.pc_feat_channels['pc_vx'],
               h_min:h_max+1, 
               w_min:w_max+1+1] = vx
        pc_hm[opt.pc_feat_channels['pc_vz'],
               h_min:h_max+1, 
               w_min:w_max+1+1] = vz
        '''
        这里是找出论文中所说的和物体最对应的雷达的点，由于视锥体中有可能含有多个雷达点，作者取得是最近的雷达点，
        同时我感觉是由于雷达本身构造的特征不能太大，所以作者再利用bbox的时候设置类一个缩放因子，
        将原始的2d的bbox来进行缩放来得到最后的雷达区域，同时雷达区域的值是所有的雷达点中深度信息最小的那个点所对应的特征，
        最后分别对三个特征都进行上述的赋值过程来的到最后的输入雷达特征，
        上述过程针对的是作者所说的训练阶段，使用的用bbox来对雷达特征进行操作的。
        '''

代码继续

    if self.opt.debug > 0 or self.enable_meta:
      gt_det = self._format_gt_det(gt_det)
      meta = {'c': c, 's': s, 'gt_det': gt_det, 'img_id': img_info['id'],
              'img_path': img_path, 'calib': calib,
              'img_width': img_info['width'], 'img_height': img_info['height'],
              'flipped': flipped, 'velocity_mat':velocity_mat}
      ret['meta'] = meta
    ret['calib'] = calib
    return ret