import json
import os
from collections import defaultdict
import jsonlines
def process_collision_data(input_file, output_file, stats_output_file):
"""
处理碰撞数据,按scene_id和相机类型分组,生成mask标签
并按scene_id、cam_tag和timestamp降序输出,同时生成统计信息
:param input_file: 输入JSONL文件路径
:param output_file: 输出JSONL文件路径
:param stats_output_file: 统计信息输出JSON文件路径
"""
# 读取并解析JSONL文件
data = []
with open(input_file, 'r', encoding='utf-8') as f:
for line in f:
try:
item = json.loads(line)
# 添加cam_tag字段到每个数据项
item['cam_tag'] = item['image_path'].split('/')[-2]
data.append(item)
except json.JSONDecodeError:
print(f"JSON解析错误: {line}")
continue
# 按scene_id和相机类型分组
grouped_data = defaultdict(lambda: defaultdict(list))
for item in data:
grouped_data[item['scene_id']][item['cam_tag']].append(item)
# 初始化统计数据结构
stats = {
"total_scenes": len(grouped_data),
"scenes": {},
"overall": {
"total_frames": 0,
"yes_count": 0,
"no_count": 0,
"yes_ratio": 0.0,
"no_ratio": 0.0
}
}
# 处理每个分组
processed_data = []
for scene_id, cam_groups in grouped_data.items():
scene_stats = {
"scene_id": scene_id,
"total_cam_tags": len(cam_groups),
"cam_tags": {},
"total_frames": 0,
"yes_count": 0,
"no_count": 0
}
for cam_type, cam_data in cam_groups.items():
# 初始化相机类型统计
cam_stats = {
"cam_tag": cam_type,
"total_frames": len(cam_data),
"yes_count": 0,
"no_count": 0
}
# 按时间戳降序排序(最新时间在前)
cam_data.sort(key=lambda x: x["timestamp"], reverse=True)
# 状态跟踪变量(修复问题:确保正确继承)
prev_points = None
prev_tag = None
# 处理第一帧
if cam_data:
if cam_data[0].get('mop_pnc_info'):
# 第一帧固定为'yes'
cam_data[0]['mop_pnc_info'][0]['tag'] = 'yes'
cam_stats["yes_count"] += 1
prev_points = tuple(cam_data[0]['mop_pnc_info'][0].get('roadPoints', []))
prev_tag = 'yes'
print(f"场景 {scene_id} 相机 {cam_type}: 帧0 (最新) 标签设置为 'yes'")
else:
print(f"警告: scene_id={scene_id}, cam_type={cam_type} 的第一帧没有 mop_pnc_info 数据")
prev_points = tuple()
prev_tag = None
# 处理第二帧(如果存在)
if len(cam_data) > 1:
current_item = cam_data[1]
if current_item.get('mop_pnc_info'):
current_points = tuple(current_item['mop_pnc_info'][0].get('roadPoints', []))
# 第二帧特殊处理:与第一帧比较
if current_points == prev_points:
current_item['mop_pnc_info'][0]['tag'] = 'yes'
print(f"场景 {scene_id} 相机 {cam_type}: 帧1 与帧0相同 -> 标签设置为 'yes'")
else:
current_item['mop_pnc_info'][0]['tag'] = 'no'
print(f"场景 {scene_id} 相机 {cam_type}: 帧1 与帧0不同 -> 标签设置为 'no'")
# 更新统计
if current_item['mop_pnc_info'][0]['tag'] == 'yes':
cam_stats["yes_count"] += 1
else:
cam_stats["no_count"] += 1
# 更新状态(仅用于下一帧比较)
prev_points_for_next = current_points # 用于下一帧比较的点
prev_tag_for_next = current_item['mop_pnc_info'][0]['tag'] # 用于下一帧比较的标签
else:
print(f"警告: scene_id={scene_id}, cam_type={cam_type} 的第二帧没有 mop_pnc_info 数据")
prev_points_for_next = prev_points
prev_tag_for_next = prev_tag
# 处理第三帧及后续帧(修复问题:确保正确继承)
for i in range(2, len(cam_data)):
current_item = cam_data[i]
if current_item.get('mop_pnc_info'):
current_points = tuple(current_item['mop_pnc_info'][0].get('roadPoints', []))
# 后续帧处理:与前一帧比较(关键修复:使用前一帧的状态)
if i == 2:
# 第三帧特殊处理:与第二帧比较
if current_points == prev_points_for_next:
current_item['mop_pnc_info'][0]['tag'] = prev_tag_for_next
print(
f"场景 {scene_id} 相机 {cam_type}: 帧{i} 与帧{i - 1}相同 -> 继承标签 '{prev_tag_for_next}'")
else:
# 反转前一帧标签
current_item['mop_pnc_info'][0]['tag'] = 'no' if prev_tag_for_next == 'yes' else 'yes'
print(
f"场景 {scene_id} 相机 {cam_type}: 帧{i} 与帧{i - 1}不同 -> 反转标签为 '{current_item['mop_pnc_info'][0]['tag']}'")
else:
# 第四帧及以后:与前一帧比较
if current_points == prev_points:
current_item['mop_pnc_info'][0]['tag'] = prev_tag
print(
f"场景 {scene_id} 相机 {cam_type}: 帧{i} 与帧{i - 1}相同 -> 继承标签 '{prev_tag}'")
else:
# 反转前一帧标签
current_item['mop_pnc_info'][0]['tag'] = 'no' if prev_tag == 'yes' else 'yes'
print(
f"场景 {scene_id} 相机 {cam_type}: 帧{i} 与帧{i - 1}不同 -> 反转标签为 '{current_item['mop_pnc_info'][0]['tag']}'")
# 更新统计
if current_item['mop_pnc_info'][0]['tag'] == 'yes':
cam_stats["yes_count"] += 1
else:
cam_stats["no_count"] += 1
# 更新状态(用于下一帧比较)
prev_points = current_points
prev_tag = current_item['mop_pnc_info'][0]['tag']
else:
print(f"警告: scene_id={scene_id}, cam_type={cam_type} 的第{i + 1}帧没有 mop_pnc_info 数据")
# 计算相机类型占比
if cam_stats["total_frames"] > 0:
cam_stats["yes_ratio"] = cam_stats["yes_count"] / cam_stats["total_frames"]
cam_stats["no_ratio"] = cam_stats["no_count"] / cam_stats["total_frames"]
else:
cam_stats["yes_ratio"] = 0.0
cam_stats["no_ratio"] = 0.0
# 更新场景统计
scene_stats["cam_tags"][cam_type] = cam_stats
scene_stats["total_frames"] += cam_stats["total_frames"]
scene_stats["yes_count"] += cam_stats["yes_count"]
scene_stats["no_count"] += cam_stats["no_count"]
# 添加到处理后的数据
processed_data.extend(cam_data)
# 计算场景占比
if scene_stats["total_frames"] > 0:
scene_stats["yes_ratio"] = scene_stats["yes_count"] / scene_stats["total_frames"]
scene_stats["no_ratio"] = scene_stats["no_count"] / scene_stats["total_frames"]
else:
scene_stats["yes_ratio"] = 0.0
scene_stats["no_ratio"] = 0.0
# 添加到全局统计
stats["scenes"][scene_id] = scene_stats
stats["overall"]["total_frames"] += scene_stats["total_frames"]
stats["overall"]["yes_count"] += scene_stats["yes_count"]
stats["overall"]["no_count"] += scene_stats["no_count"]
# 计算全局占比
if stats["overall"]["total_frames"] > 0:
stats["overall"]["yes_ratio"] = stats["overall"]["yes_count"] / stats["overall"]["total_frames"]
stats["overall"]["no_ratio"] = stats["overall"]["no_count"] / stats["overall"]["total_frames"]
else:
stats["overall"]["yes_ratio"] = 0.0
stats["overall"]["no_ratio"] = 0.0
# 按scene_id、cam_tag和timestamp降序排序
processed_data.sort(key=lambda x: (
x['scene_id'],
x['cam_tag'],
-x['timestamp']
))
# 写入输出文件
with jsonlines.open(output_file, 'w') as writer:
for item in processed_data:
writer.write(item)
# 写入统计文件
with open(stats_output_file, 'w', encoding='utf-8') as f:
json.dump(stats, f, indent=4, ensure_ascii=False)
return stats
# 使用示例
if __name__ == "__main__":
stats = process_collision_data(
"./task4/basicData_historyDepth_v2_img106k_250723__huanshi_6ver_4fish_large_model_900_792874_10frames.jsonl",
"processed_106k_0723_1.jsonl",
"collision_stats_0723_1.json"
)
print(f"处理完成!统计信息已保存到 collision_stats.json")
print(f"总场景数: {stats['total_scenes']}")
print(f"总帧数: {stats['overall']['total_frames']}")
print(f"YES占比: {stats['overall']['yes_ratio']:.2%}")
print(f"NO占比: {stats['overall']['no_ratio']:.2%}")
修改mask打标逻辑:每组数据排序后的第一个默认是yes,第二个判断时,取出第二条数据的roadpoint的值在第一个的数据中进行遍历,如果均在数据1 里则为yes,否则为no,每个场景下的每个相机,当mask从yes转变为no,则该组的后续数据全部为no
最新发布