#!/usr/bin/python3.10
# -*- coding: utf-8 -*-
# Copyright (c) Shenzhen Yinwang Intelligent Technologies Co., Ltd. 2024. All rights reserved.
import shutil
from dataclasses import dataclass, field
import os
import re
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Tuple, Any, Union, Set, Optional
import heapq
from loguru import logger
from app.constants import MAX_WORKERS
from app.handler.event_config_handler import EventConfigHandler
from app.utils.slice_tool.src import slice_bag_tools
from app.utils.slice_tool.src.models import RosbagSliceParam, UdpBagSliceParam
from app.utils.utils import tar_folder
pattern = re.compile(r'([0-9]{17}_[0-9]{5})')
def copy_and_rename_files(event_ids, output_dir, file_list: list[str]):
for file_path in file_list:
base_path = os.path.dirname(file_path)
file_name = os.path.basename(file_path)
match = pattern.search(file_name)
if match:
base_name = match.group(1)
ext = os.path.splitext(file_name)[1]
event_id = match.group(2)
# 如果匹配到事件ID,则拷贝到对应的文件夹并重命名
if event_id in event_ids:
new_file_name = f"{base_name}{ext}"
dest_path = os.path.join(output_dir, new_file_name)
shutil.copy(file_path, dest_path)
print(f"Copied {file_name} to {event_id}")
else:
# 如果没有匹配到事件ID,则拷贝到所有文件夹
for event_id in event_ids:
dest_path = os.path.join(f"{base_path}/C__{event_id}", file_name)
shutil.copy(file_path, dest_path)
print(f"Copied {file_name} to {event_id}")
@dataclass
class SlicerRes:
upload_path: Path
file_size: int
event_time: int
error_code: list = field(default_factory=list)
def convert_datetime_to_timestamp(
datetime_str: str,
format_output: bool = False
) -> Union[int, str]:
"""Convert datetime string to timestamp or formatted string.
Args:
datetime_str: String in format '%Y%m%d%H%M%S%f'
format_output: If True, returns formatted string instead of timestamp
Returns:
Either timestamp in milliseconds or formatted string
"""
dt_object = datetime.strptime(datetime_str[:17], '%Y%m%d%H%M%S%f')
if format_output:
return dt_object.strftime('%Y_%m_%d_%H_%M_%S')
timestamp_seconds = dt_object.replace(tzinfo=timezone.utc).timestamp()
return int(timestamp_seconds * 1000)
def convert_topic_keys(input_dict: Dict[str, Any]) -> Dict[str, Any]:
"""Convert topic keys to new format.
Args:
input_dict: Dictionary with original topic keys
Returns:
Dictionary with converted keys (e.g. '/topic/name' -> 'topic_name_lite')
"""
return {
f"{key.lstrip('/').replace('/', '_')}_lite": value
for key, value in input_dict.items()
}
class SlicerHandler:
"""Handler for slicing rosbag files based on event configurations."""
def __init__(
self,
bag_side: str,
data_name: str,
side_data: str,
bag_dir: str,
temp_dir_global: str,
rest_path: list,
need_copy_files: list
):
"""Initialize SlicerHandler.
Args:
bag_side: Side identifier for bags
data_name: Name of the data
side_data: Side data identifier
bag_dir: Directory containing bag files
temp_dir_global: Global temporary directory
"""
self.bag_dir = Path(bag_dir)
self.err_list: List[str] = []
self.data_name = data_name
self.side_data = side_data
self.bag_side = bag_side
self.temp_dir_global = Path(temp_dir_global)
self.rest_paths = rest_path
self.need_copy_files = need_copy_files
# Create slicer result directory
self.slicer_res_dir = self.temp_dir_global / "slicer_res"
self.slicer_res_dir.mkdir(mode=0o700, exist_ok=True)
def parse(self) -> dict[str, SlicerRes]:
"""Main parsing method that coordinates the slicing process."""
bags, zbags, gbags, event_yaml_files, bin_files = self.classify_files()
config_dict = self.process_yaml_configs(event_yaml_files)
bin_events = self.process_bin_files(bin_files)
filtered_events = self.filter_and_adjust_events(bags, zbags, gbags, config_dict, bin_events)
results = self.process_slice_tasks(filtered_events, bags, zbags, gbags)
return results
def classify_files(self) -> Tuple[
Dict[str, List[str]],
Dict[str, Dict[str, List[str]]],
Dict[str, Dict[str, List[str]]],
List[str],
List[str]
]:
"""Classify files in the bag directory by type.
Returns:
Tuple containing:
- bags: Dictionary mapping bag names to their file paths
- yaml_files: List of event config YAML files
- bin_files: List of customized header binary files
"""
bags = defaultdict(list)
zbags = {"ddi": defaultdict(list), "time_list": defaultdict(list)}
gbags = {"ddi": defaultdict(list), "time_list": defaultdict(list)}
yaml_files = []
bin_files = []
for filename in os.listdir(self.bag_dir):
filepath = self.bag_dir / filename
if filename.lower().endswith('.bag'):
bag_name = filename.split('.')[0]
heapq.heappush(bags[bag_name], filename)
elif filename.lower().endswith('.yaml') and "event_config" in filename:
yaml_files.append(str(filepath))
self.need_copy_files.append(str(filepath))
elif filename.lower().endswith('.bin') and "customized_header" in filename:
bin_files.append(str(filepath))
self.need_copy_files.append(str(filepath))
elif filename.lower().endswith("zbag") and filename.split(".")[0].endswith("-ddi"):
bag_name = filename.split('.')[0][:-4]
heapq.heappush(zbags["ddi"][bag_name], str(filename))
elif filename.lower().endswith("zbag") and filename.split(".")[0].endswith("-timelist"):
bag_name = filename.split('.')[0][:-9]
heapq.heappush(zbags["time_list"][bag_name], str(filepath))
elif filename.lower().endswith("gbag") and filename.split(".")[0].endswith("-ddi"):
bag_name = filename.split('.')[0]
heapq.heappush(gbags["ddi"][bag_name], str(filename))
elif filename.lower().endswith("gbag") and filename.split(".")[0].endswith("-timelist"):
bag_name = filename.split('.')[0]
heapq.heappush(gbags["time_list"][bag_name], str(filepath))
else:
self.need_copy_files.append(str(filepath))
return bags, zbags, gbags, yaml_files, bin_files
@staticmethod
def process_yaml_configs(yaml_files: List[str]) -> Dict[str, Dict[str, Any]]:
"""Process YAML configuration files.
Args:
yaml_files: List of paths to YAML config files
Returns:
Dictionary mapping event info to converted configs
"""
config_dict = {}
for file_path in yaml_files:
config = EventConfigHandler(file_path)
file_name = Path(file_path).name
# Extract event ID and time from filename
parts = file_name.split('.')[0].split('_')
event_id = parts[-1]
event_time = convert_datetime_to_timestamp(parts[-2], format_output=True)
event_info = f"{event_id}_{event_time}"
config_dict[event_info] = convert_topic_keys(config.event_config)
return config_dict
@staticmethod
def process_bin_files(bin_files: List[str]) -> Dict[str, int]:
"""Process binary files to extract event timestamps.
Args:
bin_files: List of paths to binary files
Returns:
Dictionary mapping event info to timestamps
"""
bin_events = {}
for file_path in bin_files:
filename = Path(file_path).name
time_part, event_id = filename.split('.')[1].split('_')
event_time = convert_datetime_to_timestamp(time_part, format_output=True)
timestamp = convert_datetime_to_timestamp(time_part)
event_info = f"{event_id}_{event_time}"
bin_events[event_info] = timestamp
return bin_events
@staticmethod
def _match_bag_topic(topic: str, available_topics: Set[str]) -> Optional[str]:
"""Match a topic with available bag topics using multiple strategies.
Args:
topic: Topic to match
available_topics: Set of available bag topics
Returns:
Matched topic name if found, otherwise None
"""
# 尝试绝对匹配
if topic in available_topics:
return topic
# 2. 如果绝对匹配失败,才尝试startswith匹配
matching_topics = [t for t in available_topics if t.startswith(topic.rstrip("_lite"))]
if matching_topics:
return matching_topics[0]
return None
def filter_and_adjust_events(
self,
bags: Dict[str, List[str]],
zbags: Dict[str, Dict[str, List[str]]],
gbags: Dict[str, Dict[str, List[str]]],
config_dict: Dict[str, Dict[str, Any]],
bin_events: Dict[str, int]
) -> Dict[str, Dict[str, Dict[str, Union[Tuple[int, int], str]]]]:
"""Filter and adjust event time ranges based on bin events.
Args:
bags: Dictionary of available bags
zbags: Dictionary of zbag files (包含ddi和time_list)
gbags: Dictionary of gbag files (包含ddi和time_list)
config_dict: Event configurations
bin_events: Event timestamps from binary files
Returns:
Filtered and adjusted event configurations
"""
bag_topics = set(bags.keys())
zbag_ddi_topics = set(zbags["ddi"].keys())
gbag_ddi_topics = set(gbags["ddi"].keys())
filtered_events = {}
# First pass: adjust time ranges based on bin events
for event_id, config in config_dict.items():
if event_id in bin_events:
timestamp = bin_events[event_id]
for topic, (value1, value2) in config.items():
config[topic] = (
timestamp - value1 * 1000,
timestamp + value2 * 1000
)
# Second pass: filter configs to include existing topics
for event_id, config in config_dict.items():
filtered_config = {}
for topic, time_range in config.items():
# 尝试匹配普通bag
matched_topic = self._match_bag_topic(topic, bag_topics)
if matched_topic:
filtered_config[topic] = {
"time_range": time_range,
"match_key": f"{matched_topic}",
"type": "bag"
}
continue
# 尝试匹配zbag-ddi
matched_zbag = self._match_bag_topic(topic, zbag_ddi_topics)
if matched_zbag:
filtered_config[topic] = {
"time_range": time_range,
"match_key": f"{matched_zbag}-ddi",
"type": "zbag"
}
continue
# 尝试匹配gbag-ddi
matched_gbag = self._match_bag_topic(topic, gbag_ddi_topics)
if matched_gbag:
filtered_config[topic] = {
"time_range": time_range,
"match_key": f"{matched_gbag}-ddi.gbag",
"type": "gbag"
}
filtered_events[event_id] = filtered_config
return filtered_events
def process_slice_tasks(
self,
events: Dict[str, Dict[str, Dict[str, Union[Tuple[int, int], str]]]],
bags: Dict[str, List[str]],
zbags: Dict[str, Dict[str, List[str]]],
gbags: Dict[str, Dict[str, List[str]]],
) -> dict[str, SlicerRes]:
"""Process all slice tasks using thread pool.
Args:
events: Dictionary of events to process
bags: Dictionary of available bags
zbags: Dictionary of available zbags
gbags: Dictionary of available gbags
Returns:
Dictionary containing processing results
"""
results = {}
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
for event_id, config in events.items():
err_list = []
output_dir = self._prepare_output_directory(event_id)
copy_and_rename_files([event_id], output_dir, self.need_copy_files)
futures = self._submit_slice_tasks(executor, config, bags, zbags, gbags, output_dir)
self._wait_for_tasks_completion(futures, err_list)
output_dir = self._create_result_tarball(output_dir.parent)
results[event_id] = SlicerRes(output_dir, os.path.getsize(output_dir), 0, err_list)
return results
def _prepare_output_directory(self, event_id: str) -> Path:
"""Prepare output directory for sliced bags.
Args:
event_id: Event identifier
Returns:
Path to the output directory
"""
outer_dir = self.bag_dir / f"C_{self.side_data}_{event_id}_{self.bag_side}"
output_dir = outer_dir / f"C_{self.side_data}_{event_id}"
outer_dir.mkdir(exist_ok=True)
output_dir.mkdir(exist_ok=True)
return output_dir
def _submit_slice_tasks(
self,
executor: ThreadPoolExecutor,
config: Dict[str, Dict[str, Union[Tuple[int, int], str]]],
bags: Dict[str, List[str]],
zbags: Dict[str, Dict[str, List[str]]],
gbags: Dict[str, Dict[str, List[str]]],
output_dir: Path
) -> Dict[str, Dict[str, Any]]:
"""Submit slice tasks to executor.
Args:
executor: Thread pool executor
config: Event configuration
bags: Available bags
output_dir: Output directory
Returns:
Dictionary containing futures and their associated metadata
"""
task_info = {}
for topic, config_data in config.items():
time_range = config_data["time_range"]
match_key = config_data["match_key"]
bag_type = config_data["type"]
if bag_type == "bag":
bag_paths = [self.bag_dir / bag_path for bag_path in bags[topic]]
param = RosbagSliceParam(
name=f"{match_key}.bag",
sources=bag_paths,
output=output_dir / f"{match_key}.bag",
start=time_range[0],
end=time_range[1],
compression="lz4",
)
future = executor.submit(slice_bag_tools.process_ros_bag, param)
task_info[match_key] = {
"future": future,
"type": "bag",
"param": param,
"topic": topic
}
elif bag_type == "zbag":
zbag_files = [self.bag_dir / f for f in zbags["ddi"].get(match_key[:-4], [])]
time_files = [self.bag_dir / f for f in zbags["time_list"].get(match_key[:-4], [])]
param = UdpBagSliceParam(
name=f"{match_key}.zbag",
sources=zbag_files,
times=time_files,
output=output_dir / f"{match_key}.zbag",
start=time_range[0],
end=time_range[1],
)
future = executor.submit(slice_bag_tools.process_udp_bag, param)
task_info[match_key] = {
"future": future,
"type": "zbag",
"param": param,
"topic": topic
}
elif bag_type == "gbag":
gbag_files = [self.bag_dir / f for f in gbags["ddi"].get(match_key[:-9], [])]
time_files = [self.bag_dir / f for f in gbags["time_list"].get(match_key[:-9], [])]
param = UdpBagSliceParam(
name=f"{match_key}.gbag",
sources=gbag_files,
times=time_files,
output=output_dir / f"{match_key}.gbag",
start=time_range[0],
end=time_range[1],
)
future = executor.submit(slice_bag_tools.process_udp_bag, param)
task_info[match_key] = {
"future": future,
"type": "gbag",
"param": param,
"topic": topic
}
return task_info
def _wait_for_tasks_completion(self, task_info: Dict[str, Dict[str, Any]], err_list: List[str]) -> None:
"""Wait for slice tasks to complete and handle errors.
Args:
task_info: Dictionary containing task futures and metadata
err_list: List to collect error messages
"""
for match_key, task_data in task_info.items():
try:
task_data["future"].result()
except Exception as e:
logger.error(
f"Slice task failed for {match_key} (type: {task_data['type']}, topic: {task_data['topic']}): "
f"{e.__class__.__name__}"
)
# TODO 兜底?
err_list.append(str(e))
def _create_result_tarball(self, output_dir: Path) -> Path:
"""Create tarball of the processed results.
Args:
output_dir: Directory containing results to be archived
"""
for rest_path in self.rest_paths:
shutil.copytree(rest_path, output_dir / rest_path.name)
tar_name = output_dir.name + ".tar"
tar_path = self.slicer_res_dir / tar_name
tar_folder(output_dir, tar_path)
return output_dir
if __name__ == '__main__':
slice_handler = SlicerHandler("bag_side", "", "",
r"\opt\service\tmp\test\_2169b559-599e-4578-951c-785d949ad590\bag_dir",
r"", [], [])
slice_handler.parse()
copy_and_rename_files有点问题,请帮忙重构下
这个时需要拷贝的目录,有20250723060930410_10000 事件+事件类型的文件,如果能匹配到,就放到对应的文件夹下面,其他同类型的就不添加,如果匹配不到就将所有的添加进去,比如说下面这两个文件\opt\service\tmp\test_2169b559-599e-4578-951c-785d949ad590\bag_dir\customized_data.20250723060924355_13001.bin’, ‘\opt\service\tmp\test_2169b559-599e-4578-951c-785d949ad590\bag_dir\customized_data.20250723060930410_10000.bin’, 分别应该放到20250723060930410_10000和20250723060924355_13001文件夹下面,\opt\service\tmp\test_2169b559-599e-4578-951c-785d949ad590\bag_dir\meta.yaml这种不携带时间+event_id的就这两个文件夹都放这个我们有这两个时间戳+事件id文件,如果文件名里面携带事件id+事件戳,需要将时间戳+事件id去除掉