一、PASCAL_VOC形式的数据集文件组织形式
VOCdevkit/
|__ VOC2007/
|__ Annotations/ (这里存放着每一张图片的XML注解文件,如 000001.xml)
|__ ImageSets/ (这里包含不同任务或不同类别的图像文件名列表)
|__ Main/ (例如,train.txt, val.txt, trainval.txt, test.txt)
|__ Layout/ (布局任务的图像集,例如,train.txt, val.txt, trainval.txt, test.txt)
|__ Segmentation/ (分割任务的图像集,例如,train.txt, val.txt, trainval.txt, test.txt)
|__ JPEGImages/ (这里存放着数据集中的所有JPEG图像,如 000001.jpg)
|__ SegmentationClass/ (这里包含分割类别标签的图像)
|__ SegmentationObject/ (这里包含分割目标标签的图像)
二、VOC2COCO.py 数据转换
这里以遥感中的DIOR数据集为例(DIOR采用的就是PASCAL_VOC数据集的文件组织形式)
一般VOC数据集都提前划分好了train.txt,val.txt, test.txt,所以以下代码是直接按照这三个文件list来读取的。代码如下:
import os
import json
import xml.etree.ElementTree as ET
from typing import List, Dict
def get_all_categories(voc_root: str, splits: List[str]) -> Dict[str, int]:
category_dict = {}
for split in splits:
print(f'scanning {split}')
image_ids = get_image_ids_from_file(os.path.join(voc_root, 'ImageSets', 'Main', f'{split}.txt'))
annotation_dir = os.path.join(voc_root, 'Annotations/Annotations/HBB')
for image_id in image_ids:
file = f'{image_id}.xml'
tree = ET.parse(os.path.join(annotation_dir, file))
root = tree.getroot()
for obj in root.iter('object'):
category = obj.find('name').text
if category not in category_dict:
category_dict[category] = len(category_dict) + 1
return category_dict
def get_image_ids_from_file(file_path: str) -> List[str]:
with open(file_path, 'r') as file:
return file.read().strip().split('\n')
def voc_to_coco(voc_root: str, split: str, category_dict: Dict[str, int]) -> Dict:
annotations = []
images = []
categories = []
# 获取需要的图像ID
image_ids = get_image_ids_from_file(os.path.join(voc_root, 'ImageSets', 'Main', f'{split}.txt'))
# 根据自己的文件夹结构修改:扫描Annotations目录下所有的xml文件
annotation_dir = os.path.join(voc_root, 'Annotations/Annotations/HBB')
for image_id in image_ids:
file = f'{image_id}.xml'
print(f'Processing {file}')
tree = ET.parse(os.path.join(annotation_dir, file))
root = tree.getroot()
# 从xml文件中提取信息
size = root.find('size')
width = int(size.find('width').text)
height = int(size.find('height').text)
# 填充images字段
images.append({
"file_name": f'{image_id}.jpg',
"height": height,
"width": width,
"id": image_id
})
# 处理每一个目标
for obj in root.iter('object'):
category = obj.find('name').text
# 不需要检查类别是否存在于字典中,因为已经在get_all_categories中处理过
bndbox = obj.find('bndbox')
xmin = int(bndbox.find('xmin').text)
ymin = int(bndbox.find('ymin').text)
xmax = int(bndbox.find('xmax').text)
ymax = int(bndbox.find('ymax').text)
# 填充annotations字段
annotations.append({
"segmentation": [], # COCO格式的segmentation字段通常需要分割任务的标注,这里忽略
"area": (xmax - xmin) * (ymax - ymin),
"iscrowd": 0,
"image_id": image_id,
"bbox": [xmin, ymin, xmax - xmin, ymax - ymin],
"category_id": category_dict[category],
"id": len(annotations) + 1
})
# 填充categories字段
for category, category_id in category_dict.items():
categories.append({
'id': category_id,
'name': category,
'supercategory': 'none', # 可根据实际情况修改
})
# 组合所有信息
coco_format = {
"images": images,
"annotations": annotations,
"categories": categories
}
return coco_format
def main():
voc_root = './DIOR/' # 数据集根目录
splits = ['train', 'val', 'test']
category_dict = get_all_categories(voc_root, splits)
for split in splits:
coco = voc_to_coco(voc_root, split, category_dict)
with open(f'{split}.json', 'w') as f:
json.dump(coco, f)
if __name__ == "__main__":
main()
然而,如果数据集没有提前分割,可以直接转换为一个完整的all.json。代码小改一下就好。
三、划分完整的coco数据集为训练、验证、测试集
读取一个包含所有数据的coco形式的json文件,按自定义比例分割。
import json
import numpy as np
def split_dataset(json_file, ratios, names):
assert sum(ratios) == 1.0, "Ratios must sum to 1.0"
assert len(ratios) == len(names), "Must provide name for each split"
# 载入整个json数据集
with open(json_file, "r") as read_file:
data = json.load(read_file)
# 对图片和注解的id进行分割
image_ids = [image["id"] for image in data["images"]]
np.random.shuffle(image_ids)
num_images = len(image_ids)
splits = [int(ratio * num_images) for ratio in ratios]
splits[-1] = num_images - sum(splits[:-1]) # Ensure the splits sum to num_images
split_ids = np.split(image_ids, np.cumsum(splits[:-1]))
# 创建一个函数来生成新的json数据集
def create_subset(ids, name):
subset = {}
subset["info"] = data["info"]
subset["licenses"] = data["licenses"]
subset["categories"] = data["categories"]
subset["images"] = [image for image in data["images"] if image["id"] in ids]
subset["annotations"] = [annotation for annotation in data["annotations"] if annotation["image_id"] in ids]
# 保存为新的json文件
with open(f"{name}.json", "w") as write_file:
json.dump(subset, write_file)
# 创建数据集
for ids, name in zip(split_ids, names):
create_subset(ids, name)
# Example usage:
split_dataset("all.json", [0.75, 0.25, 0.0], ["train", "val", "test"])