为避免每次调用bash,写了个较为简短的切分数据集代码:
import os
import random
# ====== 配置路径和比例 ======
dataset_dir = './data/your_dir' # 输出目录
image_dir = os.path.join(dataset_dir, 'images')
ann_dir = os.path.join(dataset_dir, 'annotations')
train_ratio = 0.6
val_ratio = 0.2 # 剩下的是 test(0.2)
def split_dataset():
# 以 annotations 文件为主
ann_files = sorted([f for f in os.listdir(ann_dir) if f.endswith('.png')])
base_names = [os.path.splitext(f)[0] for f in ann_files]
random.shuffle(base_names)
total = len(base_names)
train_end = int(total * train_ratio)
val_end = int(total * (train_ratio + val_ratio))
splits = {
'train': base_names[:train_end],
'val': base_names[train_end:val_end],
'test': base_names[val_end:]
}
for split, names in splits.items():
with open(os.path.join(dataset_dir, f'{split}.txt'), 'w', encoding='utf-8') as f:
for name in names:
# 尝试找到图像文件
if os.path.exists(os.path.join(image_dir, name + '.jpg')):
img_path = f'images/{name}.jpg'
elif os.path.exists(os.path.join(image_dir, name + '.png')):
img_path = f'images/{name}.png'
elif os.path.exists(os.path.join(image_dir, name + '.jpeg')):
img_path = f'images/{name}.jpeg'
else:
print(f"⚠️ 警告:找不到对应图像文件:{name}")
continue
ann_path = f'annotations/{name}.png'
f.write(f'{img_path} {ann_path}\n')
print(f"✅ 数据划分完成!共 {total} 个标注")
print(f" 训练集: {len(splits['train'])}")
print(f" 验证集: {len(splits['val'])}")
print(f" 测试集: {len(splits['test'])}")
if __name__ == '__main__':
split_dataset()
该代码会按照比例随机切分数据集,如果想要重现效果,可加入随机种子

被折叠的 条评论
为什么被折叠?



