为了将一个数据集分为训练集、验证集和测试集,并生成适合YOLOv8模型训练的格式,你需要以下几个步骤:
-
准备数据集:假设你的数据集包括图像和标注文件。YOLO的标注文件通常是文本文件(.txt),每个文件包含与图像中目标的类别和位置信息(通过边界框表示)。
-
数据集拆分:我们需要将图像和相应的标注拆分为训练集、验证集和测试集。
-
生成YOLOv8所需的目录结构:YOLOv8要求特定的目录结构,如下:
├── data
│ ├── train
│ │ ├── images
│ │ ├── labels
│ ├── val
│ │ ├── images
│ │ ├── labels
│ ├── test
│ │ ├── images
│ │ ├── labels
4.创建YOLO格式的标签文件:YOLO的标签格式是每个标签文件包含一行每个物体的信息,格式如下:
<class_id> <x_center> <y_center> <width> <height>
以下是一个Python脚本,用于拆分数据集并生成YOLOv8训练所需的目录结构和标签文件:
import os
import random
import shutil
def create_dir_structure(base_path):
"""创建训练集、验证集和测试集所需的目录结构"""
paths = [
os.path.join(base_path, 'train/images'),
os.path.join(base_path, 'train/labels'),
os.path.join(base_path, 'val/images'),
os.path.join(base_path, 'val/labels'),
os.path.join(base_path, 'test/images'),
os.path.join(base_path, 'test/labels'),
]
for path in paths:
if not os.path.exists(path):
os.makedirs(path)
def split_dataset(images_path, labels_path, output_dir, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
"""将数据集拆分为训练集、验证集和测试集"""
# 获取所有图像和标签文件
image_files = [f for f in os.listdir(images_path) if f.endswith('.jpg') or f.endswith('.png')]
label_files = [f.replace('.jpg', '.txt').replace('.png', '.txt') for f in image_files]
# 打乱图像和标签文件
combined = list(zip(image_files, label_files))
random.shuffle(combined)
image_files, label_files = zip(*combined)
# 计算各个集合的大小
total = len(image_files)
train_end = int(total * train_ratio)
val_end = int(total * (train_ratio + val_ratio))
# 拷贝文件到相应目录
for i, image_file in enumerate(image_files):
image_path = os.path.join(images_path, image_file)
label_path = os.path.join(labels_path, label_files[i])
if i < train_end:
shutil.copy(image_path, os.path.join(output_dir, 'train/images', image_file))
shutil.copy(label_path, os.path.join(output_dir, 'train/labels', label_files[i]))
elif i < val_end:
shutil.copy(image_path, os.path.join(output_dir, 'val/images', image_file))
shutil.copy(label_path, os.path.join(output_dir, 'val/labels', label_files[i]))
else:
shutil.copy(image_path, os.path.join(output_dir, 'test/images', image_file))
shutil.copy(label_path, os.path.join(output_dir, 'test/labels', label_files[i]))
print(f"Dataset split into {train_end} training, {val_end - train_end} validation, and {total - val_end} test samples.")
def main():
# 设置路径
images_path = '/path/to/your/images' # 修改为图像所在路径
labels_path = '/path/to/your/labels' # 修改为标签所在路径
output_dir = '/path/to/output/directory' # 修改为输出目录路径
# 创建目录结构
create_dir_structure(output_dir)
# 拆分数据集
split_dataset(images_path, labels_path, output_dir)
if __name__ == '__main__':
main()