存一下数据集转换的脚本

在处理HSRC2016数据集中遇到了一些困难,比如将标签转换成yolo格式的,划分数据集,查询目标类别是否修改成功,将33个类别重新修改变成指定个数的类别信息等

VOC转YOLO_obb:

import os
import xml.etree.ElementTree as ET
import numpy as np


def parse_voc_xml(xml_path):
    """解析PASCAL VOC XML文件,提取图像尺寸和旋转框顶点坐标"""
    tree = ET.parse(xml_path)
    root = tree.getroot()

    # 获取图像尺寸
    size = root.find('size')
    width = int(size.find('width').text)
    height = int(size.find('height').text)

    # 提取所有旋转框信息
    objects = []
    for obj in root.iter('object'):
        cls_name = obj.find('name').text
        polygon = obj.find('polygon')
        points = []
        for i in range(1, 5):
            x = float(polygon.find(f'x{i}').text)
            y = float(polygon.find(f'y{i}').text)
            points.append([x, y])
        objects.append({'class': cls_name, 'points': np.array(points)})

    return width, height, objects


def normalize_points(points, img_w, img_h):
    """将顶点坐标归一化"""
    points_norm = points.copy()
    points_norm[:, 0] = points_norm[:, 0] / img_w  # 归一化 x 坐标
    points_norm[:, 1] = points_norm[:, 1] / img_h  # 归一化 y 坐标
    return points_norm


def convert_voc_to_yolo_obb(xml_path, txt_dir, class_dict):
    """转换单个XML文件到YOLO-OBB TXT格式(顶点坐标格式),并保存到指定文件夹"""
    img_w, img_h, objects = parse_voc_xml(xml_path)

    # 确保输出目录存在
    if not os.path.exists(txt_dir):
        os.makedirs(txt_dir)

    # 生成输出TXT文件路径
    xml_filename = os.path.basename(xml_path)  # 获取文件名,例如 '000001.xml'
    txt_filename = xml_filename.replace('.xml', '.txt')  # 替换扩展名,例如 '000001.txt'
    txt_path = os.path.join(txt_dir, txt_filename)  # 完整输出路径

    with open(txt_path, 'w') as f:
        for obj in objects:
            # 获取类别索引
            cls_idx = class_dict.get(obj['class'], -1)
            if cls_idx == -1:
                continue  # 跳过未定义类别

            # 归一化顶点坐标
            points_norm = normalize_points(obj['points'], img_w, img_h)

            # 确保顶点顺序(YOLO-OBB 通常要求顺时针或逆时针)
            # 这里假设 XML 中的顶点已经是正确的顺序(x1, y1, x2, y2, x3, y3, x4, y4)
            points_flat = points_norm.flatten()  # 展平为 [x1, y1, x2, y2, x3, y3, x4, y4]

            # 写入TXT文件:class x1 y1 x2 y2 x3 y3 x4 y4
            line = f"{cls_idx} {' '.join(map(str, points_flat))}\n"
            f.write(line)

    return txt_path


# ---------------------- 使用示例 ----------------------
if __name__ == "__main__":
    # 定义类别映射字典(根据实际数据集修改)
    CLASS_DICT = {'ship': 0, 'airplane': 1}  # {'类别名': 类别索引}

    # 指定XML文件夹路径和TXT输出文件夹路径
    XML_DIR = 'F:\dataset\SSDD(+)\Annotations(SSDD+)'  # 输入XML文件目录
    TXT_DIR = 'F:\dataset\SSDD(+)\labels'  # 输出TXT文件目录

    # 遍历转换所有XML文件
    for xml_file in os.listdir(XML_DIR):
        if xml_file.endswith('.xml'):
            xml_path = os.path.join(XML_DIR, xml_file)
            txt_path = convert_voc_to_yolo_obb(xml_path, TXT_DIR, CLASS_DICT)
            print(f"Converted: {xml_path} -> {txt_path}")

划分数据集:

import os
import shutil
import random
from pathlib import Path


def split_dataset(
        data_dir="dataset",
        output_dir="splitted_dataset",
        ratios=(0.7, 0.2, 0.1),  # train, val, test
        img_exts=('.jpg', '.jpeg', '.png'),
        anno_exts=('.txt', '.xml'),
        seed=42
):
    """
    数据集划分脚本
    参数:
    data_dir: 原始数据集路径(需包含images和labels文件夹)
    output_dir: 输出目录
    ratios: 划分比例 (train, val, test)
    img_exts: 支持的图像文件扩展名
    anno_exts: 支持的标注文件扩展名
    seed: 随机种子
    """
    print(f"验证比例参数 | 输入值: {ratios} | 类型: {type(ratios)} | 总和: {sum(ratios):.16f}")
    # 验证参数
    assert abs(sum(ratios) - 1.0) < 1e-6, \
        f"比例总和必须为1.0,当前总和为{sum(ratios):.16f}"
    #assert sum(ratios) == 1.0, "比例总和必须为1"
    assert len(ratios) == 3, "需要三个比例值(train, val, test)"

    # 创建输出目录结构
    output_path = Path(output_dir)
    (output_path / 'train/images').mkdir(parents=True, exist_ok=True)
    (output_path / 'train/labels').mkdir(parents=True, exist_ok=True)
    (output_path / 'val/images').mkdir(parents=True, exist_ok=True)
    (output_path / 'val/labels').mkdir(parents=True, exist_ok=True)
    (output_path / 'test/images').mkdir(parents=True, exist_ok=True)
    (output_path / 'test/labels').mkdir(parents=True, exist_ok=True)

    # 获取所有图像文件路径
    img_dir = Path(data_dir) / 'images'
    label_dir = Path(data_dir) / 'labels'

    # 验证目录结构
    if not img_dir.exists():
        raise FileNotFoundError(f"图像目录不存在: {img_dir}")
    if not label_dir.exists():
        raise FileNotFoundError(f"标注目录不存在: {label_dir}")

    # 收集有效文件对
    file_pairs = []
    for img_file in img_dir.iterdir():
        if img_file.suffix.lower() in img_exts:
            stem = img_file.stem
            # 查找对应的标注文件
            for ext in anno_exts:
                anno_file = label_dir / f"{stem}{ext}"
                if anno_file.exists():
                    file_pairs.append((img_file, anno_file))
                    break
            else:
                print(f"警告:未找到 {stem} 的标注文件,已跳过")

    # 随机打乱数据
    random.seed(seed)
    random.shuffle(file_pairs)

    # 计算划分数量
    total = len(file_pairs)
    train_end = int(total * ratios[0])
    val_end = train_end + int(total * ratios[1])

    # 划分数据集
    splits = {
        'train': file_pairs[:train_end],
        'val': file_pairs[train_end:val_end],
        'test': file_pairs[val_end:]
    }

    # 复制文件到对应目录
    for split_name, pairs in splits.items():
        print(f"正在处理 {split_name} 集 ({len(pairs)} 个样本)")

        for img_src, anno_src in pairs:
            # 复制图像
            img_dst = output_path / split_name / 'images' / img_src.name
            shutil.copy(img_src, img_dst)

            # 复制标注
            anno_dst = output_path / split_name / 'labels' / anno_src.name
            shutil.copy(anno_src, anno_dst)

    print("\n数据集划分完成!")
    print(f"总样本数: {total}")
    print(f"训练集: {len(splits['train'])} ({len(splits['train']) / total:.1%})")
    print(f"验证集: {len(splits['val'])} ({len(splits['val']) / total:.1%})")
    print(f"测试集: {len(splits['test'])} ({len(splits['test']) / total:.1%})")


if __name__ == "__main__":
    # 使用示例 - 修改参数以适应你的需求
    split_dataset(
        data_dir=PATH_TO_YOUR_DATASET,  # 原始数据集路径
        output_dir=PATH_TO_SPILIT_OUTPUT,  # 输出路径
        ratios=(0.7, 0.2, 0.1),  # 划分比例
        img_exts=('.jpg', '.png'),  # 支持的图像格式
        anno_exts=('.txt', '.xml'),  # 支持的标注格式
        seed=42  # 随机种子
    )

查询yolo格式下的类别标签是否存在:

import os

def find_files_with_category_in_multiple_folders(folder_paths, target_category):
    for folder_path in folder_paths:
        files_with_target_category = []

        # 检查文件夹是否存在
        if not os.path.exists(folder_path):
            print(f"Error: The folder {folder_path} does not exist.")
            continue

        # 遍历文件夹中的所有文件
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.txt'):  # 假设标签文件是txt格式
                file_path = os.path.join(folder_path, file_name)
                try:
                    with open(file_path, 'r') as file:
                        for line in file:
                            # 提取类别编号 (行的第一个元素)
                            category_id = int(line.split()[0])
                            if category_id == target_category:
                                files_with_target_category.append(file_name)
                                break  # 找到目标类别后跳出循环,继续下一个文件
                except Exception as e:
                    print(f"Error reading file {file_name} in folder {folder_path}: {e}")

        # 根据查询结果输出
        file_count = len(files_with_target_category)
        if file_count > 0:
            print(f"\nFolder: {folder_path}")
            print(f"Files containing category {target_category}: {files_with_target_category}")
            print(f"Total number of files containing category {target_category} in this folder: {file_count}")
        else:
            print(f"\nFolder: {folder_path}")
            print(f"No files containing category {target_category} were found in this folder.")

# 使用示例
folder_paths = [
    r'F:\yolov8_test\ultralytics-main\dataset\HRSC_train\labels_obb_originalname33',
    ]
#     r'F:\yolov8\ultralytics-main\dataset\HRSC_val\labels',
#     r'F:\yolov8\ultralytics-main\dataset\HRSC_test\labels'
# ]  # 替换为你要查询的多个文件夹路径

target_category = 6 # 替换为要查询的类别编号
find_files_with_category_in_multiple_folders(folder_paths, target_category)

将标签中的某类别修改成目标类别:

import os


def replace_category_in_files(folder_path, target_categories, new_category):
    # 检查文件夹是否存在
    if not os.path.exists(folder_path):
        print(f"Error: The folder {folder_path} does not exist.")
        return

    # 遍历文件夹中的所有文件
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.txt'):  # 假设标签文件是txt格式
            file_path = os.path.join(folder_path, file_name)

            # 读取文件内容
            with open(file_path, 'r') as file:
                lines = file.readlines()

            # 创建一个新的文件内容
            new_lines = []
            for line in lines:
                elements = line.split()
                category_id = int(elements[0])

                # 如果类别编号在目标类别列表中,替换为新的类别编号
                if category_id in target_categories:
                    elements[0] = str(new_category)

                # 将修改后的行加入新的内容
                new_lines.append(" ".join(elements) + "\n")

            # 写入修改后的内容回到文件
            with open(file_path, 'w') as file:
                file.writelines(new_lines)

    print(f"All occurrences of categories {target_categories} have been replaced with {new_category}.")


# 使用示例
#folder_path = r'F:\yolov8\ultralytics-main\dataset\HRSC_train\labels'
#folder_path = r'F:\yolov8\ultralytics-main\dataset\HRSC_val\labels'
folder_path = r'F:\yolov8_test\ultralytics-main\dataset\HRSC_val\labels_obb_originalname33'# 替换为标签文件夹路径
target_categories = [17,19,21,22,23,24,25,28,29]  # 替换为要修改的类别编号列表
new_category = 3  # 替换为新的统一类别编号

replace_category_in_files(folder_path, target_categories, new_category)
#1:4,5,11,12,30,31yiwancheng
#2:6,7,8,9,10,13,14,16,18,20,27yiwancheng
#3:17,19,21,22,23,24,25,28,29yiwancheng
#0:26,25,23wancheng

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值