易错点:
1.类别与实际不符(大小写,下划线)
2.地址问题
import os
import json
import random
def generate_aid_dataset(base_dir):
"""
生成数据集的JSON文件
参数:
base_dir: 包含所有类别图像文件夹的基础目录
"""
# 数据集的所有类别
categories = [
"AnnualCrop", "Forest", "HerbaceousVegetation", "Highway", "Industrial",
"Pasture","Pasture", "PermanentCrop", "Residential", "River", "SeaLake"
]
# 最终的数据集字典
dataset = {
"train": [],
"test": []
}
# 遍历每个类别
for category_id, category_name in enumerate(categories):
# 获取当前类别的图像文件
category_dir = os.path.join(base_dir, category_name)
image_files = [f for f in os.listdir(category_dir) if f.endswith('.jpg')]
# 随机打乱图像文件列表
random.shuffle(image_files)
# 计算训练集和测试集的分割点
train_split = int(len(image_files) * 0.5)
# 生成训练集条目
for filename in image_files[:train_split]:
file_path = os.path.join(category_name, filename)
entry = [file_path, category_id, category_name]
dataset["train"].append(entry)
# 生成测试集条目
for filename in image_files[train_split:]:
file_path = os.path.join(category_name, filename)
entry = [file_path, category_id, category_name]
dataset["test"].append(entry)
# 写入JSON文件
output_filename = r"C:\Users\Ai_Studio\Desktop\remoteclip\remote dataset\EuroSAT\eurosat_dataset.json"
with open(output_filename, 'w') as f:
json.dump(dataset, f, indent=2)
# 打印数据集信息
print(f"JSON文件 {output_filename} 已生成")
print(f"训练集图像数量: {len(dataset['train'])}")
print(f"测试集图像数量: {len(dataset['test'])}")
# 打印每个类别的训练集和测试集数量
for category_name in categories:
train_count = sum(1 for entry in dataset['train'] if entry[2] == category_name)
test_count = sum(1 for entry in dataset['test'] if entry[2] == category_name)
print(f"{category_name}: 训练集 {train_count} 张, 测试集 {test_count} 张")
# 使用示例
base_directory = r"C:\Users\Ai_Studio\Desktop\remoteclip\remote dataset\EuroSAT\2750" # eurosat数据集的根目录
generate_aid_dataset(base_directory)