数据集处理_windflower,54, yellow iris,4-优快云博客

Cars196

import scipy.io
import os
import shutil

# 读取 cars_annos.mat 文件
mat = scipy.io.loadmat('cars/cars_annos.mat')
annotations = mat['annotations'][0]
class_names = mat['class_names'][0]

# 创建 train 和 test 目录
train_dir = 'cars/train'
test_dir = 'cars/test'
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

test_num = 0
train_num = 0
# 遍历每个 annotation
for anno in annotations:
    img_path = anno[0][0].split('/')[-1]  # car_ims/011204.jpg
    # print(img_path)
    class_id = int(anno[5][0][0])  # 获取类别ID
    is_test = int(anno[6][0][0])  # 获取是否为训练集 (0 是训练集, 1 是测试集)
    
    # 获取类别名称
    class_name = class_names[class_id - 1][0].replace(' ', '_').replace('/', '_')
    
    # 拼接新文件名 num_name
    new_class_name = f"{str(class_id).zfill(3)}_{class_name}"
    
    # 目标路径
    if not is_test:
        dst_dir = train_dir
        train_num += 1
    else:
        dst_dir = test_dir
        test_num += 1
    
    # 复制图像到目标目录
    src_img_path = os.path.join('cars/car_ims', img_path)
    dst_dir = os.path.join(dst_dir, new_class_name)
    os.makedirs(dst_dir, exist_ok=True)
    # Copying cars/car_ims/016185.jpg to cars/test/196_smart_fortwo_Convertible_2012
    print(f"Copying {src_img_path} to {dst_dir}")
    shutil.copy(src_img_path, dst_dir)

print(f"Train set: {train_num}, Test set: {test_num}")
print("Dataset split completed!")

PETs

import shutil
import os

# 训练集和测试集标签文件
train_val_file = "./annotations/trainval.txt"
test_file = "./annotations/test.txt"

# 创建训练集和测试集文件夹
processed_image_dir = "images"
train_dir = "train"
test_dir = "test"
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# 读取训练集文件名
with open(train_val_file, 'r') as f:
    train_list = f.readlines()

# 读取测试集文件名
with open(test_file, 'r') as f:
    test_list = f.readlines()

# 处理训练集图像
for line in train_list:
    image_name = line.split()[0] + ".jpg"
    parts = image_name.split('_')
    category = "_".join(parts[:-1])
    category_dir = os.path.join(train_dir, category)
    os.makedirs(category_dir, exist_ok=True)
    src_path = os.path.join(processed_image_dir, image_name)
    dst_path = os.path.join(category_dir, image_name)
    if os.path.exists(src_path):
        shutil.copy(src_path, dst_path)
        # print(f"copy {src_path} to {dst_path}")
    else:
        print(f"Image {src_path} not found")

# 处理测试集图像
for line in test_list:
    image_name = line.split()[0] + ".jpg"
    parts = image_name.split('_')
    category = "_".join(parts[:-1])
    category_dir = os.path.join(test_dir, category)
    os.makedirs(category_dir, exist_ok=True)
    src_path = os.path.join(processed_image_dir, image_name)
    dst_path = os.path.join(category_dir, image_name)
    if os.path.exists(src_path):
        shutil.copy(src_path, dst_path)
    else:
        print(f"Image {src_path} not found")

Flowers102

随机分为train（0.8），test（0.2），并以class name命名。

import scipy.io as sio
import os

# class_names = [
    # "pink primrose", "hard-leaved pocket orchid", "canterbury bells", "sweet pea",
    # "english marigold", "tiger lily", "moon orchid", "bird of paradise", "monkshood",
    # "globe thistle", "snapdragon", "colt's foot", "king protea", "spear thistle",
    # "yellow iris", "globe-flower", "purple coneflower", "peruvian lily",
    # "balloon flower", "giant white arum lily", "fire lily", "pincushion flower",
    # "fritillary", "red ginger", "grape hyacinth", "corn poppy", "prince of wales feathers",
    # "stemless gentian", "artichoke", "sweet william", "carnation", "garden phlox",
    # "love in the mist", "mexican aster", "alpine sea holly", "ruby-lipped cattleya",
    # "cape flower", "great masterwort", "siam tulip", "lenten rose", "barbeton daisy",
    # "daffodil", "sword lily", "poinsettia", "bolero deep blue", "wallflower",
    # "marigold", "buttercup", "oxeye daisy", "common dandelion", "petunia", "wild pansy",
    # "primula", "sunflower", "pelargonium", "bishop of llandaff", "gaura", "geranium",
    # "orange dahlia", "pink-yellow dahlia?", "cautleya spicata", "japanese anemone",
    # "black-eyed susan", "silverbush", "californian poppy", "osteospermum", "spring crocus",
    # "bearded iris", "windflower", "tree poppy", "gazania", "azalea", "water lily",
    # "rose", "thorn apple", "morning glory", "passion flower", "lotus", "toad lily",
    # "anthurium", "frangipani", "clematis", "hibiscus", "columbine", "desert-rose",
    # "tree mallow", "magnolia", "cyclamen", "watercress", "canna lily", "hippeastrum",
    # "bee balm", "pink quill", "foxglove", "bougainvillea", "camellia", "mallow",
    # "mexican petunia", "bromelia", "blanket flower", "trumpet creeper", "blackberry lily"]

label2name = {'21': 'fire lily',
 '3': 'canterbury bells',
 '45': 'bolero deep blue',
 '1': 'pink primrose',
 '34': 'mexican aster',
 '27': 'prince of wales feathers',
 '7': 'moon orchid',
 '16': 'globe-flower',
 '25': 'grape hyacinth',
 '26': 'corn poppy',
 '79': 'toad lily',
 '39': 'siam tulip',
 '24': 'red ginger',
 '67': 'spring crocus',
 '35': 'alpine sea holly',
 '32': 'garden phlox',
 '10': 'globe thistle',
 '6': 'tiger lily',
 '93': 'ball moss',
 '33': 'love in the mist',
 '9': 'monkshood',
 '102': 'blackberry lily',
 '14': 'spear thistle',
 '19': 'balloon flower',
 '100': 'blanket flower',
 '13': 'king protea',
 '49': 'oxeye daisy',
 '15': 'yellow iris',
 '61': 'cautleya spicata',
 '31': 'carnation',
 '64': 'silverbush',
 '68': 'bearded iris',
 '63': 'black-eyed susan',
 '69': 'windflower',
 '62': 'japanese anemone',
 '20': 'giant white arum lily',
 '38': 'great masterwort',
 '4': 'sweet pea',
 '86': 'tree mallow',
 '101': 'trumpet creeper',
 '42': 'daffodil',
 '22': 'pincushion flower',
 '2': 'hard-leaved pocket orchid',
 '54': 'sunflower',
 '66': 'osteospermum',
 '70': 'tree poppy',
 '85': 'desert-rose',
 '99': 'bromelia',
 '87': 'magnolia',
 '5': 'english marigold',
 '92': 'bee balm',
 '28': 'stemless gentian',
 '97': 'mallow',
 '57': 'gaura',
 '40': 'lenten rose',
 '47': 'marigold',
 '59': 'orange dahlia',
 '48': 'buttercup',
 '55': 'pelargonium',
 '36': 'ruby-lipped cattleya',
 '91': 'hippeastrum',
 '29': 'artichoke',
 '71': 'gazania',
 '90': 'canna lily',
 '18': 'peruvian lily',
 '98': 'mexican petunia',
 '8': 'bird of paradise',
 '30': 'sweet william',
 '17': 'purple coneflower',
 '52': 'wild pansy',
 '84': 'columbine',
 '12': "colt's foot",
 '11': 'snapdragon',
 '96': 'camellia',
 '23': 'fritillary',
 '50': 'common dandelion',
 '44': 'poinsettia',
 '53': 'primula',
 '72': 'azalea',
 '65': 'californian poppy',
 '80': 'anthurium',
 '76': 'morning glory',
 '37': 'cape flower',
 '56': 'bishop of llandaff',
 '60': 'pink-yellow dahlia',
 '82': 'clematis',
 '58': 'geranium',
 '75': 'thorn apple',
 '41': 'barbeton daisy',
 '95': 'bougainvillea',
 '43': 'sword lily',
 '83': 'hibiscus',
 '78': 'lotus lotus',
 '88': 'cyclamen',
 '94': 'foxglove',
 '81': 'frangipani',
 '74': 'rose',
 '89': 'watercress',
 '73': 'water lily',
 '46': 'wallflower',
 '77': 'passion flower',
 '51': 'petunia'}

# print(len(label2name))
# 定义 .mat 文件的路径
mat_file_path = '/grp01/cs_hszhao/cs002u03/dataset/Flower102/imagelabels.mat'

# 加载 .mat 文件
mat_data = sio.loadmat(mat_file_path)

print(mat_data)
# 提取图像标签
image_labels = mat_data.get('labels', None)  # 使用 'labels' 作为键来访问数据

# 提取图像标签（MATLAB 数据通常是二维数组）
image_labels = image_labels[0]  # 提取为一维数组

# 输出前 10 个标签查看
print("前 10 个图像标签：", image_labels[:10])

# 输出标签信息
print(f"标签总数: {len(image_labels)}")
print(f"最小标签值: {min(image_labels)}")
print(f"最大标签值: {max(image_labels)}")
print("type: ", type(image_labels[0])) # <class 'numpy.uint8'>
# 将图像编号和标签以字典形式保存
image_label_dict = {f"image_{i+1:05d}.jpg": str(label) for i, label in enumerate(image_labels)} 
label_image_dict = {str(label): [] for label in set(image_labels)}
for img_name, label in image_label_dict.items():
    label_image_dict[label].append(img_name) # ['image_06734.jpg', 'image_06735.jpg',...]

# randomly split into train(0.8) and test(0.2) and save to file named by category name
import random
import shutil
import os
random.seed(0)

root = "/grp01/cs_hszhao/cs002u03/dataset/Flower102/jpg"
train_root = "/grp01/cs_hszhao/cs002u03/dataset/Flower102/train"
test_root = "/grp01/cs_hszhao/cs002u03/dataset/Flower102/test"

os.makedirs(train_root, exist_ok=True)
os.makedirs(test_root, exist_ok=True)

for label, images in label_image_dict.items():
    random.shuffle(images)
    train_num = int(len(images) * 0.8)
    train_images = images[:train_num]
    test_images = images[train_num:]
    train_root_perC = os.path.join(train_root, f"{label.zfill(2)}_{label2name[label].replace(' ', '_')}")
    test_root_perC = os.path.join(test_root, f"{label.zfill(2)}_{label2name[label].replace(' ', '_')}")
    os.makedirs(train_root_perC, exist_ok=True)
    os.makedirs(test_root_perC, exist_ok=True)
    for img in train_images:
        shutil.copy(os.path.join(root, img), os.path.join(train_root_perC, img))
    for img in test_images:
        shutil.copy(os.path.join(root, img), os.path.join(test_root_perC, img))



#######################################################

Caltech101

去除google background，留下101个类别。

import os
import shutil
import random
random.seed(0)

src_root = "/grp01/cs_hszhao/cs002u03/dataset/caltech-101/101_ObjectCategories"
dst_root = "/grp01/cs_hszhao/cs002u03/dataset/caltech-101/organize"

train_root = os.path.join(dst_root, "train")
test_root = os.path.join(dst_root, "test")
os.makedirs(train_root, exist_ok=True)
os.makedirs(test_root, exist_ok=True)

class_names = os.listdir(src_root)
for class_name in class_names:
    img_names = os.listdir(os.path.join(src_root, class_name))
    img_names = sorted(img_names)
    random.shuffle(img_names)
    train_length = 30
    train_paths = img_names[:train_length]
    test_paths = img_names[train_length:]
    os.makedirs(os.path.join(train_root, class_name), exist_ok=True)
    os.makedirs(os.path.join(test_root, class_name), exist_ok=True)

    for img_name in train_paths:
        src_path = os.path.join(src_root, class_name, img_name)
        dst_path = os.path.join(train_root, class_name, img_name)
        shutil.copy(src_path, dst_path)
    for img_name in test_paths:
        src_path = os.path.join(src_root, class_name, img_name)
        dst_path = os.path.join(test_root, class_name, img_name)
        shutil.copy(src_path, dst_path)

CUB

import os
import shutil

# 设置数据集路径
data_dir = "CUB_200_2011"
images_dir = os.path.join(data_dir, "images")
train_dir = os.path.join(data_dir, "train")
test_dir = os.path.join(data_dir, "test")

# 创建输出目录
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# 读取图像文件和训练/测试分割信息
with open(os.path.join(data_dir, "images.txt"), "r") as f:
    images = [line.strip().split(" ")[1] for line in f.readlines()]

with open(os.path.join(data_dir, "train_test_split.txt"), "r") as f:
    train_test_split = [int(line.strip().split(" ")[1]) for line in f.readlines()]

# 遍历所有图像并按训练/测试分配到对应的文件夹中
for img, is_train in zip(images, train_test_split):
    source = os.path.join(images_dir, img)
    target_folder = train_dir if is_train == 1 else test_dir
    
    # 创建对应的类别文件夹
    category_folder = os.path.join(target_folder, img.split("/")[0])
    os.makedirs(category_folder, exist_ok=True)
    
    # 拷贝图像到目标文件夹
    shutil.copy(source, os.path.join(target_folder, img))

print("数据集整理完成！")

Cifar100

import os
import pickle
import numpy as np
from PIL import Image

# 定义源数据集文件路径
cifar_dir = 'cifar-100-python'     # 解压后的 CIFAR-100 数据集文件夹
train_file = os.path.join(cifar_dir, 'train')
test_file = os.path.join(cifar_dir, 'test')
meta_file = os.path.join(cifar_dir, 'meta')

# 定义目标路径
output_dir = 'cifar-100-python/data'  # 将整理后的数据集存储到这个目录
train_output_dir = os.path.join(output_dir, 'train')
test_output_dir = os.path.join(output_dir, 'test')

# 检查输出文件夹，不存在则创建
os.makedirs(train_output_dir, exist_ok=True)
os.makedirs(test_output_dir, exist_ok=True)

# 加载 CIFAR-100 的元数据（类别名称）
with open(meta_file, 'rb') as f:
    meta = pickle.load(f, encoding='bytes')
    fine_label_names = meta[b'fine_label_names']  # CIFAR-100 中的细粒度类别名称
    fine_label_names = [name.decode('utf-8') for name in fine_label_names]

# 辅助函数：将图像数据保存为 PNG 格式
def save_image(image_array, filename):
    image = Image.fromarray(image_array)  # 将 numpy 数组转换为图像
    image.save(filename)  # 保存图像文件

# 辅助函数：解码 CIFAR-100 数据并保存图像
def decode_and_save(data_file, output_dir):
    with open(data_file, 'rb') as f:
        data_dict = pickle.load(f, encoding='bytes')
        # 提取数据和标签
        images = data_dict[b'data']
        labels = data_dict[b'fine_labels']
        filenames = data_dict[b'filenames']

        # 遍历所有图像
        for i in range(len(images)):
            # 获取图像数据，并转换为 32x32x3 的格式
            img_array = np.reshape(images[i], (3, 32, 32)).transpose(1, 2, 0)
            label = labels[i]
            class_name = fine_label_names[label]  # 根据标签获取类别名称
            filename = filenames[i].decode('utf-8')  # 获取文件名并解码
            
            # 创建以类别命名的文件夹
            class_dir = os.path.join(output_dir, class_name)
            os.makedirs(class_dir, exist_ok=True)

            # 保存图像到类别文件夹中
            save_image(img_array, os.path.join(class_dir, filename))

# 处理训练集和测试集
print("开始处理训练集...")
decode_and_save(train_file, train_output_dir)

print("开始处理测试集...")
decode_and_save(test_file, test_output_dir)

print("CIFAR-100 数据集整理完成！")

IN200-S

import os
import shutil
import random
random.seed(0)

# 原始 ImageNet 数据集根目录
source_dir = '/grp01/cs_hszhao/cs002u03/dataset/ImageNet-1k/train'
source_val_dir = '/grp01/cs_hszhao/cs002u03/dataset/ImageNet-1k/val_1k'
train_dest_dir = '/grp01/cs_hszhao/cs002u03/dataset/IN200_S/train'
val_dest_dir = '/grp01/cs_hszhao/cs002u03/dataset/IN200_S/val'

num_images_per_class = 100
num_classes = 200

if not os.path.exists(train_dest_dir):
    os.makedirs(train_dest_dir)

# 获取所有类别（子文件夹名称）
all_classes = [d for d in os.listdir(source_dir) if os.path.isdir(os.path.join(source_dir, d))]

# 随机选择 200 个类别
selected_classes = random.sample(all_classes, num_classes)

# 遍历每个选择的类别
for class_name in selected_classes:
    # 获取该类别下的所有图片路径
    class_dir = os.path.join(source_dir, class_name)
    all_images = [f for f in os.listdir(class_dir) if os.path.isfile(os.path.join(class_dir, f))]

    # 随机选择 100 张图片
    selected_images = random.sample(all_images, num_images_per_class)

    # 创建目标类别文件夹
    train_class_dir = os.path.join(train_dest_dir, class_name)
    val_class_dir = os.path.join(val_dest_dir, class_name)
    os.makedirs(train_class_dir, exist_ok=True)
    os.makedirs(val_class_dir, exist_ok=True)

    # 将选择的图片移动到目标文件夹中
    for image_name in selected_images:
        source_image_path = os.path.join(class_dir, image_name)
        target_image_path = os.path.join(train_class_dir, image_name)
        shutil.copy(source_image_path, target_image_path)

    val_images = os.listdir(os.path.join(source_val_dir, class_name))
    for image_name in val_images:
        source_image_path = os.path.join(source_val_dir, class_name, image_name)
        target_image_path = os.path.join(val_dest_dir, class_name, image_name)
        shutil.copy(source_image_path, target_image_path)

    print(f'Copied {num_images_per_class} images for class {class_name} to {train_class_dir}')

print(f'Successfully copied {num_classes} classes with {num_images_per_class} images each to {source_dir}.')

IN100-S

生成train_100.json

wnids_IN100 = ['n01498041', 'n01514859', 'n01582220', 'n01608432', 'n01616318',
        'n01443537', 'n01776313', 'n01806567', 'n01833805', 'n01882714',
        'n01910747', 'n01944390', 'n01985128', 'n02007558', 'n02071294',
        'n02085620', 'n02114855', 'n02123045', 'n02128385', 'n02129165',
        'n02129604', 'n02165456', 'n02190166', 'n02219486', 'n02226429',
        'n02279972', 'n02317335', 'n02326432', 'n02342885', 'n02363005',
        'n02391049', 'n02395406', 'n02403003', 'n02422699', 'n02442845',
        'n02444819', 'n02480855', 'n02510455', 'n02640242', 'n02672831',
        'n02687172', 'n02701002', 'n02730930', 'n02769748', 'n02782093',
        'n02787622', 'n02793495', 'n02799071', 'n02802426', 'n02814860',
        'n02840245', 'n02906734', 'n02948072', 'n02980441', 'n02999410',
        'n03014705', 'n03028079', 'n03032252', 'n03125729', 'n03160309',
        'n03179701', 'n03220513', 'n03249569', 'n03291819', 'n03384352',
        'n03388043', 'n03450230', 'n03481172', 'n03594734', 'n03594945',
        'n03627232', 'n03642806', 'n03649909', 'n03661043', 'n03676483',
        'n03724870', 'n03733281', 'n03759954', 'n03761084', 'n03773504',
        'n03804744', 'n03916031', 'n03938244', 'n04004767', 'n04026417',
        'n04090263', 'n04133789', 'n04153751', 'n04296562', 'n04330267',
        'n04371774', 'n04404412', 'n04465501', 'n04485082', 'n04507155',
        'n04536866', 'n04579432', 'n04606251', 'n07714990', 'n07745940']
IN100_words = ['stingray', 'hen', 'magpie', 'kite', 'vulture',
        'goldfish',   'tick', 'quail', 'hummingbird', 'koala',
        'jellyfish', 'snail', 'crawfish', 'flamingo', 'orca',
        'chihuahua', 'coyote', 'tabby', 'leopard', 'lion',
        'tiger','ladybug', 'fly' , 'ant', 'grasshopper',
        'monarch', 'starfish', 'hare', 'hamster', 'beaver',
        'zebra', 'pig', 'ox', 'impala',  'mink',
        'otter', 'gorilla', 'panda', 'sturgeon', 'accordion',
        'carrier', 'ambulance', 'apron', 'backpack', 'balloon',
        'banjo','barn','baseball', 'basketball', 'beacon',
        'binder', 'broom', 'candle', 'castle', 'chain',
        'chest', 'church', 'cinema', 'cradle', 'dam',
        'desk', 'dome', 'drum','envelope', 'forklift',
        'fountain', 'gown', 'hammer','jean', 'jeep',
        'knot', 'laptop', 'mower', 'library','lipstick',
        'mask', 'maze', 'microphone','microwave','missile',
        'nail', 'perfume','pillow','printer','purse',
        'rifle', 'sandal', 'screw','stage','stove',
        'swing','television','tractor','tripod','umbrella',
        'violin','whistle','wreck', 'broccoli', 'strawberry'
        ]
wnids_IN100, IN100_words = zip(*sorted(zip(wnids_IN100, IN100_words)))

categories = []
for index, (wnid, word) in enumerate(sorted(zip(wnids_IN100, IN100_words))):
    categories.append((index, wnid, word))

print(categories[:5])

# generate train_100.json
root_dir = "/mnt/petrelfs/share/imagenet/images/train"
import os
import json

data = []
for item in categories:
    index, wnid, word = item
    category_dir = os.path.join(root_dir, wnid)
    images = os.listdir(category_dir)[:200]
    for image in images:
        data.append({
            "image_file": os.path.join(wnid, image),
            "label": index,
            "word": word
        })

with open("/mnt/petrelfs/yangshuo/IP-Adapter-main/data/ImageNet-1K/train_100.json", "w") as f:
    json.dump(data, f)

生成train文件夹，将train_100.json中的图片移动到train文件夹中

import json
import shutil
import os

root = "/grp01/cs_hszhao/cs002u03/dataset/IN100/train"
json_file = json.load(open('/grp01/cs_hszhao/cs002u03/dataset/IN100_sub/train_100.json'))
# print(json_file[:5])
for item in json_file:
    category = item['image_file'].split('/')[0]
    image_path = os.path.join(root, item['image_file'])
    os.makedirs('/grp01/cs_hszhao/cs002u03/dataset/IN100_sub/train/' + category, exist_ok=True)
    shutil.copy(image_path, '/grp01/cs_hszhao/cs002u03/dataset/IN100_sub/train/' + category)

print('done')

生成val文件夹，并将图片移动到val文件夹中

import os
import shutil

wnids = ['n01498041', 'n01514859', 'n01582220', 'n01608432', 'n01616318',
        'n01443537', 'n01776313', 'n01806567', 'n01833805', 'n01882714',
        'n01910747', 'n01944390', 'n01985128', 'n02007558', 'n02071294',
        'n02085620', 'n02114855', 'n02123045', 'n02128385', 'n02129165',
        'n02129604', 'n02165456', 'n02190166', 'n02219486', 'n02226429',
        'n02279972', 'n02317335', 'n02326432', 'n02342885', 'n02363005',
        'n02391049', 'n02395406', 'n02403003', 'n02422699', 'n02442845',
        'n02444819', 'n02480855', 'n02510455', 'n02640242', 'n02672831',
        'n02687172', 'n02701002', 'n02730930', 'n02769748', 'n02782093',
        'n02787622', 'n02793495', 'n02799071', 'n02802426', 'n02814860',
        'n02840245', 'n02906734', 'n02948072', 'n02980441', 'n02999410',
        'n03014705', 'n03028079', 'n03032252', 'n03125729', 'n03160309',
        'n03179701', 'n03220513', 'n03249569', 'n03291819', 'n03384352',
        'n03388043', 'n03450230', 'n03481172', 'n03594734', 'n03594945',
        'n03627232', 'n03642806', 'n03649909', 'n03661043', 'n03676483',
        'n03724870', 'n03733281', 'n03759954', 'n03761084', 'n03773504',
        'n03804744', 'n03916031', 'n03938244', 'n04004767', 'n04026417',
        'n04090263', 'n04133789', 'n04153751', 'n04296562', 'n04330267',
        'n04371774', 'n04404412', 'n04465501', 'n04485082', 'n04507155',
        'n04536866', 'n04579432', 'n04606251', 'n07714990', 'n07745940']

root = "/grp01/cs_hszhao/cs002u03/dataset/ImageNet-1k/val_1k"
output_path = "/grp01/cs_hszhao/cs002u03/dataset/IN100/val"

for wnid in wnids:
    output_dir = os.path.join(output_path, wnid)
    if os.path.isdir(output_dir):
        pass
    else:
        os.makedirs(output_dir, exist_ok=True)
    filenames = os.listdir(os.path.join(root, wnid))
    for filename in filenames:
        shutil.copy(os.path.join(os.path.join(root, wnid), filename), os.path.join(output_dir, filename))

MedMNIST

import os
import numpy as np
from PIL import Image

# 1. 加载 npz 文件
# data = np.load('breastmnist_224.npz')
data = np.load("pathmnist_224.npz")
# data = np.load("organsmnist_224.npz")

# 2. 提取训练、验证和测试数据
train_images, train_labels = data['train_images'], data['train_labels']
val_images, val_labels = data['val_images'], data['val_labels']
test_images, test_labels = data['test_images'], data['test_labels']

# 3. 定义保存图像的根目录
# output_dir = "MedMnist/breastmnist_224"
output_dir = 'MedMnist/pathmnist_224'
# output_dir = 'MedMnist/organsmnist_224'
os.makedirs(output_dir, exist_ok=True)

# 4. 创建不同数据集（train, val, test）对应的保存目录
data_splits = {
    # 'train': (train_images, train_labels),
    'val': (val_images, val_labels),
    'test': (test_images, test_labels)
}

# 5. 将图像按照标签保存到对应目录
for split_name, (images, labels) in data_splits.items():
    # 创建数据集对应的根目录 (如 train/, val/, test/)
    split_dir = os.path.join(output_dir, split_name)
    os.makedirs(split_dir, exist_ok=True)

    # 遍历图像和标签，将每张图像保存到相应的标签目录中
    for idx, (image, label) in enumerate(zip(images, labels)):
        # 创建以标签为名称的子目录 (如 train/0/, train/1/)
        label_dir = os.path.join(split_dir, str(int(label)))
        os.makedirs(label_dir, exist_ok=True)

        # 构建图像的保存路径 (如 train/0/00001.png)
        image_filename = os.path.join(label_dir, f"{idx:05d}.png")

        # if len(image.shape) == 2:
        #     image = np.expand_dims(image, axis=-1)
        if image.dtype != np.uint8:
            image = image.astype(np.uint8)

        # 如果图像有多个通道，则需要转换为 PIL 格式
        pil_image = Image.fromarray(image)

        # 保存图像
        pil_image.save(image_filename)

    print(f"{split_name} images saved to {split_dir}")

print("All images have been saved successfully!")