KMNIST

原创已于 2023-07-01 15:42:14 修改 · 274 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#python

于 2023-07-01 15:29:12 首次发布

Kuzushiji-MNIST是一个28x28灰度图像的数据集，包含70,000张图片，作为MNIST数据集的替换。它选择了10个日文平假名字符来代表10个类别。文章提供了读取和保存数据集图像的代码，适用于图像处理和机器学习任务。

Kuzushiji-MNIST is a drop-in replacement for the MNIST dataset (28x28 grayscale, 70,000 images). Since MNIST restricts us to 10 classes, the authors chose one character to represent each of the 10 rows of Hiragana when creating Kuzushiji-MNIST. Kuzushiji is a Japanese cursive writing style.

下载链接：https://www.kaggle.com/datasets/anokas/kuzushijihttps://www.kaggle.com/datasets/anokas/kuzushiji

import numpy as np
import matplotlib.pyplot as plt
import os
def read_idx_file(filename):
    with open(filename, 'rb') as f:
        # 读取文件头
        magic_number = int.from_bytes(f.read(4), 'big')
        num_items = int.from_bytes(f.read(4), 'big')

        # 读取数据
        if magic_number == 2051:  # 图像文件
            num_rows = int.from_bytes(f.read(4), 'big')
            num_cols = int.from_bytes(f.read(4), 'big')
            data = np.frombuffer(f.read(), dtype=np.uint8)
            data = data.reshape(num_items, num_rows, num_cols)
        elif magic_number == 2049:  # 标签文件
            data = np.frombuffer(f.read(), dtype=np.uint8)
        else:
            raise ValueError("Invalid file format")

        return data

def save_images(data, labels, save_dir):
    # 创建保存图像的目录
    os.makedirs(save_dir, exist_ok=True)

    # 按类别保存图像
    for i in range(10):
        class_dir = os.path.join(save_dir, str(i))
        os.makedirs(class_dir, exist_ok=True)

    for i in range(len(data)):
        image = data[i]
        label = labels[i]

        class_dir = os.path.join(save_dir, str(label))
        image_path = os.path.join(class_dir, f"{i}.png")

        plt.imsave(image_path, image, cmap='gray')

# 读取训练数据
train_images = read_idx_file('train-images-idx3-ubyte') #改为绝对地址
train_labels = read_idx_file('train-labels-idx1-ubyte')

# 读取测试数据
test_images = read_idx_file('t10k-images-idx3-ubyte')
test_labels = read_idx_file('t10k-labels-idx1-ubyte')

# 保存训练图像
save_images(train_images, train_labels, 'train_images')

# 保存测试图像
save_images(test_images, test_labels, 'test_images')