Kuzushiji-MNIST is a drop-in replacement for the MNIST dataset (28x28 grayscale, 70,000 images). Since MNIST restricts us to 10 classes, the authors chose one character to represent each of the 10 rows of Hiragana when creating Kuzushiji-MNIST. Kuzushiji is a Japanese cursive writing style.
下载链接:https://www.kaggle.com/datasets/anokas/kuzushiji
https://www.kaggle.com/datasets/anokas/kuzushiji

import numpy as np
import matplotlib.pyplot as plt
import os
def read_idx_file(filename):
with open(filename, 'rb') as f:
# 读取文件头
magic_number = int.from_bytes(f.read(4), 'big')
num_items = int.from_bytes(f.read(4), 'big')
# 读取数据
if magic_number == 2051: # 图像文件
num_rows = int.from_bytes(f.read(4), 'big')
num_cols = int.from_bytes(f.read(4), 'big')
data = np.frombuffer(f.read(), dtype=np.uint8)
data = data.reshape(num_items, num_rows, num_cols)
elif magic_number == 2049: # 标签文件
data = np.frombuffer(f.read(), dtype=np.uint8)
else:
raise ValueError("Invalid file format")
return data
def save_images(data, labels, save_dir):
# 创建保存图像的目录
os.makedirs(save_dir, exist_ok=True)
# 按类别保存图像
for i in range(10):
class_dir = os.path.join(save_dir, str(i))
os.makedirs(class_dir, exist_ok=True)
for i in range(len(data)):
image = data[i]
label = labels[i]
class_dir = os.path.join(save_dir, str(label))
image_path = os.path.join(class_dir, f"{i}.png")
plt.imsave(image_path, image, cmap='gray')
# 读取训练数据
train_images = read_idx_file('train-images-idx3-ubyte') #改为绝对地址
train_labels = read_idx_file('train-labels-idx1-ubyte')
# 读取测试数据
test_images = read_idx_file('t10k-images-idx3-ubyte')
test_labels = read_idx_file('t10k-labels-idx1-ubyte')
# 保存训练图像
save_images(train_images, train_labels, 'train_images')
# 保存测试图像
save_images(test_images, test_labels, 'test_images')
Kuzushiji-MNIST是一个28x28灰度图像的数据集,包含70,000张图片,作为MNIST数据集的替换。它选择了10个日文平假名字符来代表10个类别。文章提供了读取和保存数据集图像的代码,适用于图像处理和机器学习任务。
646

被折叠的 条评论
为什么被折叠?



