Tensorflow教程学习笔记（一）----将自己的数据集转换成TFRecord

最新推荐文章于 2022-07-08 16:00:02 发布

zcc_TPJH

最新推荐文章于 2022-07-08 16:00:02 发布

阅读量4.2k

点赞数 1

CC 4.0 BY-SA版权

分类专栏： tensorflow 文章标签： tensorflow

本文链接：https://blog.youkuaiyun.com/weixin_39881922/article/details/78610911

tensorflow 专栏收录该内容

26 篇文章

订阅专栏

本文介绍如何将NotMNIST数据集转换为TensorFlow的TFRecord格式，包括读取图片、标签处理、TFRecord文件生成及读取等关键步骤。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

How to transform our data into Tensorflow TFRecord?

Transform into TFRecord
Read and decode

TFRecord, 这是一种将图像数据和标签放在一起的二进制文件，能更好的利用内存，在tensorflow中能快速的复制，移动，读取，存储等等。

接下来，将not_mnist数据集作为自己的数据集，生成TFRecord文件。

数据集下载地址：http://yaroslavvb.com/upload/notMNIST/

import tensorflow as tf
import numpy as np
import os
import matplotlib.pyplot as plt
import skimage.io as io
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# %%

def get_file(file_dir):
    '''Get full image directory and corresponding labels
    Args:
        file_dir: file directory
    Returns:
        images: image directories, list, string
        labels: label, list, int
    '''

    images = []  #存放每张图片的路径[‘./notMNIST_small/A/MDEtMDEtMDAudHRm.png’,...,'./notMNIST_small/G/R2FyYW1vbmRQcmVtclByby1NZWRJdERpc3Aub3Rm.png]
    temp = []    #存放数据集下每一个子文件的路径['./notMNIST_small/A','./notMNIST_small/B',...,'/notMNIST_small/J']
    for root, sub_folders, files in os.walk(file_dir):

        # image directories
        for name in files:
            images.append(os.path.join(root, name))
        # get 10 sub-folder names
        for name in sub_folders:
            temp.append(os.path.join(root, name))


    # assign 10 labels based on the folder names
    labels = []
    #循环数据集下每一个子文件夹
    for one_folder in temp:
        #获得子文件夹中图片的个数，os.listdir() 方法用于返回指定的文件夹包含的文件或文件夹的名字的列表
        n_img = len(os.listdir(one_folder))
        #用‘/'来划分子文件夹路径，如./notMNIST_small/A，取最后一个元素，其实就是获得ABCDEFGHIJ
        letter = one_folder.split('/')[-1]
        #按照子文件夹名字的不同，来划分类，贴上标签，共10类
        if letter == 'A':
            labels = np.append(labels, n_img * [1])
        elif letter == 'B':
            labels = np.append(labels, n_img * [2])
        elif letter == 'C':
            labels = np.append(labels, n_img * [3])
        elif letter == 'D':
            labels = np.append(labels, n_img * [4])
        elif letter == 'E':
            labels = np.append(labels, n_img * [5])
        elif letter == 'F':
            labels = np.append(labels, n_img * [6])
        elif letter == 'G':
            labels = np.append(labels, n_img * [7])
        elif letter == 'H':
            labels = np.append(labels, n_img * [8])
        elif letter == 'I':
            labels = np.append(labels, n_img * [9])
        else:
            labels = np.append(labels, n_img * [10])

    # shuffle
    temp = np.array([images, labels])#[['/notMNIST_small/A/MDEtMDEtMDAudHRm.png',...,'/notMNIST_small/J/SWNvbmUgTFQgUmVndWxhciBJdGFsaWMgT3NGLnR0Zg==.png'],[1,1,1...10,10,10]]
    temp = temp.transpose()#[['/notMNIST_small/A/MDEtMDEtMDAudHRm.png',1],...,['/notMNIST_small/J/SWNvbmUgTFQgUmVndWxhciBJdGFsaWMgT3NGLnR0Zg==.png',10]
    np.random.shuffle(temp)#打乱顺序

    image_list = list(temp[:, 0])#读取temp中第0列，即images
    label_list = list(temp[:, 1])#读取temp中第1列，即labels
    label_list = [int(float(i)) for i in label_list]

    return image_list, label_list


# %%
#输入的图片跟标签都是特征，因为其类型不同，故将labels转换成int64,将图片转换成bytes
#生成整数型的属性
def int64_feature(value):
    """Wrapper for inserting int64 features into Example proto."""
    if not isinstance(value, list):
        value = [value]
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

#生成字符串型整数
def bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


# %%

def convert_to_tfrecord(images, labels, save_dir, name):
    '''convert all images and labels to one tfrecord file.
    Args:
        images: list of image directories, string type
        labels: list of labels, int type
        save_dir: the directory to save tfrecord file, e.g.: '/home/folder1/'
        name: the name of tfrecord file, string type, e.g.: 'train'
    Return:
        no return
    Note:
        converting needs some time, be patient...
    '''
    #生成tfrecords文件保存路径
    filename = os.path.join(save_dir, name + '.tfrecords')
    n_samples = len(labels)

    if np.shape(images)[0] != n_samples:
        raise ValueError('Images size %d does not match label size %d.' % (images.shape[0], n_samples))

    # wait some time here, transforming need some time based on the size of your data.
    #向这个文件中写入
    writer = tf.python_io.TFRecordWriter(filename)
    print('\nTransform start......')
    #循环所有图片
    for i in np.arange(0, n_samples):
        try:
            #读图，需转换成array
            image = io.imread(images[i])  # type(image) must be array!
            #将图像矩阵转化成一个字符串
            image_raw = image.tostring()
            label = int(labels[i])
            #将每个图片和其对应的label写入每一个example
            example = tf.train.Example(features=tf.train.Features(feature={
                'label': int64_feature(label),
                'image_raw': bytes_feature(image_raw)}))
            writer.write(example.SerializeToString())
        #如果图片损坏，将错误信息打印出来
        except IOError as e:
            print('Could not read:', images[i])
            print('error: %s' % e)
            print('Skip it!\n')
    writer.close()
    print('Transform done!')


# %%
#读取tfrecord文件，并生成批次
def read_and_decode(tfrecords_file, batch_size):
    '''read and decode tfrecord file, generate (image, label) batches
    Args:
        tfrecords_file: the directory of tfrecord file
        batch_size: number of images in each batch
    Returns:
        image: 4D tensor - [batch_size, width, height, channel]
        label: 1D tensor - [batch_size]
    '''
    # make an input queue from the tfrecord file
    #将文件生成一个队列
    filename_queue = tf.train.string_input_producer([tfrecords_file])
    #创建一个reader来读取TFRecord文件
    reader = tf.TFRecordReader()
    #从文件中独处一个样例。也可以使用read_up_to函数一次性读取多个样例
    _, serialized_example = reader.read(filename_queue)
    #解析每一个元素。如果需要解析多个样例，可以用parse_example函数
    img_features = tf.parse_single_example(
        serialized_example,
        features={
            'label': tf.FixedLenFeature([], tf.int64),
            'image_raw': tf.FixedLenFeature([], tf.string),
        })
    #tf.decode_raw可以将字符串解析成图像对应的像素数组
    image = tf.decode_raw(img_features['image_raw'], tf.uint8)

    ##########################################################
    # you can put data augmentation here, I didn't use it
    ##########################################################
    # all the images of notMNIST are 28*28, you need to change the image size if you use other dataset.

    image = tf.reshape(image, [28, 28])
    label = tf.cast(img_features['label'], tf.int32)
    image_batch, label_batch = tf.train.batch([image, label],
                                              batch_size=batch_size,
                                              num_threads=64,
                                              capacity=2000)
    return image_batch, tf.reshape(label_batch, [batch_size])


# %% Convert data to TFRecord

test_dir = 'F://TensorFlow--middleteach//TFRecord_notmnist//notMNIST_small//'
save_dir = 'F://TensorFlow--middleteach//TFRecord_notmnist//'
BATCH_SIZE = 25

# Convert test data: you just need to run it ONCE !
name_test = 'test'
images, labels = get_file(test_dir)
tfrecords_file_dir='F://TensorFlow--middleteach//TFRecord_notmnist//test.tfrecords'
if not os.path.exists(tfrecords_file_dir):
    convert_to_tfrecord(images, labels, save_dir, name_test)


# %% TO test train.tfrecord file
#一个batchsize读取25张图，展示成5行5列
def plot_images(images, labels):
    '''plot one batch size
    '''
    for i in np.arange(0, BATCH_SIZE):
        plt.subplot(5, 5, i + 1)
        #关闭坐标轴显示
        plt.axis('off')
        '''ord()函数主要用来返回对应字符的ascii码，chr()主要用来表示ascii码对应的字符他的输入时数字
            print ord('a)
             #97
            print chr(97)
            #a
        '''
        plt.title(chr(ord('A') + labels[i] - 1), fontsize=14)
        # plt.subplots_adjust(top=0.5)
        plt.imshow(images[i])
    plt.show()


tfrecords_file = 'F://TensorFlow--middleteach//TFRecord_notmnist//test.tfrecords'
image_batch, label_batch = read_and_decode(tfrecords_file, batch_size=BATCH_SIZE)

with tf.Session()  as sess:
    i = 0
    #启用多线程处理数据
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)

    try:
        while not coord.should_stop() and i < 1:
            # just plot one batch size
            image, label = sess.run([image_batch, label_batch])
            plot_images(image, label)
            i += 1

    except tf.errors.OutOfRangeError:
        print('done!')
    finally:
        coord.request_stop()
    coord.join(threads)