How to transform our data into Tensorflow TFRecord?
- Transform into TFRecord
- Read and decode
接下来,将not_mnist数据集作为自己的数据集,生成TFRecord文件。
import tensorflow as tf
import numpy as np
import os
import matplotlib.pyplot as plt
import skimage.io as io
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
# %%
def get_file(file_dir):
'''Get full image directory and corresponding labels
Args:
file_dir: file directory
Returns:
images: image directories, list, string
labels: label, list, int
'''
images = [] #存放每张图片的路径[‘./notMNIST_small/A/MDEtMDEtMDAudHRm.png’,...,'./notMNIST_small/G/R2FyYW1vbmRQcmVtclByby1NZWRJdERpc3Aub3Rm.png]
temp = [] #存放数据集下每一个子文件的路径['./notMNIST_small/A','./notMNIST_small/B',...,'/notMNIST_small/J']
for root, sub_folders, files in os.walk(file_dir):
# image directories
for name in files:
images.append(os.path.join(root, name))
# get 10 sub-folder names
for name in sub_folders:
temp.append(os.path.join(root, name))
# assign 10 labels based on the folder names
labels = []
#循环数据集下每一个子文件夹
for one_folder in temp:
#获得子文件夹中图片的个数,os.listdir() 方法用于返回指定的文件夹包含的文件或文件夹的名字的列表
n_img = len(os.listdir(one_folder))
#用‘/'来划分子文件夹路径,如./notMNIST_small/A,取最后一个元素,其实就是获得ABCDEFGHIJ
letter = one_folder.split('/')[-1]
#按照子文件夹名字的不同,来划分类,贴上标签,共10类
if letter == 'A':
labels = np.append(labels, n_img * [1])
elif letter == 'B':
labels = np.append(labels, n_img * [2])
elif letter == 'C':
labels = np.append(labels, n_img * [3])
elif letter == 'D':
labels = np.append(labels, n_img * [4])
elif letter == 'E':
labels = np.append(labels, n_img * [5])
elif letter == 'F':
labels = np.append(labels, n_img * [6])
elif letter == 'G':
labels = np.append(labels, n_img * [7])
elif letter == 'H':
labels = np.append(labels, n_img * [8])
elif letter == 'I':
labels = np.append(labels, n_img * [9])
else:
labels = np.append(labels, n_img * [10])
# shuffle
temp = np.array([images, labels])#[['/notMNIST_small/A/MDEtMDEtMDAudHRm.png',...,'/notMNIST_small/J/SWNvbmUgTFQgUmVndWxhciBJdGFsaWMgT3NGLnR0Zg==.png'],[1,1,1...10,10,10]]
temp = temp.transpose()#[['/notMNIST_small/A/MDEtMDEtMDAudHRm.png',1],...,['/notMNIST_small/J/SWNvbmUgTFQgUmVndWxhciBJdGFsaWMgT3NGLnR0Zg==.png',10]
np.random.shuffle(temp)#打乱顺序
image_list = list(temp[:, 0])#读取temp中第0列,即images
label_list = list(temp[:, 1])#读取temp中第1列,即labels
label_list = [int(float(i)) for i in label_list]
return image_list, label_list
# %%
#输入的图片跟标签都是特征,因为其类型不同,故将labels转换成int64,将图片转换成bytes
#生成整数型的属性
def int64_feature(value):
"""Wrapper for inserting int64 features into Example proto."""
if not isinstance(value, list):
value = [value]
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
#生成字符串型整数
def bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
# %%
def convert_to_tfrecord(images, labels, save_dir, name):
'''convert all images and labels to one tfrecord file.
Args:
images: list of image directories, string type
labels: list of labels, int type
save_dir: the directory to save tfrecord file, e.g.: '/home/folder1/'
name: the name of tfrecord file, string type, e.g.: 'train'
Return:
no return
Note:
converting needs some time, be patient...
'''
#生成tfrecords文件保存路径
filename = os.path.join(save_dir, name + '.tfrecords')
n_samples = len(labels)
if np.shape(images)[0] != n_samples:
raise ValueError('Images size %d does not match label size %d.' % (images.shape[0], n_samples))
# wait some time here, transforming need some time based on the size of your data.
#向这个文件中写入
writer = tf.python_io.TFRecordWriter(filename)
print('\nTransform start......')
#循环所有图片
for i in np.arange(0, n_samples):
try:
#读图,需转换成array
image = io.imread(images[i]) # type(image) must be array!
#将图像矩阵转化成一个字符串
image_raw = image.tostring()
label = int(labels[i])
#将每个图片和其对应的label写入每一个example
example = tf.train.Example(features=tf.train.Features(feature={
'label': int64_feature(label),
'image_raw': bytes_feature(image_raw)}))
writer.write(example.SerializeToString())
#如果图片损坏,将错误信息打印出来
except IOError as e:
print('Could not read:', images[i])
print('error: %s' % e)
print('Skip it!\n')
writer.close()
print('Transform done!')
# %%
#读取tfrecord文件,并生成批次
def read_and_decode(tfrecords_file, batch_size):
'''read and decode tfrecord file, generate (image, label) batches
Args:
tfrecords_file: the directory of tfrecord file
batch_size: number of images in each batch
Returns:
image: 4D tensor - [batch_size, width, height, channel]
label: 1D tensor - [batch_size]
'''
# make an input queue from the tfrecord file
#将文件生成一个队列
filename_queue = tf.train.string_input_producer([tfrecords_file])
#创建一个reader来读取TFRecord文件
reader = tf.TFRecordReader()
#从文件中独处一个样例。也可以使用read_up_to函数一次性读取多个样例
_, serialized_example = reader.read(filename_queue)
#解析每一个元素。如果需要解析多个样例,可以用parse_example函数
img_features = tf.parse_single_example(
serialized_example,
features={
'label': tf.FixedLenFeature([], tf.int64),
'image_raw': tf.FixedLenFeature([], tf.string),
})
#tf.decode_raw可以将字符串解析成图像对应的像素数组
image = tf.decode_raw(img_features['image_raw'], tf.uint8)
##########################################################
# you can put data augmentation here, I didn't use it
##########################################################
# all the images of notMNIST are 28*28, you need to change the image size if you use other dataset.
image = tf.reshape(image, [28, 28])
label = tf.cast(img_features['label'], tf.int32)
image_batch, label_batch = tf.train.batch([image, label],
batch_size=batch_size,
num_threads=64,
capacity=2000)
return image_batch, tf.reshape(label_batch, [batch_size])
# %% Convert data to TFRecord
test_dir = 'F://TensorFlow--middleteach//TFRecord_notmnist//notMNIST_small//'
save_dir = 'F://TensorFlow--middleteach//TFRecord_notmnist//'
BATCH_SIZE = 25
# Convert test data: you just need to run it ONCE !
name_test = 'test'
images, labels = get_file(test_dir)
tfrecords_file_dir='F://TensorFlow--middleteach//TFRecord_notmnist//test.tfrecords'
if not os.path.exists(tfrecords_file_dir):
convert_to_tfrecord(images, labels, save_dir, name_test)
# %% TO test train.tfrecord file
#一个batchsize读取25张图,展示成5行5列
def plot_images(images, labels):
'''plot one batch size
'''
for i in np.arange(0, BATCH_SIZE):
plt.subplot(5, 5, i + 1)
#关闭坐标轴显示
plt.axis('off')
'''ord()函数主要用来返回对应字符的ascii码,chr()主要用来表示ascii码对应的字符他的输入时数字
print ord('a)
#97
print chr(97)
#a
'''
plt.title(chr(ord('A') + labels[i] - 1), fontsize=14)
# plt.subplots_adjust(top=0.5)
plt.imshow(images[i])
plt.show()
tfrecords_file = 'F://TensorFlow--middleteach//TFRecord_notmnist//test.tfrecords'
image_batch, label_batch = read_and_decode(tfrecords_file, batch_size=BATCH_SIZE)
with tf.Session() as sess:
i = 0
#启用多线程处理数据
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
try:
while not coord.should_stop() and i < 1:
# just plot one batch size
image, label = sess.run([image_batch, label_batch])
plot_images(image, label)
i += 1
except tf.errors.OutOfRangeError:
print('done!')
finally:
coord.request_stop()
coord.join(threads)