tensorflow2 Dataset 相关常用函数_tensorflow dataset的pop函数-优快云博客

本文链接：https://blog.youkuaiyun.com/xiedelong/article/details/121374638

1. tf.constant

'''Creates a constant tensor from a tensor-like object.'''
features = tf.constant([[1, 3], [2, 1], [3, 3]])
features

<tf.Tensor: shape=(3, 2), dtype=int32, numpy=
array([[1, 3],
       [2, 1],
       [3, 3]], dtype=int32)>

'''If `shape` is set, the `value` is reshaped to match. Scalars are expanded to fill the `shape`:'''
a = tf.constant(0, shape=(2, 3))
a

<tf.Tensor: shape=(2, 3), dtype=int32, numpy=
array([[0, 0, 0],
       [0, 0, 0]], dtype=int32)>

'''相当于对一维向量进行 reshape 操作转换'''
b = tf.constant([1, 2, 3, 4, 5, 6], shape=[2, 3])
b

<tf.Tensor: shape=(2, 3), dtype=int32, numpy=
array([[1, 2, 3],
       [4, 5, 6]], dtype=int32)>

2. tf.data.Dataset.from_tensor_slices

'''Creates a `Dataset` whose elements are slices of the given tensors.'''

# from_tensor_slices 里如果是一个 tuple 结果也是一个 tuple
test_2 = tf.data.Dataset.from_tensor_slices((data_add[['col_1', 'col_2']][:13].values, data_add[['label']][:13].values))
test_2

<TensorSliceDataset shapes: ((2,), (1,)), types: (tf.int64, tf.int64)>

# 是一个 list 的话，结果也是一个 list
tf.data.Dataset.from_tensor_slices(data_add[['label']][:13].values)

<TensorSliceDataset shapes: (1,), types: tf.int64>

3. TensorSliceDataset .as_numpy_iterator

'''Returns an iterator which converts all elements of the dataset to numpy.'''
# 接上
# 不用 as_numpy_iterator 的话，查看 shape 和 types 和 每个元素的值
for ele in test_2:
    ele
    type(ele)
    type(ele[0])

(<tf.Tensor: shape=(2,), dtype=int64, numpy=array([3, 1])>,
 <tf.Tensor: shape=(1,), dtype=int64, numpy=array([1])>)
tuple
tensorflow.python.framework.ops.EagerTensor


# 用 as_numpy_iterator 的话，print dataset elements directly 直接看到每个元素的内容
for ele in test_2.as_numpy_iterator():
    ele
    type(ele)
    type(ele[0])

(array([3, 1]), array([1]))
tuple
numpy.ndarray

4.TensorSliceDataset . shuffle

'''Randomly shuffles the elements of this dataset.'''
# 接2
# 为了实现完美的洗牌，需要缓冲区 buffer_size 的大小，大于或等于数据集的完整大小。
test_2 = test_2.shuffle(buffer_size=16, seed=2021)

for ele in test_2.as_numpy_iterator():
    ele
    type(ele)
    type(ele[0])

(array([0, 0]), array([1]))
tuple
numpy.ndarray
...

5. TensorSliceDataset.batch(self, batch_size, drop_remainder=False)

'''Combines consecutive elements of this dataset into batches.'''
'''drop_remainder representing whether the last batch should be dropped in the case it has fewer than `batch_size` elements;  if True == drop'''
# 接4，看起来会先默认 shuffle 一遍再 batch
for ele in test_2.batch(3, drop_remainder=False):
    ele

(<tf.Tensor: shape=(3, 2), dtype=int64, numpy=
 array([[1, 0],
        [1, 1],
        [2, 1]])>, <tf.Tensor: shape=(3, 1), dtype=int64, numpy=
 array([[1],
        [1],
        [1]])>)

6. TensorSliceDataset – .take / .skip / .shard

'''Creates a `Dataset` with at most `count` elements from this dataset.'''
# 从已经存在的 dataset 中取前 k 个
dataset = tf.data.Dataset.range(10)
dataset = dataset.take(3)
list(dataset.as_numpy_iterator())

[0, 1, 2]


'''Creates a `Dataset` that skips `count` elements from this dataset.'''
# 从已经存在的 dataset 中跳过前 k 个取后面所有的
dataset = tf.data.Dataset.range(10)
dataset = dataset.skip(7)
list(dataset.as_numpy_iterator())

[7, 8, 9]


'''Creates a `Dataset` that includes only 1/`num_shards` of this dataset.'''
# 生成包含已存在 dataset 中，索引mod n = i 的元素值
# eg：[1, 10, 2, 7, 6, 8, 9]，mod 3 = 1 的 index 是 1 和 4，对应的值为 10 和 6
A = tf.data.Dataset.from_tensor_slices([1, 10, 2, 7, 6, 8, 9])
C = A.shard(num_shards=3, index=1)
list(C.as_numpy_iterator())

[10, 6]