Tensorflow-keras实战(七):tf.data.Dataset

本文详细介绍使用TensorFlow和Keras进行数据集处理与模型训练的实战案例,涵盖数据生成、预处理、模型构建与评估等关键步骤。

目录:

1.基础api
2.tf_data_generate_csv及与keras结合实战

1.基础api

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import os
import sklearn
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl,np,pd,sklearn,tf,keras:
    print(module.__name__, module.__version__)


dataset = tf.data.Dataset.from_tensor_slices(np.arange(10)) #从内存中构建数据集,可以使列表,数组,甚至字典
print(dataset)
for item in dataset:
    print(item)



# 1.repeat epoch
# 2.get batch
dataset = dataset.repeat(3).batch(7)
for item in dataset:
    print(item)



# interleave: 对dataset中每一个元素进行处理
# case: 文件dataset->具体数据集

dataset2 = dataset.interleave(
    lambda v:tf.data.Dataset.from_tensor_slices(v), # map_fn
    cycle_length = 5, # cycle_length,读取文件并行度
    block_length = 5,# block_length
)
for item in dataset2:
    print(item)  #从上面结果中取一部分数据集




x = np.array([[1,2],[3,4],[5,6]])
y = np.array(['cat','dog','fox'])
dataset3 = tf.data.Dataset.from_tensor_slices((x,y))
print(dataset3)

for item_x,item_y in dataset3:
    #print(item_x,item_y)
    print(item_x.numpy(),item_y.numpy())  #numpy取得tensor具体的值



dataset4 = tf.data.Dataset.from_tensor_slices({"feature":x,"label":y})

for item in dataset4:
    #print(item)
    print(item["feature"].numpy(),item["label"].numpy())

2.tf_data_generate_csv及与keras结合实战

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import os
import sklearn
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl,np,pd,sklearn,tf,keras:
    print(module.__name__, module.__version__)


from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()



from sklearn.model_selection import train_test_split
x_train_all, x_test, y_train_all, y_test = train_test_split(
    housing.data, housing.target, random_state = 7)
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train_all, y_train_all, random_state = 11)
print(x_valid.shape, y_valid.shape)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.fit_transform(x_valid)
x_test_scaled = scaler.fit_transform(x_test)




output_dir = "generate_csv"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
def save_to_csv(output_dir,data,name_prefix,header=None,n_parts=10):
    path_format = os.path.join(output_dir,"{}_{:02d}.csv")
    filenames = []
    
    for file_idx,row_indices in enumerate(np.array_split(np.arange(len(data)),n_parts)):
        part_csv = path_format.format(name_prefix,file_idx)
        filenames.append(part_csv)
        with open(part_csv,"wt",encoding="utf-8") as f:
            if header is not None:
                f.write(header+"\n")
            for row_index in row_indices:
                f.write(",".join([repr(col) for col in data[row_index]]))
                f.write('\n')
    return filenames

train_data = np.c_[x_train_scaled,y_train]
valid_data = np.c_[x_valid_scaled,y_valid]
test_data = np.c_[x_test_scaled,y_test]
header_cols = housing.feature_names + ["MidianHouseValue"]
header_str = ",".join(header_cols)

train_filenames = save_to_csv(output_dir,train_data,"train",header_str,n_parts=20)
valid_filenames = save_to_csv(output_dir,valid_data,"valid",header_str,n_parts=10)
test_filenames = save_to_csv(output_dir,test_data,"test",header_str,n_parts=10)




import pprint
print("train filenames:")
pprint.pprint(train_filenames)
print("valid filenames:")
pprint.pprint(valid_filenames)
print("test filename:")
pprint.pprint(test_filenames)





# 1.filename->dataset
# 2.read file->dataset->datasets->merge
# 3.parse csv

filename_dataset = tf.data.Dataset.list_files(train_filenames)
#list_files专门处理文件名,会把文件名生成Dataset
for filename in filename_dataset:
    print(filename)

n_readers = 5
dataset = filename_dataset.interleave(
    lambda filename:tf.data.TextLineDataset(filename).skip(1),
    cycle_length = n_readers
)
for line in dataset.take(15):
    print(line.numpy())


# tf.io.decode_csv(str, recode_dafaults)

sample_str = '1,2,3,4,5'
record_defaults = [
    tf.constant(0,dtype = tf.int32),
    0,
    np.nan,
    "hello",
    tf.constant([])
]
parsed_fields = tf.io.decode_csv(sample_str,record_defaults)
print(parsed_fields)


def parse_csv_line(line,n_fields=9):
    defs = [tf.constant(np.nan)]*n_fields
    parsed_fields = tf.io.decode_csv(line,record_defaults=defs)
    x = tf.stack(parsed_fields[0:-1])
    y = tf.stack(parsed_fields[-1:])
    return x,y

parse_csv_line(b'0.6363646332204844,-1.0895425985107923,0.09260902815633619,-0.20538124656801682,1.2025670451003232,-0.03630122549633783,-0.6784101660505877,0.182235342347858,2.429',
              n_fields=9)


# 1.filename->dataset
# 2.read file->dataset->datasets->merge
# 3.parse csv

def csv_reader_dataset(filenames,n_readers=5,batch_size=32,n_parse_threads=5,
                      shuffle_buffer_size=10000):
    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.repeat()
    dataset = dataset.interleave(
        lambda filename:tf.data.TextLineDataset(filename).skip(1),
        cycle_length = n_readers)
    dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(parse_csv_line,
                          num_parallel_calls = n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset

train_set = csv_reader_dataset(train_filenames,batch_size=3)
for x_batch,y_batch in train_set.take(2):
    print("x:")
    pprint.pprint(x_batch)
    print("y:")
    pprint.pprint(y_batch)


batch_size=32
train_set = csv_reader_dataset(train_filenames,
                              batch_size=batch_size)
valid_set = csv_reader_dataset(valid_filenames,
                             batch_size=batch_size)
test_set = csv_reader_dataset(test_filenames,
                             batch_size=batch_size)



model = keras.models.Sequential([
    keras.layers.Dense(30,activation="relu",
                      input_shape=[8]),
    keras.layers.Dense(1),
])

model.compile(loss="mean_squared_error",optimizer="sgd")
callbacks = [keras.callbacks.EarlyStopping(
    patience=5,min_delta=1e-2)]

history = model.fit(train_set,
                    validation_data = valid_set,
                    steps_per_epoch = 11160//batch_size,
                    validation_steps = 3870//batch_size,
                    epochs = 100,
                    callbacks = callbacks)




model.evaluate(test_set,steps = 5160//batch_size)

--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) Cell In[1], line 1 ----> 1 from keras.datasets import imdb 3 # 加载IMDB数据集(保留最常见的20,000词) 4 max_features = 20000 File A:\anaconda\Lib\site-packages\keras\__init__.py:7 1 """DO NOT EDIT. 2 3 This file was autogenerated. Do not edit it by hand, 4 since your modifications would be overwritten. 5 """ ----> 7 from keras import _tf_keras as _tf_keras 8 from keras import activations as activations 9 from keras import applications as applications File A:\anaconda\Lib\site-packages\keras\_tf_keras\__init__.py:1 ----> 1 from keras._tf_keras import keras File A:\anaconda\Lib\site-packages\keras\_tf_keras\keras\__init__.py:7 1 """DO NOT EDIT. 2 3 This file was autogenerated. Do not edit it by hand, 4 since your modifications would be overwritten. 5 """ ----> 7 from keras import activations as activations 8 from keras import applications as applications 9 from keras import callbacks as callbacks File A:\anaconda\Lib\site-packages\keras\activations\__init__.py:7 1 """DO NOT EDIT. 2 3 This file was autogenerated. Do not edit it by hand, 4 since your modifications would be overwritten. 5 """ ----> 7 from keras.src.activations import deserialize as deserialize 8 from keras.src.activations import get as get 9 from keras.src.activations import serialize as serialize File A:\anaconda\Lib\site-packages\keras\src\__init__.py:1 ----> 1 from keras.src import activations 2 from keras.src import applications 3 from keras.src import backend File A:\anaconda\Lib\site-packages\keras\src\activations\__init__.py:3 1 import types ----> 3 from keras.src.activations.activations import celu 4 from keras.src.activations.activations import elu 5 from keras.src.activations.activations import exponential File A:\anaconda\Lib\site-packages\keras\src\activations\activations.py:1 ----> 1 from keras.src import backend 2 from keras.src import ops 3 from keras.src.api_export import keras_export File A:\anaconda\Lib\site-packages\keras\src\backend\__init__.py:10 7 import torch 9 from keras.src.api_export import keras_export ---> 10 from keras.src.backend.common.dtypes import result_type 11 from keras.src.backend.common.keras_tensor import KerasTensor 12 from keras.src.backend.common.keras_tensor import any_symbolic_tensors File A:\anaconda\Lib\site-packages\keras\src\backend\common\__init__.py:2 1 from keras.src.backend.common import backend_utils ----> 2 from keras.src.backend.common.dtypes import result_type 3 from keras.src.backend.common.variables import AutocastScope 4 from keras.src.backend.common.variables import Variable as KerasVariable File A:\anaconda\Lib\site-packages\keras\src\backend\common\dtypes.py:5 3 from keras.src.api_export import keras_export 4 from keras.src.backend import config ----> 5 from keras.src.backend.common.variables import standardize_dtype 7 BOOL_TYPES = ("bool",) 8 INT_TYPES = ( 9 "uint8", 10 "uint16", (...) 16 "int64", 17 ) File A:\anaconda\Lib\site-packages\keras\src\backend\common\variables.py:11 9 from keras.src.backend.common.stateless_scope import get_stateless_scope 10 from keras.src.backend.common.stateless_scope import in_stateless_scope ---> 11 from keras.src.utils.module_utils import tensorflow as tf 12 from keras.src.utils.naming import auto_name 15 class Variable: File A:\anaconda\Lib\site-packages\keras\src\utils\__init__.py:1 ----> 1 from keras.src.utils.audio_dataset_utils import audio_dataset_from_directory 2 from keras.src.utils.dataset_utils import split_dataset 3 from keras.src.utils.file_utils import get_file File A:\anaconda\Lib\site-packages\keras\src\utils\audio_dataset_utils.py:4 1 import numpy as np 3 from keras.src.api_export import keras_export ----> 4 from keras.src.utils import dataset_utils 5 from keras.src.utils.module_utils import tensorflow as tf 6 from keras.src.utils.module_utils import tensorflow_io as tfio File A:\anaconda\Lib\site-packages\keras\src\utils\dataset_utils.py:9 5 from multiprocessing.pool import ThreadPool 7 import numpy as np ----> 9 from keras.src import tree 10 from keras.src.api_export import keras_export 11 from keras.src.utils import io_utils File A:\anaconda\Lib\site-packages\keras\src\tree\__init__.py:1 ----> 1 from keras.src.tree.tree_api import assert_same_paths 2 from keras.src.tree.tree_api import assert_same_structure 3 from keras.src.tree.tree_api import flatten File A:\anaconda\Lib\site-packages\keras\src\tree\tree_api.py:8 5 from keras.src.utils.module_utils import optree 7 if optree.available: ----> 8 from keras.src.tree import optree_impl as tree_impl 9 elif dmtree.available: 10 from keras.src.tree import dmtree_impl as tree_impl File A:\anaconda\Lib\site-packages\keras\src\tree\optree_impl.py:13 11 # Register backend-specific node classes 12 if backend() == "tensorflow": ---> 13 from tensorflow.python.trackable.data_structures import ListWrapper 14 from tensorflow.python.trackable.data_structures import _DictWrapper 16 try: File A:\anaconda\Lib\site-packages\tensorflow\__init__.py:55 53 from tensorflow._api.v2 import autograph 54 from tensorflow._api.v2 import bitwise ---> 55 from tensorflow._api.v2 import compat 56 from tensorflow._api.v2 import config 57 from tensorflow._api.v2 import data File A:\anaconda\Lib\site-packages\tensorflow\_api\v2\compat\__init__.py:8 3 """Public API for tf._api.v2.compat namespace 4 """ 6 import sys as _sys ----> 8 from tensorflow._api.v2.compat import v1 9 from tensorflow._api.v2.compat import v2 10 from tensorflow.python.compat.compat import forward_compatibility_horizon # line: 125 File A:\anaconda\Lib\site-packages\tensorflow\_api\v2\compat\v1\__init__.py:30 28 from tensorflow._api.v2.compat.v1 import autograph 29 from tensorflow._api.v2.compat.v1 import bitwise ---> 30 from tensorflow._api.v2.compat.v1 import compat 31 from tensorflow._api.v2.compat.v1 import config 32 from tensorflow._api.v2.compat.v1 import data File A:\anaconda\Lib\site-packages\tensorflow\_api\v2\compat\v1\compat\__init__.py:8 3 """Public API for tf._api.v2.compat namespace 4 """ 6 import sys as _sys ----> 8 from tensorflow._api.v2.compat.v1.compat import v1 9 from tensorflow._api.v2.compat.v1.compat import v2 10 from tensorflow.python.compat.compat import forward_compatibility_horizon # line: 125 File A:\anaconda\Lib\site-packages\tensorflow\_api\v2\compat\v1\compat\v1\__init__.py:32 30 from tensorflow._api.v2.compat.v1 import compat 31 from tensorflow._api.v2.compat.v1 import config ---> 32 from tensorflow._api.v2.compat.v1 import data 33 from tensorflow._api.v2.compat.v1 import debugging 34 from tensorflow._api.v2.compat.v1 import distribute File A:\anaconda\Lib\site-packages\tensorflow\_api\v2\compat\v1\data\__init__.py:8 3 """Public API for tf._api.v2.data namespace 4 """ 6 import sys as _sys ----> 8 from tensorflow._api.v2.compat.v1.data import experimental 9 from tensorflow.python.data.ops.dataset_ops import AUTOTUNE # line: 103 10 from tensorflow.python.data.ops.dataset_ops import DatasetV1 as Dataset # line: 3710 File A:\anaconda\Lib\site-packages\tensorflow\_api\v2\compat\v1\data\experimental\__init__.py:32 30 from tensorflow.python.data.experimental.ops.interleave_ops import parallel_interleave # line: 29 31 from tensorflow.python.data.experimental.ops.interleave_ops import sample_from_datasets_v1 as sample_from_datasets # line: 158 ---> 32 from tensorflow.python.data.experimental.ops.iterator_model_ops import get_model_proto # line: 25 33 from tensorflow.python.data.experimental.ops.iterator_ops import make_saveable_from_iterator # line: 38 34 from tensorflow.python.data.experimental.ops.lookup_ops import DatasetInitializer # line: 54 ModuleNotFoundError: No module named 'tensorflow.python.data.experimental.ops.iterator_model_ops' 运行加载Keras中的IMDB数据集的代码报错
最新发布
06-05
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值