TensorFlow构建模型（pandas.DataFrame数据加载）九

最新推荐文章于 2025-03-17 00:10:55 发布

原创最新推荐文章于 2025-03-17 00:10:55 发布 · 1k 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#tensorflow #深度学习 #python #DataFrame

人工智能专栏收录该内容

68 篇文章

订阅专栏

本文介绍如何使用TensorFlow从pandas DataFrame加载数据并训练一个心脏病预测模型。通过实际操作，展示如何处理不同类型的特征数据，包括数值型、二值型及类别型变量。

部署运行你感兴趣的模型镜像

概要

本文源自TensorFlow教程。
主要讲如何使用tensorflow加载pandas DataFrame数据进行模型训练。使用心脏疾病的数据集进行一个二分类例子。本文是一个tensorflow初级入门的教程。内容较为简单，并且和之前的这篇加载csv训练模型的文章差不多，因为都是数据格式都是pandas的DataFrame。

内容

import pandas as pd
import tensorflow as tf

SHUFFLE_BUFFER = 500
BATCH_SIZE = 2

csv_file = tf.keras.utils.get_file('heart.csv', 'https://storage.googleapis.com/download.tensorflow.org/data/heart.csv')

df = pd.read_csv(csv_file)
# 目标标签
target = df.pop('target')

如果数据集中特征的数据类型是统一的，我们可以将其作为一个numpy array，使用tf.convert_to_tensor转化成一个张量。然后这个张量可以直接传入tensorflow构建的模型中进行训练。

numeric_feature_names = ['age', 'thalach', 'trestbps',  'chol', 'oldpeak']
numeric_features = df[numeric_feature_names]
tf.convert_to_tensor(numeric_features)
"""
<tf.Tensor: shape=(303, 5), dtype=float64, numpy=
array([[ 63. , 150. , 145. , 233. ,   2.3],
       [ 67. , 108. , 160. , 286. ,   1.5],
       [ 67. , 129. , 120. , 229. ,   2.6],
       ...,
       [ 65. , 127. , 135. , 254. ,   2.8],
       [ 48. , 150. , 130. , 256. ,   0. ],
       [ 63. , 154. , 150. , 407. ,   4. ]])>
"""

# 首先讲数据进行归一化，
normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(numeric_features)
normalizer(numeric_features.iloc[:3])
# 模型构建
def get_basic_model():
  model = tf.keras.Sequential([
    normalizer,  # 第一层 输入层，进行归一化
    tf.keras.layers.Dense(10, activation='relu'),  # 第一层隐藏层
    tf.keras.layers.Dense(10, activation='relu'),  # 第二层隐藏层
    tf.keras.layers.Dense(1)  # 输入层
  ])

  model.compile(optimizer='adam',
                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                metrics=['accuracy'])
  return model

model = get_basic_model()
model.fit(numeric_features, target, epochs=15, batch_size=BATCH_SIZE)  # loss: 0.4051 - accuracy: 0.7888

使用tf_data对对数据进行处理。Dataset.from_tensor_slices方法可以遍历dataframe的行构建数据集，每一行是一个初始向量，方法返回的是(inputs, labels)对。

numeric_dataset = tf.data.Dataset.from_tensor_slices((numeric_features, target))

for row in numeric_dataset.take(3):
  print(row)

"""
(<tf.Tensor: shape=(5,), dtype=float64, numpy=array([ 63. , 150. , 145. , 233. ,   2.3])>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(5,), dtype=float64, numpy=array([ 67. , 108. , 160. , 286. ,   1.5])>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(5,), dtype=float64, numpy=array([ 67. , 129. , 120. , 229. ,   2.6])>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
"""

numeric_batches = numeric_dataset.shuffle(1000).batch(BATCH_SIZE)

model = get_basic_model()
model.fit(numeric_batches, epochs=15)  # loss: 0.4170 - accuracy: 0.7954

将dataframe作为一个字典

# 模型可以直接训练字典类型数据集
model.fit(dict(numeric_features), target, epochs=5, batch_size=BATCH_SIZE)
# 先使用from_tensor_slices进行处理
numeric_dict_ds = tf.data.Dataset.from_tensor_slices((dict(numeric_features), target))
numeric_dict_batches = numeric_dict_ds.shuffle(SHUFFLE_BUFFER).batch(BATCH_SIZE)
model.fit(numeric_dict_batches, epochs=5)


def stack_dict(inputs, fun=tf.stack):
    values = []
    for key in sorted(inputs.keys()):
      values.append(tf.cast(inputs[key], tf.float32))

    return fun(values, axis=-1)
    
# 构建一个从inputs => x的计算过程
# ==== beigin====
x = stack_dict(inputs, fun=tf.concat)

normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(stack_dict(dict(numeric_features)))

x = normalizer(x)
x = tf.keras.layers.Dense(10, activation='relu')(x)
x = tf.keras.layers.Dense(10, activation='relu')(x)
x = tf.keras.layers.Dense(1)(x)
# ==== end====
model = tf.keras.Model(inputs, x)

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'],
              run_eagerly=True)

model.fit(dict(numeric_features), target, epochs=5, batch_size=BATCH_SIZE)

numeric_dict_batches = numeric_dict_ds.shuffle(SHUFFLE_BUFFER).batch(BATCH_SIZE)
model.fit(numeric_dict_batches, epochs=5)

以上例子的前提条件是dataframe的数据类型一致。然而实际情况并不这样。

binary_feature_names = ['sex', 'fbs', 'exang']

categorical_feature_names = ['cp', 'restecg', 'slope', 'thal', 'ca']

inputs = {}
for name, column in df.items():
  if type(column[0]) == str:
    dtype = tf.string
  elif (name in categorical_feature_names or
        name in binary_feature_names):
    dtype = tf.int64
  else:
    dtype = tf.float32

  inputs[name] = tf.keras.Input(shape=(), name=name, dtype=dtype)

preprocessed = []
# 二值特征
for name in binary_feature_names:
  inp = inputs[name]
  inp = inp[:, tf.newaxis]
  float_value = tf.cast(inp, tf.float32)
  preprocessed.append(float_value)

preprocessed
# 数值特征
normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(stack_dict(dict(numeric_features)))
numeric_inputs = {}
for name in numeric_feature_names:
  numeric_inputs[name]=inputs[name]

numeric_inputs = stack_dict(numeric_inputs)
numeric_normalized = normalizer(numeric_inputs)

preprocessed.append(numeric_normalized)

preprocessed
# 类别特征
vocab = ['a','b','c']
lookup = tf.keras.layers.StringLookup(vocabulary=vocab, output_mode='one_hot')
lookup(['c','a','a','b','zzz'])
vocab = [1,4,7,99]
lookup = tf.keras.layers.IntegerLookup(vocabulary=vocab, output_mode='one_hot')

lookup([-1,4,1])
for name in categorical_feature_names:
  vocab = sorted(set(df[name]))
  print(f'name: {name}')
  print(f'vocab: {vocab}\n')

  if type(vocab[0]) is str:
    lookup = tf.keras.layers.StringLookup(vocabulary=vocab, output_mode='one_hot')
  else:
    lookup = tf.keras.layers.IntegerLookup(vocabulary=vocab, output_mode='one_hot')

  x = inputs[name][:, tf.newaxis]
  x = lookup(x)
  preprocessed.append(x)

preprocesssed_result = tf.concat(preprocessed, axis=-1)
preprocesssed_result

preprocessor = tf.keras.Model(inputs, preprocesssed_result)

body = tf.keras.Sequential([
  tf.keras.layers.Dense(10, activation='relu'),
  tf.keras.layers.Dense(10, activation='relu'),
  tf.keras.layers.Dense(1)
])

x = preprocessor(inputs)
result = body(x)
model = tf.keras.Model(inputs, result)

model.compile(optimizer='adam',
                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                metrics=['accuracy'])
history = model.fit(dict(df), target, epochs=5, batch_size=BATCH_SIZE)