二、特征数据预处理

部署运行你感兴趣的模型镜像

1. tf.feature_column处理特征

1.1 应用场景:

tf.feature_column是针对结构化数据(Structure Data)使用的,比如在csv中存储的数据。FC 的角色是 结构化数据和模型之间的桥梁 (feature columns as a bridge to map from columns in a CSV to features used to train the model),feature_column 上接pandasDataFrame,下接DenseFeatures Layer.

详细介绍

对加权分类数据求和:

在某些情况下,您需要处理分类数据,其中类别的每次出现都附带关联的权重。在特征列中,这由 tf.feature_column.weighted_categorical_column 处理。与 indicator_column 配对时,效果是对每个类别的权重求和。

Using weighted inputs in “count” mode

layer = tf.keras.layers.CategoryEncoding(
          num_tokens=4, output_mode="count")
count_weights = np.array([[.1, .2], [.1, .1], [.2, .3], [.4, .2]])
layer([[0, 1], [0, 0], [1, 2], [3, 1]], count_weights=count_weights)





1.2 使用总结

  1. 对于不同类型的数据tf.feature_column提供不同的函数处理(numericcategorical),如果是文本数据则需要单独的文本embedding,不在FC的覆盖范围内。
  2. 每个FC都是针对单独一列的, 一般来说一个 Datafame 包含很多列, 所以FC 一般都放在一个list中 (feature_columns
  3. Model 中第一个layer 是 tf.keras.layers.DenseFeatures(feature_columns)

1.3 实践案例

从Kaggle泰坦尼克号项目页面下载数据

from datetime import datetime
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models


# 打印日志
def printLog(info):
    now_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print("\n" + "==========" * 8 + "%s" % now_time)
    print(info + '...\n\n')


printLog("step1: prepare dataset...")

train_raw = pd.read_csv('train.csv')
test_raw = pd.read_csv('test.csv')

data = pd.concat([train_raw, test_raw])


# 处理缺失值
def prepare_df_data(df_raw):
    df = df_raw.copy()
    df.columns = [x.lower() for x in df.columns]
    df = df.rename(columns={'survived': 'label'})
    df = df.drop(['passengerid', 'name'], axis=1)  # 删除无用的列
    for col, dtype in dict(df.dtypes).items():
        # 判断是否包含缺失值
        if df[col].hasnans:
            # 添加标识是否缺失列
            df[col + '_nan'] = pd.isna(df[col]).astype('int32')
            # 填充
            if dtype not in [np.object, np.str, np.unicode]:
                df[col].fillna(df[col].mean(), inplace=True)
            else:
                df[col].fillna('', inplace=True)
    return df


pre_data = prepare_df_data(data)
train = pre_data.iloc[0:len(train_raw), :]
test = pre_data.iloc[len(train_raw):, :]


# 使用tf.data加载数据
def df_to_dataset(df, shuffle=True, batch_size=32):
    df_data = df.copy()
    if 'label' not in df_data.columns:  # 预测集合
        ds = tf.data.Dataset.from_tensor_slices(df_data.to_dict(orient='list'))
    else:
        labels = df_data.pop('label').values
        ds = tf.data.Dataset.from_tensor_slices((df_data.to_dict(orient='list'), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(df_data))
    ds = ds.batch(batch_size)
    return ds


ds_train = df_to_dataset(train)
ds_test = df_to_dataset(test)

# 使用tf.feature_column定义特征列
printLog("step2: make feature columns...")

feature_columns = []
feature_inputs = {}  # 函数式API
# 数值列
for col in ['age', 'fare', 'parch', 'sibsp'] + [c for c in pre_data.columns if c.endswith('_nan')]:
    feature_columns.append(tf.feature_column.numeric_column(col))

    feature_inputs[col] = layers.Input(shape=(1,), name=col, dtype=tf.float32)

# 分桶列
age = tf.feature_column.numeric_column('age')
age_buckets = tf.feature_column.bucketized_column(age,
                                                  boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
feature_columns.append(age_buckets)
feature_inputs['age'] = layers.Input((1,), name='age', dtype=tf.int64)

# indicator_columns

indicator_column = tf.feature_column.indicator_column(
    tf.feature_column.categorical_column_with_vocabulary_list(
        'pclass', vocabulary_list=pre_data['pclass'].unique()))

feature_columns.append(indicator_column)

feature_inputs['pclass'] = layers.Input((1,), name='pclass', dtype=tf.int64)

# 类别列,注意:所有的 Categorical Column类型最终都要通过indicator_column转换成Dense Column类型才能传入模型!
# indicator_columns
indicator_column_names = ['sex', 'embarked']
for col_name in indicator_column_names:
    indicator_column = tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            col_name, vocabulary_list=pre_data[col_name].unique()))

    feature_columns.append(indicator_column)

    feature_inputs[col_name] = layers.Input((1,), name=col_name, dtype=tf.string)

# 特征hash
ticket = tf.feature_column.indicator_column(
    tf.feature_column.categorical_column_with_hash_bucket('ticket', 3))

feature_columns.append(ticket)
feature_inputs['ticket'] = layers.Input((1,), name='ticket', dtype=tf.string)

# embedding columns
cabin = tf.feature_column.embedding_column(
    tf.feature_column.categorical_column_with_hash_bucket('cabin', 32), 2)

feature_columns.append(cabin)
feature_inputs['cabin'] = layers.Input((1,), name='cabin', dtype=tf.string)

# 交叉列
pclass_cate = tf.feature_column.categorical_column_with_vocabulary_list(
    key='pclass', vocabulary_list=[1, 2, 3])

crossed_feature = tf.feature_column.indicator_column(
    tf.feature_column.crossed_column([age_buckets, pclass_cate], hash_bucket_size=15))

feature_columns.append(crossed_feature)

# 定义模型
printLog("step3: define model...")
# model1: 函数式API
features = layers.DenseFeatures(feature_columns)(feature_inputs)
# print(features.shape, features)
x = layers.Dense(64, activation="relu")(features)
x = layers.Dropout(rate=0.2)(x)
x = layers.Dense(64, activation="relu")(x)
output = layers.Dense(1, activation="sigmoid")(x)

model_1 = tf.keras.Model(feature_inputs, output)

# 2. sequential model
model = tf.keras.Sequential([
    layers.DenseFeatures(feature_columns),
    layers.Dense(128, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

# 训练模型
printLog("step4: train model...")

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(ds_train,
                    validation_data=ds_test,
                    epochs=30)

# 评估模型
printLog("step5: eval model...")


def plot_metric(history, metric):
    train_metrics = history.history[metric]
    val_metrics = history.history['val_' + metric]
    epochs = range(1, len(train_metrics) + 1)
    plt.plot(epochs, train_metrics, 'bo--')
    plt.plot(epochs, val_metrics, 'ro-')
    plt.title('Training and validation ' + metric)
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend(["train_" + metric, 'val_' + metric])
    plt.show()


plot_metric(history, "accuracy")


2. Keras.preprocessing 处理特征

通过Processing Layers来做预处理的最大好处是:构建好的模型会自带预处理机制,这样有助于构建一个end-to-end的模型,最大程度的减少调用者的麻烦,模型使用者可以直接将raw string 喂给模型。

使用预处理层建模

2.1 功能层介绍

Layer作用
TextVectorization文本向量化。在对数据处理前, 需要调用 adapt ,对数据进行学习
Normalization对数值feature做正则化。在对数据处理前, 需要调用 adapt ,对数据进行学习
Discretization Layer对数值feature进行分段,变成Category类型。在对数据处理前, 需要调用 adapt ,对数据进行学习
CategoryEncoding Layer对已经转为indicesCategory进行one-hotmulti-hot or TF-IDF编码,常与StringLookupIntergerLookup联用。在对数据处理前, 需要调用 adapt ,对数据进行学习
Hashing LayerFeature进行hash
StringLookup Layerstring Category 变成indices。在对数据处理前, 需要调用 adapt ,对数据进行学习
IntegerLookup Layernumeric Category 变成indices。在对数据处理前, 需要调用 adapt ,对数据进行学习
CategoryCrossing Layer把多列交叉,生成新的feature

注: 使用词汇表嵌入字符串数据:
对于较大的词汇表,通常需要嵌入向量才能获得良好的性能。下面是一个使用特征列嵌入字符串特征的示例:

vocab_col = tf1.feature_column.categorical_column_with_vocabulary_list(
    'col',
    vocabulary_list=['small', 'medium', 'large'],
    num_oov_buckets=0)
embedding_col = tf1.feature_column.embedding_column(vocab_col, 4)
call_feature_columns(embedding_col, {'col': ['small', 'medium', 'large']})

利用 Keras 预处理层,可以通过组合 tf.keras.layers.StringLookup 层和 tf.keras.layers.Embedding 层来实现。StringLookup 的默认输出将是可直接馈送到嵌入向量中的整数索引。

注:Embedding 层包含可训练参数。虽然 StringLookup 层可以应用于模型内部或外部的数据,但 Embedding 必须始终是可训练 Keras 模型的一部分才能正确训练。

string_lookup_layer = tf.keras.layers.StringLookup(
    vocabulary=['small', 'medium', 'large'], num_oov_indices=0)
embedding = tf.keras.layers.Embedding(3, 4)
embedding(string_lookup_layer(['small', 'medium', 'large']))

2.2 使用教程

2.1 数值特征

# Create a Normalization layer and set its internal state using the training data
normalizer = preprocessing.Normalization()
normalizer.adapt(data)

2.2 one-hot| Multi-hot encoding

keras只能接受长度相等的序列输入。当我们的数据集中出现了长度不等的序列时,可以使用pad_sequence()函数将序列转化为经过填充以后得到的一个长度相同新的序列。

CategoryEncoding 特征序列长度需要保持相同参考文档:

StringLookup 详解教程,特征序列长度需要保持相同 参考文档
对于序列长度不等的,可以使用tf.keras.preprocessing.sequence.pad_sequences() 处理

类别特征

data = tf.constant([["a"], ["b"], ["c"], ["b"], ["c"], ["a"]])

indexer = preprocessing.StringLookup()   # Use StringLookup to build an index of the feature values
indexer.adapt(data)
# encoder = tf.keras.layers.Embedding(input_dim=len(index.get_vocabulary()), output_dim=16)  # 使用embedding映射
encoder = preprocessing.CategoryEncoding(output_mode="binary")  # Use CategoryEncoding to encode the integer indices to a one-hot vector
encoder.adapt(indexer(data))

数值型离散化

# Define some toy data
data = tf.constant([[10], [20], [20], [10], [30], [0]])

# Use IntegerLookup to build an index of the feature values
indexer = preprocessing.IntegerLookup()
indexer.adapt(data)

# Use CategoryEncoding to encode the integer indices to a one-hot vector
encoder = preprocessing.CategoryEncoding(output_mode="binary")
encoder.adapt(indexer(data))

# Convert new test data (which includes unknown feature values)
test_data = tf.constant([10, 10, 20, 50, 60, 0])
encoded_data = encoder(indexer(test_data))
print(encoded_data)

2.3 特征hash

data = np.random.randint(0, 100000, size=(10000, 1))

# Use the Hashing layer to hash the values to the range [0, 64]
hasher = preprocessing.Hashing(num_bins=64, salt=1337)

# Use the CategoryEncoding layer to one-hot encode the hashed values
encoder = preprocessing.CategoryEncoding(max_tokens=64, output_mode="binary")
encoded_data = encoder(hasher(data))
print(encoded_data.shape)

2.4 文本向量化

  1. TF1.X 是函数 tf.nn.embedding_lookup
  2. TF2.X 是函数 layers.Embedding(input_dim, output_dim, embeddings_initializer='uniform',weights =[weight]) 其中input_dim是Size of the vocabulary。weight : 如果你要已经训练好的词向量,可以在这里传入。比如现在大部分用Google的Word2Vector中词向量。
  3. Embedding层只能作为模型的第一层,嵌入层将正整数(下标)转换为具有固定大小的向量。其中 tf.keras.layers.TextVectorizationtf.keras.layers.StringLookup以及tf.keras.layers.IntegerLookup 预处理层可以作为Embedding layer的输入。
  4. tf.keras.layers.TextVectorization 将不同长度的文本,转换成相同长度的数组。
  5. tf.keras.preprocessing.sequence.pad_sequence(sequences,padding="post" ) 其中sequences:序列列表(每个序列都是整数列表)。文本分类Masking and padding应用
# Define some text data to adapt the layer
data = tf.constant(
    [
        "The Brain is wider than the Sky",
        "For put them side by side",
        "The one the other will contain",
        "With ease and You beside",
    ]
)
# Instantiate TextVectorization with "int" output_mode
text_vectorizer = preprocessing.TextVectorization(output_mode="int")
# Index the vocabulary via `adapt()`
text_vectorizer.adapt(data) 

# You can retrieve the vocabulary we indexed via get_vocabulary()
vocab = text_vectorizer.get_vocabulary()
# input_dim=text_vectorizer.vocabulary_size()
print("Vocabulary:", vocab)

# Create an Embedding + LSTM model
inputs = keras.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)        # 得到 data 中字典下标组成的数组
x = layers.Embedding(input_dim=len(vocab), output_dim=64)(x)
outputs = layers.LSTM(1)(x)
model = keras.Model(inputs, outputs)

# Call the model on test data (which includes unknown tokens)
test_data = tf.constant(["The Brain is deeper than the sea"])
test_output = model(test_data)

2.5 n-grams

# Define some text data to adapt the layer
data = tf.constant(
    [
        "The Brain is wider than the Sky",
        "For put them side by side",
        "The one the other will contain",
        "With ease and You beside",
    ]
)
# Instantiate TextVectorization with "binary" output_mode (multi-hot)
# and ngrams=2 (index all bigrams)
text_vectorizer = preprocessing.TextVectorization(output_mode="binary", ngrams=2)
# Index the bigrams via `adapt()`
text_vectorizer.adapt(data)

print(
    "Encoded text:\n",
    text_vectorizer(["The Brain is deeper than the sea"]).numpy(),
    "\n",
)

# Create a Dense model
inputs = keras.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
outputs = layers.Dense(1)(x)
model = keras.Model(inputs, outputs)

# Call the model on test data (which includes unknown tokens)
test_data = tf.constant(["The Brain is deeper than the sea"])
test_output = model(test_data)

print("Model output:", test_output)

2.6 使用TF-IDF编码文本

# Define some text data to adapt the layer
data = tf.constant(
    [
        "The Brain is wider than the Sky",
        "For put them side by side",
        "The one the other will contain",
        "With ease and You beside",
    ]
)
# Instantiate TextVectorization with "tf-idf" output_mode
# (multi-hot with TF-IDF weighting) and ngrams=2 (index all bigrams)
text_vectorizer = preprocessing.TextVectorization(output_mode="tf-idf", ngrams=2)
# Index the bigrams and learn the TF-IDF weights via `adapt()`
text_vectorizer.adapt(data)

print(
    "Encoded text:\n",
    text_vectorizer(["The Brain is deeper than the sea"]).numpy(),
    "\n",
)

# Create a Dense model
inputs = keras.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
outputs = layers.Dense(1)(x)
model = keras.Model(inputs, outputs)

# Call the model on test data (which includes unknown tokens)
test_data = tf.constant(["The Brain is deeper than the sea"])
test_output = model(test_data)
print("Model output:", test_output)

2.3 实践案例

import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

import pathlib

dataset_url = 'http://storage.googleapis.com/download.tensorflow.org/data/petfinder-mini.zip'
csv_file = 'datasets/petfinder-mini/petfinder-mini.csv'

tf.keras.utils.get_file('petfinder_mini.zip', dataset_url,
                        extract=True, cache_dir='.')
dataframe = pd.read_csv(csv_file)

# 创建目标变量
dataframe['label'] = np.where(dataframe['AdoptionSpeed'] == 4, 0, 1)  # 0:表示未被领养

dataframe = dataframe.drop(columns=['AdoptionSpeed', 'Description'])  # 删除无用的列

# 划分训练集、验证集、测试集
train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)


# 使用tf.data创建流水线
def df_to_dataset(df, shuffle=True, batch_size=32):
    df_data = df.copy()
    if 'label' not in df_data.columns:  # 预测集合
        ds = tf.data.Dataset.from_tensor_slices(dict(dataframe))
    else:
        labels = df_data.pop('label').values
        ds = tf.data.Dataset.from_tensor_slices((dict(df_data), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(df_data))
    ds = ds.batch(batch_size)
    return ds


# TODO:使用预处理层
# 数值列
def get_normalization_layer(name, dataset):
    normalizer = preprocessing.Normalization(axis=None)
    feature_ds = dataset.map(lambda x, y: x[name])  # 训练集dataset输入feature 和label
    normalizer.adapt(feature_ds)
    return normalizer


# 分类列:该层将值从词汇表映射到整数索引,并对特征进行独热编码
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    if dtype == 'string':  # 字符串类型
        index = preprocessing.StringLookup(max_tokens=max_tokens)
    else:  # 离散数值型
        index = preprocessing.IntegerLookup(max_tokens=max_tokens)

    feature_ds = dataset.map(lambda x, y: x[name])

    index.adapt(feature_ds)  # Learn the set of possible values and assign them a fixed integer index.

    if name in ['province', 'current_brand', 'currentchannel', 'currentdevicemanufacturer',
            'currentdevicetype']:
        """
        tf.keras.Input(shape=(1,)),Input shape: 2D tensor with shape: (batch_size, input_length).
        embedding的输出向量的shape为(batch_size, input_length, output_dim).,再和其他特征concatenate前,需要Flatten操作
        如果输入层是tf.keras.Input(shape=()),则embedding的输出向量的shape(batch_size, output_dim).
        """
        encoder = tf.keras.layers.Embedding(input_dim=len(index.get_vocabulary()), output_dim=16)  # embedding 映射

        return lambda feature: tf.keras.layers.Flatten()(encoder(index(feature)))

    else:
        encoder = tf.keras.layers.CategoryEncoding(num_tokens=index.vocabulary_size())  # one-hot编码
        return lambda feature: encoder(index(feature))


batch_size = 256
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

# 调用它来查看它返回的数据的格式
[(train_features, label_batch)] = train_ds.take(1)
# print('Every feature:', list(train_features.keys()))
# print('A batch of ages:', train_features['Age'])
# print('A batch of targets:', label_batch)

all_inputs = []  # 输入层
encoded_features = []  # 编码层

# Numeric features.
for header in ['PhotoAmt', 'Fee']:
    numeric_col = tf.keras.Input(shape=(1,), name=header)
    normalization_layer = get_normalization_layer(header, train_ds)
    encoded_numeric_col = normalization_layer(numeric_col)
    all_inputs.append(numeric_col)
    encoded_features.append(encoded_numeric_col)

# Categorical features encoded as integers.
age_col = tf.keras.Input(shape=(1,), name='Age', dtype='int64')
encoding_layer = get_category_encoding_layer('Age', train_ds, dtype='int64',
                                             max_tokens=5)
encoded_age_col = encoding_layer(age_col)
all_inputs.append(age_col)
encoded_features.append(encoded_age_col)

# Categorical features encoded as string.
categorical_cols = ['Type', 'Color1', 'Color2', 'Gender', 'MaturitySize',
                    'FurLength', 'Vaccinated', 'Sterilized', 'Health', 'Breed1']
for header in categorical_cols:
    categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='string')
    encoding_layer = get_category_encoding_layer(header, train_ds, dtype='string',
                                                 max_tokens=5)
    encoded_categorical_col = encoding_layer(categorical_col)
    all_inputs.append(categorical_col)
    encoded_features.append(encoded_categorical_col)

# 创建端到端模型
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(32, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(1)(x)
model = tf.keras.Model(all_inputs, output)
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=["accuracy"])

# 模型训练
model.fit(train_ds, epochs=10, validation_data=val_ds)
# 模型预测
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

您可能感兴趣的与本文相关的镜像

TensorFlow-v2.15

TensorFlow-v2.15

TensorFlow

TensorFlow 是由Google Brain 团队开发的开源机器学习框架,广泛应用于深度学习研究和生产环境。 它提供了一个灵活的平台,用于构建和训练各种机器学习模型

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值