目录
1. tf.feature_column处理特征
1.1 应用场景:
tf.feature_column是针对结构化数据(Structure Data)使用的,比如在csv中存储的数据。FC 的角色是 结构化数据和模型之间的桥梁 (feature columns as a bridge to map from columns in a CSV to features used to train the model),feature_column 上接pandas的DataFrame,下接DenseFeatures Layer.
对加权分类数据求和:
在某些情况下,您需要处理分类数据,其中类别的每次出现都附带关联的权重。在特征列中,这由 tf.feature_column.weighted_categorical_column 处理。与 indicator_column 配对时,效果是对每个类别的权重求和。
Using weighted inputs in “count” mode
layer = tf.keras.layers.CategoryEncoding(
num_tokens=4, output_mode="count")
count_weights = np.array([[.1, .2], [.1, .1], [.2, .3], [.4, .2]])
layer([[0, 1], [0, 0], [1, 2], [3, 1]], count_weights=count_weights)
1.2 使用总结
- 对于不同类型的数据
tf.feature_column提供不同的函数处理(numeric、categorical),如果是文本数据则需要单独的文本embedding,不在FC的覆盖范围内。 - 每个FC都是针对单独一列的, 一般来说一个 Datafame 包含很多列, 所以FC 一般都放在一个list中 (
feature_columns) - Model 中第一个layer 是
tf.keras.layers.DenseFeatures(feature_columns)。
1.3 实践案例
从Kaggle泰坦尼克号项目页面下载数据
from datetime import datetime
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
# 打印日志
def printLog(info):
now_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print("\n" + "==========" * 8 + "%s" % now_time)
print(info + '...\n\n')
printLog("step1: prepare dataset...")
train_raw = pd.read_csv('train.csv')
test_raw = pd.read_csv('test.csv')
data = pd.concat([train_raw, test_raw])
# 处理缺失值
def prepare_df_data(df_raw):
df = df_raw.copy()
df.columns = [x.lower() for x in df.columns]
df = df.rename(columns={'survived': 'label'})
df = df.drop(['passengerid', 'name'], axis=1) # 删除无用的列
for col, dtype in dict(df.dtypes).items():
# 判断是否包含缺失值
if df[col].hasnans:
# 添加标识是否缺失列
df[col + '_nan'] = pd.isna(df[col]).astype('int32')
# 填充
if dtype not in [np.object, np.str, np.unicode]:
df[col].fillna(df[col].mean(), inplace=True)
else:
df[col].fillna('', inplace=True)
return df
pre_data = prepare_df_data(data)
train = pre_data.iloc[0:len(train_raw), :]
test = pre_data.iloc[len(train_raw):, :]
# 使用tf.data加载数据
def df_to_dataset(df, shuffle=True, batch_size=32):
df_data = df.copy()
if 'label' not in df_data.columns: # 预测集合
ds = tf.data.Dataset.from_tensor_slices(df_data.to_dict(orient='list'))
else:
labels = df_data.pop('label').values
ds = tf.data.Dataset.from_tensor_slices((df_data.to_dict(orient='list'), labels))
if shuffle:
ds = ds.shuffle(buffer_size=len(df_data))
ds = ds.batch(batch_size)
return ds
ds_train = df_to_dataset(train)
ds_test = df_to_dataset(test)
# 使用tf.feature_column定义特征列
printLog("step2: make feature columns...")
feature_columns = []
feature_inputs = {} # 函数式API
# 数值列
for col in ['age', 'fare', 'parch', 'sibsp'] + [c for c in pre_data.columns if c.endswith('_nan')]:
feature_columns.append(tf.feature_column.numeric_column(col))
feature_inputs[col] = layers.Input(shape=(1,), name=col, dtype=tf.float32)
# 分桶列
age = tf.feature_column.numeric_column('age')
age_buckets = tf.feature_column.bucketized_column(age,
boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
feature_columns.append(age_buckets)
feature_inputs['age'] = layers.Input((1,), name='age', dtype=tf.int64)
# indicator_columns
indicator_column = tf.feature_column.indicator_column(
tf.feature_column.categorical_column_with_vocabulary_list(
'pclass', vocabulary_list=pre_data['pclass'].unique()))
feature_columns.append(indicator_column)
feature_inputs['pclass'] = layers.Input((1,), name='pclass', dtype=tf.int64)
# 类别列,注意:所有的 Categorical Column类型最终都要通过indicator_column转换成Dense Column类型才能传入模型!
# indicator_columns
indicator_column_names = ['sex', 'embarked']
for col_name in indicator_column_names:
indicator_column = tf.feature_column.indicator_column(
tf.feature_column.categorical_column_with_vocabulary_list(
col_name, vocabulary_list=pre_data[col_name].unique()))
feature_columns.append(indicator_column)
feature_inputs[col_name] = layers.Input((1,), name=col_name, dtype=tf.string)
# 特征hash
ticket = tf.feature_column.indicator_column(
tf.feature_column.categorical_column_with_hash_bucket('ticket', 3))
feature_columns.append(ticket)
feature_inputs['ticket'] = layers.Input((1,), name='ticket', dtype=tf.string)
# embedding columns
cabin = tf.feature_column.embedding_column(
tf.feature_column.categorical_column_with_hash_bucket('cabin', 32), 2)
feature_columns.append(cabin)
feature_inputs['cabin'] = layers.Input((1,), name='cabin', dtype=tf.string)
# 交叉列
pclass_cate = tf.feature_column.categorical_column_with_vocabulary_list(
key='pclass', vocabulary_list=[1, 2, 3])
crossed_feature = tf.feature_column.indicator_column(
tf.feature_column.crossed_column([age_buckets, pclass_cate], hash_bucket_size=15))
feature_columns.append(crossed_feature)
# 定义模型
printLog("step3: define model...")
# model1: 函数式API
features = layers.DenseFeatures(feature_columns)(feature_inputs)
# print(features.shape, features)
x = layers.Dense(64, activation="relu")(features)
x = layers.Dropout(rate=0.2)(x)
x = layers.Dense(64, activation="relu")(x)
output = layers.Dense(1, activation="sigmoid")(x)
model_1 = tf.keras.Model(feature_inputs, output)
# 2. sequential model
model = tf.keras.Sequential([
layers.DenseFeatures(feature_columns),
layers.Dense(128, activation='relu'),
layers.Dense(128, activation='relu'),
layers.Dense(1, activation='sigmoid')
])
# 训练模型
printLog("step4: train model...")
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
history = model.fit(ds_train,
validation_data=ds_test,
epochs=30)
# 评估模型
printLog("step5: eval model...")
def plot_metric(history, metric):
train_metrics = history.history[metric]
val_metrics = history.history['val_' + metric]
epochs = range(1, len(train_metrics) + 1)
plt.plot(epochs, train_metrics, 'bo--')
plt.plot(epochs, val_metrics, 'ro-')
plt.title('Training and validation ' + metric)
plt.xlabel("Epochs")
plt.ylabel(metric)
plt.legend(["train_" + metric, 'val_' + metric])
plt.show()
plot_metric(history, "accuracy")
2. Keras.preprocessing 处理特征
通过Processing Layers来做预处理的最大好处是:构建好的模型会自带预处理机制,这样有助于构建一个end-to-end的模型,最大程度的减少调用者的麻烦,模型使用者可以直接将raw string 喂给模型。
2.1 功能层介绍
| Layer | 作用 |
|---|---|
TextVectorization | 文本向量化。在对数据处理前, 需要调用 adapt ,对数据进行学习 |
Normalization | 对数值feature做正则化。在对数据处理前, 需要调用 adapt ,对数据进行学习 |
Discretization Layer | 对数值feature进行分段,变成Category类型。在对数据处理前, 需要调用 adapt ,对数据进行学习 |
CategoryEncoding Layer | 对已经转为indices的Category进行one-hot、multi-hot or TF-IDF编码,常与StringLookup,IntergerLookup联用。在对数据处理前, 需要调用 adapt ,对数据进行学习 |
Hashing Layer | 对Feature进行hash |
StringLookup Layer | 把 string Category 变成indices。在对数据处理前, 需要调用 adapt ,对数据进行学习 |
IntegerLookup Layer | 把numeric Category 变成indices。在对数据处理前, 需要调用 adapt ,对数据进行学习 |
CategoryCrossing Layer | 把多列交叉,生成新的feature |
注: 使用词汇表嵌入字符串数据:
对于较大的词汇表,通常需要嵌入向量才能获得良好的性能。下面是一个使用特征列嵌入字符串特征的示例:
vocab_col = tf1.feature_column.categorical_column_with_vocabulary_list(
'col',
vocabulary_list=['small', 'medium', 'large'],
num_oov_buckets=0)
embedding_col = tf1.feature_column.embedding_column(vocab_col, 4)
call_feature_columns(embedding_col, {'col': ['small', 'medium', 'large']})
利用 Keras 预处理层,可以通过组合 tf.keras.layers.StringLookup 层和 tf.keras.layers.Embedding 层来实现。StringLookup 的默认输出将是可直接馈送到嵌入向量中的整数索引。
注:Embedding 层包含可训练参数。虽然 StringLookup 层可以应用于模型内部或外部的数据,但 Embedding 必须始终是可训练 Keras 模型的一部分才能正确训练。
string_lookup_layer = tf.keras.layers.StringLookup(
vocabulary=['small', 'medium', 'large'], num_oov_indices=0)
embedding = tf.keras.layers.Embedding(3, 4)
embedding(string_lookup_layer(['small', 'medium', 'large']))
2.2 使用教程
2.1 数值特征
# Create a Normalization layer and set its internal state using the training data
normalizer = preprocessing.Normalization()
normalizer.adapt(data)
2.2 one-hot| Multi-hot encoding
keras只能接受长度相等的序列输入。当我们的数据集中出现了长度不等的序列时,可以使用pad_sequence()函数将序列转化为经过填充以后得到的一个长度相同新的序列。
CategoryEncoding 特征序列长度需要保持相同参考文档:
StringLookup 详解教程,特征序列长度需要保持相同 参考文档
对于序列长度不等的,可以使用tf.keras.preprocessing.sequence.pad_sequences() 处理
类别特征
data = tf.constant([["a"], ["b"], ["c"], ["b"], ["c"], ["a"]])
indexer = preprocessing.StringLookup() # Use StringLookup to build an index of the feature values
indexer.adapt(data)
# encoder = tf.keras.layers.Embedding(input_dim=len(index.get_vocabulary()), output_dim=16) # 使用embedding映射
encoder = preprocessing.CategoryEncoding(output_mode="binary") # Use CategoryEncoding to encode the integer indices to a one-hot vector
encoder.adapt(indexer(data))
数值型离散化
# Define some toy data
data = tf.constant([[10], [20], [20], [10], [30], [0]])
# Use IntegerLookup to build an index of the feature values
indexer = preprocessing.IntegerLookup()
indexer.adapt(data)
# Use CategoryEncoding to encode the integer indices to a one-hot vector
encoder = preprocessing.CategoryEncoding(output_mode="binary")
encoder.adapt(indexer(data))
# Convert new test data (which includes unknown feature values)
test_data = tf.constant([10, 10, 20, 50, 60, 0])
encoded_data = encoder(indexer(test_data))
print(encoded_data)
2.3 特征hash
data = np.random.randint(0, 100000, size=(10000, 1))
# Use the Hashing layer to hash the values to the range [0, 64]
hasher = preprocessing.Hashing(num_bins=64, salt=1337)
# Use the CategoryEncoding layer to one-hot encode the hashed values
encoder = preprocessing.CategoryEncoding(max_tokens=64, output_mode="binary")
encoded_data = encoder(hasher(data))
print(encoded_data.shape)
2.4 文本向量化
- TF1.X 是函数
tf.nn.embedding_lookup - TF2.X 是函数
layers.Embedding(input_dim, output_dim, embeddings_initializer='uniform',weights =[weight])其中input_dim是Size of the vocabulary。weight: 如果你要已经训练好的词向量,可以在这里传入。比如现在大部分用Google的Word2Vector中词向量。 Embedding层只能作为模型的第一层,嵌入层将正整数(下标)转换为具有固定大小的向量。其中tf.keras.layers.TextVectorization、tf.keras.layers.StringLookup以及tf.keras.layers.IntegerLookup预处理层可以作为Embedding layer的输入。tf.keras.layers.TextVectorization将不同长度的文本,转换成相同长度的数组。tf.keras.preprocessing.sequence.pad_sequence(sequences,padding="post" )其中sequences:序列列表(每个序列都是整数列表)。文本分类Masking and padding应用
# Define some text data to adapt the layer
data = tf.constant(
[
"The Brain is wider than the Sky",
"For put them side by side",
"The one the other will contain",
"With ease and You beside",
]
)
# Instantiate TextVectorization with "int" output_mode
text_vectorizer = preprocessing.TextVectorization(output_mode="int")
# Index the vocabulary via `adapt()`
text_vectorizer.adapt(data)
# You can retrieve the vocabulary we indexed via get_vocabulary()
vocab = text_vectorizer.get_vocabulary()
# input_dim=text_vectorizer.vocabulary_size()
print("Vocabulary:", vocab)
# Create an Embedding + LSTM model
inputs = keras.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs) # 得到 data 中字典下标组成的数组
x = layers.Embedding(input_dim=len(vocab), output_dim=64)(x)
outputs = layers.LSTM(1)(x)
model = keras.Model(inputs, outputs)
# Call the model on test data (which includes unknown tokens)
test_data = tf.constant(["The Brain is deeper than the sea"])
test_output = model(test_data)
2.5 n-grams
# Define some text data to adapt the layer
data = tf.constant(
[
"The Brain is wider than the Sky",
"For put them side by side",
"The one the other will contain",
"With ease and You beside",
]
)
# Instantiate TextVectorization with "binary" output_mode (multi-hot)
# and ngrams=2 (index all bigrams)
text_vectorizer = preprocessing.TextVectorization(output_mode="binary", ngrams=2)
# Index the bigrams via `adapt()`
text_vectorizer.adapt(data)
print(
"Encoded text:\n",
text_vectorizer(["The Brain is deeper than the sea"]).numpy(),
"\n",
)
# Create a Dense model
inputs = keras.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
outputs = layers.Dense(1)(x)
model = keras.Model(inputs, outputs)
# Call the model on test data (which includes unknown tokens)
test_data = tf.constant(["The Brain is deeper than the sea"])
test_output = model(test_data)
print("Model output:", test_output)
2.6 使用TF-IDF编码文本
# Define some text data to adapt the layer
data = tf.constant(
[
"The Brain is wider than the Sky",
"For put them side by side",
"The one the other will contain",
"With ease and You beside",
]
)
# Instantiate TextVectorization with "tf-idf" output_mode
# (multi-hot with TF-IDF weighting) and ngrams=2 (index all bigrams)
text_vectorizer = preprocessing.TextVectorization(output_mode="tf-idf", ngrams=2)
# Index the bigrams and learn the TF-IDF weights via `adapt()`
text_vectorizer.adapt(data)
print(
"Encoded text:\n",
text_vectorizer(["The Brain is deeper than the sea"]).numpy(),
"\n",
)
# Create a Dense model
inputs = keras.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
outputs = layers.Dense(1)(x)
model = keras.Model(inputs, outputs)
# Call the model on test data (which includes unknown tokens)
test_data = tf.constant(["The Brain is deeper than the sea"])
test_output = model(test_data)
print("Model output:", test_output)
2.3 实践案例
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
import pathlib
dataset_url = 'http://storage.googleapis.com/download.tensorflow.org/data/petfinder-mini.zip'
csv_file = 'datasets/petfinder-mini/petfinder-mini.csv'
tf.keras.utils.get_file('petfinder_mini.zip', dataset_url,
extract=True, cache_dir='.')
dataframe = pd.read_csv(csv_file)
# 创建目标变量
dataframe['label'] = np.where(dataframe['AdoptionSpeed'] == 4, 0, 1) # 0:表示未被领养
dataframe = dataframe.drop(columns=['AdoptionSpeed', 'Description']) # 删除无用的列
# 划分训练集、验证集、测试集
train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
# 使用tf.data创建流水线
def df_to_dataset(df, shuffle=True, batch_size=32):
df_data = df.copy()
if 'label' not in df_data.columns: # 预测集合
ds = tf.data.Dataset.from_tensor_slices(dict(dataframe))
else:
labels = df_data.pop('label').values
ds = tf.data.Dataset.from_tensor_slices((dict(df_data), labels))
if shuffle:
ds = ds.shuffle(buffer_size=len(df_data))
ds = ds.batch(batch_size)
return ds
# TODO:使用预处理层
# 数值列
def get_normalization_layer(name, dataset):
normalizer = preprocessing.Normalization(axis=None)
feature_ds = dataset.map(lambda x, y: x[name]) # 训练集dataset输入feature 和label
normalizer.adapt(feature_ds)
return normalizer
# 分类列:该层将值从词汇表映射到整数索引,并对特征进行独热编码
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
if dtype == 'string': # 字符串类型
index = preprocessing.StringLookup(max_tokens=max_tokens)
else: # 离散数值型
index = preprocessing.IntegerLookup(max_tokens=max_tokens)
feature_ds = dataset.map(lambda x, y: x[name])
index.adapt(feature_ds) # Learn the set of possible values and assign them a fixed integer index.
if name in ['province', 'current_brand', 'currentchannel', 'currentdevicemanufacturer',
'currentdevicetype']:
"""
tf.keras.Input(shape=(1,)),Input shape: 2D tensor with shape: (batch_size, input_length).
embedding的输出向量的shape为(batch_size, input_length, output_dim).,再和其他特征concatenate前,需要Flatten操作
如果输入层是tf.keras.Input(shape=()),则embedding的输出向量的shape(batch_size, output_dim).
"""
encoder = tf.keras.layers.Embedding(input_dim=len(index.get_vocabulary()), output_dim=16) # embedding 映射
return lambda feature: tf.keras.layers.Flatten()(encoder(index(feature)))
else:
encoder = tf.keras.layers.CategoryEncoding(num_tokens=index.vocabulary_size()) # one-hot编码
return lambda feature: encoder(index(feature))
batch_size = 256
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)
# 调用它来查看它返回的数据的格式
[(train_features, label_batch)] = train_ds.take(1)
# print('Every feature:', list(train_features.keys()))
# print('A batch of ages:', train_features['Age'])
# print('A batch of targets:', label_batch)
all_inputs = [] # 输入层
encoded_features = [] # 编码层
# Numeric features.
for header in ['PhotoAmt', 'Fee']:
numeric_col = tf.keras.Input(shape=(1,), name=header)
normalization_layer = get_normalization_layer(header, train_ds)
encoded_numeric_col = normalization_layer(numeric_col)
all_inputs.append(numeric_col)
encoded_features.append(encoded_numeric_col)
# Categorical features encoded as integers.
age_col = tf.keras.Input(shape=(1,), name='Age', dtype='int64')
encoding_layer = get_category_encoding_layer('Age', train_ds, dtype='int64',
max_tokens=5)
encoded_age_col = encoding_layer(age_col)
all_inputs.append(age_col)
encoded_features.append(encoded_age_col)
# Categorical features encoded as string.
categorical_cols = ['Type', 'Color1', 'Color2', 'Gender', 'MaturitySize',
'FurLength', 'Vaccinated', 'Sterilized', 'Health', 'Breed1']
for header in categorical_cols:
categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='string')
encoding_layer = get_category_encoding_layer(header, train_ds, dtype='string',
max_tokens=5)
encoded_categorical_col = encoding_layer(categorical_col)
all_inputs.append(categorical_col)
encoded_features.append(encoded_categorical_col)
# 创建端到端模型
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(32, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(1)(x)
model = tf.keras.Model(all_inputs, output)
model.compile(optimizer='adam',
loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
metrics=["accuracy"])
# 模型训练
model.fit(train_ds, epochs=10, validation_data=val_ds)
# 模型预测
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)
3973

被折叠的 条评论
为什么被折叠?



