FFM 的实例
我们的数据
import pandas as pd
df = pd.read_csv("./train.csv")
df
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
595 | 596 | 0 | 3 | Van Impe, Mr. Jean Baptiste | male | 36.0 | 1 | 1 | 345773 | 24.1500 | NaN | S |
596 | 597 | 1 | 2 | Leitch, Miss. Jessie Wills | female | NaN | 0 | 0 | 248727 | 33.0000 | NaN | S |
597 | 598 | 0 | 3 | Johnson, Mr. Alfred | male | 49.0 | 0 | 0 | LINE | 0.0000 | NaN | S |
598 | 599 | 0 | 3 | Boulos, Mr. Hanna | male | NaN | 0 | 0 | 2664 | 7.2250 | NaN | C |
599 | 600 | 1 | 1 | Duff Gordon, Sir. Cosmo Edmund ("Mr Morgan") | male | 49.0 | 1 | 0 | PC 17485 | 56.9292 | A20 | C |
600 rows × 12 columns
Survived 是我们要预测的标签,这是一个0 1 分类问题。 我们的feild 是’Pclass’, “Sex”, “SibSp”, “Parch”, “Fare”, “Embarked”
其中 ‘Pclass’, “Sex”, “SibSp”, “Parch”, “Embarked” 是离散变量, 而"Fare"是连续变量。我们所有的features 就来自这6个field。
代码部分
import tensorflow as tf
from collections import namedtuple
from sklearn.preprocessing import StandardScaler
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
输入参数
tf.flags.DEFINE_string("opt_type", "Adam", "optimizer type (Adagrad, Adam, Ftrl, Momentum, RMSProp, SGD).")
tf.flags.DEFINE_string("train_file_path", "./train.csv", "train file path.")
tf.flags.DEFINE_string("test_file_path", "./test.csv", "train file path.")
tf.flags.DEFINE_string("label", "Survived", "target column name.")
tf.flags.DEFINE_string("activation", "relu", "deep mid activation function(tanh, relu, tanh, sigmoid).")
tf.flags.DEFINE_float("threshold", 0.5, "bi-classification threshold." )
tf.flags.DEFINE_string("loss_type", "log_loss", "bi-classification is log_loss, regression is mse.")
tf.flags.DEFINE_string("model_path", "./checkpoint/", "save model path.")
tf.flags.DEFINE_bool("use_deep", True, "Whether to use deep or not.")
tf.flags.DEFINE_string("model", "fm", "fm or ffm.")
tf.flags.DEFINE_list("layers", [30,30], "deep mid layers.")
tf.flags.DEFINE_list("category_columns", ['Pclass',"Sex","SibSp","Parch","Embarked"], "category columns.")
tf.flags.DEFINE_list("continuation_columns", ['Fare'], "continuation columns.")
tf.flags.DEFINE_float("lr", 0.01, "learning rate.")
tf.flags.DEFINE_float("line_output_keep_dropout", 0.9, "line output keep dropout in deep schema.")
tf.flags.DEFINE_float("fm_output_keep_dropout", 0.9, "fm output keep dropout in deep schema.")
tf.flags.DEFINE_float("deep_output_keep_dropout", 0.9, "deep output keep dropout in deep schema.")
tf.flags.DEFINE_float("deep_input_keep_dropout", 0.9, "deep input keep dropout in deep schema.")
tf.flags.DEFINE_float("deep_mid_keep_dropout", 0.8, "deep mid keep dropout in deep schema.")
tf.flags.DEFINE_integer("embedding_size", 3, "field embedding size")
tf.flags.DEFINE_bool("use_batch_normal", False, "Whether to use batch normal or not.")
tf.flags.DEFINE_integer("batch_size", 64, "batch size.")
tf.flags.DEFINE_integer("epoches", 1000, "epoches.")
tf.flags.DEFINE_integer("logging_level", 20, "tensorflow logging level.")
tf.flags.DEFINE_integer("seed", 20, "tensorflow seed num.")
FLAGS = tf.flags.FLAGS
tf.app.flags.DEFINE_string('f', '', 'kernel')
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
HParams = namedtuple(
"HParams",
[
"opt_type",
"threshold",
"loss_type",
"use_deep",
"model",
"layers",
"lr",
"fm_output_keep_dropout",
"line_output_keep_dropout",
"deep_input_keep_dropout",
"deep_mid_keep_dropout",
"deep_output_keep_dropout",
"embedding_size",
"use_batch_normal",
"batch_size",
"epoches",
"field_nums",
"feature_nums",
"activation",
"seed"
])
定义参数字典
def create_hparams(field_nums, feature_nums):
return HParams(
model=FLAGS.model,
opt_type=FLAGS.opt_type,
threshold=FLAGS.threshold,
loss_type=FLAGS.loss_type,
use_deep=FLAGS.use_deep,
layers=FLAGS.layers,
lr=FLAGS.lr,
fm_output_keep_dropout=FLAGS.fm_output_keep_dropout,
line_output_keep_dropout=FLAGS.line_output_keep_dropout,
deep_input_keep_dropout=FLAGS.deep_input_keep_dropout,
deep_output_keep_dropout=FLAGS.deep_output_keep_dropout,
deep_mid_keep_dropout=FLAGS.deep_mid_keep_dropout,
embedding_size=FLAGS.embedding_size,
use_batch_normal=FLAGS.use_batch_normal,
batch_size=FLAGS.batch_size,
epoches=FLAGS.epoches,
activation=FLAGS.activation,
seed=FLAGS.seed,
field_nums=field_nums,
feature_nums=feature_nums
)
数据处理函数
class FieldHandler(object):
def __init__(self, train_file_path, test_file_path=None, category_columns=[], continuation_columns=[]):
"""
train_df_path: train file path(must)
test_df_path: None or test file path
"""
self.train_file_path = None
self.test_file_path = None
self.feature_nums = 0
self.field_dict = {}
self.category_columns = category_columns
self.continuation_columns = continuation_columns
# 检查训练数据路径
if not isinstance(train_file_path, str):
raise ValueError("train file path must str")
# 检查训练数据路径
if os.path.exists(train_file_path):
self.train_file_path = train_file_path
else:
raise OSError("train file path isn't exists!")
# 检查测试数据路径
if test_file_path:
if os.path.exists(test_file_path):
self.test_file_path = test_file_path
else:
raise OSError("test file path isn't exists!")
# 读数据
self.read_data()
# 把离散(分类)值中的 null 填充为 -1
self.df[category_columns].fillna("-1", inplace=True)
# 建造领域特征
self.build_filed_dict()
# 标准化连续数据
self.build_standard_scaler()
self.field_nums = len(self.category_columns + self.continuation_columns)
def build_filed_dict(self):
for column in self.df.columns:
if column in self.category_columns:
cv = self.df[column].unique()
self.field_dict[column] = dict(zip(cv, range(self.feature_nums, self.feature_nums + len(cv))))
self.feature_nums += len(cv)
else:
self.field_dict[column] = self.feature_nums
self.feature_nums += 1
def read_data(self):
if self.train_file_path and self.test_file_path:
train_df = pd.read_csv(self.train_file_path)[self.category_columns + self.continuation_columns]
test_df = pd.read_csv(self.test_file_path)[self.category_columns + self.continuation_columns]
self.df = pd.concat([train_df, test_df])
else:
self.df = pd.read_csv(self.train_file_path)[self.category_columns + self.continuation_columns]
def build_standard_scaler(self):
if self.continuation_columns:
self.standard_scaler = StandardScaler()
self.standard_scaler.fit(self.df[self.continuation_columns].values)
else:
self.standard_scaler = None
fh = FieldHandler(train_file_path=FLAGS.train_file_path,
category_columns=FLAGS.category_columns,
continuation_columns=FLAGS.continuation_columns)
D:\windows-miniconda\miniconda\envs\jianbo\lib\site-packages\pandas\core\frame.py:4327: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
downcast=downcast,
hparams = create_hparams(fh.field_nums, fh.feature_nums)
把数据转化成模型输入的格式
def transformation_data(file_path: str, field_hander: FieldHandler, label=None):
"""
lable: target columns name
"""
df_v = pd.read_csv(file_path)
# 读取label值,把label值转为 float32 类型
if label:
if label in df_v.columns:
labels = df_v[[label]].values.astype("float32")
else:
raise KeyError(f'label "{label}" isn\'t exists')
# 合并离散(分类)值的列(field)和连续值的列(field)
df_v = df_v[field_hander.category_columns + field_hander.continuation_columns]
# 用—1 填充 df_v[field_hander.category_columns].fillna("-1", inplace=True)
df_v[field_hander.continuation_columns].fillna(-999, inplace=True)
# 标准化连续数据
if field_hander.standard_scaler:
df_v[field_hander.continuation_columns] = field_hander.standard_scaler.transform(
df_v[field_hander.continuation_columns].values)
df_i = df_v.copy()
# df_i 为features的编号,每个field 都对应着 自己不同的feature
# eq field1 有 feature1-1,feature1-2 两个feature , field2 有 feature2-1,feature2-2 ,feature2-3 三个feature ,
# df_i 就是为 feature1-1,feature1-2 ,feature2-1,feature2-2 ,feature2-3 统一用数字编号
# feature1-1,feature1-2 ,feature2-1,feature2-2 ,feature2-3 对应 0,1,2,3,4,5
# 而 df_v 就是 为这些feature 赋值。 离散(分类)feature变量的值都为 1, 而连续散feature变量为其本身。
# 后面 我们用df_i 找到对应的feature的 embedding weight, 这样就可以在多feature的情况下减少
for column in df_v.columns:
if column in field_hander.category_columns:
# 找到 离散(分类)feature 在field_dict 中的编号
df_i[column] = df_i[column].map(field_hander.field_dict[column])
# 设置这个feature 的值
df_v[column] = 1
else:
# 找到 连续feature 在field_dict 中的编号, 因为是连续 feature,所以 df_v是其本身 而不是 1.
df_i[column] = field_hander.field_dict[column]
df_v = df_v.values.astype("float32")
df_i = df_i.values.astype("int32")
features = {
"df_i": df_i,
"df_v": df_v
}
if label:
return features, labels
return features, None
features, labels = transformation_data(file_path=FLAGS.train_file_path, field_hander=fh, label=FLAGS.label)
test_features, test_labels = transformation_data(file_path=FLAGS.test_file_path, field_hander=fh, label=FLAGS.label)
D:\windows-miniconda\miniconda\envs\jianbo\lib\site-packages\pandas\core\frame.py:4327: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
downcast=downcast,
D:\windows-miniconda\miniconda\envs\jianbo\lib\site-packages\pandas\core\frame.py:4327: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
downcast=downcast,
训练和测试数据集
def create_input_fn(features, label, batch_size=32, num_epochs=10):
def input_fn():
dataset = tf.data.Dataset.from_tensor_slices((features, label))
dataset = dataset.shuffle(1000) # 将数据打乱,数值越大,混乱程度越大
dataset = dataset.repeat(num_epochs) # 数据集重复了指定次数
dataset = dataset.batch(batch_size) # 按照顺序取出batch_size行数据,最后一次输出可能小于batch
iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next()
return next_element
return input_fn
train_input_fn = create_input_fn(features,label=labels,batch_size=hparams.batch_size,num_epochs=hparams.epoches)
test_input_fn = create_input_fn(test_features,label=test_labels,batch_size=hparams.batch_size,num_epochs=hparams.epoches)
预测数据集
def create_predict_input_fn(features, batch_size=32, num_epochs=10):
def input_fn():
dataset = tf.data.Dataset.from_tensor_slices((features))
dataset = dataset.shuffle(1000) # 将数据打乱,数值越大,混乱程度越大
dataset = dataset.repeat(num_epochs) # 数据集重复了指定次数
dataset = dataset.batch(batch_size) # 按照顺序取出batch_size行数据,最后一次输出可能小于batch
iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next()
return next_element
return input_fn
predict_input_fn = create_predict_input_fn(test_features,batch_size=hparams.batch_size,num_epochs=hparams.epoches)
定义FM模型
class FFM(object):
def __init__(self, hparams, df_i, df_v):
# df_i, df_v None * n
self.hparams = hparams
tf.set_random_seed(self.hparams.seed)
self.line_result = self.line_section(df_i, df_v)
self.fm_result = self.fm_section(df_i, df_v)
print(self.line_result, self.fm_result)
self.logits = self.line_result + self.fm_result
def line_section(self, df_i, df_v):
with tf.variable_scope("line"):
weights = tf.get_variable("weights",
shape=[self.hparams.feature_nums, 1],
dtype=tf.float32,
initializer=tf.initializers.glorot_uniform()) # f * 1
batch_weights = tf.nn.embedding_lookup(weights, df_i) # none * n * 1
batch_weights = tf.squeeze(batch_weights, axis=2) # None * n
line_result = tf.multiply(df_v, batch_weights, name="line_w_x") # none * n
biase = tf.get_variable("biase",
shape=[1, 1],
dtype=tf.float32,
initializer=tf.initializers.zeros()) # 1 * 1
line_result = tf.add(tf.reduce_sum(line_result, axis=1, keepdims=True), biase) # None,1
return line_result
def fm_section(self, df_i, df_v):
with tf.variable_scope("fm"):
embedding = tf.get_variable("embedding",
shape=[self.hparams.field_nums,
self.hparams.feature_nums,
self.hparams.embedding_size],
dtype=tf.float32,
initializer=tf.initializers.random_normal()) # field * f * embedding_size
fm_result = None
for i in range(self.hparams.field_nums):
for j in range(i + 1, self.hparams.field_nums):
vi_fj = tf.nn.embedding_lookup(embedding[j], df_i[:, i]) # None * embedding_size
vj_fi = tf.nn.embedding_lookup(embedding[i], df_i[:, j]) # None * embedding_size
wij = tf.multiply(vi_fj, vj_fi)
x_i = tf.expand_dims(df_v[:, i], 1) # None * 1
x_j = tf.expand_dims(df_v[:, j], 1) # None * 1
xij = tf.multiply(x_i, x_j) # None * 1
if fm_result is None:
fm_result = tf.reduce_sum(tf.multiply(wij, xij), axis=1, keepdims=True)
else:
fm_result += tf.reduce_sum(tf.multiply(wij, xij), axis=1, keepdims=True)
fm_result = tf.reduce_sum(fm_result, axis=1, keep_dims=True)
return fm_result
创建模型
def create_model_fn(model):
def model_fn(features, labels, params, mode):
if params.threshold:
threshold = params.threshold
else:
threshold = 0.5
df_i = features['df_i']
df_v = features['df_v']
logits = model(params, df_i, df_v).logits
if mode == tf.contrib.learn.ModeKeys.TRAIN:
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits), name="loss")
train_op = tf.contrib.layers.optimize_loss(
loss=loss,
global_step=tf.train.get_global_step(),
learning_rate=params.lr,
clip_gradients=10.0,
optimizer=params.opt_type
)
pre = tf.nn.sigmoid(logits, name="sigmoid")
auc = tf.metrics.auc(labels=labels, predictions=pre, name="auc")
accuracy = tf.metrics.accuracy(labels=labels, predictions=tf.cast(pre > threshold, tf.float32),
name="accuracy")
tf.summary.scalar('train_accuracy', accuracy[1])
tf.summary.scalar('train_auc', auc[1])
return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
if mode == tf.estimator.ModeKeys.EVAL:
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits), name="loss")
pre = tf.nn.sigmoid(logits, name="sigmoid")
predict = tf.cast(pre > threshold, dtype=tf.int32)
auc = tf.metrics.auc(labels=labels, predictions=pre, name="auc")
accuracy = tf.metrics.accuracy(labels=labels, predictions=tf.cast(pre > threshold, tf.float32),
name="accuracy")
metrics = {
"auc": auc,
"accuracy": accuracy
}
return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics)
if mode == tf.estimator.ModeKeys.PREDICT:
pre = tf.nn.sigmoid(logits, name="sigmoid")
predict = tf.cast(pre > threshold, dtype=tf.int32)
predictions = {
"predict_pro": pre,
"predict": predict
}
return tf.estimator.EstimatorSpec(mode, predictions=predictions)
return model_fn
model_fn = create_model_fn(FFM)
定义模型训练图结构
estimator = tf.estimator.Estimator(
model_fn=model_fn,
model_dir=FLAGS.model_path,
params=hparams,
config=tf.estimator.RunConfig(
tf_random_seed=hparams.seed,
log_step_count_steps=500
)
)
show_dict = {
"loss": "loss",
"accuracy": "accuracy/value",
"auc": "auc/value"
}
log_hook = tf.train.LoggingTensorHook(show_dict, every_n_iter=100)
train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, hooks=[log_hook])
eval_spec = tf.estimator.EvalSpec(input_fn=test_input_fn, )
开始训练模型
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
Tensor("line/Add:0", shape=(?, 1), dtype=float32) Tensor("fm/Sum_15:0", shape=(?, 1), dtype=float32)
Tensor("line/Add:0", shape=(?, 1), dtype=float32) Tensor("fm/Sum_15:0", shape=(?, 1), dtype=float32)
({'accuracy': 0.7834375,
'auc': 0.81081283,
'loss': 0.6182821,
'global_step': 9375},
[])
预测
results_iterator = estimator.predict(input_fn=predict_input_fn)
打印前五个预测结果
for index ,result in enumerate(results_iterator):
if index>5:
break
print(result)
Tensor("line/Add:0", shape=(?, 1), dtype=float32) Tensor("fm/Sum_15:0", shape=(?, 1), dtype=float32)
{'predict_pro': array([0.9999973], dtype=float32), 'predict': array([1])}
{'predict_pro': array([0.92061496], dtype=float32), 'predict': array([1])}
{'predict_pro': array([0.10704267], dtype=float32), 'predict': array([0])}
{'predict_pro': array([0.45321018], dtype=float32), 'predict': array([0])}
{'predict_pro': array([0.11381763], dtype=float32), 'predict': array([0])}
{'predict_pro': array([0.36696994], dtype=float32), 'predict': array([0])}