FFM_from collections import namedtuple("hparams")-优快云博客

本文链接：https://blog.youkuaiyun.com/frank_zhaojianbo/article/details/114108519

FFM 的实例

我们的数据

import pandas as pd
df = pd.read_csv("./train.csv")

df

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	0	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	0	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	0	373450	8.0500	NaN	S
...	...	...	...	...	...	...	...	...	...	...	...	...
595	596	0	3	Van Impe, Mr. Jean Baptiste	male	36.0	1	1	345773	24.1500	NaN	S
596	597	1	2	Leitch, Miss. Jessie Wills	female	NaN	0	0	248727	33.0000	NaN	S
597	598	0	3	Johnson, Mr. Alfred	male	49.0	0	0	LINE	0.0000	NaN	S
598	599	0	3	Boulos, Mr. Hanna	male	NaN	0	0	2664	7.2250	NaN	C
599	600	1	1	Duff Gordon, Sir. Cosmo Edmund ("Mr Morgan")	male	49.0	1	0	PC 17485	56.9292	A20	C

600 rows × 12 columns

Survived 是我们要预测的标签，这是一个0 1 分类问题。我们的feild 是’Pclass’, “Sex”, “SibSp”, “Parch”, “Fare”, “Embarked”
其中 ‘Pclass’, “Sex”, “SibSp”, “Parch”, “Embarked” 是离散变量，而"Fare"是连续变量。我们所有的features 就来自这6个field。

代码部分

import tensorflow as tf
from collections import namedtuple
from sklearn.preprocessing import StandardScaler
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

输入参数

tf.flags.DEFINE_string("opt_type", "Adam", "optimizer type (Adagrad, Adam, Ftrl, Momentum, RMSProp, SGD).")
tf.flags.DEFINE_string("train_file_path", "./train.csv", "train file path.")
tf.flags.DEFINE_string("test_file_path", "./test.csv", "train file path.")
tf.flags.DEFINE_string("label", "Survived", "target column name.")
tf.flags.DEFINE_string("activation", "relu", "deep mid activation function(tanh, relu, tanh, sigmoid).")
tf.flags.DEFINE_float("threshold", 0.5, "bi-classification threshold." )
tf.flags.DEFINE_string("loss_type", "log_loss", "bi-classification is log_loss, regression is mse.")
tf.flags.DEFINE_string("model_path", "./checkpoint/", "save model path.")
tf.flags.DEFINE_bool("use_deep", True, "Whether to use deep or not.")
tf.flags.DEFINE_string("model", "fm", "fm or ffm.")
tf.flags.DEFINE_list("layers", [30,30], "deep mid layers.")
tf.flags.DEFINE_list("category_columns", ['Pclass',"Sex","SibSp","Parch","Embarked"], "category columns.")
tf.flags.DEFINE_list("continuation_columns", ['Fare'], "continuation columns.")
tf.flags.DEFINE_float("lr", 0.01, "learning rate.")
tf.flags.DEFINE_float("line_output_keep_dropout", 0.9, "line output keep dropout in deep schema.")
tf.flags.DEFINE_float("fm_output_keep_dropout", 0.9, "fm output keep dropout in deep schema.")
tf.flags.DEFINE_float("deep_output_keep_dropout", 0.9, "deep output keep dropout in deep schema.")
tf.flags.DEFINE_float("deep_input_keep_dropout", 0.9, "deep input keep dropout in deep schema.")
tf.flags.DEFINE_float("deep_mid_keep_dropout", 0.8, "deep mid keep dropout in deep schema.")
tf.flags.DEFINE_integer("embedding_size", 3, "field embedding size")
tf.flags.DEFINE_bool("use_batch_normal", False, "Whether to use batch normal or not.")
tf.flags.DEFINE_integer("batch_size", 64, "batch size.")
tf.flags.DEFINE_integer("epoches", 1000, "epoches.")
tf.flags.DEFINE_integer("logging_level", 20, "tensorflow logging level.")
tf.flags.DEFINE_integer("seed", 20, "tensorflow seed num.")
FLAGS = tf.flags.FLAGS
tf.app.flags.DEFINE_string('f', '', 'kernel')
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

HParams = namedtuple(
  "HParams",
  [
    "opt_type",
    "threshold",
    "loss_type",
    "use_deep",
    "model",
    "layers",
    "lr",
    "fm_output_keep_dropout",
    "line_output_keep_dropout",
    "deep_input_keep_dropout",
    "deep_mid_keep_dropout",
    "deep_output_keep_dropout",
    "embedding_size",
    "use_batch_normal",
    "batch_size",
    "epoches",
    "field_nums",
    "feature_nums",
    "activation",
    "seed"
  ])

定义参数字典

def create_hparams(field_nums, feature_nums):
  return HParams(
    model=FLAGS.model,
    opt_type=FLAGS.opt_type,
    threshold=FLAGS.threshold,
    loss_type=FLAGS.loss_type,
    use_deep=FLAGS.use_deep,
    layers=FLAGS.layers,
    lr=FLAGS.lr,
    fm_output_keep_dropout=FLAGS.fm_output_keep_dropout,
    line_output_keep_dropout=FLAGS.line_output_keep_dropout,
    deep_input_keep_dropout=FLAGS.deep_input_keep_dropout,
    deep_output_keep_dropout=FLAGS.deep_output_keep_dropout,
    deep_mid_keep_dropout=FLAGS.deep_mid_keep_dropout,
    embedding_size=FLAGS.embedding_size,
    use_batch_normal=FLAGS.use_batch_normal,
    batch_size=FLAGS.batch_size,
    epoches=FLAGS.epoches,
    activation=FLAGS.activation,
    seed=FLAGS.seed,
    field_nums=field_nums,
    feature_nums=feature_nums
    )

数据处理函数

class FieldHandler(object):
    def __init__(self, train_file_path, test_file_path=None, category_columns=[], continuation_columns=[]):
        """
        train_df_path: train file path(must)
        test_df_path: None or test file path
        """
        self.train_file_path = None
        self.test_file_path = None
        self.feature_nums = 0
        self.field_dict = {}

        self.category_columns = category_columns
        self.continuation_columns = continuation_columns

        # 检查训练数据路径
        if not isinstance(train_file_path, str):
            raise ValueError("train file path must str")

        # 检查训练数据路径
        if os.path.exists(train_file_path):
            self.train_file_path = train_file_path
        else:
            raise OSError("train file path isn't exists!")

        # 检查测试数据路径
        if test_file_path:
            if os.path.exists(test_file_path):
                self.test_file_path = test_file_path
            else:
                raise OSError("test file path isn't exists!")

        # 读数据
        self.read_data()

        # 把离散（分类）值中的 null 填充为 -1
        self.df[category_columns].fillna("-1", inplace=True)

        # 建造领域特征
        self.build_filed_dict()

        # 标准化连续数据
        self.build_standard_scaler()

        self.field_nums = len(self.category_columns + self.continuation_columns)

    def build_filed_dict(self):

        for column in self.df.columns:
            if column in self.category_columns:
                cv = self.df[column].unique()
                self.field_dict[column] = dict(zip(cv, range(self.feature_nums, self.feature_nums + len(cv))))
                self.feature_nums += len(cv)
            else:
                self.field_dict[column] = self.feature_nums
                self.feature_nums += 1

    def read_data(self):

        if self.train_file_path and self.test_file_path:

            train_df = pd.read_csv(self.train_file_path)[self.category_columns + self.continuation_columns]
            test_df = pd.read_csv(self.test_file_path)[self.category_columns + self.continuation_columns]
            self.df = pd.concat([train_df, test_df])
        else:
            self.df = pd.read_csv(self.train_file_path)[self.category_columns + self.continuation_columns]

    def build_standard_scaler(self):

        if self.continuation_columns:
            self.standard_scaler = StandardScaler()
            self.standard_scaler.fit(self.df[self.continuation_columns].values)
        else:
            self.standard_scaler = None

fh = FieldHandler(train_file_path=FLAGS.train_file_path,
                  category_columns=FLAGS.category_columns,
                  continuation_columns=FLAGS.continuation_columns)

D:\windows-miniconda\miniconda\envs\jianbo\lib\site-packages\pandas\core\frame.py:4327: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,

hparams = create_hparams(fh.field_nums, fh.feature_nums)

把数据转化成模型输入的格式

def transformation_data(file_path: str, field_hander: FieldHandler, label=None):
    """
    lable: target columns name
    """
    df_v = pd.read_csv(file_path)

    # 读取label值，把label值转为 float32 类型
    if label:
        if label in df_v.columns:
            labels = df_v[[label]].values.astype("float32")
        else:
            raise KeyError(f'label "{label}" isn\'t exists')

    # 合并离散（分类）值的列（field）和连续值的列（field）
    df_v = df_v[field_hander.category_columns + field_hander.continuation_columns]

    # 用—1 填充    df_v[field_hander.category_columns].fillna("-1", inplace=True)
    df_v[field_hander.continuation_columns].fillna(-999, inplace=True)

    # 标准化连续数据
    if field_hander.standard_scaler:
        df_v[field_hander.continuation_columns] = field_hander.standard_scaler.transform(
            df_v[field_hander.continuation_columns].values)

    df_i = df_v.copy()

    # df_i 为features的编号，每个field 都对应着 自己不同的feature
    # eq field1 有 feature1-1，feature1-2 两个feature ， field2 有 feature2-1，feature2-2 ，feature2-3 三个feature ，
    # df_i 就是为 feature1-1，feature1-2 ，feature2-1，feature2-2 ，feature2-3 统一用数字编号
    # feature1-1，feature1-2 ，feature2-1，feature2-2 ，feature2-3 对应 0，1，2，3，4，5
    # 而 df_v 就是 为这些feature 赋值。 离散（分类）feature变量的值都为 1， 而连续散feature变量为其本身。
    # 后面 我们用df_i 找到对应的feature的 embedding weight， 这样就可以在多feature的情况下减少
    for column in df_v.columns:

        if column in field_hander.category_columns:
            # 找到 离散（分类）feature 在field_dict 中的编号
            df_i[column] = df_i[column].map(field_hander.field_dict[column])
            # 设置这个feature 的值
            df_v[column] = 1
        else:
            # 找到 连续feature 在field_dict 中的编号, 因为是连续 feature，所以 df_v是其本身 而不是 1.
            df_i[column] = field_hander.field_dict[column]

    df_v = df_v.values.astype("float32")
    df_i = df_i.values.astype("int32")

    features = {
        "df_i": df_i,
        "df_v": df_v
    }

    if label:
        return features, labels
    return features, None

features, labels = transformation_data(file_path=FLAGS.train_file_path, field_hander=fh, label=FLAGS.label)
test_features, test_labels = transformation_data(file_path=FLAGS.test_file_path, field_hander=fh, label=FLAGS.label)

D:\windows-miniconda\miniconda\envs\jianbo\lib\site-packages\pandas\core\frame.py:4327: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
D:\windows-miniconda\miniconda\envs\jianbo\lib\site-packages\pandas\core\frame.py:4327: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,

训练和测试数据集

def create_input_fn(features, label, batch_size=32, num_epochs=10):

    def input_fn():
        dataset = tf.data.Dataset.from_tensor_slices((features, label))
        dataset = dataset.shuffle(1000)  # 将数据打乱，数值越大，混乱程度越大
        dataset = dataset.repeat(num_epochs)  # 数据集重复了指定次数
        dataset = dataset.batch(batch_size)  # 按照顺序取出batch_size行数据，最后一次输出可能小于batch
        iterator = dataset.make_one_shot_iterator()
        next_element = iterator.get_next()
        return next_element

    return input_fn

train_input_fn = create_input_fn(features,label=labels,batch_size=hparams.batch_size,num_epochs=hparams.epoches)

test_input_fn = create_input_fn(test_features,label=test_labels,batch_size=hparams.batch_size,num_epochs=hparams.epoches)

预测数据集

def create_predict_input_fn(features, batch_size=32, num_epochs=10):

    def input_fn():

        dataset = tf.data.Dataset.from_tensor_slices((features))
        dataset = dataset.shuffle(1000)  # 将数据打乱，数值越大，混乱程度越大
        dataset = dataset.repeat(num_epochs)  # 数据集重复了指定次数
        dataset = dataset.batch(batch_size)  # 按照顺序取出batch_size行数据，最后一次输出可能小于batch
        iterator = dataset.make_one_shot_iterator()
        next_element = iterator.get_next()
        return next_element

    return input_fn

predict_input_fn = create_predict_input_fn(test_features,batch_size=hparams.batch_size,num_epochs=hparams.epoches)

定义FM模型

class FFM(object):
    def __init__(self, hparams, df_i, df_v):
        # df_i, df_v  None * n
        self.hparams = hparams
        tf.set_random_seed(self.hparams.seed)
        self.line_result = self.line_section(df_i, df_v)
        self.fm_result = self.fm_section(df_i, df_v)
        print(self.line_result, self.fm_result)
        self.logits = self.line_result + self.fm_result


    def line_section(self, df_i, df_v):

        with tf.variable_scope("line"):

            weights = tf.get_variable("weights",
                                      shape=[self.hparams.feature_nums, 1],
                                      dtype=tf.float32,
                                      initializer=tf.initializers.glorot_uniform())  # f * 1
            batch_weights = tf.nn.embedding_lookup(weights, df_i)  # none * n * 1
            batch_weights = tf.squeeze(batch_weights, axis=2)  # None * n
            line_result = tf.multiply(df_v, batch_weights, name="line_w_x")  # none * n

            biase = tf.get_variable("biase",
                                    shape=[1, 1],
                                    dtype=tf.float32,
                                    initializer=tf.initializers.zeros())  # 1 * 1
            line_result = tf.add(tf.reduce_sum(line_result, axis=1, keepdims=True), biase)  # None，1
        return line_result

    def fm_section(self, df_i, df_v):
        with tf.variable_scope("fm"):
            embedding = tf.get_variable("embedding",
                                        shape=[self.hparams.field_nums,
                                               self.hparams.feature_nums,
                                               self.hparams.embedding_size],
                                        dtype=tf.float32,
                                        initializer=tf.initializers.random_normal())  # field * f * embedding_size
            fm_result = None
            for i in range(self.hparams.field_nums):
                for j in range(i + 1, self.hparams.field_nums):
                    vi_fj = tf.nn.embedding_lookup(embedding[j], df_i[:, i])  # None * embedding_size
                    vj_fi = tf.nn.embedding_lookup(embedding[i], df_i[:, j])  # None * embedding_size
                    wij = tf.multiply(vi_fj, vj_fi)

                    x_i = tf.expand_dims(df_v[:, i], 1)  # None * 1
                    x_j = tf.expand_dims(df_v[:, j], 1)  # None * 1

                    xij = tf.multiply(x_i, x_j)  # None * 1
                    if fm_result is None:
                        fm_result = tf.reduce_sum(tf.multiply(wij, xij), axis=1, keepdims=True)
                    else:
                        fm_result += tf.reduce_sum(tf.multiply(wij, xij), axis=1, keepdims=True)

            fm_result = tf.reduce_sum(fm_result, axis=1, keep_dims=True)
        return fm_result

创建模型

def create_model_fn(model):

    def model_fn(features, labels, params, mode):

        if params.threshold:
            threshold = params.threshold
        else:
            threshold = 0.5

        df_i = features['df_i']
        df_v = features['df_v']

        logits = model(params, df_i, df_v).logits

        if mode == tf.contrib.learn.ModeKeys.TRAIN:
            loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits), name="loss")
            train_op = tf.contrib.layers.optimize_loss(
                loss=loss,
                global_step=tf.train.get_global_step(),
                learning_rate=params.lr,
                clip_gradients=10.0,
                optimizer=params.opt_type
            )

            pre = tf.nn.sigmoid(logits, name="sigmoid")
            auc = tf.metrics.auc(labels=labels, predictions=pre, name="auc")
            accuracy = tf.metrics.accuracy(labels=labels, predictions=tf.cast(pre > threshold, tf.float32),
                                           name="accuracy")

            tf.summary.scalar('train_accuracy', accuracy[1])
            tf.summary.scalar('train_auc', auc[1])
            return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

        if mode == tf.estimator.ModeKeys.EVAL:
            loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits), name="loss")
            pre = tf.nn.sigmoid(logits, name="sigmoid")
            predict = tf.cast(pre > threshold, dtype=tf.int32)
            auc = tf.metrics.auc(labels=labels, predictions=pre, name="auc")
            accuracy = tf.metrics.accuracy(labels=labels, predictions=tf.cast(pre > threshold, tf.float32),
                                           name="accuracy")

            metrics = {
                "auc": auc,
                "accuracy": accuracy
            }

            return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics)

        if mode == tf.estimator.ModeKeys.PREDICT:
            pre = tf.nn.sigmoid(logits, name="sigmoid")
            predict = tf.cast(pre > threshold, dtype=tf.int32)
            predictions = {
                "predict_pro": pre,
                "predict": predict
            }
            return tf.estimator.EstimatorSpec(mode, predictions=predictions)

    return model_fn

model_fn = create_model_fn(FFM)

定义模型训练图结构

estimator = tf.estimator.Estimator(
    model_fn=model_fn,
    model_dir=FLAGS.model_path,
    params=hparams,
    config=tf.estimator.RunConfig(
        tf_random_seed=hparams.seed,
        log_step_count_steps=500
    )
)

show_dict = {
    "loss": "loss",
    "accuracy": "accuracy/value",
    "auc": "auc/value"
}

log_hook = tf.train.LoggingTensorHook(show_dict, every_n_iter=100)
train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, hooks=[log_hook])
eval_spec = tf.estimator.EvalSpec(input_fn=test_input_fn, )

开始训练模型

tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

Tensor("line/Add:0", shape=(?, 1), dtype=float32) Tensor("fm/Sum_15:0", shape=(?, 1), dtype=float32)
Tensor("line/Add:0", shape=(?, 1), dtype=float32) Tensor("fm/Sum_15:0", shape=(?, 1), dtype=float32)





({'accuracy': 0.7834375,
  'auc': 0.81081283,
  'loss': 0.6182821,
  'global_step': 9375},
 [])

预测

results_iterator = estimator.predict(input_fn=predict_input_fn)

打印前五个预测结果

for index ,result in enumerate(results_iterator):
    if index>5:
        break
    print(result)

Tensor("line/Add:0", shape=(?, 1), dtype=float32) Tensor("fm/Sum_15:0", shape=(?, 1), dtype=float32)
{'predict_pro': array([0.9999973], dtype=float32), 'predict': array([1])}
{'predict_pro': array([0.92061496], dtype=float32), 'predict': array([1])}
{'predict_pro': array([0.10704267], dtype=float32), 'predict': array([0])}
{'predict_pro': array([0.45321018], dtype=float32), 'predict': array([0])}
{'predict_pro': array([0.11381763], dtype=float32), 'predict': array([0])}
{'predict_pro': array([0.36696994], dtype=float32), 'predict': array([0])}