FFM

FFM 的实例

我们的数据
import pandas as pd
df = pd.read_csv("./train.csv")
df
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
.......................................
59559603Van Impe, Mr. Jean Baptistemale36.01134577324.1500NaNS
59659712Leitch, Miss. Jessie WillsfemaleNaN0024872733.0000NaNS
59759803Johnson, Mr. Alfredmale49.000LINE0.0000NaNS
59859903Boulos, Mr. HannamaleNaN0026647.2250NaNC
59960011Duff Gordon, Sir. Cosmo Edmund ("Mr Morgan")male49.010PC 1748556.9292A20C

600 rows × 12 columns

Survived 是我们要预测的标签,这是一个0 1 分类问题。 我们的feild 是’Pclass’, “Sex”, “SibSp”, “Parch”, “Fare”, “Embarked”
其中 ‘Pclass’, “Sex”, “SibSp”, “Parch”, “Embarked” 是离散变量, 而"Fare"是连续变量。我们所有的features 就来自这6个field。

代码部分
import tensorflow as tf
from collections import namedtuple
from sklearn.preprocessing import StandardScaler
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
输入参数
tf.flags.DEFINE_string("opt_type", "Adam", "optimizer type (Adagrad, Adam, Ftrl, Momentum, RMSProp, SGD).")
tf.flags.DEFINE_string("train_file_path", "./train.csv", "train file path.")
tf.flags.DEFINE_string("test_file_path", "./test.csv", "train file path.")
tf.flags.DEFINE_string("label", "Survived", "target column name.")
tf.flags.DEFINE_string("activation", "relu", "deep mid activation function(tanh, relu, tanh, sigmoid).")
tf.flags.DEFINE_float("threshold", 0.5, "bi-classification threshold." )
tf.flags.DEFINE_string("loss_type", "log_loss", "bi-classification is log_loss, regression is mse.")
tf.flags.DEFINE_string("model_path", "./checkpoint/", "save model path.")
tf.flags.DEFINE_bool("use_deep", True, "Whether to use deep or not.")
tf.flags.DEFINE_string("model", "fm", "fm or ffm.")
tf.flags.DEFINE_list("layers", [30,30], "deep mid layers.")
tf.flags.DEFINE_list("category_columns", ['Pclass',"Sex","SibSp","Parch","Embarked"], "category columns.")
tf.flags.DEFINE_list("continuation_columns", ['Fare'], "continuation columns.")
tf.flags.DEFINE_float("lr", 0.01, "learning rate.")
tf.flags.DEFINE_float("line_output_keep_dropout", 0.9, "line output keep dropout in deep schema.")
tf.flags.DEFINE_float("fm_output_keep_dropout", 0.9, "fm output keep dropout in deep schema.")
tf.flags.DEFINE_float("deep_output_keep_dropout", 0.9, "deep output keep dropout in deep schema.")
tf.flags.DEFINE_float("deep_input_keep_dropout", 0.9, "deep input keep dropout in deep schema.")
tf.flags.DEFINE_float("deep_mid_keep_dropout", 0.8, "deep mid keep dropout in deep schema.")
tf.flags.DEFINE_integer("embedding_size", 3, "field embedding size")
tf.flags.DEFINE_bool("use_batch_normal", False, "Whether to use batch normal or not.")
tf.flags.DEFINE_integer("batch_size", 64, "batch size.")
tf.flags.DEFINE_integer("epoches", 1000, "epoches.")
tf.flags.DEFINE_integer("logging_level", 20, "tensorflow logging level.")
tf.flags.DEFINE_integer("seed", 20, "tensorflow seed num.")
FLAGS = tf.flags.FLAGS
tf.app.flags.DEFINE_string('f', '', 'kernel')
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
HParams = namedtuple(
  "HParams",
  [
    "opt_type",
    "threshold",
    "loss_type",
    "use_deep",
    "model",
    "layers",
    "lr",
    "fm_output_keep_dropout",
    "line_output_keep_dropout",
    "deep_input_keep_dropout",
    "deep_mid_keep_dropout",
    "deep_output_keep_dropout",
    "embedding_size",
    "use_batch_normal",
    "batch_size",
    "epoches",
    "field_nums",
    "feature_nums",
    "activation",
    "seed"
  ])

定义参数字典
def create_hparams(field_nums, feature_nums):
  return HParams(
    model=FLAGS.model,
    opt_type=FLAGS.opt_type,
    threshold=FLAGS.threshold,
    loss_type=FLAGS.loss_type,
    use_deep=FLAGS.use_deep,
    layers=FLAGS.layers,
    lr=FLAGS.lr,
    fm_output_keep_dropout=FLAGS.fm_output_keep_dropout,
    line_output_keep_dropout=FLAGS.line_output_keep_dropout,
    deep_input_keep_dropout=FLAGS.deep_input_keep_dropout,
    deep_output_keep_dropout=FLAGS.deep_output_keep_dropout,
    deep_mid_keep_dropout=FLAGS.deep_mid_keep_dropout,
    embedding_size=FLAGS.embedding_size,
    use_batch_normal=FLAGS.use_batch_normal,
    batch_size=FLAGS.batch_size,
    epoches=FLAGS.epoches,
    activation=FLAGS.activation,
    seed=FLAGS.seed,
    field_nums=field_nums,
    feature_nums=feature_nums
    )
数据处理函数
class FieldHandler(object):
    def __init__(self, train_file_path, test_file_path=None, category_columns=[], continuation_columns=[]):
        """
        train_df_path: train file path(must)
        test_df_path: None or test file path
        """
        self.train_file_path = None
        self.test_file_path = None
        self.feature_nums = 0
        self.field_dict = {}

        self.category_columns = category_columns
        self.continuation_columns = continuation_columns

        # 检查训练数据路径
        if not isinstance(train_file_path, str):
            raise ValueError("train file path must str")

        # 检查训练数据路径
        if os.path.exists(train_file_path):
            self.train_file_path = train_file_path
        else:
            raise OSError("train file path isn't exists!")

        # 检查测试数据路径
        if test_file_path:
            if os.path.exists(test_file_path):
                self.test_file_path = test_file_path
            else:
                raise OSError("test file path isn't exists!")

        # 读数据
        self.read_data()

        # 把离散(分类)值中的 null 填充为 -1
        self.df[category_columns].fillna("-1", inplace=True)

        # 建造领域特征
        self.build_filed_dict()

        # 标准化连续数据
        self.build_standard_scaler()

        self.field_nums = len(self.category_columns + self.continuation_columns)

    def build_filed_dict(self):

        for column in self.df.columns:
            if column in self.category_columns:
                cv = self.df[column].unique()
                self.field_dict[column] = dict(zip(cv, range(self.feature_nums, self.feature_nums + len(cv))))
                self.feature_nums += len(cv)
            else:
                self.field_dict[column] = self.feature_nums
                self.feature_nums += 1

    def read_data(self):

        if self.train_file_path and self.test_file_path:

            train_df = pd.read_csv(self.train_file_path)[self.category_columns + self.continuation_columns]
            test_df = pd.read_csv(self.test_file_path)[self.category_columns + self.continuation_columns]
            self.df = pd.concat([train_df, test_df])
        else:
            self.df = pd.read_csv(self.train_file_path)[self.category_columns + self.continuation_columns]

    def build_standard_scaler(self):

        if self.continuation_columns:
            self.standard_scaler = StandardScaler()
            self.standard_scaler.fit(self.df[self.continuation_columns].values)
        else:
            self.standard_scaler = None
fh = FieldHandler(train_file_path=FLAGS.train_file_path,
                  category_columns=FLAGS.category_columns,
                  continuation_columns=FLAGS.continuation_columns)
D:\windows-miniconda\miniconda\envs\jianbo\lib\site-packages\pandas\core\frame.py:4327: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
hparams = create_hparams(fh.field_nums, fh.feature_nums)
把数据转化成模型输入的格式
def transformation_data(file_path: str, field_hander: FieldHandler, label=None):
    """
    lable: target columns name
    """
    df_v = pd.read_csv(file_path)

    # 读取label值,把label值转为 float32 类型
    if label:
        if label in df_v.columns:
            labels = df_v[[label]].values.astype("float32")
        else:
            raise KeyError(f'label "{label}" isn\'t exists')

    # 合并离散(分类)值的列(field)和连续值的列(field)
    df_v = df_v[field_hander.category_columns + field_hander.continuation_columns]

    # 用—1 填充    df_v[field_hander.category_columns].fillna("-1", inplace=True)
    df_v[field_hander.continuation_columns].fillna(-999, inplace=True)

    # 标准化连续数据
    if field_hander.standard_scaler:
        df_v[field_hander.continuation_columns] = field_hander.standard_scaler.transform(
            df_v[field_hander.continuation_columns].values)

    df_i = df_v.copy()

    # df_i 为features的编号,每个field 都对应着 自己不同的feature
    # eq field1 有 feature1-1,feature1-2 两个feature , field2 有 feature2-1,feature2-2 ,feature2-3 三个feature ,
    # df_i 就是为 feature1-1,feature1-2 ,feature2-1,feature2-2 ,feature2-3 统一用数字编号
    # feature1-1,feature1-2 ,feature2-1,feature2-2 ,feature2-3 对应 0,1,2,3,4,5
    # 而 df_v 就是 为这些feature 赋值。 离散(分类)feature变量的值都为 1, 而连续散feature变量为其本身。
    # 后面 我们用df_i 找到对应的feature的 embedding weight, 这样就可以在多feature的情况下减少
    for column in df_v.columns:

        if column in field_hander.category_columns:
            # 找到 离散(分类)feature 在field_dict 中的编号
            df_i[column] = df_i[column].map(field_hander.field_dict[column])
            # 设置这个feature 的值
            df_v[column] = 1
        else:
            # 找到 连续feature 在field_dict 中的编号, 因为是连续 feature,所以 df_v是其本身 而不是 1.
            df_i[column] = field_hander.field_dict[column]

    df_v = df_v.values.astype("float32")
    df_i = df_i.values.astype("int32")

    features = {
        "df_i": df_i,
        "df_v": df_v
    }

    if label:
        return features, labels
    return features, None

features, labels = transformation_data(file_path=FLAGS.train_file_path, field_hander=fh, label=FLAGS.label)
test_features, test_labels = transformation_data(file_path=FLAGS.test_file_path, field_hander=fh, label=FLAGS.label)
D:\windows-miniconda\miniconda\envs\jianbo\lib\site-packages\pandas\core\frame.py:4327: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
D:\windows-miniconda\miniconda\envs\jianbo\lib\site-packages\pandas\core\frame.py:4327: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
训练和测试数据集
def create_input_fn(features, label, batch_size=32, num_epochs=10):

    def input_fn():
        dataset = tf.data.Dataset.from_tensor_slices((features, label))
        dataset = dataset.shuffle(1000)  # 将数据打乱,数值越大,混乱程度越大
        dataset = dataset.repeat(num_epochs)  # 数据集重复了指定次数
        dataset = dataset.batch(batch_size)  # 按照顺序取出batch_size行数据,最后一次输出可能小于batch
        iterator = dataset.make_one_shot_iterator()
        next_element = iterator.get_next()
        return next_element

    return input_fn
train_input_fn = create_input_fn(features,label=labels,batch_size=hparams.batch_size,num_epochs=hparams.epoches)

test_input_fn = create_input_fn(test_features,label=test_labels,batch_size=hparams.batch_size,num_epochs=hparams.epoches)
预测数据集
def create_predict_input_fn(features, batch_size=32, num_epochs=10):

    def input_fn():

        dataset = tf.data.Dataset.from_tensor_slices((features))
        dataset = dataset.shuffle(1000)  # 将数据打乱,数值越大,混乱程度越大
        dataset = dataset.repeat(num_epochs)  # 数据集重复了指定次数
        dataset = dataset.batch(batch_size)  # 按照顺序取出batch_size行数据,最后一次输出可能小于batch
        iterator = dataset.make_one_shot_iterator()
        next_element = iterator.get_next()
        return next_element

    return input_fn
predict_input_fn = create_predict_input_fn(test_features,batch_size=hparams.batch_size,num_epochs=hparams.epoches)
定义FM模型
class FFM(object):
    def __init__(self, hparams, df_i, df_v):
        # df_i, df_v  None * n
        self.hparams = hparams
        tf.set_random_seed(self.hparams.seed)
        self.line_result = self.line_section(df_i, df_v)
        self.fm_result = self.fm_section(df_i, df_v)
        print(self.line_result, self.fm_result)
        self.logits = self.line_result + self.fm_result


    def line_section(self, df_i, df_v):

        with tf.variable_scope("line"):

            weights = tf.get_variable("weights",
                                      shape=[self.hparams.feature_nums, 1],
                                      dtype=tf.float32,
                                      initializer=tf.initializers.glorot_uniform())  # f * 1
            batch_weights = tf.nn.embedding_lookup(weights, df_i)  # none * n * 1
            batch_weights = tf.squeeze(batch_weights, axis=2)  # None * n
            line_result = tf.multiply(df_v, batch_weights, name="line_w_x")  # none * n

            biase = tf.get_variable("biase",
                                    shape=[1, 1],
                                    dtype=tf.float32,
                                    initializer=tf.initializers.zeros())  # 1 * 1
            line_result = tf.add(tf.reduce_sum(line_result, axis=1, keepdims=True), biase)  # None,1
        return line_result

    def fm_section(self, df_i, df_v):
        with tf.variable_scope("fm"):
            embedding = tf.get_variable("embedding",
                                        shape=[self.hparams.field_nums,
                                               self.hparams.feature_nums,
                                               self.hparams.embedding_size],
                                        dtype=tf.float32,
                                        initializer=tf.initializers.random_normal())  # field * f * embedding_size
            fm_result = None
            for i in range(self.hparams.field_nums):
                for j in range(i + 1, self.hparams.field_nums):
                    vi_fj = tf.nn.embedding_lookup(embedding[j], df_i[:, i])  # None * embedding_size
                    vj_fi = tf.nn.embedding_lookup(embedding[i], df_i[:, j])  # None * embedding_size
                    wij = tf.multiply(vi_fj, vj_fi)

                    x_i = tf.expand_dims(df_v[:, i], 1)  # None * 1
                    x_j = tf.expand_dims(df_v[:, j], 1)  # None * 1

                    xij = tf.multiply(x_i, x_j)  # None * 1
                    if fm_result is None:
                        fm_result = tf.reduce_sum(tf.multiply(wij, xij), axis=1, keepdims=True)
                    else:
                        fm_result += tf.reduce_sum(tf.multiply(wij, xij), axis=1, keepdims=True)

            fm_result = tf.reduce_sum(fm_result, axis=1, keep_dims=True)
        return fm_result
创建模型
def create_model_fn(model):

    def model_fn(features, labels, params, mode):

        if params.threshold:
            threshold = params.threshold
        else:
            threshold = 0.5

        df_i = features['df_i']
        df_v = features['df_v']

        logits = model(params, df_i, df_v).logits

        if mode == tf.contrib.learn.ModeKeys.TRAIN:
            loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits), name="loss")
            train_op = tf.contrib.layers.optimize_loss(
                loss=loss,
                global_step=tf.train.get_global_step(),
                learning_rate=params.lr,
                clip_gradients=10.0,
                optimizer=params.opt_type
            )

            pre = tf.nn.sigmoid(logits, name="sigmoid")
            auc = tf.metrics.auc(labels=labels, predictions=pre, name="auc")
            accuracy = tf.metrics.accuracy(labels=labels, predictions=tf.cast(pre > threshold, tf.float32),
                                           name="accuracy")

            tf.summary.scalar('train_accuracy', accuracy[1])
            tf.summary.scalar('train_auc', auc[1])
            return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

        if mode == tf.estimator.ModeKeys.EVAL:
            loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits), name="loss")
            pre = tf.nn.sigmoid(logits, name="sigmoid")
            predict = tf.cast(pre > threshold, dtype=tf.int32)
            auc = tf.metrics.auc(labels=labels, predictions=pre, name="auc")
            accuracy = tf.metrics.accuracy(labels=labels, predictions=tf.cast(pre > threshold, tf.float32),
                                           name="accuracy")

            metrics = {
                "auc": auc,
                "accuracy": accuracy
            }

            return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics)

        if mode == tf.estimator.ModeKeys.PREDICT:
            pre = tf.nn.sigmoid(logits, name="sigmoid")
            predict = tf.cast(pre > threshold, dtype=tf.int32)
            predictions = {
                "predict_pro": pre,
                "predict": predict
            }
            return tf.estimator.EstimatorSpec(mode, predictions=predictions)

    return model_fn
model_fn = create_model_fn(FFM)
定义模型训练图结构
estimator = tf.estimator.Estimator(
    model_fn=model_fn,
    model_dir=FLAGS.model_path,
    params=hparams,
    config=tf.estimator.RunConfig(
        tf_random_seed=hparams.seed,
        log_step_count_steps=500
    )
)

show_dict = {
    "loss": "loss",
    "accuracy": "accuracy/value",
    "auc": "auc/value"
}

log_hook = tf.train.LoggingTensorHook(show_dict, every_n_iter=100)
train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, hooks=[log_hook])
eval_spec = tf.estimator.EvalSpec(input_fn=test_input_fn, )
开始训练模型
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
Tensor("line/Add:0", shape=(?, 1), dtype=float32) Tensor("fm/Sum_15:0", shape=(?, 1), dtype=float32)
Tensor("line/Add:0", shape=(?, 1), dtype=float32) Tensor("fm/Sum_15:0", shape=(?, 1), dtype=float32)





({'accuracy': 0.7834375,
  'auc': 0.81081283,
  'loss': 0.6182821,
  'global_step': 9375},
 [])
预测
results_iterator = estimator.predict(input_fn=predict_input_fn)
打印前五个预测结果
for index ,result in enumerate(results_iterator):
    if index>5:
        break
    print(result)
Tensor("line/Add:0", shape=(?, 1), dtype=float32) Tensor("fm/Sum_15:0", shape=(?, 1), dtype=float32)
{'predict_pro': array([0.9999973], dtype=float32), 'predict': array([1])}
{'predict_pro': array([0.92061496], dtype=float32), 'predict': array([1])}
{'predict_pro': array([0.10704267], dtype=float32), 'predict': array([0])}
{'predict_pro': array([0.45321018], dtype=float32), 'predict': array([0])}
{'predict_pro': array([0.11381763], dtype=float32), 'predict': array([0])}
{'predict_pro': array([0.36696994], dtype=float32), 'predict': array([0])}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值