mmoe_layer

该博客介绍了如何使用TensorFlow实现MMOE(Multi-gate Mixture-of-Experts)层,这是一种用于多任务学习的模型。MMOE层通过多个专家网络和门控网络来处理不同任务,每个任务可以利用多个专家的输出,并通过门控网络加权求和得到最终结果。文章详细阐述了MMOE层的结构、参数配置、权重初始化、激活函数等,并给出了实例代码演示其工作原理。
#!usr/bin/env python
# coding=utf-8

import numpy as np
import pandas as pd
import datetime
import itertools
import tensorflow as tf
from tensorflow.keras.layers import *
import tensorflow.keras.backend as K
from tensorflow.keras import layers

class MMoELayer(Layer):
    """
    The Multi-gate Mixture-of-Experts layer in MMOE model
      Input shape
        - 2D tensor with shape: ``(batch_size, units)``.
      Output shape
        - A list with **num_tasks** elements, which is a 2D tensor with shape: ``(batch_size, units_experts)`` .
      Arguments
        - **num_tasks**: integer, the number of tasks, equal to the number of outputs.
        - **num_experts**: integer, the number of experts.
        - **units_experts**: integer, the dimension of each output of MMOELayer.
    References
      - [Jiaqi Ma, Zhe Zhao, Xinyang Yi, et al. Modeling Task Relationships in Multi-task Learning with Multi-gate Mixture-of-Experts[C]](https://dl.acm.org/doi/10.1145/3219819.3220007)
    """

    def __init__(self, units_experts, num_experts, num_tasks,
                 use_expert_bias=True, use_gate_bias=True, expert_activation='relu', gate_activation='softmax',
                 expert_bias_initializer='zeros', gate_bias_initializer='zeros', expert_bias_regularizer=None,
                 gate_bias_regularizer=None, expert_bias_constraint=None, gate_bias_constraint=None,
                 expert_kernel_initializer='VarianceScaling', gate_kernel_initializer='VarianceScaling',
                 expert_kernel_regularizer=None, gate_kernel_regularizer=None, expert_kernel_constraint=None,
                 gate_kernel_constraint=None, activity_regularizer=None, **kwargs):
        super(MMoELayer, self).__init__(**kwargs)

        self
class MMoEModel(Model): def __init__(self, tower_size1, tower_size2, name): super(MMoEModel, self).__init__(name=name) #包含3个共享专家网络(expert1/2/3),每个专家是包含Dropout和BatchNorm的MyTower self.expert1 = MyTower(tower_size1, name='expert1') self.expert2 = MyTower(tower_size1, name='expert2') self.expert3 = MyTower(tower_size1, name='expert3') # self.expert4 = MyTower(tower_size1, name='expert4') self.gate_ctr = Dense(3, activation='softmax', name='gate_ctr') self.gate_cvr = Dense(3, activation='softmax', name='gate_cvr') self.dropout_ctr = Dropout(0.25) self.dropout_cvr = Dropout(0.25) self.bn_layer_1 = BatchNormalization(name='batch_norm_1') self.bn_layer_2 = BatchNormalization(name='batch_norm_2') self.bn_layer_3 = BatchNormalization(name='batch_norm_3') self.bn_layer_4 = BatchNormalization(name='batch_norm_4') self.bn_layer_5 = BatchNormalization(name='batch_norm_5') self.bn_layer_6 = BatchNormalization(name='batch_norm_6') self.bn_layer_ctr = BatchNormalization(name='batch_norm_ctr') self.bn_layer_cvr = BatchNormalization(name='batch_norm_cvr') self.output_tower_ctr = MyTower(tower_size2, name='output_tower_ctr') self.output_tower_cvr = MyTower(tower_size2, name='output_tower_cvr') self.output_layer_ctr = Dense(1, activation='sigmoid', name='output_layer_ctr') self.output_layer_cvr = Dense(1, activation='sigmoid', name='output_layer_cvr') def call(self, inputs, training=None): share_output1 = self.expert1(inputs) share_output1 = self.bn_layer_1(share_output1) share_output2 = self.expert2(inputs) share_output2 = self.bn_layer_2(share_output2) share_output3 = self.expert3(inputs) share_output3 = self.bn_layer_3(share_output3) # share_output4 = self.expert4(inputs) # share_output4 = self.bn_layer_4(share_output4) # expert_outputs = tf.stack([share_output1, share_output2, share_output3, share_output4], axis=-1) expert_outputs = tf.stack([share_output1, share_output2, share_output3], axis=-1) gate_output_ctr = self.dropout_ctr(self.gate_ctr(self.bn_layer_5(inputs))) gate_output_ctr = tf.expand_dims(gate_output_ctr, axis=1) share_output_ctr = tf.reduce_sum(gate_output_ctr * expert_outputs, axis=-1) share_output_ctr = self.bn_layer_ctr(share_output_ctr) gate_output_cvr = self.dropout_cvr(self.gate_cvr(self.bn_layer_6(inputs))) gate_output_cvr = tf.expand_dims(gate_output_cvr, axis=1) share_output_cvr = tf.reduce_sum(gate_output_cvr * expert_outputs, axis=-1) share_output_cvr = self.bn_layer_cvr(share_output_cvr) ctr_output = self.output_tower_ctr(share_output_ctr) ctr_output = self.output_layer_ctr(ctr_output) cvr_output = self.output_tower_cvr(share_output_cvr) cvr_output = self.output_layer_cvr(cvr_output) return {'ctr': ctr_output, 'cvr': cvr_output, 'gate_output_cvr': gate_output_cvr} 详细解释心意啊
最新发布
09-16
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值