未验证 提交 4d33a3f0 编写于 作者: Z zhang wenhui 提交者: GitHub

add mmoe (#4181)

上级 27e0706c
# MMoE
##简介
MMoE是经典的多任务(multi-task)模型,原论文[Modeling Task Relationships in Multi-task Learning with Multi-gate Mixture-of-Experts](https://www.kdd.org/kdd2018/accepted-papers/view/modeling-task-relationships-in-multi-task-learning-with-multi-gate-mixture-) 发表于KDD 2018.
多任务模型通过学习不同任务的联系和差异,可提高每个任务的学习效率和质量。多任务学习的的框架广泛采用shared-bottom的结构,不同任务间共用底部的隐层。这种结构本质上可以减少过拟合的风险,但是效果上可能受到任务差异和数据分布带来的影响。论文中提出了一个Multi-gate Mixture-of-Experts(MMoE)的多任务学习结构。MMoE模型刻画了任务相关性,基于共享表示来学习特定任务的函数,避免了明显增加参数的缺点。(https://zhuanlan.zhihu.com/p/55752344)
我们基于实际工业界场景实现了MMoE的核心思想。
## 数据
我们采用了随机数据作为训练数据,可以根据自己的数据调整data部分。
## 训练
```
python mmoe_train.py
```
# 未来工作
1. 添加预测部分
2. 添加公开数据集的结果
import paddle.fluid as fluid
import numpy as np
dict_dim = 1000
emb_dim = 64
def fc_layers(input, layers, acts, prefix):
fc_layers_input = [input]
fc_layers_size = layers
fc_layers_act = acts
init_range = 0.2
scales_tmp = [input.shape[1]] + fc_layers_size
scales = []
for i in range(len(scales_tmp)):
scales.append(init_range / (scales_tmp[i]**0.5))
for i in range(len(fc_layers_size)):
name = prefix + "_" + str(i)
fc = fluid.layers.fc(
input = fc_layers_input[-1],
size = fc_layers_size[i],
act = fc_layers_act[i],
param_attr = \
fluid.ParamAttr(learning_rate=1.0, \
initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])),
bias_attr = \
fluid.ParamAttr(learning_rate=1.0, \
initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])),
name=name)
fc_layers_input.append(fc)
return fc_layers_input[-1]
def mmoe_layer(inputs, expert_num=8, gate_num=3):
expert_out = []
expert_nn = [3]
expert_act = ['relu']
for i in range(0, expert_num):
cur_expert = fc_layers(inputs, expert_nn, expert_act,
'expert_' + str(i))
expert_out.append(cur_expert)
expert_concat = fluid.layers.concat(expert_out, axis=1)
expert_concat = fluid.layers.reshape(expert_concat,
[-1, expert_num, expert_nn[-1]])
outs = []
for i in range(0, gate_num):
cur_gate = fluid.layers.fc(input=inputs,
size=expert_num,
act='softmax',
name='gate_' + str(i))
cur_gate_expert = fluid.layers.elementwise_mul(
expert_concat, cur_gate, axis=0)
cur_gate_expert = fluid.layers.reduce_sum(cur_gate_expert, dim=1)
cur_fc = fc_layers(cur_gate_expert, [64, 32, 16, 1],
['relu', 'relu', 'relu', None], 'out_' + str(i))
outs.append(cur_fc)
return outs
def model():
label_like = fluid.layers.data(
name="label_like",
shape=[-1, 1],
dtype="int64",
lod_level=0,
append_batch_size=False)
label_comment = fluid.layers.data(
name="label_comment",
shape=[-1, 1],
dtype="int64",
lod_level=0,
append_batch_size=False)
label_share = fluid.layers.data(
name="label_share",
shape=[-1, 1],
dtype="int64",
lod_level=0,
append_batch_size=False)
a_data = fluid.layers.data(
name="a", shape=[-1, 1], dtype="int64", append_batch_size=False)
emb = fluid.layers.embedding(input=a_data, size=[dict_dim, emb_dim])
outs = mmoe_layer(emb, expert_num=8, gate_num=3)
output_like = fluid.layers.sigmoid(
fluid.layers.clip(
outs[0], min=-15.0, max=15.0), name="output_like")
output_comment = fluid.layers.sigmoid(
fluid.layers.clip(
outs[1], min=-15.0, max=15.0), name="output_comment")
output_share = fluid.layers.sigmoid(
fluid.layers.clip(
outs[2], min=-15.0, max=15.0), name="output_share")
cost_like = fluid.layers.log_loss(
input=output_like,
label=fluid.layers.cast(
x=label_like, dtype='float32'))
cost_comment = fluid.layers.log_loss(
input=output_comment,
label=fluid.layers.cast(
x=label_comment, dtype='float32'))
cost_share = fluid.layers.log_loss(
input=output_share,
label=fluid.layers.cast(
x=label_share, dtype='float32'))
avg_cost_like = fluid.layers.mean(x=cost_like)
avg_cost_comment = fluid.layers.mean(x=cost_comment)
avg_cost_share = fluid.layers.mean(x=cost_share)
cost = avg_cost_like + avg_cost_comment + avg_cost_share
return cost, [a_data, label_like, label_comment, label_share]
batch_size = 5
loss, data_list = model()
sgd = fluid.optimizer.SGD(learning_rate=0.001)
sgd.minimize(loss)
use_cuda = True
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
feeder = fluid.DataFeeder(feed_list=data_list, place=place)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
for batch_id in range(100):
data = [
np.random.randint(
2, size=(batch_size, 1)).astype('int64') for i in range(4)
]
loss_data, = exe.run(fluid.default_main_program(),
feed={
"a": data[0],
"label_like": data[1],
"label_comment": data[2],
"label_share": data[3]
},
fetch_list=[loss.name])
print(batch_id, " loss:", float(np.array(loss_data)))
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册