From 4d33a3f06b7d07d777df85b9e2588b16e178f6e2 Mon Sep 17 00:00:00 2001 From: zhang wenhui Date: Thu, 9 Jan 2020 13:45:38 +0800 Subject: [PATCH] add mmoe (#4181) --- PaddleRec/multi-task/MMoE/README.md | 24 ++++ PaddleRec/multi-task/MMoE/mmoe_train.py | 142 ++++++++++++++++++++++++ 2 files changed, 166 insertions(+) create mode 100644 PaddleRec/multi-task/MMoE/mmoe_train.py diff --git a/PaddleRec/multi-task/MMoE/README.md b/PaddleRec/multi-task/MMoE/README.md index 5e601c99..f52fec4b 100644 --- a/PaddleRec/multi-task/MMoE/README.md +++ b/PaddleRec/multi-task/MMoE/README.md @@ -1 +1,25 @@ # MMoE + +##简介 + +MMoE是经典的多任务(multi-task)模型,原论文[Modeling Task Relationships in Multi-task Learning with Multi-gate Mixture-of-Experts](https://www.kdd.org/kdd2018/accepted-papers/view/modeling-task-relationships-in-multi-task-learning-with-multi-gate-mixture-) 发表于KDD 2018. + +多任务模型通过学习不同任务的联系和差异,可提高每个任务的学习效率和质量。多任务学习的的框架广泛采用shared-bottom的结构,不同任务间共用底部的隐层。这种结构本质上可以减少过拟合的风险,但是效果上可能受到任务差异和数据分布带来的影响。论文中提出了一个Multi-gate Mixture-of-Experts(MMoE)的多任务学习结构。MMoE模型刻画了任务相关性,基于共享表示来学习特定任务的函数,避免了明显增加参数的缺点。(https://zhuanlan.zhihu.com/p/55752344) + +我们基于实际工业界场景实现了MMoE的核心思想。 + +## 数据 + +我们采用了随机数据作为训练数据,可以根据自己的数据调整data部分。 + +## 训练 + +``` +python mmoe_train.py +``` + +# 未来工作 + +1. 添加预测部分 + +2. 添加公开数据集的结果 diff --git a/PaddleRec/multi-task/MMoE/mmoe_train.py b/PaddleRec/multi-task/MMoE/mmoe_train.py new file mode 100644 index 00000000..4f915eab --- /dev/null +++ b/PaddleRec/multi-task/MMoE/mmoe_train.py @@ -0,0 +1,142 @@ +import paddle.fluid as fluid +import numpy as np + +dict_dim = 1000 +emb_dim = 64 + + +def fc_layers(input, layers, acts, prefix): + fc_layers_input = [input] + fc_layers_size = layers + fc_layers_act = acts + init_range = 0.2 + scales_tmp = [input.shape[1]] + fc_layers_size + scales = [] + for i in range(len(scales_tmp)): + scales.append(init_range / (scales_tmp[i]**0.5)) + for i in range(len(fc_layers_size)): + name = prefix + "_" + str(i) + fc = fluid.layers.fc( + input = fc_layers_input[-1], + size = fc_layers_size[i], + act = fc_layers_act[i], + param_attr = \ + fluid.ParamAttr(learning_rate=1.0, \ + initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])), + bias_attr = \ + fluid.ParamAttr(learning_rate=1.0, \ + initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])), + name=name) + fc_layers_input.append(fc) + return fc_layers_input[-1] + + +def mmoe_layer(inputs, expert_num=8, gate_num=3): + + expert_out = [] + expert_nn = [3] + expert_act = ['relu'] + for i in range(0, expert_num): + cur_expert = fc_layers(inputs, expert_nn, expert_act, + 'expert_' + str(i)) + expert_out.append(cur_expert) + expert_concat = fluid.layers.concat(expert_out, axis=1) + expert_concat = fluid.layers.reshape(expert_concat, + [-1, expert_num, expert_nn[-1]]) + + outs = [] + for i in range(0, gate_num): + cur_gate = fluid.layers.fc(input=inputs, + size=expert_num, + act='softmax', + name='gate_' + str(i)) + cur_gate_expert = fluid.layers.elementwise_mul( + expert_concat, cur_gate, axis=0) + cur_gate_expert = fluid.layers.reduce_sum(cur_gate_expert, dim=1) + cur_fc = fc_layers(cur_gate_expert, [64, 32, 16, 1], + ['relu', 'relu', 'relu', None], 'out_' + str(i)) + outs.append(cur_fc) + return outs + + +def model(): + label_like = fluid.layers.data( + name="label_like", + shape=[-1, 1], + dtype="int64", + lod_level=0, + append_batch_size=False) + label_comment = fluid.layers.data( + name="label_comment", + shape=[-1, 1], + dtype="int64", + lod_level=0, + append_batch_size=False) + label_share = fluid.layers.data( + name="label_share", + shape=[-1, 1], + dtype="int64", + lod_level=0, + append_batch_size=False) + + a_data = fluid.layers.data( + name="a", shape=[-1, 1], dtype="int64", append_batch_size=False) + emb = fluid.layers.embedding(input=a_data, size=[dict_dim, emb_dim]) + + outs = mmoe_layer(emb, expert_num=8, gate_num=3) + + output_like = fluid.layers.sigmoid( + fluid.layers.clip( + outs[0], min=-15.0, max=15.0), name="output_like") + output_comment = fluid.layers.sigmoid( + fluid.layers.clip( + outs[1], min=-15.0, max=15.0), name="output_comment") + output_share = fluid.layers.sigmoid( + fluid.layers.clip( + outs[2], min=-15.0, max=15.0), name="output_share") + + cost_like = fluid.layers.log_loss( + input=output_like, + label=fluid.layers.cast( + x=label_like, dtype='float32')) + cost_comment = fluid.layers.log_loss( + input=output_comment, + label=fluid.layers.cast( + x=label_comment, dtype='float32')) + cost_share = fluid.layers.log_loss( + input=output_share, + label=fluid.layers.cast( + x=label_share, dtype='float32')) + + avg_cost_like = fluid.layers.mean(x=cost_like) + avg_cost_comment = fluid.layers.mean(x=cost_comment) + avg_cost_share = fluid.layers.mean(x=cost_share) + + cost = avg_cost_like + avg_cost_comment + avg_cost_share + return cost, [a_data, label_like, label_comment, label_share] + + +batch_size = 5 + +loss, data_list = model() +sgd = fluid.optimizer.SGD(learning_rate=0.001) +sgd.minimize(loss) +use_cuda = True +place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() +feeder = fluid.DataFeeder(feed_list=data_list, place=place) +exe = fluid.Executor(place) +exe.run(fluid.default_startup_program()) +for batch_id in range(100): + data = [ + np.random.randint( + 2, size=(batch_size, 1)).astype('int64') for i in range(4) + ] + loss_data, = exe.run(fluid.default_main_program(), + feed={ + "a": data[0], + "label_like": data[1], + "label_comment": data[2], + "label_share": data[3] + }, + fetch_list=[loss.name]) + print(batch_id, " loss:", float(np.array(loss_data))) -- GitLab