diff --git a/LICENSE b/LICENSE index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 100644 --- a/LICENSE +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..96811af3ade79f0e27ef9cd231e78b69997995b4 --- /dev/null +++ b/README.md @@ -0,0 +1,217 @@ +

+ +

+ +

+
+ Release + License + Slack +
+

+ + +

什么是PaddleRec

+ +

+ +

+ +- 源于飞桨生态的搜索推荐模型**一站式开箱即用工具** +- 适合初学者,开发者,研究者从调研,训练到预测部署的全流程解决方案 +- 包含语义理解、召回、粗排、精排、多任务学习、融合等多个任务的推荐搜索算法库 +- 配置**yaml**自定义选项,即可快速上手使用单机训练、大规模分布式训练、离线预测、在线部署 + + +

PadlleRec概览

+ +

+ +

+ + +

推荐系统-流程概览

+ +

+ +

+ +

便捷安装

+ +### 环境要求 +* Python 2.7/ 3.5 / 3.6 / 3.7 +* PaddlePaddle >= 1.7.2 +* 操作系统: Windows/Mac/Linux + + > Windows下目前仅提供单机训练,建议使用Linux + +### 安装命令 + +- 安装方法一: + ```bash + python -m pip install paddle-rec + ``` + +- 安装方法二 + + 源码编译安装 + 1. 安装飞桨 **注:需要用户安装版本 >1.7.2 的飞桨** + + ```shell + python -m pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple + ``` + + 2. 源码安装PaddleRec + + ``` + git clone https://github.com/PaddlePaddle/PaddleRec/ + cd PaddleRec + python setup.py install + ``` + + +

快速启动

+ +### 启动内置模型的默认配置 + +目前框架内置了多个模型,一行命令即可使用内置模型开始单机训练和本地模拟分布式训练。 + > 本地模拟分布式(`local_cluster`)为`1个server + 1个trainer`的参数服务器模式 + + +我们以排序模型中的`dnn`模型为例介绍PaddleRec的简单使用。训练数据来源为[Criteo数据集](https://www.kaggle.com/c/criteo-display-ad-challenge/),我们从中截取了100条方便您快速上手体验完整的PaddleRec流程。 + +```bash +# 使用CPU进行单机训练 +python -m paddlerec.run -m paddlerec.models.rank.dnn +``` + +### 启动内置模型的自定配置 + +若您复用内置模型,对**yaml**配置文件进行了修改,如更改超参,重新配置数据后,可以直接使用paddlerec运行该yaml文件。 + +我们以dnn模型为例,在paddlerec代码目录下: +```bash +cd paddlerec +``` + +修改dnn模型的[超参配置](./models/rank/dnn/config.yaml),例如将迭代训练轮数从10轮修改为5轮: +```yaml +train: + # epochs: 10 + epochs: 5 +``` + +在Linux环境下,可以使用`vim`等文本编辑工具修改yaml文件: + +```bash +vim ./models/rank/dnn/config.yaml +# 键入 i, 进入编辑模式 +# 修改yaml文件配置 +# 完成修改后,点击esc,退出编辑模式 +# 键入 :wq 保存文件并退出 +``` + +完成dnn模型`models/rank/dnn/config.yaml`的配置修改后,运行`dnn`模型: +```bash +# 使用自定配置进行训练 +python -m paddlerec.run -m ./models/rank/dnn/config.yaml +``` + +### 分布式训练 + +分布式训练需要配置`config.yaml`,加入或修改`engine`选项为`cluster`或`local_cluster`,以进行分布式训练,或本地模拟分布式训练。 + +#### 本地模拟分布式训练 + +我们以dnn模型为例,在paddlerec代码目录下,修改dnn模型的`config.yaml`文件: + +```yaml +train: + #engine: single + engine: local_cluster +``` +然后启动paddlerec训练: + +```bash +# 进行本地模拟分布式训练 +python -m paddlerec.run -m ./models/rank/dnn/config.yaml +``` + +#### 集群分布式训练 + +我们以dnn模型为例,在paddlerec代码目录下,首先修改dnn模型`config.yaml`文件: + +```yaml +train: + #engine: single + engine: cluster +``` +再添加分布式启动配置文件`backend.yaml`,具体配置规则在[分布式训练](doc/distributed_train.md)教程中介绍。最后启动paddlerec训练: + +```bash +# 配置好 mpi/k8s/paddlecloud集群环境后 +python -m paddlerec.run -m ./models/rank/dnn/config.yaml -b backend.yaml +``` + + +

支持模型列表

+ + +| 方向 | 模型 | 单机CPU训练 | 单机GPU训练 | 分布式CPU训练 | +| :------: | :-----------------------------------------------------------------------: | :---------: | :---------: | :-----------: | +| 内容理解 | [Text-Classifcation](models/contentunderstanding/classification/model.py) | ✓ | x | ✓ | +| 内容理解 | [TagSpace](models/contentunderstanding/tagspace/model.py) | ✓ | x | ✓ | +| 召回 | [DSSM](models/match/dssm/model.py) | ✓ | x | ✓ | +| 召回 | [MultiView-Simnet](models/match/multiview-simnet/model.py) | ✓ | x | ✓ | +| 召回 | [TDM](models/treebased/tdm/model.py) | ✓ | x | ✓ | +| 召回 | [Word2Vec](models/recall/word2vec/model.py) | ✓ | x | ✓ | +| 召回 | [SSR](models/recall/ssr/model.py) | ✓ | ✓ | ✓ | +| 召回 | [Gru4Rec](models/recall/gru4rec/model.py) | ✓ | ✓ | ✓ | +| 排序 | [Dnn](models/rank/dnn/model.py) | ✓ | x | ✓ | +| 排序 | [DeepFM](models/rank/deepfm/model.py) | ✓ | x | ✓ | +| 排序 | [xDeepFM](models/rank/xdeepfm/model.py) | ✓ | x | ✓ | +| 排序 | [DIN](models/rank/din/model.py) | ✓ | x | ✓ | +| 排序 | [Wide&Deep](models/rank/wide_deep/model.py) | ✓ | x | ✓ | +| 多任务 | [ESMM](models/multitask/esmm/model.py) | ✓ | ✓ | ✓ | +| 多任务 | [MMOE](models/multitask/mmoe/model.py) | ✓ | ✓ | ✓ | +| 多任务 | [ShareBottom](models/multitask/share-bottom/model.py) | ✓ | ✓ | ✓ | + + + + +

文档

+ +### 背景介绍 +* [推荐系统介绍](doc/rec_background.md) +* [分布式深度学习介绍](doc/ps_background.md) + +### 新手教程 +* [环境要求](#环境要求) +* [安装命令](#安装命令) +* [快速开始](#启动内置模型的默认配置) + +### 进阶教程 +* [自定义数据集及Reader](doc/custom_dataset_reader.md) +* [分布式训练](doc/distributed_train.md) + +### 开发者教程 +* [PaddleRec设计文档](doc/design.md) + +### 关于PaddleRec性能 +* [Benchmark](doc/benchmark.md) + +### FAQ +* [常见问题FAQ](doc/faq.md) + + +

社区

+ +### 反馈 +如有意见、建议及使用中的BUG,欢迎在`GitHub Issue`提交 + +### 版本历史 +- 2020.5.14 - PaddleRec v0.1 + +### 许可证书 +本项目的发布受[Apache 2.0 license](LICENSE)许可认证。 + diff --git a/core/engine/cluster/cloud/cluster.sh b/core/engine/cluster/cloud/cluster.sh index c2e984b85c1210fb7dccdad1eb3b2f6b3101fc71..1a0605fd9aeefbf87542e5e5156470eb1d81b836 100644 --- a/core/engine/cluster/cloud/cluster.sh +++ b/core/engine/cluster/cloud/cluster.sh @@ -1,8 +1,22 @@ #!/bin/bash +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + ################################################### # Usage: submit.sh -# Description: run mpi submit clinet implement +# Description: run mpi submit client implement ################################################### # ---------------------------------------------------------------------------- # diff --git a/core/engine/cluster/cluster.py b/core/engine/cluster/cluster.py index ae5703f15fbb8221bdd430c9687a76da46796084..6dfcec3a929ad8124192014f48270ebd1862dc2c 100644 --- a/core/engine/cluster/cluster.py +++ b/core/engine/cluster/cluster.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,10 +15,9 @@ from __future__ import print_function from __future__ import unicode_literals -import subprocess -import sys -import os import copy +import os +import subprocess from paddlerec.core.engine.engine import Engine from paddlerec.core.factory import TrainerFactory @@ -32,7 +31,7 @@ class ClusterEngine(Engine): if backend == "PaddleCloud": self.submit_script = os.path.join(abs_dir, "cloud/cluster.sh") else: - raise ValueError("{} can not supported now".format(backend)) + raise ValueError("{} can not be supported now".format(backend)) def start_worker_procs(self): trainer = TrainerFactory.create(self.trainer) diff --git a/core/engine/engine.py b/core/engine/engine.py index afe00dd931752399c35215ae7a306124e22630b1..492bf8e1c6f83f015be4fbd287ebef7d432e953d 100755 --- a/core/engine/engine.py +++ b/core/engine/engine.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import abc @@ -15,4 +29,3 @@ class Engine: @abc.abstractmethod def run(self): pass - diff --git a/core/engine/local_cluster.py b/core/engine/local_cluster.py index 1b850bd05ee559749c193fa25122d19eb32aa5c4..4cf614f02315acbff2a3c21126d8c061c10ba8ad 100755 --- a/core/engine/local_cluster.py +++ b/core/engine/local_cluster.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,10 +14,11 @@ from __future__ import print_function from __future__ import unicode_literals -import subprocess -import sys -import os + import copy +import os +import sys +import subprocess from paddlerec.core.engine.engine import Engine from paddlerec.core.utils import envs diff --git a/core/engine/local_mpi.py b/core/engine/local_mpi.py index 29e34907845a286a49b66286eefa54a1367512ce..49db821fe5764ae9ef7f42cbd3ca2fe77b83a1d1 100755 --- a/core/engine/local_mpi.py +++ b/core/engine/local_mpi.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,10 +14,11 @@ from __future__ import print_function from __future__ import unicode_literals -import subprocess -import sys -import os + import copy +import os +import sys +import subprocess from paddlerec.core.engine.engine import Engine diff --git a/core/layer.py b/core/layer.py index ab06841706400ceae1ab4444ba77bba66ff9924a..07c058c77d3fb947334d5d8fca89b15e2629c724 100755 --- a/core/layer.py +++ b/core/layer.py @@ -25,23 +25,8 @@ class Layer(object): """ pass - def generate(self, mode, param): - """R - """ - if mode == 'fluid': - return self.generate_fluid(param) - elif mode == 'tensorflow': - return self.generate_tensorflow(param) - print('unsupport this mode: ' + mode) - return None, None - @abc.abstractmethod - def generate_fluid(self, param): + def generate(self, param): """R """ pass - - def generate_tensorflow(self, param): - """ Not implement currently - """ - pass diff --git a/core/metric.py b/core/metric.py index 469fd56c8cb97c8e21739b402b4b89daab57bde9..e0f6b24e7e6bfc3e4e1689622019ffd540c8c033 100755 --- a/core/metric.py +++ b/core/metric.py @@ -53,7 +53,7 @@ class Metric(object): pass @abc.abstractmethod - def get_result_to_string(self): + def __str__(self): """ Return: result(string) : calculate result with string format, for output diff --git a/core/metrics/auc_metrics.py b/core/metrics/auc_metrics.py index 3c48040db29fc31846780f6f4e871168ac65f3f8..5dd16cc078aa43d8fb07a50a4b006d4fdae3b2e9 100755 --- a/core/metrics/auc_metrics.py +++ b/core/metrics/auc_metrics.py @@ -13,8 +13,10 @@ # limitations under the License. import math + import numpy as np import paddle.fluid as fluid + from paddlerec.core.metric import Metric @@ -198,7 +200,7 @@ class AUCMetric(Metric): """ """ return self._result - def get_result_to_string(self): + def __str__(self): """ """ result = self.get_result() result_str = "%s AUC=%.6f BUCKET_ERROR=%.6f MAE=%.6f RMSE=%.6f " \ diff --git a/core/model.py b/core/model.py index 24d3ab4fb165c0e8d4a5cc9c7229555aa74211fb..2771c26e9b13c9addc07e27434685da091bef8a4 100755 --- a/core/model.py +++ b/core/model.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import abc import paddle.fluid as fluid @@ -62,7 +76,7 @@ class Model(object): def get_infer_results(self): return self._infer_results - def get_cost_op(self): + def get_avg_cost(self): """R """ return self._cost @@ -72,12 +86,6 @@ class Model(object): """ return self._metrics - def custom_preprocess(self): - """ - do something after exe.run(stratup_program) and before run() - """ - pass - def get_fetch_period(self): return self._fetch_interval diff --git a/core/modules/modul/build.py b/core/modules/modul/build.py index 20e7702ecb220c939ed919f77747f8c768f4c5d7..0263cbf60e3b1647a05cbc471b7bbff1840f88ba 100755 --- a/core/modules/modul/build.py +++ b/core/modules/modul/build.py @@ -1,7 +1,22 @@ -import yaml +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import copy + import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet +import yaml from paddlerec.core.model import Model from paddlerec.core.utils import table diff --git a/core/modules/modul/layers.py b/core/modules/modul/layers.py index f533a46db24da5e6feb3e8723ff3265529cfbcb3..26ee98a816a63c4121428b2dd5d2c835d05f7216 100755 --- a/core/modules/modul/layers.py +++ b/core/modules/modul/layers.py @@ -13,10 +13,11 @@ # limitations under the License. import paddle.fluid as fluid + from paddlerec.core.layer import Layer -class EmbeddingInputLayer(Layer): +class EmbeddingFuseLayer(Layer): """R """ @@ -31,7 +32,7 @@ class EmbeddingInputLayer(Layer): self._emb_dim = self._mf_dim + 3 # append show ctr lr self._emb_layers = [] - def generate_fluid(self, param): + def generate(self, param): """R """ show_clk = fluid.layers.concat( @@ -63,7 +64,7 @@ class LabelInputLayer(Layer): self._data_type = config.get('data_type', "int64") self._label_idx = config['label_idx'] - def generate_fluid(self, param): + def generate(self, param): """R """ label = fluid.layers.data(name=self._name, shape=[-1, self._dim], \ @@ -85,7 +86,7 @@ class TagInputLayer(Layer): self._dim = config.get('dim', 1) self._data_type = config['data_type'] - def generate_fluid(self, param): + def generate(self, param): """R """ output = fluid.layers.data(name=self._name, shape=[-1, self._dim], \ @@ -107,7 +108,7 @@ class ParamLayer(Layer): self._data_type = config.get('data_type', 'float32') self._config = config - def generate_fluid(self, param): + def generate(self, param): """R """ return self._config, {'inference_param': {'name': 'param', 'params': [], 'table_id': self._table_id}} @@ -125,7 +126,7 @@ class SummaryLayer(Layer): self._data_type = config.get('data_type', 'float32') self._config = config - def generate_fluid(self, param): + def generate(self, param): """R """ return self._config, {'inference_param': {'name': 'summary', 'params': [], 'table_id': self._table_id}} @@ -143,7 +144,7 @@ class NormalizetionLayer(Layer): self._summary = config['summary'] self._table_id = config.get('table_id', -1) - def generate_fluid(self, param): + def generate(self, param): """R """ input_layer = param['layer'][self._input[0]] @@ -158,7 +159,7 @@ class NormalizetionLayer(Layer): 'params': inference_param, 'table_id': summary_layer.get('table_id', -1)}} -class NeuralLayer(Layer): +class FCLayer(Layer): """R """ @@ -171,7 +172,7 @@ class NeuralLayer(Layer): self._bias = config.get('bias', True) self._act_func = config.get('act_func', None) - def generate_fluid(self, param): + def generate(self, param): """R """ param_layer = param['layer'][self._param] @@ -199,7 +200,7 @@ class NeuralLayer(Layer): 'table_id': param_layer.get('table_id', -1)}} -class SigmoidLossLayer(Layer): +class LogLossLayer(Layer): """R """ @@ -230,7 +231,7 @@ class SigmoidLossLayer(Layer): } } - def generate_fluid(self, param): + def generate(self, param): """R """ input_layer = param['layer'][self._input[0]] diff --git a/core/trainer.py b/core/trainer.py index df5626b7a5317b9fd7f108964adf11afe0c59c0e..40fc35de973ce7841bfdf28dfc6c6a3751484be7 100755 --- a/core/trainer.py +++ b/core/trainer.py @@ -12,14 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -import sys - import abc +import os import time +import sys import yaml from paddle import fluid + from paddlerec.core.utils import envs diff --git a/core/trainers/__init__.py b/core/trainers/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..cd9c9db5e6b93fd6171bca0a5b0f97f69306aedc 100755 --- a/core/trainers/__init__.py +++ b/core/trainers/__init__.py @@ -0,0 +1,26 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +trainer implement. + + ↗ (single/cluster) CtrTrainer +Trainer + ↗ (for single training) SingleTrainer/TDMSingleTrainer + ↘ TranspilerTrainer → (for cluster training) ClusterTrainer/TDMClusterTrainer + ↘ (for online learning training) OnlineLearningTrainer + +""" + + diff --git a/core/trainers/cluster_trainer.py b/core/trainers/cluster_trainer.py index fa4238bf3e4e188df124c298e15211454f2c2ade..faa960359bc82d6130302002a99fb664c7374249 100755 --- a/core/trainers/cluster_trainer.py +++ b/core/trainers/cluster_trainer.py @@ -25,7 +25,6 @@ import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory from paddle.fluid.incubate.fleet.base.role_maker import PaddleCloudRoleMaker -from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker from paddlerec.core.utils import envs from paddlerec.core.trainers.transpiler_trainer import TranspileTrainer @@ -83,7 +82,7 @@ class ClusterTrainer(TranspileTrainer): strategy = self.build_strategy() optimizer = fleet.distributed_optimizer(optimizer, strategy) - optimizer.minimize(self.model.get_cost_op()) + optimizer.minimize(self.model.get_avg_cost()) if fleet.is_server(): context['status'] = 'server_pass' @@ -115,7 +114,7 @@ class ClusterTrainer(TranspileTrainer): program = fluid.compiler.CompiledProgram( fleet.main_program).with_data_parallel( - loss_name=self.model.get_cost_op().name, + loss_name=self.model.get_avg_cost().name, build_strategy=self.strategy.get_build_strategy(), exec_strategy=self.strategy.get_execute_strategy()) diff --git a/core/trainers/ctr_coding_trainer.py b/core/trainers/ctr_coding_trainer.py index a6377c853853d3f1ab53c55b9a5253cf0ff8d5b7..3bfec28cfd149bdbe47fdc202107c7ed7af58fdd 100755 --- a/core/trainers/ctr_coding_trainer.py +++ b/core/trainers/ctr_coding_trainer.py @@ -11,9 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + import os -import numpy as np +import numpy as np import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker @@ -22,7 +23,7 @@ from paddlerec.core.utils import envs from paddlerec.core.trainer import Trainer -class CtrPaddleTrainer(Trainer): +class CtrTrainer(Trainer): """R """ @@ -87,7 +88,7 @@ class CtrPaddleTrainer(Trainer): optimizer = self.model.optimizer() optimizer = fleet.distributed_optimizer(optimizer, strategy={"use_cvm": False}) - optimizer.minimize(self.model.get_cost_op()) + optimizer.minimize(self.model.get_avg_cost()) if fleet.is_server(): context['status'] = 'server_pass' diff --git a/core/trainers/ctr_modul_trainer.py b/core/trainers/ctr_modul_trainer.py index 8c99a5c885ed1b8a577556c52d56c4a8ad04944c..7b3bd7874359059c3b03289cc10da7d7756ac35b 100755 --- a/core/trainers/ctr_modul_trainer.py +++ b/core/trainers/ctr_modul_trainer.py @@ -13,12 +13,12 @@ # limitations under the License. +import datetime +import json import sys import time -import json -import datetime -import numpy as np +import numpy as np import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker @@ -72,7 +72,7 @@ def worker_numric_max(value, env="mpi"): return wroker_numric_opt(value, env, "max") -class CtrPaddleTrainer(Trainer): +class CtrTrainer(Trainer): """R """ @@ -129,7 +129,7 @@ class CtrPaddleTrainer(Trainer): model = self._exector_context[executor['name']]['model'] self._metrics.update(model.get_metrics()) runnnable_scope.append(scope) - runnnable_cost_op.append(model.get_cost_op()) + runnnable_cost_op.append(model.get_avg_cost()) for var in model._data_var: if var.name in data_var_name_dict: continue @@ -146,7 +146,7 @@ class CtrPaddleTrainer(Trainer): model = self._exector_context[executor['name']]['model'] program = model._build_param['model']['train_program'] if not executor['is_update_sparse']: - program._fleet_opt["program_configs"][str(id(model.get_cost_op().block.program))]["push_sparse"] = [] + program._fleet_opt["program_configs"][str(id(model.get_avg_cost().block.program))]["push_sparse"] = [] if 'train_thread_num' not in executor: executor['train_thread_num'] = self.global_config['train_thread_num'] with fluid.scope_guard(scope): diff --git a/core/trainers/online_learning_trainer.py b/core/trainers/online_learning_trainer.py new file mode 100755 index 0000000000000000000000000000000000000000..0303e96ac0bb20b1f46cdc9f5836d18fa73b9a8e --- /dev/null +++ b/core/trainers/online_learning_trainer.py @@ -0,0 +1,185 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Training use fluid with one node only. +""" + +from __future__ import print_function + +import datetime +import os +import time + +import paddle.fluid as fluid +from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet +from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory +from paddle.fluid.incubate.fleet.base.role_maker import PaddleCloudRoleMaker + +from paddlerec.core.utils import envs +from paddlerec.core.trainers.transpiler_trainer import TranspileTrainer + + +class OnlineLearningTrainer(TranspileTrainer): + def processor_register(self): + role = PaddleCloudRoleMaker() + fleet.init(role) + + if fleet.is_server(): + self.regist_context_processor('uninit', self.instance) + self.regist_context_processor('init_pass', self.init) + self.regist_context_processor('server_pass', self.server) + else: + self.regist_context_processor('uninit', self.instance) + self.regist_context_processor('init_pass', self.init) + self.regist_context_processor('startup_pass', self.startup) + if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None, "train.reader") != "DataLoader": + self.regist_context_processor('train_pass', self.dataset_train) + else: + self.regist_context_processor( + 'train_pass', self.dataloader_train) + self.regist_context_processor('infer_pass', self.infer) + self.regist_context_processor('terminal_pass', self.terminal) + + def build_strategy(self): + mode = envs.get_runtime_environ("train.trainer.strategy") + assert mode in ["async", "geo", "sync", "half_async"] + + strategy = None + + if mode == "async": + strategy = StrategyFactory.create_async_strategy() + elif mode == "geo": + push_num = envs.get_global_env("train.strategy.mode.push_num", 100) + strategy = StrategyFactory.create_geo_strategy(push_num) + elif mode == "sync": + strategy = StrategyFactory.create_sync_strategy() + elif mode == "half_async": + strategy = StrategyFactory.create_half_async_strategy() + + assert strategy is not None + + self.strategy = strategy + return strategy + + def init(self, context): + self.model.train_net() + optimizer = self.model.optimizer() + strategy = self.build_strategy() + optimizer = fleet.distributed_optimizer(optimizer, strategy) + optimizer.minimize(self.model.get_avg_cost()) + + if fleet.is_server(): + context['status'] = 'server_pass' + else: + self.fetch_vars = [] + self.fetch_alias = [] + self.fetch_period = self.model.get_fetch_period() + + metrics = self.model.get_metrics() + if metrics: + self.fetch_vars = metrics.values() + self.fetch_alias = metrics.keys() + context['status'] = 'startup_pass' + + def server(self, context): + fleet.init_server() + fleet.run_server() + context['is_exit'] = True + + def startup(self, context): + self._exe.run(fleet.startup_program) + context['status'] = 'train_pass' + + def dataloader_train(self, context): + print("online learning can only support LINUX only") + context['status'] = 'terminal_pass' + + def _get_dataset(self, state="TRAIN", hour=None): + if state == "TRAIN": + inputs = self.model.get_inputs() + namespace = "train.reader" + train_data_path = envs.get_global_env( + "train_data_path", None, namespace) + else: + inputs = self.model.get_infer_inputs() + namespace = "evaluate.reader" + train_data_path = envs.get_global_env( + "test_data_path", None, namespace) + + threads = int(envs.get_runtime_environ("train.trainer.threads")) + batch_size = envs.get_global_env("batch_size", None, namespace) + reader_class = envs.get_global_env("class", None, namespace) + abs_dir = os.path.dirname(os.path.abspath(__file__)) + reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py') + pipe_cmd = "python {} {} {} {}".format( + reader, reader_class, state, self._config_yaml) + + if train_data_path.startswith("paddlerec::"): + package_base = envs.get_runtime_environ("PACKAGE_BASE") + assert package_base is not None + train_data_path = os.path.join( + package_base, train_data_path.split("::")[1]) + + dataset = fluid.DatasetFactory().create_dataset() + dataset.set_use_var(inputs) + dataset.set_pipe_command(pipe_cmd) + dataset.set_batch_size(batch_size) + dataset.set_thread(threads) + + if hour is not None: + train_data_path = os.path.join(train_data_path, hour) + + file_list = [ + os.path.join(train_data_path, x) + for x in os.listdir(train_data_path) + ] + + self.files = file_list + dataset.set_filelist(self.files) + return dataset + + def dataset_train(self, context): + fleet.init_worker() + + days = envs.get_global_env("train.days") + begin_day = datetime.datetime.strptime("begin_day_d", '%Y%m%d') + + for day in range(days): + for hour in range(24): + day = begin_day + datetime.timedelta(days=day, hours=hour) + day_s = day.strftime('%Y%m%d/%H') + i = day.strftime('%Y%m%d_%H') + + dataset = self._get_dataset(hour=day_s) + ins = self._get_dataset_ins() + + begin_time = time.time() + self._exe.train_from_dataset(program=fluid.default_main_program(), + dataset=dataset, + fetch_list=self.fetch_vars, + fetch_info=self.fetch_alias, + print_period=self.fetch_period) + end_time = time.time() + times = end_time-begin_time + print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins/times)) + self.save(i, "train", is_fleet=True) + + fleet.stop_worker() + context['status'] = 'infer_pass' + + def terminal(self, context): + for model in self.increment_models: + print("epoch :{}, dir: {}".format(model[0], model[1])) + context['is_exit'] = True diff --git a/core/trainers/single_trainer.py b/core/trainers/single_trainer.py index 888a93f3ddb7c1045505fc98eebc15d6dd19e950..8079377ba257041e4946d6e452cacaa388ca36ce 100755 --- a/core/trainers/single_trainer.py +++ b/core/trainers/single_trainer.py @@ -17,14 +17,14 @@ Training use fluid with one node only. """ from __future__ import print_function -import logging + import time +import logging import paddle.fluid as fluid from paddlerec.core.trainers.transpiler_trainer import TranspileTrainer from paddlerec.core.utils import envs -import numpy as np logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger("fluid") @@ -36,7 +36,8 @@ class SingleTrainer(TranspileTrainer): self.regist_context_processor('uninit', self.instance) self.regist_context_processor('init_pass', self.init) self.regist_context_processor('startup_pass', self.startup) - if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None, "train.reader") != "DataLoader": + if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None, + "train.reader") != "DataLoader": self.regist_context_processor('train_pass', self.dataset_train) else: self.regist_context_processor('train_pass', self.dataloader_train) @@ -47,7 +48,7 @@ class SingleTrainer(TranspileTrainer): def init(self, context): self.model.train_net() optimizer = self.model.optimizer() - optimizer.minimize((self.model.get_cost_op())) + optimizer.minimize((self.model.get_avg_cost())) self.fetch_vars = [] self.fetch_alias = [] @@ -74,7 +75,7 @@ class SingleTrainer(TranspileTrainer): program = fluid.compiler.CompiledProgram( fluid.default_main_program()).with_data_parallel( - loss_name=self.model.get_cost_op().name) + loss_name=self.model.get_avg_cost().name) metrics_varnames = [] metrics_format = [] @@ -122,8 +123,8 @@ class SingleTrainer(TranspileTrainer): fetch_info=self.fetch_alias, print_period=self.fetch_period) end_time = time.time() - times = end_time-begin_time - print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins/times)) + times = end_time - begin_time + print("epoch {} using time {}, speed {:.2f} lines/s".format(i, times, ins / times)) self.save(i, "train", is_fleet=False) context['status'] = 'infer_pass' diff --git a/core/trainers/tdm_cluster_trainer.py b/core/trainers/tdm_cluster_trainer.py index 11fa23340c7587995df2b39181cd0bc9007982a1..3bd1ad3367f340019333e8f83cf5abdd3b36b25f 100755 --- a/core/trainers/tdm_cluster_trainer.py +++ b/core/trainers/tdm_cluster_trainer.py @@ -1,4 +1,3 @@ -# -*- coding=utf-8 -*- # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -18,17 +17,16 @@ Training use fluid with one node only. """ from __future__ import print_function + import logging + import numpy as np import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet -from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory -from paddle.fluid.incubate.fleet.base.role_maker import PaddleCloudRoleMaker from paddlerec.core.utils import envs from paddlerec.core.trainers.cluster_trainer import ClusterTrainer - logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger("fluid") logger.setLevel(logging.INFO) @@ -65,11 +63,10 @@ class TDMClusterTrainer(ClusterTrainer): "cluster.init_model_path", "", namespace) if load_tree: - # 将明文树结构及数据,set到组网中的Variale中 - # 不使用NumpyInitialize方法是考虑到树结构相关数据size过大,有性能风险 + # covert tree to tensor, set it into Fluid's variable. for param_name in special_param: param_t = fluid.global_scope().find_var(param_name).get_tensor() - param_array = self.tdm_prepare(param_name) + param_array = self._tdm_prepare(param_name) param_t.set(param_array.astype('int32'), self._place) if save_init_model: @@ -80,27 +77,27 @@ class TDMClusterTrainer(ClusterTrainer): context['status'] = 'train_pass' - def tdm_prepare(self, param_name): + def _tdm_prepare(self, param_name): if param_name == "TDM_Tree_Travel": - travel_array = self.tdm_travel_prepare() + travel_array = self._tdm_travel_prepare() return travel_array elif param_name == "TDM_Tree_Layer": - layer_array, _ = self.tdm_layer_prepare() + layer_array, _ = self._tdm_layer_prepare() return layer_array elif param_name == "TDM_Tree_Info": - info_array = self.tdm_info_prepare() + info_array = self._tdm_info_prepare() return info_array else: raise " {} is not a special tdm param name".format(param_name) - def tdm_travel_prepare(self): + def _tdm_travel_prepare(self): """load tdm tree param from npy/list file""" travel_array = np.load(self.tree_travel_path) logger.info("TDM Tree leaf node nums: {}".format( travel_array.shape[0])) return travel_array - def tdm_layer_prepare(self): + def _tdm_layer_prepare(self): """load tdm tree param from npy/list file""" layer_list = [] layer_list_flat = [] @@ -120,7 +117,7 @@ class TDMClusterTrainer(ClusterTrainer): [len(i) for i in layer_list])) return layer_array, layer_list - def tdm_info_prepare(self): + def _tdm_info_prepare(self): """load tdm tree param from list file""" info_array = np.load(self.tree_info_path) return info_array diff --git a/core/trainers/tdm_single_trainer.py b/core/trainers/tdm_single_trainer.py index 118548abd62fb0c3a22d2e43265afd028f64cfe0..21be66a677750f6e817b63794819b14ed72d9fa2 100755 --- a/core/trainers/tdm_single_trainer.py +++ b/core/trainers/tdm_single_trainer.py @@ -1,4 +1,3 @@ -# -*- coding=utf-8 -*- # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -19,12 +18,11 @@ Training use fluid with one node only. from __future__ import print_function import logging -import paddle.fluid as fluid -from paddlerec.core.trainers.transpiler_trainer import TranspileTrainer +import numpy as np +import paddle.fluid as fluid from paddlerec.core.trainers.single_trainer import SingleTrainer from paddlerec.core.utils import envs -import numpy as np logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger("fluid") @@ -68,11 +66,10 @@ class TDMSingleTrainer(SingleTrainer): persistables_model_path)) if load_tree: - # 将明文树结构及数据,set到组网中的Variale中 - # 不使用NumpyInitialize方法是考虑到树结构相关数据size过大,有性能风险 + # covert tree to tensor, set it into Fluid's variable. for param_name in special_param: param_t = fluid.global_scope().find_var(param_name).get_tensor() - param_array = self.tdm_prepare(param_name) + param_array = self._tdm_prepare(param_name) if param_name == 'TDM_Tree_Emb': param_t.set(param_array.astype('float32'), self._place) else: @@ -86,37 +83,37 @@ class TDMSingleTrainer(SingleTrainer): context['status'] = 'train_pass' - def tdm_prepare(self, param_name): + def _tdm_prepare(self, param_name): if param_name == "TDM_Tree_Travel": - travel_array = self.tdm_travel_prepare() + travel_array = self._tdm_travel_prepare() return travel_array elif param_name == "TDM_Tree_Layer": - layer_array, _ = self.tdm_layer_prepare() + layer_array, _ = self._tdm_layer_prepare() return layer_array elif param_name == "TDM_Tree_Info": - info_array = self.tdm_info_prepare() + info_array = self._tdm_info_prepare() return info_array elif param_name == "TDM_Tree_Emb": - emb_array = self.tdm_emb_prepare() + emb_array = self._tdm_emb_prepare() return emb_array else: raise " {} is not a special tdm param name".format(param_name) - def tdm_travel_prepare(self): + def _tdm_travel_prepare(self): """load tdm tree param from npy/list file""" travel_array = np.load(self.tree_travel_path) logger.info("TDM Tree leaf node nums: {}".format( travel_array.shape[0])) return travel_array - def tdm_emb_prepare(self): + def _tdm_emb_prepare(self): """load tdm tree param from npy/list file""" emb_array = np.load(self.tree_emb_path) logger.info("TDM Tree node nums from emb: {}".format( emb_array.shape[0])) return emb_array - def tdm_layer_prepare(self): + def _tdm_layer_prepare(self): """load tdm tree param from npy/list file""" layer_list = [] layer_list_flat = [] @@ -136,7 +133,7 @@ class TDMSingleTrainer(SingleTrainer): [len(i) for i in layer_list])) return layer_array, layer_list - def tdm_info_prepare(self): + def _tdm_info_prepare(self): """load tdm tree param from list file""" info_array = np.load(self.tree_info_path) return info_array diff --git a/core/trainers/transpiler_trainer.py b/core/trainers/transpiler_trainer.py index 82dfe538b1400ab5ac3b93b7d7ce120d2e2b1eb0..2fc2e19d6f3d8e426f3f01aa2689be37d513f670 100755 --- a/core/trainers/transpiler_trainer.py +++ b/core/trainers/transpiler_trainer.py @@ -29,7 +29,7 @@ from paddlerec.core.reader import SlotReader class TranspileTrainer(Trainer): def __init__(self, config=None): Trainer.__init__(self, config) - device = envs.get_global_env("train.device") + device = envs.get_global_env("train.device", "cpu") if device == 'gpu': self._place = fluid.CUDAPlace(0) self._exe = fluid.Executor(self._place) @@ -265,7 +265,7 @@ class TranspileTrainer(Trainer): 'evaluate_model_path', "", namespace='evaluate'))] is_return_numpy = envs.get_global_env( - 'is_return_numpy', True, namespace='evaluate') + 'is_return_numpy', True, namespace='evaluate') for (epoch, model_dir) in model_list: print("Begin to infer No.{} model, model_dir: {}".format( diff --git a/core/utils/dataloader_instance.py b/core/utils/dataloader_instance.py index e9154e65ced877747f224c00e50dce01777fb265..8d4db6f82c05a41a0945f2c882caefd2a3c83d36 100755 --- a/core/utils/dataloader_instance.py +++ b/core/utils/dataloader_instance.py @@ -14,7 +14,6 @@ from __future__ import print_function import os -import sys from paddlerec.core.utils.envs import lazy_instance_by_fliename from paddlerec.core.utils.envs import get_global_env diff --git a/core/utils/dataset.py b/core/utils/dataset_holder.py similarity index 96% rename from core/utils/dataset.py rename to core/utils/dataset_holder.py index 0eff8da071920b9931ce073ff94abe253baa31a0..cd195450336cac0265f76670ca0e3fa24c45a7ba 100755 --- a/core/utils/dataset.py +++ b/core/utils/dataset_holder.py @@ -13,8 +13,8 @@ # limitations under the License. import abc -import time import datetime +import time import paddle.fluid as fluid @@ -22,7 +22,7 @@ from paddlerec.core.utils import fs as fs from paddlerec.core.utils import util as util -class Dataset(object): +class DatasetHolder(object): """ Dataset Base """ @@ -62,7 +62,7 @@ class Dataset(object): pass -class TimeSplitDataset(Dataset): +class TimeSplitDatasetHolder(DatasetHolder): """ Dataset with time split dir. root_path/$DAY/$HOUR """ @@ -142,16 +142,6 @@ class TimeSplitDataset(Dataset): data_time = data_time + datetime.timedelta(minutes=self._split_interval) return data_file_list - -class FluidTimeSplitDataset(TimeSplitDataset): - """ - A Dataset with time split for PaddleFluid - """ - - def __init__(self, config): - """ """ - TimeSplitDataset.__init__(self, config) - def _alloc_dataset(self, file_list): """ """ dataset = fluid.DatasetFactory().create_dataset(self._config['dataset_type']) diff --git a/core/utils/dataset_instance.py b/core/utils/dataset_instance.py index d6b3742c8eeec86e054e2512085aa014100d2239..f5175c48df978919c51519d027561011bd3ceb44 100755 --- a/core/utils/dataset_instance.py +++ b/core/utils/dataset_instance.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function import sys diff --git a/core/utils/envs.py b/core/utils/envs.py index 7f6cfb61d3997ce6a673bae10919779ef696b054..4fe57b32a6a534c630005d42e66b1aedc7972089 100755 --- a/core/utils/envs.py +++ b/core/utils/envs.py @@ -12,11 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os +from contextlib import closing import copy -import sys +import os import socket -from contextlib import closing +import sys global_envs = {} @@ -89,22 +89,38 @@ def get_global_envs(): return global_envs +def path_adapter(path): + def adapt(l_p): + if get_platform() == "WINDOWS": + adapted_p = l_p.split("paddlerec.")[1].replace(".", "\\") + else: + adapted_p = l_p.split("paddlerec.")[1].replace(".", "/") + return adapted_p + + if path.startswith("paddlerec."): + package = get_runtime_environ("PACKAGE_BASE") + return os.path.join(package, adapt(path)) + else: + return adapt(path) + + +def windows_path_converter(path): + if get_platform() == "WINDOWS": + return path.replace("/", "\\") + else: + return path.replace("\\", "/") + + def update_workspace(): workspace = global_envs.get("train.workspace", None) if not workspace: return - - # is fleet inner models - if workspace.startswith("paddlerec."): - fleet_package = get_runtime_environ("PACKAGE_BASE") - workspace_dir = workspace.split("paddlerec.")[1].replace(".", "/") - path = os.path.join(fleet_package, workspace_dir) - else: - path = workspace + workspace = path_adapter(workspace) for name, value in global_envs.items(): if isinstance(value, str): - value = value.replace("{workspace}", path) + value = value.replace("{workspace}", workspace) + value = windows_path_converter(value) global_envs[name] = value diff --git a/core/utils/fs.py b/core/utils/fs.py index df2379721041197170945ebfd03ccf8f8cc61d0a..836c6f598b9c423b0922e30f536a669c55e83098 100755 --- a/core/utils/fs.py +++ b/core/utils/fs.py @@ -13,6 +13,7 @@ # limitations under the License. import os + from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient @@ -28,12 +29,12 @@ class LocalFSClient(object): """ Util for local disk file_system io """ - + def __init__(self): """R """ pass - + def write(self, content, path, mode): """ write to file @@ -43,7 +44,7 @@ class LocalFSClient(object): mode(string): w/a w:clear_write a:append_write """ temp_dir = os.path.dirname(path) - if not os.path.exists(temp_dir): + if not os.path.exists(temp_dir): os.makedirs(temp_dir) f = open(path, mode) f.write(content) @@ -75,7 +76,7 @@ class LocalFSClient(object): """R """ os.system("rm -rf " + path) - + def is_exist(self, path): """R """ @@ -94,13 +95,14 @@ class FileHandler(object): """ A Smart file handler. auto judge local/afs by path """ + def __init__(self, config): """R """ if 'fs_name' in config: - hadoop_home="$HADOOP_HOME" + hadoop_home = "$HADOOP_HOME" hdfs_configs = { - "hadoop.job.ugi": config['fs_ugi'], + "hadoop.job.ugi": config['fs_ugi'], "fs.default.name": config['fs_name'] } self._hdfs_client = HDFSClient(hadoop_home, hdfs_configs) @@ -131,7 +133,8 @@ class FileHandler(object): if mode.find('a') >= 0: org_content = self._hdfs_client.cat(dest_path) content = content + org_content - self._local_fs_client.write(content, temp_local_file, mode) #fleet hdfs_client only support upload, so write tmp file + self._local_fs_client.write(content, temp_local_file, + mode) # fleet hdfs_client only support upload, so write tmp file self._hdfs_client.delete(dest_path + ".tmp") self._hdfs_client.upload(dest_path + ".tmp", temp_local_file) self._hdfs_client.delete(dest_path + ".bak") @@ -139,7 +142,7 @@ class FileHandler(object): self._hdfs_client.rename(dest_path + ".tmp", dest_path) else: self._local_fs_client.write(content, dest_path, mode) - + def cat(self, path): """R """ @@ -148,7 +151,7 @@ class FileHandler(object): return hdfs_cat else: return self._local_fs_client.cat(path) - + def ls(self, path): """R """ @@ -160,7 +163,7 @@ class FileHandler(object): files = self._local_fs_client.ls(path) files = [path + '/' + fi for fi in files] # absulte path return files - + def cp(self, org_path, dest_path): """R """ @@ -170,6 +173,6 @@ class FileHandler(object): return self._local_fs_client.cp(org_path, dest_path) if not org_is_afs and dest_is_afs: return self._hdfs_client.upload(dest_path, org_path) - if org_is_afs and not dest_is_afs: + if org_is_afs and not dest_is_afs: return self._hdfs_client.download(org_path, dest_path) print("Not Suppor hdfs cp currently") diff --git a/core/utils/table.py b/core/utils/table.py index 7c86d0f03edb47a7835ab7364ebe318985d091e2..558cd26d61b1be165e964b4dea3a1f3dfe82e0ba 100755 --- a/core/utils/table.py +++ b/core/utils/table.py @@ -12,16 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -import copy -import yaml - class TableMeta(object): """ Simple ParamTable Meta, Contain table_id """ TableId = 1 - + @staticmethod def alloc_new_table(table_id): """ diff --git a/core/utils/util.py b/core/utils/util.py index 76965f50ccf298dd63bd67e1c74d4f3d23c2fe72..bd63284873b6c6be80c9849f40535cebe1b7fb14 100755 --- a/core/utils/util.py +++ b/core/utils/util.py @@ -12,11 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import datetime import os import time -import datetime from paddle import fluid + from paddlerec.core.utils import fs as fs diff --git a/doc/custom_dataset_reader.md b/doc/custom_dataset_reader.md index 8133ab2cdb90f1fb3d495a73b933c19836a235d6..82b0fd12d4f52fe83155fd371f6041be52c8bcba 100644 --- a/doc/custom_dataset_reader.md +++ b/doc/custom_dataset_reader.md @@ -70,39 +70,189 @@ self._dense_data_var # PaddleRec 自定义数据集及Reader -## 数据集及reader配置简介 +用户自定义数据集及配置异步Reader,需要关注以下几个步骤: -以`ctr-dnn`模型举例: +* [数据集整理](#数据集整理) +* [在模型组网中加入输入占位符](#在模型组网中加入输入占位符) +* [Reader实现](#Reader的实现) +* [在yaml文件中配置Reader](#在yaml文件中配置reader) -```yaml -reader: - batch_size: 2 - class: "{workspace}/../criteo_reader.py" - train_data_path: "{workspace}/data/train" - reader_debug_mode: False +我们以CTR-DNN模型为例,给出了从数据整理,变量定义,Reader写法,调试的完整历程。 + +* [数据及Reader示例-DNN](#数据及Reader示例-DNN) + + +## 数据集整理 + +PaddleRec支持模型自定义数据集。 + +关于数据的tips: +1. 数据量: + + PaddleRec面向大规模数据设计,可以轻松支持亿级的数据读取,工业级的数据读写api:`dataset`在搜索、推荐、信息流等业务得到了充分打磨。 +2. 文件类型: + + 支持任意直接可读的文本数据,`dataset`同时支持`.gz`格式的文本压缩数据,无需额外代码,可直接读取。数据样本应以`\n`为标志,按行组织。 + +3. 文件存放位置: + + 文件通常存放在训练节点本地,但同时,`dataset`支持使用`hadoop`远程读取数据,数据无需下载到本地,为dataset配置hadoop相关账户及地址即可。 +4. 数据类型 + + Reader处理的是以行为单位的`string`数据,喂入网络的数据需要转为`int`,`float`的数值数据,不支持`string`喂入网络,不建议明文保存及处理训练数据。 +5. Tips + + Dataset模式下,训练线程与数据读取线程的关系强相关,为了多线程充分利用,`强烈建议将文件合理的拆为多个小文件`,尤其是在分布式训练场景下,可以均衡各个节点的数据量,同时加快数据的下载速度。 + +## 在模型组网中加入输入占位符 + +Reader读取文件后,产出的数据喂入网络,需要有占位符进行接收。占位符在Paddle中使用`fluid.data`或`fluid.layers.data`进行定义。`data`的定义可以参考[fluid.data](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/data_cn.html#data)以及[fluid.layers.data](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/layers_cn/data_cn.html#data)。 + +加入您希望输入三个数据,分别是维度32的数据A,维度变长的稀疏数据B,以及一个一维的标签数据C,并希望梯度可以经过该变量向前传递,则示例如下: + +数据A的定义: +```python +var_a = fluid.data(name='A', shape= [-1, 32], dtype='float32') +``` + +数据B的定义,变长数据的使用可以参考[LoDTensor](https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/basic_concept/lod_tensor.html#cn-user-guide-lod-tensor): +```python +var_b = fluid.data(name='B', shape=[-1, 1], lod_level=1, dtype='int64') ``` -有以上4个需要重点关注的配置选项: -- batch_size: 网络进行小批量训练的一组数据的大小 -- class: 指定数据处理及读取的`reader` python文件 -- train_data_path: 训练数据所在地址 -- reader_debug_mode: 测试reader语法,及输出是否符合预期的debug模式的开关 +数据C的定义: +```python +var_c = fluid.data(name='C', shape=[-1, 1], dtype='int32') +var_c.stop_gradient = False +``` + +当我们完成以上三个数据的定义后,在PaddleRec的模型定义中,还需将其加入model基类成员变量`self._data_var` + +```python +self._data_var.append(var_a) +self._data_var.append(var_b) +self._data_var.append(var_c) +``` +至此,我们完成了在组网中定义输入数据的工作。 + +## Reader的实现 + +### Reader的实现范式 + +Reader的逻辑需要一个单独的python文件进行描述。我们试写一个`test_reader.py`,实现的具体流程如下: +1. 首先我们需要引入Reader基类 + + ```python + from paddlerec.core.reader import Reader + ``` +2. 创建一个子类,继承Reader的基类,训练所需Reader命名为`TrainerReader` + ```python + class TrainerReader(Reader): + def init(self): + pass + + def generator_sample(self, line): + pass + ``` + +3. 在`init(self)`函数中声明一些在数据读取中会用到的变量,必要时可以在`config.yaml`文件中配置变量,利用`env.get_global_env()`拿到。 + + 比如,我们希望从yaml文件中读取一个数据预处理变量`avg=10`,目的是将数据A的数据缩小10倍,可以这样实现: + + 首先更改yaml文件,在某个space下加入该变量 + + ```yaml + ... + train: + reader: + avg: 10 + ... + ``` + + + 再更改Reader的init函数 + + ```python + from paddlerec.core.utils import envs + class TrainerReader(Reader): + def init(self): + self.avg = envs.get_global_env("avg", None, "train.reader") + + def generator_sample(self, line): + pass + ``` + +4. 继承并实现基类中的`generate_sample(self, line)`函数,逐行读取数据。 + - 该函数应返回一个可以迭代的reader方法(带有yield的函数不再是一个普通的函数,而是一个生成器generator,成为了可以迭代的对象,等价于一个数组、链表、文件、字符串etc.) + - 在这个可以迭代的函数中,如示例代码中的`def reader()`,我们定义数据读取的逻辑。以行为单位的数据进行截取,转换及预处理。 + - 最后,我们需要将数据整理为特定的格式,才能够被PaddleRec的Reader正确读取,并灌入的训练的网络中。简单来说,数据的输出顺序与我们在网络中创建的`inputs`必须是严格一一对应的,并转换为类似字典的形式。 + + 示例: 假设数据ABC在文本数据中,每行以这样的形式存储: + ```shell + 0.1,0.2,0.3...3.0,3.1,3.2 \t 99999,99998,99997 \t 1 \n + ``` + + 则示例代码如下: + ```python + from paddlerec.core.utils import envs + class TrainerReader(Reader): + def init(self): + self.avg = envs.get_global_env("avg", None, "train.reader") + + def generator_sample(self, line): + + def reader(self, line): + # 先分割 '\n', 再以 '\t'为标志分割为list + variables = (line.strip('\n')).split('\t') + + # A是第一个元素,并且每个数据之间使用','分割 + var_a = variables[0].split(',') # list + var_a = [float(i) / self.avg for i in var_a] # 将str数据转换为float + + + # B是第二个元素,同样以 ',' 分割 + var_b = variables[1].split(',') # list + var_b = [int(i) for i in var_b] # 将str数据转换为int + + # C是第三个元素, 只有一个元素,没有分割符 + var_c = variables[2] + var_c = int(var_c) # 将str数据转换为int + var_c = [var_c] # 将单独的数据元素置入list中 + + # 将数据与数据名结合,组织为dict的形式 + # 如下,output形式为{ A: var_a, B: var_b, C: var_c} + variable_name = ['A', 'B', 'C'] + output = zip(variable_name, [var_a] + [var_b] + [var_c]) + + # 将数据输出,使用yield方法,将该函数变为了一个可迭代的对象 + yield output + + ``` + + 至此,我们完成了Reader的实现。 + -## 自定义数据集 +### 在yaml文件中配置Reader -PaddleRec支持模型自定义数据集,在model.config.yaml文件中的reader部分,通过`train_data_path`指定数据读取路径。 +在模型的yaml配置文件中,主要的修改是三个,如下 -关于数据的tips +```yaml +reader: + batch_size: 2 + class: "{workspace}/reader.py" + train_data_path: "{workspace}/data/train_data" + reader_debug_mode: False +``` -- PaddleRec 面向的是推荐与搜索领域,数据以文本格式为主 -- Dataset模式支持读取文本数据压缩后的`.gz`格式 -- Dataset模式下,训练线程与数据读取线程的关系强相关,为了多线程充分利用,`强烈建议将文件拆成多个小文件`,尤其是在分布式训练场景下,可以均衡各个节点的数据量。 +batch_size: 顾名思义,是小批量训练时的样本大小 +class: 运行改模型所需reader的路径 +train_data_path: 训练数据所在文件夹 +reader_debug_mode: 测试reader语法,及输出是否符合预期的debug模式的开关 -## 自定义Reader -数据集准备就绪后,需要适当修改或重写一个新的reader以适配数据集或新组网。 +## 数据及Reader示例-DNN -我们以`ctr-dnn`网络举例`reader`的正确打开方式,网络文件位于`models/rank/dnn`。 +Reader代码来源于[criteo_reader.py](../models/rank/criteo_reader.py), 组网代码来源于[model.py](../models/rank/dnn/model.py) ### Criteo数据集格式 @@ -170,13 +320,8 @@ for input in self.sparse_inputs: self._data_var.append(self.label_input) -if self._platform != "LINUX": - self._data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False) ``` -若运行于**Linux**环境下,默认使用**dataset**模式读取数据集;若运行于**windows**或**mac**下,默认使用**dataloader**模式读取数据集。以上两种方法是paddle.io中提供的不同模式,`dataset`运行速度更快,但依赖于linux的环境,因此会有该逻辑判断。 -> Paddle的组网中不支持数据输入为`str`类型,`强烈不建议使用明文保存和读取数据` ### Criteo Reader写法 @@ -235,24 +380,6 @@ class TrainReader(Reader): return reader ``` -### 如何自定义数据读取规则 - -在上文我们看到了由`criteo_reader.py`实现具体的数据读取规则,那么,怎样为自己的数据集写规则呢? - -具体流程如下: -1. 首先我们需要引入Reader基类 - - ```python - from paddlerec.core.reader import Reader - ``` -2. 创建一个子类,继承Reader的基类,训练所需Reader命名为`TrainerReader` -3. 在`init(self)`函数中声明一些在数据读取中会用到的变量,如示例代码中的`cont_min_`、`categorical_range_`等,必要时可以在`config.yaml`文件中配置变量,通过`env.get_global_env()`拿到。 -4. 继承并实现基类中的`generate_sample(self, line)`函数,逐行读取数据。该函数应返回一个可以迭代的reader方法(带有yield的函数不再是一个普通的函数,而是一个生成器generator,成为了可以迭代的对象,等价于一个数组、链表、文件、字符串etc.) -5. 在这个可以迭代的函数中,如示例代码中的`def reader()`,我们定义数据读取的逻辑。以行为单位的数据进行截取,转换及预处理。 -6. 最后,我们需要将数据整理为特定的格式,才能够被dataset正确读取,并灌入的训练的网络中。简单来说,数据的输出顺序与我们在网络中创建的`inputs`必须是严格一一对应的,并转换为类似字典的形式。在示例代码中,我们使用`zip`的方法将参数名与数值构成的元组组成了一个list,并将其yield输出。如果展开来看,我们输出的数据形如 - - `[('dense_feature',[value]),('C1',[value]),('C2',[value]),...,('C26',[value]),('label',[value])]` - ### 调试Reader diff --git a/doc/design.md b/doc/design.md index 6b16a2428a029adb2ce8f77e55a53c0eced1d44d..2975d77f14e461547921f74b9ced5cf73703e2e7 100644 --- a/doc/design.md +++ b/doc/design.md @@ -1,2 +1,282 @@ # PaddleRec 设计 -> 占位 \ No newline at end of file + + +## PaddleRec 整体设计概览 +PaddleRec将推荐模型的训练与预测流程,整体抽象为了五个大模块: + +* [Engine 流程执行引擎](#engine) +* [Trainer 流程具体定义](#trainer) +* [Model 模型组网定义](#model) +* [Reader 数据读取定义](#reader) +* [Metric 精度指标打印](#metric) + +层级结构,以及一键启动训练时的调用关系如下图所示: + +

+ +

+ +core的文件结构如下,后续分别对各个模块进行介绍。 +``` +.core +├── engine/ 运行引擎实现 +├── metrics/ 全局指标实现 +├── modules/ 自定义op实现 +├── trainers/ 运行流程实现 +├── utils/ 辅助工具 +├── factory.py 运行流程的注册 +├── layer.py 自定义op基类定义 +├── metric.py Metric基类定义 +├── model.py Model基类定义 +├── reader.py Reader基类定义 +└── trainer.py Trainer基类定义 +``` + + +## Engine + +Engine是整体训练的执行引擎,与组网逻辑及数据无关,只与当前运行模式、运行环境及运行设备有关。 + +运行模式具体是指: +- 单机运行 +- 分布式运行 +- 本地模拟分布式 + +运行环境是指: +- Linux +- Windows +- Mac + +运行设备是指: +- CPU +- GPU +- AI芯片 + +在用户调用`python -m paddlerec.run`时,首先会根据`yaml`文件中的配置信息选择合适的执行引擎, 以下代码位于[run.py](../run.py): +```python +engine_registry() +which_engine = get_engine(args) +engine = which_engine(args) +engine.run() +``` + +我们以`single engine`为例,概览engine的行为: +```python +def single_engine(args): + trainer = get_trainer_prefix(args) + "SingleTrainer" + single_envs = {} + single_envs["train.trainer.trainer"] = trainer + single_envs["train.trainer.threads"] = "2" + single_envs["train.trainer.engine"] = "single" + single_envs["train.trainer.device"] = args.device + single_envs["train.trainer.platform"] = envs.get_platform() + print("use {} engine to run model: {}".format(trainer, args.model)) + + set_runtime_envs(single_envs, args.model) + trainer = TrainerFactory.create(args.model) + return trainer +``` +single_engine被调用后,主要进行了以下两个工作: + +1. 根据`yaml`配置文件,设置了**当前进程的环境变量**,后续的所有流程都依赖于环境变量。 +2. 根据模型及环境,指定并初始化了运行流程所用的`Trainer` + +进一步细化第一步工作 +- 本地模拟分布式引擎会在单机环境变量的基础上,额外设置本地模拟分布式的环境变量,比如:为各个进程设置不同通信端口,分配ID。最后会启动多个`Trainer`完成本地模拟分布式的工作。 +- 分布式引擎会在单机环境变量的基础上,基于运行参数`-b --backend`所指定的脚本或配置文件,完成分布式任务的文件打包,上传,提交等操作。该脚本格式与分布式任务运行的集群有关,如MPI/K8S/PaddleCloud等,用户可以自定义分布式运行逻辑。 + +Engine的自定义实现,可以参考[local_cluster.py](../core/engine/local_cluster.py) + +## Trainer + +`Trainer`是训练与预测流程的具体实现,会run模型中定义的各个流程,与model、reader、metric紧密相关。PaddleRec以有限状态机的逻辑定义了训练中的各个阶段,不同的Trainer子类会分别实现阶段中的特殊需求。有限状态机的流程在`def processor_register()`中注册。 + +我们以SingleTrainer为例,概览Trainer行为: + +```python +class SingleTrainer(TranspileTrainer): + def processor_register(self): + self.regist_context_processor('uninit', self.instance) + self.regist_context_processor('init_pass', self.init) + self.regist_context_processor('startup_pass', self.startup) + if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None, "train.reader") != "DataLoader": + self.regist_context_processor('train_pass', self.dataset_train) + else: + self.regist_context_processor('train_pass', self.dataloader_train) + + self.regist_context_processor('infer_pass', self.infer) + self.regist_context_processor('terminal_pass', self.terminal) +``` + +SingleTrainer首先注册了完成任务所需的步骤,各步骤首先按照注册顺序加入`Trainer`基类中名为`status_processor`的字典,运行的先后顺序,可以在每个执行步骤中改变`context['status']`的值,指定下一步运行哪个步骤。 + +SingleTrainer指定了以下6个步骤: +1. uninit:默认排在首位,通过环境变量决定model的对象 +1. init_pass:调用model_的接口,生成模型的组网,初始化fetch及metric的变量 +2. startup_pass:初始化模型组网中的各个参数,run(fluid.default_startup_program) +3. train_pass:会根据环境分别调用`dataset`与`dataloader`进行训练的流程。 +4. infer_pass:在训练结束后,会对训练保存的模型在测试集上验证效果 +5. terminal_pass:打印全局变量及预测结果等自定义的信息。 + +Trainer的自定义实现,可以参照[single_trainer.py](../core/trainers/single_trainer.py) + +## Model + +Model定义了各个模型实现的范式,模型只要继承并实现基类中的函数,并给一些成员赋值,就可以保证模型被Trainer正确调用。 + +我们首先看一下Model基类中的部分重要定义,对模型的实现流程有初步概念。 + +```python +class Model(object): + __metaclass__ = abc.ABCMeta + + def __init__(self, config): + self._cost = None + self._metrics = {} + self._data_var = [] + self._infer_data_var = [] + self._infer_results = {} + self._data_loader = None + self._infer_data_loader = None + self._fetch_interval = 20 + self._namespace = "train.model" + self._platform = envs.get_platform() + + def get_inputs(self): + return self._data_var + + @abc.abstractmethod + def train_net(self): + pass + + @abc.abstractmethod + def infer_net(self): + pass + + def get_avg_cost(self): + return self._cost + +``` + +每个模型都一定需要继承`def train_net`与`def infer_net`,并且给`self._data_var`与`self._cost`成员赋值,指定模型入口,实现组网的整体逻辑。若有更多或更复杂的需求,可以参照下面的接口,分别继承各个函数,并实现需要的功能: + +```python +def get_infer_inputs(self): + return self._infer_data_var + +def get_infer_results(self): + return self._infer_results + +def get_metrics(self): + return self._metrics + +def get_fetch_period(self): + return self._fetch_interval +``` + +model的具体实现,可以参考dnn的示例[model.py](../../models/rank/dnn/../../../paddlerec/core/model.py) + + +## Reader + +PaddleRec会根据运行环境,分别指定不同的数据IO方式。在Linux下,优先使用`Dataset`,Win及Mac优先使用`Dataloader`。 + + +Dataset的使用介绍可以参考[DatasetFactory](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/dataset_cn/DatasetFactory_cn.html) + +Dataloader的使用介绍可以参考[异步数据读取](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/data_preparing/use_py_reader.html) + + +考虑到以上两种高效的数据IO方式仍然有很高的学习门槛,PaddleRec将两种数据读取方式进行了更高层次的封装,用户需要实现的仅是每行数据的处理逻辑,剩下的工作交给PaddleRec的Reader基类完成。 + +首先浏览以下Reader基类的定义,有一个初步的印象: + +```python +class Reader(dg.MultiSlotDataGenerator): + __metaclass__ = abc.ABCMeta + + def __init__(self, config): + dg.MultiSlotDataGenerator.__init__(self) + + if os.path.isfile(config): + with open(config, 'r') as rb: + _config = yaml.load(rb.read(), Loader=yaml.FullLoader) + else: + raise ValueError("reader config only support yaml") + + envs.set_global_envs(_config) + envs.update_workspace() + + @abc.abstractmethod + def init(self): + pass + + @abc.abstractmethod + def generate_sample(self, line): + pass + +``` + +用户需要关注并实现的是`def init(self)`与`def generate_sample(self,line)`函数,分别执行数据读取中预处理所需变量的初始化,以及每一行string的切分及处理逻辑。 + +当用户定义好以上两个函数,完成自己的Reader后,PaddleRec分别使用 +- [dataset_instance.py](../core/utils/dataset_instance.py) +- [dataloader_instance.py](../core/utils/dataloader_instance.py) + +完成reader的构建工作。 + +Reader数据处理的逻辑,可以参考[criteo_reader.py](../../models/rank/../../paddlerec/models/rank/criteo_reader.py) + + + +## Metric + +训练必然伴随着训练指标的打印,当单机运行时,打印相关信息比较简单。但分布式训练时,单机指标与全局指标往往有很大diff,比如`auc`以及正逆序`pn`。PaddleRec面向大规模分布式训练,将指标打印的逻辑抽象出来单独实现,以解决分布式训练时全局指标打印的问题。 + +Metric基类定义了基本的接口,如下: +```python +class Metric(object): + __metaclass__ = abc.ABCMeta + + def __init__(self, config): + """ init """ + pass + + @abc.abstractmethod + def clear(self, scope, params): + """ + clear current value + Args: + scope: value container + params: extend varilable for clear + """ + pass + + @abc.abstractmethod + def calculate(self, scope, params): + """ + calculate result + Args: + scope: value container + params: extend varilable for clear + """ + pass + + @abc.abstractmethod + def get_result(self): + """ + Return: + result(dict) : calculate result + """ + pass + + @abc.abstractmethod + def get_result_to_string(self): + """ + Return: + result(string) : calculate result with string format, for output + """ + pass +``` + +全局指标的计算及输出,需要分别继承并实现以上四个成员函数。具体实现的例子,可以参考[auc_metric.py](../core/metrics/auc_metrics.py) \ No newline at end of file diff --git a/doc/distributed_train.md b/doc/distributed_train.md index c28425f7aa9a50fd165a4071b2d09a28694a4ab1..425f141ab76e173a9484dff90cd5cfb55acaf853 100644 --- a/doc/distributed_train.md +++ b/doc/distributed_train.md @@ -4,10 +4,7 @@ > 占位 ### 本地模拟分布式 > 占位 -### MPI集群运行分布式 -> 占位 -### PaddleCloud集群运行分布式 -> 占位 + ### K8S集群运行分布式 > 占位 diff --git a/doc/imgs/design.png b/doc/imgs/design.png new file mode 100644 index 0000000000000000000000000000000000000000..740112697e5d6ce82521446ac05336cad33d6324 Binary files /dev/null and b/doc/imgs/design.png differ diff --git a/doc/imgs/dssm.png b/doc/imgs/dssm.png new file mode 100644 index 0000000000000000000000000000000000000000..c1c462b825225ce74f9669dc1b301ced8823287d Binary files /dev/null and b/doc/imgs/dssm.png differ diff --git a/doc/imgs/gnn.png b/doc/imgs/gnn.png new file mode 100644 index 0000000000000000000000000000000000000000..8a5e111e92853a92cd04df5de1c347f85afde56f Binary files /dev/null and b/doc/imgs/gnn.png differ diff --git a/doc/imgs/gru4rec.png b/doc/imgs/gru4rec.png new file mode 100644 index 0000000000000000000000000000000000000000..ab7f6074b15ee85ce6f24e19ad74f3c16a72559d Binary files /dev/null and b/doc/imgs/gru4rec.png differ diff --git a/doc/imgs/multiview-simnet.png b/doc/imgs/multiview-simnet.png new file mode 100644 index 0000000000000000000000000000000000000000..01c6974d6fb60b317b741669a41f2eecd949ca57 Binary files /dev/null and b/doc/imgs/multiview-simnet.png differ diff --git a/doc/imgs/overview.png b/doc/imgs/overview.png index f6e8186ebcae25eb75cf4e71dac18a00b6cb814c..f0d951fddfc6d498ec6b4a0636bdafc95dc966bc 100644 Binary files a/doc/imgs/overview.png and b/doc/imgs/overview.png differ diff --git a/doc/imgs/rec-overview.png b/doc/imgs/rec-overview.png new file mode 100644 index 0000000000000000000000000000000000000000..952bc447b7a2b602e9a2348d403092afaee1af08 Binary files /dev/null and b/doc/imgs/rec-overview.png differ diff --git a/doc/imgs/ssr.png b/doc/imgs/ssr.png new file mode 100644 index 0000000000000000000000000000000000000000..03326c1681c56cfaa53ea6e0593b3fdee0c016e5 Binary files /dev/null and b/doc/imgs/ssr.png differ diff --git a/doc/imgs/word2vec.png b/doc/imgs/word2vec.png new file mode 100644 index 0000000000000000000000000000000000000000..947ff6174685fd0ce24632d3c75487aa3d011df3 Binary files /dev/null and b/doc/imgs/word2vec.png differ diff --git a/doc/ps_background.md b/doc/ps_background.md index 567c30a9672be8b3fa02da22e755555c68d38ea8..984e1b00c96242843cceaf68cf15bb8deb52c391 100644 --- a/doc/ps_background.md +++ b/doc/ps_background.md @@ -1,140 +1,8 @@ -# 参数服务器训练简介 +## [分布式训练概述](https://www.paddlepaddle.org.cn/tutorials/projectdetail/459124) -如图1所示,参数服务器是分布式训练领域普遍采用的编程架构,主要包含Server和Worker两个部分,其中Server负责参数的存储和更新,而Worker负责训练。飞桨的参数服务器功能也是基于这种经典的架构进行设计和开发的,同时在这基础上进行了SGD(Stochastic Gradient Descent)算法的创新(Geometric Stochastic Gradient Descent)。当前经过大量的实验验证,最佳的方案是每台机器上启动Server和Worker两个进程,而一个Worker进程中可以包含多个用于训练的线程。 -

- -

+## [多机多卡训练](https://www.paddlepaddle.org.cn/tutorials/projectdetail/459127) -参数服务器是主要解决两类问题: -- 模型参数过大:单机内存空间不足,需要采用分布式存储。 -- 训练数据过多:单机训练太慢,需要加大训练节点,来提高并发训练速度。 设想,当训练数据过多,一个Worker训练太慢时,可以引入多个Worker同时训练,这时Worker之间需要同步模型参数。直观想法是,引入一个Server,Server充当Worker间参数交换的媒介。但当模型参数过大以至于单机存储空间不足时或Worker过多导致一个Server是瓶颈时,就需要引入多个Server。 +## [参数服务器训练](https://www.paddlepaddle.org.cn/tutorials/projectdetail/464839) - -具体训练流程: - -- 将训练数据均匀的分配给不同的Worker。 -- 将模型参数分片,存储在不同的Server上。 -- Worker端:读取一个minibatch训练数据,从Server端拉取最新的参数,计算梯度,并根据分片上传给不同的Server。 -- Server端:接收Worker端上传的梯度,根据优化算法更新参数。根据Server端每次参数更新是否需要等待所有Worker端的梯度,分为同步训练和异步训练两种机制。 - - -飞桨的参数服务器框架也是基于这种经典的参数服务器模式进行设计和开发的,同时在这基础上进行了SGD(Stochastic Gradient Descent)算法的创新(GEO-SGD)。目前飞桨支持3种模式,分别是同步训练模式、异步训练模式、GEO异步训练模式,如图2所示。 - -

- -

- -## 同步训练 - -Worker在训练一个batch的数据后,会合并所有线程的梯度发给Server, Server在收到所有节点的梯度后,会统一进行梯度合并及参数更新。同步训练的优势在于Loss可以比较稳定的下降,缺点是整个训练速度较慢,这是典型的木桶原理,速度的快慢取决于最慢的那个线程的训练计算时间,因此在训练较为复杂的模型时,即模型训练过程中神经网络训练耗时远大于节点间通信耗时的场景下,推荐使用同步训练模式。 - -## 异步训练 - -在训练一个batch的数据后,Worker的每个线程会发送梯度给Server。而Server不会等待接收所有节点的梯度,而是直接基于已收到的梯度进行参数更新。异步训练去除了训练过程中的等待机制,训练速度得到了极大的提升,但是缺点也很明显,那就是Loss下降不稳定,容易发生抖动。建议在个性化推荐(召回、排序)、语义匹配等数据量大的场景使用。 尤其是推荐领域的点击率预估场景,该场景可能会出现千亿甚至万亿规模的稀疏特征,而稀疏参数也可以达到万亿数量级,且需要小时级或分钟级流式增量训练。如果使用异步训练模式,可以很好的满足该场景的online-learning需求。 - -## GEO异步训练 - -GEO(Geometric Stochastic Gradient Descent)异步训练是飞桨自研的异步训练模式,其最大的特点是将参数的更新从Server转移到Worker上。每个Worker在本地训练过程中会使用SGD优化算法更新本地模型参数,在训练若干个batch的数据后,Worker将发送参数更新信息给Server。Server在接收后会通过加和方式更新保存的参数信息。所以显而易见,在GEO异步训练模式下,Worker不用再等待Server发来新的参数即可执行训练,在训练效果和训练速度上有了极大的提升。但是此模式比较适合可以在单机内能完整保存的模型,在搜索、NLP等类型的业务上应用广泛,推荐在词向量、语义匹配等场景中使用。 - -> 运行策略的详细描述可以参考文档:[PaddlePaddle Fluid CPU分布式训练(Trainspiler)使用指南](https://www.paddlepaddle.org.cn/tutorials/projectdetail/454253) - -## 单机代码转分布式 - -### 训练代码准备 -参数服务器架构,有两个重要的组成部分:Server与Worker。为了启动训练,我们是否要准备两套代码分别运行呢?答案是不需要的。Paddle Fleet API将两者运行的逻辑进行了很好的统一,用户只需使用`fleet.init(role)`就可以判断当前启动的程序扮演server还是worker。使用如下的编程范式,只需10行,便可将单机代码转变为分布式代码: -``` python -role = role_maker.PaddleCloudRoleMaker() -fleet.init(role) - -# Define your network, choose your optimizer(SGD/Adam/Adagrad etc.) -strategy = StrategyFactory.create_sync_strategy() -optimizer = fleet.distributed_optimizer(optimizer, strategy) - -if fleet.is_server(): - fleet.init_server() - fleet.run_server() -if fleet.is_worker(): - fleet.init_worker() - # run training - fleet.stop_worker() -``` - -### 运行环境准备 -- Paddle参数服务器模式的训练,目前只支持在`Liunx`环境下运行,推荐使用`ubuntu`或`CentOS` -- Paddle参数服务器模式的前端代码支持`python 2.7`及`python 3.5+`,若使用`Dataset`模式的高性能IO,需使用`python 2.7` -- 使用多台机器进行分布式训练,请确保各自之间可以通过`ip:port`的方式访问`rpc`服务,使用`http/https`代理会导致通信失败 -- 各个机器之间的通信耗费应尽量少 - -假设我们有两台机器,想要在每台机器上分别启动一个`server`进程以及一个`worker`进程,完成2x2(2个参数服务器,2个训练节点)的参数服务器模式分布式训练,按照如下步骤操作。 - -### 启动server -机器A,IP地址是`10.89.176.11`,通信端口是`36000`,配置如下环境变量后,运行训练的入口程序: -```bash -export PADDLE_PSERVERS_IP_PORT_LIST="10.89.176.11:36000,10.89.176.12:36000" -export TRAINING_ROLE=PSERVER -export POD_IP=10.89.176.11 # node A:10.89.176.11 -export PADDLE_PORT=36000 -export PADDLE_TRAINERS_NUM=2 -python -u train.py --is_cloud=1 -``` -应能在日志中看到如下输出: - -> I0318 21:47:01.298220 188592128 grpc_server.cc:470] Server listening on 127.0.0.1:36000 selected port: 36000 - -查看系统进程 -> 8624 | ttys000 | 0:02.31 | python -u train.py --is_cloud=1 - -查看系统进程及端口占用: - -> python3.7 | 8624 | paddle | 8u | IPv6 | 0xe149b87d093872e5 | 0t0 | TCP | localhost:36000 (LISTEN) - -也可以看到我们的`server`进程8624的确在`36000`端口开始了监听,等待`worker`的通信。 - -机器B,IP地址是`10.89.176.12`,通信端口是`36000`,配置如下环境变量后,运行训练的入口程序: -```bash -export PADDLE_PSERVERS_IP_PORT_LIST="10.89.176.11:36000,10.89.176.12:36000" -export TRAINING_ROLE=PSERVER -export POD_IP=10.89.176.12 # node B: 10.89.176.12 -export PADDLE_PORT=36000 -export PADDLE_TRAINERS_NUM=2 -python -u train.py --is_cloud=1 -``` -也可以看到相似的日志输出与进程状况。(进行验证时,请务必确保IP与端口的正确性) - -### 启动worker - -接下来我们分别在机器A与B上开启训练进程。配置如下环境变量并开启训练进程: - -机器A: -```bash -export PADDLE_PSERVERS_IP_PORT_LIST="10.89.176.11:36000,10.89.176.12:36000" -export TRAINING_ROLE=TRAINER -export PADDLE_TRAINERS_NUM=2 -export PADDLE_TRAINER_ID=0 # node A:trainer_id = 0 -python -u train.py --is_cloud=1 -``` - -机器B: -```bash -export PADDLE_PSERVERS_IP_PORT_LIST="10.89.176.11:36000,10.89.176.12:36000" -export TRAINING_ROLE=TRAINER -export PADDLE_TRAINERS_NUM=2 -export PADDLE_TRAINER_ID=1 # node B: trainer_id = 1 -python -u train.py --is_cloud=1 -``` - -运行该命令时,若pserver还未就绪,可在日志输出中看到如下信息: -> server not ready, wait 3 sec to retry... -> -> not ready endpoints:['10.89.176.11:36000', '10.89.176.12:36000'] - -worker进程将持续等待,直到server开始监听,或等待超时。 - -当pserver都准备就绪后,可以在日志输出看到如下信息: -> I0317 11:38:48.099179 16719 communicator.cc:271] Communicator start -> -> I0317 11:38:49.838711 16719 rpc_client.h:107] init rpc client with trainer_id 0 - -至此,分布式训练启动完毕,将开始训练。 \ No newline at end of file diff --git a/doc/rec_background.md b/doc/rec_background.md index be3e2a8116e0bbcef811223b8d0feed8a0b192ef..b6cf8b1c360165033f5640a775f3c8721754f4a3 100644 --- a/doc/rec_background.md +++ b/doc/rec_background.md @@ -1,4 +1,5 @@ # 推荐系统背景知识 +本文来源于[个性化推荐](https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/README.cn.md),进行了节选。 本文代码目录在[book/recommender_system](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system),初次使用请您参考[Book文档使用说明](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)。 diff --git a/example/__init__.py b/example/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/example/cloud/backend.yaml b/example/cloud/backend.yaml deleted file mode 100755 index 2eff4222fc2f18ccf80ad25b108d8fb762bf6b99..0000000000000000000000000000000000000000 --- a/example/cloud/backend.yaml +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -engine: - workspace: "/home/tangwei/fleet_rec_env/paddlerec/example/cloud" - backend: "PaddleCloud" - - submit: - sk: "2907445932295c57bffab28676fc5a66" - ak: "55cb2027d7a05b9f8b213de483698cc8" - version: "paddle-fluid-v1.7.1" - priority: "high" - jobname: "paddlerec_ctr_dnn" - group: "paddle_benchmark" - cluster: "SZWG-SUNWAY" - config: "{workspace}/config.ini" - nodes: 8 - - submit_scrpit: "{workspace}/submit.sh" diff --git a/example/cloud/before_hook.sh b/example/cloud/before_hook.sh deleted file mode 100644 index 35f8dfa0e717715ea0dc912cea823a7073b34976..0000000000000000000000000000000000000000 --- a/example/cloud/before_hook.sh +++ /dev/null @@ -1,3 +0,0 @@ -echo "Run before_hook.sh ..." -../python27-gcc482/bin/python ../python27-gcc482/bin/pip install ./thirdparty/paddle_rec-0.0.2-py2-none-any.whl --index-url=http://pip.baidu.com/pypi/simple --trusted-host pip.baidu.com -echo "End before_hook.sh ..." diff --git a/example/cloud/config.ini b/example/cloud/config.ini deleted file mode 100644 index 4baa49dc91e0e3bcd889ca9b41cf823a28daa94e..0000000000000000000000000000000000000000 --- a/example/cloud/config.ini +++ /dev/null @@ -1,17 +0,0 @@ -#type of storage cluster -storage_type="hdfs" -#attention: files for training should be put on hdfs -force_reuse_output_path="True" - -fs_name="afs://yinglong.afs.baidu.com:9902" -fs_ugi="paddle,paddle" - -thirdparty_path="/user/paddle/benchmark/ctr/thirdparty" - -#train data path on hdfs -train_data_path="/user/paddle/benchmark/ctr/train_data_paddle/part_1*" -#test data path on hdfs -#test_data_path="/user/paddle/benchmark/ctr/test_data" - -#the output directory on hdfs -output_path="/user/paddle/ly" \ No newline at end of file diff --git a/example/cloud/config.yaml b/example/cloud/config.yaml deleted file mode 100755 index 8cec449bda487c47cba064773a70a0117ff8037b..0000000000000000000000000000000000000000 --- a/example/cloud/config.yaml +++ /dev/null @@ -1,34 +0,0 @@ -train: - trainer: - # for cluster training - strategy: "async" - - epochs: 10 - workspace: "paddlerec.models.rank.dnn" - - reader: - batch_size: 512 - class: "{workspace}/../criteo_reader.py" - train_data_path: "train_data" - reader_debug_mode: False - - model: - models: "{workspace}/model.py" - hyper_parameters: - sparse_inputs_slots: 27 - sparse_feature_number: 1000001 - sparse_feature_dim: 10 - dense_input_dim: 13 - fc_sizes: [400, 400, 400] - learning_rate: 0.0001 - optimizer: adam - - save: - increment: - dirname: "increment" - epoch_interval: 2 - save_last: True - inference: - dirname: "inference" - epoch_interval: 4 - save_last: True diff --git a/example/cloud/job.sh b/example/cloud/job.sh deleted file mode 100644 index 5f001a6f3005983e189d7a066df997a2e6b77b01..0000000000000000000000000000000000000000 --- a/example/cloud/job.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -################################################### -# Usage: job.sh -# Description: run mpi job clinet implement -################################################### - - -# ---------------------------------------------------------------------------- # -# variable define # -# ---------------------------------------------------------------------------- # -export CPU_NUM=16 -export GLOG_v=0 -export FLAGS_rpc_deadline=300000 -# ---------------------------------------------------------------------------- # - -python -m paddlerec.run -m paddle_rec_config.yaml -e cluster -r worker diff --git a/example/cloud/submit.sh b/example/cloud/submit.sh deleted file mode 100644 index 9a5cdd8ccb9dde270db5bf456ddabc38b025abb6..0000000000000000000000000000000000000000 --- a/example/cloud/submit.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash - -################################################### -# Usage: submit.sh -# Description: run mpi submit clinet implement -################################################### - -g_package_files="" - -#----------------------------------------------------------------------------------------------------------------- -#fun : before hook submit to cluster -#param : N/A -#return : 0 -- success; not 0 -- failure -#----------------------------------------------------------------------------------------------------------------- -function before_submit_hook() { - echo "before_submit" -} - -#----------------------------------------------------------------------------------------------------------------- -#fun : after hook submit to cluster -#param : N/A -#return : 0 -- success; not 0 -- failure -#----------------------------------------------------------------------------------------------------------------- -function after_submit_hook() { - echo "after_submit" -} - -#----------------------------------------------------------------------------------------------------------------- -#fun : package to cluster -#param : N/A -#return : 0 -- success; not 0 -- failure -#----------------------------------------------------------------------------------------------------------------- -function package() { - echo "package" - temp=${engine_temp_path} - - cp ${engine_workspace}/job.sh ${temp} - cp ${engine_workspace}/before_hook.sh ${temp} - cp ${engine_run_config} ${temp}/paddle_rec_config.yaml - - g_submitfiles="job.sh before_hook.sh paddle_rec_config.yaml" - g_run_cmd="sh job.sh" -} diff --git a/example/mpi/__init__.py b/example/mpi/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/example/mpi/backend.yaml b/example/mpi/backend.yaml deleted file mode 100755 index 77384db41d4ab9163ec7b3dcc196358341f7a51a..0000000000000000000000000000000000000000 --- a/example/mpi/backend.yaml +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -engine: - workspace: "paddlerec.models.rank.dnn" - backend: "MPI" - - hdfs: - name: "hdfs://nmg01-taihang-hdfs.dmop.baidu.com:54310" - ugi: "fcr,SaK2VqfEDeXzKPor" - output: "/app/ecom/fcr/fanyabo/wadstyleimageq/tangwei12/output_1/" - - package: - build_script: "{workspace}/package.sh" - python: "/home/tangwei/fleet_rec_env/cpython-2.7.11-ucs4" - paddlerec: "/home/tangwei/fleet_rec_env/PaddleRec" - - submit: - hpc: "/home/tangwei/Plines/client/smart_client_khan/" - qconf: "/home/tangwei/Plines/imageq/qsub_f.conf" - nodes: 10 - - submit_scrpit: "{workspace}/submit.sh" - job_scrpit: "{workspace}/worker.sh" diff --git a/example/mpi/job.sh b/example/mpi/job.sh deleted file mode 100644 index 4362901b4b0e2c329a74d185e3b33fb995bc05fb..0000000000000000000000000000000000000000 --- a/example/mpi/job.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash - -################################################### -# Usage: job.sh -# Description: run job on mpi per node -################################################### - -# ---------------------------------------------------------------------------- # -# variable define # -# ---------------------------------------------------------------------------- # -declare g_curPath="" -declare g_scriptName="" -declare g_workPath="" -declare g_run_stage="" - -# ---------------------------------------------------------------------------- # -# const define # -# ---------------------------------------------------------------------------- # -export FLAGS_communicator_thread_pool_size=5 -export FLAGS_communicator_send_queue_size=18 -export FLAGS_communicator_thread_pool_size=20 -export FLAGS_communicator_max_merge_var_num=18 -################################################################################ - -#----------------------------------------------------------------------------------------------------------------- -#fun : check function return code -#param : N/A -#return : 0 -- success; not 0 -- failure -#----------------------------------------------------------------------------------------------------------------- -function check_error() { - if [ ${?} -ne 0 ]; then - echo "execute " + $g_run_stage + " raise exception! please check ..." - exit 1 - fi -} - -#----------------------------------------------------------------------------------------------------------------- -#fun : check function return code -#param : N/A -#return : 0 -- success; not 0 -- failure -#----------------------------------------------------------------------------------------------------------------- -function env_prepare() { - g_run_stage="env_prepare" - WORKDIR=$(pwd) - mpirun -npernode 1 mv package/* ./ - echo "current:"$WORKDIR - - mpirun -npernode 1 tar -zxvf python.tar.gz > /dev/null - - export PYTHONPATH=$WORKDIR/python/ - export PYTHONROOT=$WORKDIR/python/ - export LIBRARY_PATH=$PYTHONPATH/lib:$LIBRARY_PATH - export LD_LIBRARY_PATH=$PYTHONPATH/lib:$LD_LIBRARY_PATH - export PATH=$PYTHONPATH/bin:$PATH - export LIBRARY_PATH=$PYTHONROOT/lib:$LIBRARY_PATH - - mpirun -npernode 1 python/bin/python -m pip uninstall -y paddle-rec - mpirun -npernode 1 python/bin/python -m pip install whl/fleet_rec-0.0.2-py2-none-any.whl --index-url=http://pip.baidu.com/pypi/simple --trusted-host pip.baidu.com - check_error -} - -function run() { - echo "run" - g_run_stage="run" - mpirun -npernode 2 -timestamp-output -tag-output -machinefile ${PBS_NODEFILE} python/bin/python -u -m paddlerec.run -m paddlerec.models.rank.dnn --engine cluster --role worker -} - -function main() { - env_prepare - run -} - -main diff --git a/example/mpi/submit.sh b/example/mpi/submit.sh deleted file mode 100644 index 33eba8394aae179d09ca8ae85547d98fd5db38bd..0000000000000000000000000000000000000000 --- a/example/mpi/submit.sh +++ /dev/null @@ -1,90 +0,0 @@ -#!/bin/bash - -################################################### -# Usage: submit.sh -# Description: run mpi submit clinet implement -################################################### - -#----------------------------------------------------------------------------------------------------------------- -#fun : get argument from env, set it into variables -#param : N/A -#return : 0 -- success; not 0 -- failure -#----------------------------------------------------------------------------------------------------------------- -function vars_get_from_env() { - echo "xx" -} - -#----------------------------------------------------------------------------------------------------------------- -#fun : package -#param : N/A -#return : 0 -- success; not 0 -- failure -#----------------------------------------------------------------------------------------------------------------- -function package() { - g_run_stage="package" - - temp=${engine_temp_path} - echo "package temp dir: " ${temp} - - cp ${engine_job_scrpit} ${temp} - cp ${engine_submit_qconf} ${temp} - echo "copy job.sh from " ${engine_worker} " to " ${temp} - - mkdir -p ${temp}/package - cp -r ${engine_package_python} ${temp}/package/ - echo "copy python from " ${engine_package_python} " to " ${temp} - - mkdir ${temp}/package/whl - cp ${engine_package_paddlerec} ${temp}/package/whl/ - echo "copy " ${engine_package_paddlerec} " to " ${temp}"/whl/" -} - -#----------------------------------------------------------------------------------------------------------------- -#fun : before hook submit to cluster -#param : N/A -#return : 0 -- success; not 0 -- failure -#----------------------------------------------------------------------------------------------------------------- -function before_submit() { - echo "before_submit" -} - -#----------------------------------------------------------------------------------------------------------------- -#fun : after hook submit to cluster -#param : N/A -#return : 0 -- success; not 0 -- failure -#----------------------------------------------------------------------------------------------------------------- -function after_submit() { - echo "after_submit" -} - -#----------------------------------------------------------------------------------------------------------------- -#fun : submit to cluster -#param : N/A -#return : 0 -- success; not 0 -- failure -#----------------------------------------------------------------------------------------------------------------- -function submit() { - g_run_stage="submit" - g_job_name="paddle_rec_mpi" - g_hdfs_path=$g_hdfs_path - g_job_entry="job.sh" - - engine_hdfs_output=${engine_hdfs_output}/$(date +%Y%m%d%H%M%S) - - cd ${engine_temp_path} - - ${engine_submit_hpc}/bin/qsub_f \ - -N ${g_job_name} \ - --conf ${engine_submit_qconf} \ - --hdfs ${engine_hdfs_name} \ - --ugi ${engine_hdfs_ugi} \ - --hout ${engine_hdfs_output} \ - --files ./package \ - -l nodes=${engine_submit_nodes},walltime=1000:00:00,resource=full ${g_job_entry} -} - -function main() { - package - - before_submit - submit - after_submit -} diff --git a/models/contentunderstanding/classification/model.py b/models/contentunderstanding/classification/model.py index 79c21ed3ff6770abdc600fc8c237442b41bef76d..9e853aa01d4a0b6bd5c7a20d8e13164bd9905ad0 100644 --- a/models/contentunderstanding/classification/model.py +++ b/models/contentunderstanding/classification/model.py @@ -1,13 +1,21 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import paddle.fluid as fluid -import math -from paddlerec.core.utils import envs from paddlerec.core.model import Model as ModelBase -import paddle.fluid as fluid -import paddle.fluid.layers.nn as nn -import paddle.fluid.layers.tensor as tensor -import paddle.fluid.layers.control_flow as cf class Model(ModelBase): def __init__(self, config): @@ -22,7 +30,7 @@ class Model(ModelBase): def train_net(self): """ network definition """ - + data = fluid.data(name="input", shape=[None, self.max_len], dtype='int64') label = fluid.data(name="label", shape=[None, 1], dtype='int64') seq_len = fluid.data(name="seq_len", shape=[None], dtype='int64') @@ -46,12 +54,12 @@ class Model(ModelBase): prediction = fluid.layers.fc(input=[fc_1], size=self.class_dim, act="softmax") cost = fluid.layers.cross_entropy(input=prediction, label=label) avg_cost = fluid.layers.mean(x=cost) - acc = fluid.layers.accuracy(input=prediction, label=label) + acc = fluid.layers.accuracy(input=prediction, label=label) self.cost = avg_cost self._metrics["acc"] = acc - def get_cost_op(self): + def get_avg_cost(self): return self.cost def get_metrics(self): diff --git a/models/contentunderstanding/classification/reader.py b/models/contentunderstanding/classification/reader.py index a15ba371887a5434fe9f77325f6cd75c84b4b730..136a5668856c0fb558a016a3bc3a0b8a56651d3b 100644 --- a/models/contentunderstanding/classification/reader.py +++ b/models/contentunderstanding/classification/reader.py @@ -1,28 +1,33 @@ -import re +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + import sys -import collections -import os -import six -import time -import numpy as np -import paddle.fluid as fluid -import paddle -import csv -import io from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs + class TrainReader(Reader): def init(self): pass - def _process_line(self, l): + def _process_line(self, l): l = l.strip().split(" ") data = l[0:10] seq_len = l[10:11] label = l[11:] - return data, label, seq_len + return data, label, seq_len def generate_sample(self, line): def data_iter(): @@ -33,6 +38,7 @@ class TrainReader(Reader): data = [int(i) for i in data] label = [int(i) for i in label] seq_len = [int(i) for i in seq_len] - print >>sys.stderr, str([('data', data), ('label', label), ('seq_len', seq_len)]) + print >> sys.stderr, str([('data', data), ('label', label), ('seq_len', seq_len)]) yield [('data', data), ('label', label), ('seq_len', seq_len)] + return data_iter diff --git a/models/contentunderstanding/classification/train_data/part-0 b/models/contentunderstanding/classification/train_data/part-0 index 4159fad2700b0b1e7b071efba39f005922289748..421b9e90a38b918180dcb673963574adc8547776 100644 --- a/models/contentunderstanding/classification/train_data/part-0 +++ b/models/contentunderstanding/classification/train_data/part-0 @@ -1,320 +1,320 @@ -12 27 13 0 25 52 89 20 39 4 9 1 +12 27 13 0 25 52 89 20 39 4 9 1 78 10 61 58 29 79 85 16 46 41 9 1 -81 77 44 4 5 57 43 97 42 89 6 0 -7 77 86 3 98 89 56 24 7 59 9 1 -65 89 99 27 65 98 16 89 42 0 3 0 -66 14 48 38 66 5 56 89 98 19 4 1 -78 7 10 20 77 16 37 43 59 23 6 1 -84 95 28 35 0 82 55 19 13 81 7 0 -34 32 98 37 43 51 6 38 20 40 9 0 +81 77 44 4 5 57 43 97 42 89 6 0 + 7 77 86 3 98 89 56 24 7 59 9 1 +65 89 99 27 65 98 16 89 42 0 3 0 +66 14 48 38 66 5 56 89 98 19 4 1 +78 7 10 20 77 16 37 43 59 23 6 1 +84 95 28 35 0 82 55 19 13 81 7 0 +34 32 98 37 43 51 6 38 20 40 9 0 75 36 13 51 70 24 62 90 32 91 7 1 -13 5 49 21 57 21 67 85 74 14 1 0 +13 5 49 21 57 21 67 85 74 14 1 0 68 13 86 16 52 50 23 11 65 99 1 1 15 20 75 55 15 90 54 54 15 91 9 0 -44 56 15 88 57 3 62 53 89 57 8 1 -23 8 40 25 60 33 8 69 44 88 7 1 -63 94 5 43 23 70 31 67 21 55 6 0 +44 56 15 88 57 3 62 53 89 57 8 1 +23 8 40 25 60 33 8 69 44 88 7 1 +63 94 5 43 23 70 31 67 21 55 6 0 44 11 64 92 10 37 30 84 19 71 5 1 89 18 71 13 16 58 47 60 77 87 7 1 13 48 56 39 98 53 32 93 13 91 7 0 56 78 67 68 27 11 77 48 45 10 1 1 -52 12 14 5 2 8 3 36 33 59 6 0 -86 42 91 81 2 9 21 0 44 7 9 1 +52 12 14 5 2 8 3 36 33 59 6 0 +86 42 91 81 2 9 21 0 44 7 9 1 96 27 82 55 81 30 91 41 91 58 2 1 97 69 76 47 80 62 23 30 87 22 7 1 42 56 25 47 42 18 80 53 15 57 7 0 34 73 75 88 61 79 40 74 87 87 6 1 -7 91 9 24 42 60 76 31 10 13 4 0 -21 1 46 59 61 54 99 54 89 55 5 1 -67 21 1 29 88 5 3 85 39 22 5 1 -90 99 7 8 17 77 73 3 32 10 5 0 + 7 91 9 24 42 60 76 31 10 13 4 0 +21 1 46 59 61 54 99 54 89 55 5 1 +67 21 1 29 88 5 3 85 39 22 5 1 +90 99 7 8 17 77 73 3 32 10 5 0 30 44 26 32 37 74 90 71 42 29 9 1 -79 68 3 24 21 37 35 3 76 23 6 1 -3 66 7 4 2 88 94 64 47 81 6 1 +79 68 3 24 21 37 35 3 76 23 6 1 + 3 66 7 4 2 88 94 64 47 81 6 1 10 48 16 49 96 93 61 97 84 39 3 1 73 28 67 59 89 92 17 24 52 71 3 1 -98 4 35 62 91 2 78 51 72 93 1 1 +98 4 35 62 91 2 78 51 72 93 1 1 37 42 96 10 48 49 84 45 59 47 5 1 -13 24 7 49 63 78 29 75 45 92 7 1 -1 6 95 23 38 34 85 94 33 47 6 1 +13 24 7 49 63 78 29 75 45 92 7 1 + 1 6 95 23 38 34 85 94 33 47 6 1 99 63 65 39 72 73 91 20 16 45 9 0 -35 8 81 24 62 0 95 0 52 46 4 1 -58 66 88 42 86 94 91 8 18 92 7 0 -12 62 56 43 99 31 63 80 11 7 4 1 -22 36 1 39 69 20 56 75 17 15 7 0 -25 97 62 50 99 98 32 2 98 75 7 1 -7 59 98 68 62 19 28 28 60 27 7 0 -39 63 43 45 43 11 40 81 4 25 6 0 +35 8 81 24 62 0 95 0 52 46 4 1 +58 66 88 42 86 94 91 8 18 92 7 0 +12 62 56 43 99 31 63 80 11 7 4 1 +22 36 1 39 69 20 56 75 17 15 7 0 +25 97 62 50 99 98 32 2 98 75 7 1 + 7 59 98 68 62 19 28 28 60 27 7 0 +39 63 43 45 43 11 40 81 4 25 6 0 81 95 27 84 71 45 87 65 40 50 1 0 82 21 69 55 71 92 52 65 90 16 3 0 -24 6 5 22 36 34 66 71 3 52 2 0 -5 14 66 71 49 10 52 81 32 14 1 0 -8 94 52 23 60 27 43 19 89 91 9 0 +24 6 5 22 36 34 66 71 3 52 2 0 + 5 14 66 71 49 10 52 81 32 14 1 0 + 8 94 52 23 60 27 43 19 89 91 9 0 26 14 36 37 28 94 46 96 11 80 8 1 89 19 77 66 48 75 62 58 90 81 8 1 -25 43 95 21 25 81 39 79 9 74 9 0 -25 2 64 27 67 36 59 68 99 66 5 1 +25 43 95 21 25 81 39 79 9 74 9 0 +25 2 64 27 67 36 59 68 99 66 5 1 13 46 41 55 89 93 79 83 32 52 6 0 -49 77 57 9 91 49 86 50 32 5 2 0 -94 7 53 54 70 69 5 51 59 91 5 1 -24 72 94 13 17 12 2 67 0 89 6 1 +49 77 57 9 91 49 86 50 32 5 2 0 +94 7 53 54 70 69 5 51 59 91 5 1 +24 72 94 13 17 12 2 67 0 89 6 1 70 38 19 27 38 87 72 41 98 84 6 1 -89 76 82 4 69 64 97 77 88 58 9 0 -67 41 99 1 80 38 96 24 67 59 3 1 +89 76 82 4 69 64 97 77 88 58 9 0 +67 41 99 1 80 38 96 24 67 59 3 1 42 83 50 19 97 99 99 50 46 76 8 1 -43 99 63 40 93 15 3 57 11 0 1 0 +43 99 63 40 93 15 3 57 11 0 1 0 16 65 31 43 89 37 98 63 29 69 8 1 -39 5 65 45 12 82 46 87 82 93 8 0 -34 69 82 13 4 20 92 58 46 83 2 1 +39 5 65 45 12 82 46 87 82 93 8 0 +34 69 82 13 4 20 92 58 46 83 2 1 46 79 87 57 87 23 72 95 37 88 8 0 -41 72 81 71 60 15 32 1 9 97 3 0 +41 72 81 71 60 15 32 1 9 97 3 0 84 98 15 78 39 82 89 74 46 32 9 0 16 18 92 80 50 44 98 45 15 41 3 1 -74 78 81 40 17 65 38 21 27 9 1 0 -14 69 68 50 57 11 62 2 89 54 6 0 -70 29 79 29 44 56 33 27 25 4 3 1 +74 78 81 40 17 65 38 21 27 9 1 0 +14 69 68 50 57 11 62 2 89 54 6 0 +70 29 79 29 44 56 33 27 25 4 3 1 44 20 87 67 65 41 93 37 99 78 1 1 -93 57 87 11 33 40 21 3 47 87 9 1 -8 3 24 49 99 48 40 22 99 41 2 0 -19 90 9 83 93 22 36 96 44 73 7 1 -4 73 2 88 79 90 32 48 45 12 5 0 +93 57 87 11 33 40 21 3 47 87 9 1 + 8 3 24 49 99 48 40 22 99 41 2 0 +19 90 9 83 93 22 36 96 44 73 7 1 + 4 73 2 88 79 90 32 48 45 12 5 0 24 58 34 67 85 62 84 48 14 79 5 1 54 69 19 18 59 78 84 48 61 46 4 0 72 69 95 26 30 74 49 30 95 61 8 0 73 29 46 39 48 30 97 63 89 34 9 1 51 32 44 22 70 69 91 81 74 52 3 0 -99 66 89 71 31 42 5 40 21 12 6 0 +99 66 89 71 31 42 5 40 21 12 6 0 58 26 59 56 91 49 79 57 57 74 6 1 -30 36 59 74 6 30 17 1 99 38 4 0 -43 48 77 86 67 25 38 36 3 91 4 1 -67 24 51 34 37 8 98 76 84 13 1 1 +30 36 59 74 6 30 17 1 99 38 4 0 +43 48 77 86 67 25 38 36 3 91 4 1 +67 24 51 34 37 8 98 76 84 13 1 1 73 47 88 15 32 99 67 26 28 89 3 1 -91 66 11 86 5 12 15 43 79 89 1 1 -15 60 43 58 61 0 62 32 98 29 9 0 -80 36 78 42 70 52 2 10 42 41 6 1 -36 16 46 34 96 39 8 21 86 54 5 1 -80 72 13 1 28 49 73 90 81 34 1 0 -73 64 86 9 94 49 44 38 47 64 2 0 -69 90 69 36 60 45 39 7 41 72 8 0 +91 66 11 86 5 12 15 43 79 89 1 1 +15 60 43 58 61 0 62 32 98 29 9 0 +80 36 78 42 70 52 2 10 42 41 6 1 +36 16 46 34 96 39 8 21 86 54 5 1 +80 72 13 1 28 49 73 90 81 34 1 0 +73 64 86 9 94 49 44 38 47 64 2 0 +69 90 69 36 60 45 39 7 41 72 8 0 31 86 54 82 81 77 93 99 68 63 1 1 -95 76 97 36 40 12 4 95 59 64 4 1 +95 76 97 36 40 12 4 95 59 64 4 1 88 20 64 40 27 11 96 40 41 73 6 0 28 72 70 43 34 54 98 43 29 63 5 0 -78 72 4 47 47 38 73 8 65 40 3 1 -91 64 51 93 8 78 53 15 42 32 4 0 -34 36 45 9 16 0 51 40 90 29 2 1 -80 93 65 80 11 19 26 61 29 8 4 0 -94 11 60 36 58 98 43 90 64 1 1 0 +78 72 4 47 47 38 73 8 65 40 3 1 +91 64 51 93 8 78 53 15 42 32 4 0 +34 36 45 9 16 0 51 40 90 29 2 1 +80 93 65 80 11 19 26 61 29 8 4 0 +94 11 60 36 58 98 43 90 64 1 1 0 42 54 89 86 80 72 81 48 19 67 5 0 81 25 30 60 59 20 75 38 75 29 6 0 84 16 48 28 23 20 53 13 32 90 1 0 58 31 77 68 27 88 51 97 70 93 8 1 -63 67 85 6 35 22 28 65 8 7 3 0 -54 75 93 58 98 9 15 37 61 38 6 1 -56 24 50 62 63 47 9 4 58 30 8 1 -64 91 32 68 50 90 51 86 52 6 1 1 -55 50 46 41 28 1 11 39 75 9 1 0 -23 27 98 73 25 7 89 48 7 44 4 1 -86 98 68 1 74 46 15 92 59 25 9 1 +63 67 85 6 35 22 28 65 8 7 3 0 +54 75 93 58 98 9 15 37 61 38 6 1 +56 24 50 62 63 47 9 4 58 30 8 1 +64 91 32 68 50 90 51 86 52 6 1 1 +55 50 46 41 28 1 11 39 75 9 1 0 +23 27 98 73 25 7 89 48 7 44 4 1 +86 98 68 1 74 46 15 92 59 25 9 1 95 86 72 13 33 60 62 83 96 84 1 0 -9 58 37 50 57 16 78 0 21 80 2 0 -82 94 74 42 3 60 61 93 34 22 3 1 -16 97 97 14 47 50 90 35 9 58 5 0 -70 94 82 42 85 88 59 58 6 68 9 0 -14 58 24 44 8 29 12 18 26 80 7 0 -22 23 7 82 39 28 96 92 23 40 5 1 -40 31 72 94 20 81 89 4 42 1 5 0 -57 63 71 41 28 2 39 67 90 54 6 0 -9 74 4 41 11 31 15 21 44 32 6 1 + 9 58 37 50 57 16 78 0 21 80 2 0 +82 94 74 42 3 60 61 93 34 22 3 1 +16 97 97 14 47 50 90 35 9 58 5 0 +70 94 82 42 85 88 59 58 6 68 9 0 +14 58 24 44 8 29 12 18 26 80 7 0 +22 23 7 82 39 28 96 92 23 40 5 1 +40 31 72 94 20 81 89 4 42 1 5 0 +57 63 71 41 28 2 39 67 90 54 6 0 + 9 74 4 41 11 31 15 21 44 32 6 1 31 28 66 66 61 78 72 80 82 88 3 1 -79 18 1 59 35 62 0 72 78 97 7 0 +79 18 1 59 35 62 0 72 78 97 7 0 14 19 30 63 38 37 12 15 54 15 6 1 54 91 37 79 60 35 55 62 94 84 7 1 10 55 78 96 45 55 35 56 54 70 6 1 23 46 15 93 66 11 32 45 74 25 4 0 -51 55 9 9 88 59 21 66 87 12 1 1 -90 22 38 66 12 9 30 48 55 85 1 1 -39 23 82 29 57 76 79 56 3 19 2 0 -7 72 76 15 90 23 40 40 33 39 4 1 +51 55 9 9 88 59 21 66 87 12 1 1 +90 22 38 66 12 9 30 48 55 85 1 1 +39 23 82 29 57 76 79 56 3 19 2 0 + 7 72 76 15 90 23 40 40 33 39 4 1 60 64 34 11 18 18 38 39 53 37 1 1 85 72 51 47 83 90 32 96 78 23 9 1 -85 51 96 31 83 70 57 65 15 0 6 0 -41 11 56 94 40 6 62 86 68 83 7 0 -34 82 44 30 2 2 94 62 41 27 6 1 -54 86 50 83 76 65 0 87 80 70 7 0 -97 50 65 78 2 90 28 5 12 56 5 1 -34 19 68 93 11 9 14 87 22 70 9 0 +85 51 96 31 83 70 57 65 15 0 6 0 +41 11 56 94 40 6 62 86 68 83 7 0 +34 82 44 30 2 2 94 62 41 27 6 1 +54 86 50 83 76 65 0 87 80 70 7 0 +97 50 65 78 2 90 28 5 12 56 5 1 +34 19 68 93 11 9 14 87 22 70 9 0 63 77 27 20 20 37 65 51 29 29 9 1 -22 79 98 57 56 97 43 49 4 80 4 1 -6 4 35 54 4 36 1 79 85 35 6 0 -12 55 68 61 91 43 49 5 93 27 8 0 +22 79 98 57 56 97 43 49 4 80 4 1 + 6 4 35 54 4 36 1 79 85 35 6 0 +12 55 68 61 91 43 49 5 93 27 8 0 64 22 69 16 63 20 28 60 13 35 7 1 -9 19 60 89 62 29 47 33 6 13 4 0 -14 15 39 86 47 75 7 70 57 60 6 1 + 9 19 60 89 62 29 47 33 6 13 4 0 +14 15 39 86 47 75 7 70 57 60 6 1 90 63 12 43 28 46 39 97 83 42 6 0 -49 3 3 64 59 46 30 13 61 10 2 0 +49 3 3 64 59 46 30 13 61 10 2 0 79 47 29 47 54 38 50 66 18 63 5 1 -98 67 1 22 66 32 91 77 63 33 3 0 -72 22 10 27 28 44 29 66 71 1 7 0 -20 52 19 23 9 38 1 93 83 73 5 0 -88 57 22 64 93 66 20 90 78 2 7 1 -90 86 41 28 14 25 86 73 7 21 4 0 -63 91 0 29 2 78 86 76 9 20 4 1 -3 57 91 37 21 85 80 99 18 79 1 1 -69 95 36 6 85 47 83 83 61 52 4 0 -72 4 34 16 59 78 56 70 27 44 9 1 -58 42 6 53 21 7 83 38 86 66 5 0 +98 67 1 22 66 32 91 77 63 33 3 0 +72 22 10 27 28 44 29 66 71 1 7 0 +20 52 19 23 9 38 1 93 83 73 5 0 +88 57 22 64 93 66 20 90 78 2 7 1 +90 86 41 28 14 25 86 73 7 21 4 0 +63 91 0 29 2 78 86 76 9 20 4 1 + 3 57 91 37 21 85 80 99 18 79 1 1 +69 95 36 6 85 47 83 83 61 52 4 0 +72 4 34 16 59 78 56 70 27 44 9 1 +58 42 6 53 21 7 83 38 86 66 5 0 22 86 22 21 86 22 83 38 62 19 4 0 14 63 20 53 98 76 10 22 35 76 9 1 16 88 13 66 37 33 11 40 61 97 2 1 -60 9 98 35 51 11 98 73 67 26 6 1 -25 48 87 93 58 58 15 9 23 13 7 1 -61 47 47 36 97 22 63 35 9 38 5 1 -94 49 41 38 0 81 59 39 13 65 3 0 +60 9 98 35 51 11 98 73 67 26 6 1 +25 48 87 93 58 58 15 9 23 13 7 1 +61 47 47 36 97 22 63 35 9 38 5 1 +94 49 41 38 0 81 59 39 13 65 3 0 88 82 71 96 76 16 57 24 72 36 5 1 -28 46 8 95 94 86 63 1 42 63 6 0 +28 46 8 95 94 86 63 1 42 63 6 0 12 95 29 66 64 77 19 26 73 53 4 0 -19 5 52 34 13 62 6 4 25 58 5 0 -18 39 39 56 73 29 5 15 13 82 1 1 +19 5 52 34 13 62 6 4 25 58 5 0 +18 39 39 56 73 29 5 15 13 82 1 1 50 66 99 67 76 25 43 12 24 67 9 0 -74 56 61 97 23 63 22 63 6 83 2 1 +74 56 61 97 23 63 22 63 6 83 2 1 10 96 13 49 43 20 58 19 99 58 7 1 -2 95 31 4 99 91 27 90 85 32 3 0 + 2 95 31 4 99 91 27 90 85 32 3 0 41 23 20 71 41 75 75 35 16 12 3 1 21 33 87 57 19 27 94 36 80 10 6 0 -8 0 25 74 14 61 86 8 42 82 9 0 + 8 0 25 74 14 61 86 8 42 82 9 0 23 33 91 19 84 99 95 92 29 31 8 0 -94 94 5 6 98 23 37 65 14 25 6 1 -42 16 39 32 2 20 86 81 90 91 8 0 +94 94 5 6 98 23 37 65 14 25 6 1 +42 16 39 32 2 20 86 81 90 91 8 0 72 39 20 63 88 52 65 81 77 96 4 0 48 73 65 75 89 36 75 36 11 35 8 0 -79 74 3 29 63 20 76 46 8 82 5 0 -7 46 38 77 79 92 71 98 30 35 6 0 +79 74 3 29 63 20 76 46 8 82 5 0 + 7 46 38 77 79 92 71 98 30 35 6 0 44 69 93 31 22 68 91 70 32 86 5 0 45 38 77 87 64 44 69 19 28 82 9 0 -93 63 92 84 22 44 51 94 4 99 9 0 -77 10 49 29 59 55 44 7 95 39 2 0 -10 85 99 9 91 29 64 14 50 24 6 1 -74 4 21 12 77 36 71 51 50 31 9 1 -66 76 28 18 23 49 33 31 6 44 1 1 -92 50 90 64 95 58 93 4 78 88 6 1 +93 63 92 84 22 44 51 94 4 99 9 0 +77 10 49 29 59 55 44 7 95 39 2 0 +10 85 99 9 91 29 64 14 50 24 6 1 +74 4 21 12 77 36 71 51 50 31 9 1 +66 76 28 18 23 49 33 31 6 44 1 1 +92 50 90 64 95 58 93 4 78 88 6 1 69 79 76 47 46 26 30 40 33 58 8 1 -97 12 87 82 6 18 57 49 49 58 1 1 +97 12 87 82 6 18 57 49 49 58 1 1 70 79 55 86 29 88 55 39 17 74 5 1 65 51 45 62 54 17 59 12 29 79 5 0 -5 63 82 51 54 97 54 36 57 46 3 0 -74 77 52 10 12 9 34 95 2 0 5 0 + 5 63 82 51 54 97 54 36 57 46 3 0 +74 77 52 10 12 9 34 95 2 0 5 0 50 20 22 89 50 70 55 98 80 50 1 0 -61 80 7 3 78 36 44 37 90 18 9 0 +61 80 7 3 78 36 44 37 90 18 9 0 81 13 55 57 88 81 66 55 18 34 2 1 52 30 54 70 28 56 48 82 67 20 8 1 -0 41 15 63 27 90 12 16 56 79 3 0 -69 89 54 1 93 10 15 2 25 59 8 0 + 0 41 15 63 27 90 12 16 56 79 3 0 +69 89 54 1 93 10 15 2 25 59 8 0 74 99 17 93 96 82 38 77 98 85 4 0 -8 59 17 92 60 21 59 76 55 73 2 1 + 8 59 17 92 60 21 59 76 55 73 2 1 53 56 79 19 29 94 86 96 62 39 3 1 -23 44 25 63 41 94 65 10 8 40 9 1 -7 18 80 43 20 70 14 59 72 17 9 0 -84 97 79 14 37 64 23 68 8 24 2 0 -63 94 98 77 8 62 10 77 63 56 4 0 -8 63 74 34 49 22 52 54 44 93 3 0 +23 44 25 63 41 94 65 10 8 40 9 1 + 7 18 80 43 20 70 14 59 72 17 9 0 +84 97 79 14 37 64 23 68 8 24 2 0 +63 94 98 77 8 62 10 77 63 56 4 0 + 8 63 74 34 49 22 52 54 44 93 3 0 94 48 92 58 82 48 53 34 96 25 2 0 -33 15 3 95 48 93 9 69 44 77 7 1 +33 15 3 95 48 93 9 69 44 77 7 1 69 72 80 77 64 24 52 21 36 49 2 0 59 34 54 66 60 19 76 79 16 70 5 1 -8 83 9 91 67 79 31 20 31 88 2 0 -64 95 46 95 78 63 4 60 66 63 7 1 -10 39 78 45 36 4 89 94 68 75 7 0 + 8 83 9 91 67 79 31 20 31 88 2 0 +64 95 46 95 78 63 4 60 66 63 7 1 +10 39 78 45 36 4 89 94 68 75 7 0 81 52 70 11 48 15 40 63 29 14 8 1 94 49 30 14 53 12 53 42 77 82 8 1 -40 88 46 20 54 84 76 15 2 73 2 1 +40 88 46 20 54 84 76 15 2 73 2 1 71 50 79 54 17 58 30 16 17 99 1 1 74 79 74 61 61 36 28 39 89 36 6 0 -53 45 45 23 51 32 93 26 10 8 3 0 -1 97 6 67 88 20 41 63 49 6 8 0 -3 64 41 19 41 80 75 71 69 90 8 0 -31 90 38 93 52 0 38 86 41 68 9 1 -50 94 53 9 73 59 94 7 24 57 3 0 -87 11 4 62 96 7 0 59 46 11 6 1 +53 45 45 23 51 32 93 26 10 8 3 0 + 1 97 6 67 88 20 41 63 49 6 8 0 + 3 64 41 19 41 80 75 71 69 90 8 0 +31 90 38 93 52 0 38 86 41 68 9 1 +50 94 53 9 73 59 94 7 24 57 3 0 +87 11 4 62 96 7 0 59 46 11 6 1 77 67 56 88 45 62 10 51 86 27 6 1 62 62 59 99 83 84 79 97 56 37 5 0 -19 55 0 37 44 44 2 7 54 50 5 1 -23 60 11 83 6 48 20 77 54 31 6 0 -27 53 52 30 3 70 57 38 47 96 5 0 -75 14 5 83 72 46 47 64 14 12 7 0 +19 55 0 37 44 44 2 7 54 50 5 1 +23 60 11 83 6 48 20 77 54 31 6 0 +27 53 52 30 3 70 57 38 47 96 5 0 +75 14 5 83 72 46 47 64 14 12 7 0 29 95 36 63 59 49 38 44 13 15 2 1 -38 3 70 89 2 94 89 74 33 6 8 1 -28 56 49 43 83 34 7 63 36 13 7 0 +38 3 70 89 2 94 89 74 33 6 8 1 +28 56 49 43 83 34 7 63 36 13 7 0 25 90 23 85 50 65 36 10 64 38 5 0 35 94 48 38 99 71 42 39 61 75 8 1 -28 73 34 22 51 8 52 98 74 19 8 1 -12 40 65 12 7 96 73 65 12 90 5 0 +28 73 34 22 51 8 52 98 74 19 8 1 +12 40 65 12 7 96 73 65 12 90 5 0 42 42 48 16 80 14 48 29 29 45 5 0 -58 20 4 0 69 99 15 4 16 4 1 1 -93 30 90 5 23 63 25 30 99 32 7 1 +58 20 4 0 69 99 15 4 16 4 1 1 +93 30 90 5 23 63 25 30 99 32 7 1 91 23 20 26 84 78 58 76 58 90 5 1 -33 2 36 59 55 9 79 34 92 57 9 0 +33 2 36 59 55 9 79 34 92 57 9 0 80 63 84 73 22 40 70 94 59 34 5 0 49 95 50 32 90 22 18 66 46 32 2 0 -47 72 3 94 33 78 87 43 11 67 5 0 +47 72 3 94 33 78 87 43 11 67 5 0 76 44 86 81 95 48 79 46 11 65 8 1 -59 51 97 75 17 5 40 59 32 62 6 0 -41 13 58 7 54 84 8 84 27 55 1 0 +59 51 97 75 17 5 40 59 32 62 6 0 +41 13 58 7 54 84 8 84 27 55 1 0 24 80 44 26 86 99 68 80 81 22 9 0 -12 45 16 44 66 76 33 53 3 20 9 0 -22 3 79 6 32 38 75 66 15 25 9 1 -51 48 26 53 33 26 18 74 9 39 5 1 +12 45 16 44 66 76 33 53 3 20 9 0 +22 3 79 6 32 38 75 66 15 25 9 1 +51 48 26 53 33 26 18 74 9 39 5 1 35 67 89 91 29 81 23 52 19 11 6 0 -64 50 43 1 43 49 19 20 84 19 8 0 -34 4 9 77 24 61 55 82 42 76 9 0 -37 84 94 33 67 60 3 95 78 8 9 0 -82 10 54 12 47 23 78 97 6 51 5 0 -70 40 38 47 5 38 83 70 37 90 2 0 +64 50 43 1 43 49 19 20 84 19 8 0 +34 4 9 77 24 61 55 82 42 76 9 0 +37 84 94 33 67 60 3 95 78 8 9 0 +82 10 54 12 47 23 78 97 6 51 5 0 +70 40 38 47 5 38 83 70 37 90 2 0 42 21 62 27 43 47 82 80 88 49 4 0 68 68 67 12 38 13 32 30 93 27 3 1 -5 44 98 28 5 81 20 56 10 34 9 1 + 5 44 98 28 5 81 20 56 10 34 9 1 40 46 11 33 73 62 68 70 66 85 4 0 -9 46 11 84 6 31 18 89 66 32 1 1 -6 78 44 98 77 29 69 39 62 78 1 0 -47 90 18 0 3 8 12 20 51 75 4 1 + 9 46 11 84 6 31 18 89 66 32 1 1 + 6 78 44 98 77 29 69 39 62 78 1 0 +47 90 18 0 3 8 12 20 51 75 4 1 21 29 74 19 12 29 41 22 63 47 8 1 -22 59 64 62 18 89 19 92 87 8 8 0 -6 21 24 58 14 53 18 93 62 15 8 0 -20 33 88 25 37 52 1 72 74 11 2 0 -90 49 28 53 28 80 22 81 0 46 9 0 -87 31 51 27 15 31 68 93 5 4 7 1 -21 72 60 2 24 79 22 24 77 61 9 0 -20 4 6 40 28 14 16 78 58 99 7 1 -80 35 98 20 91 35 47 29 3 19 2 1 -57 21 24 61 60 39 83 34 53 2 2 0 +22 59 64 62 18 89 19 92 87 8 8 0 + 6 21 24 58 14 53 18 93 62 15 8 0 +20 33 88 25 37 52 1 72 74 11 2 0 +90 49 28 53 28 80 22 81 0 46 9 0 +87 31 51 27 15 31 68 93 5 4 7 1 +21 72 60 2 24 79 22 24 77 61 9 0 +20 4 6 40 28 14 16 78 58 99 7 1 +80 35 98 20 91 35 47 29 3 19 2 1 +57 21 24 61 60 39 83 34 53 2 2 0 74 86 78 78 18 44 20 94 85 71 4 1 27 48 44 92 10 18 74 54 25 85 2 0 74 77 28 75 74 91 69 36 95 68 7 0 32 84 17 18 55 79 59 57 21 69 2 1 -69 77 40 98 83 40 4 66 39 83 1 1 -63 24 32 39 75 92 81 49 2 51 5 1 -35 40 84 71 3 16 82 91 44 52 8 0 -21 78 66 4 57 27 21 89 4 34 7 1 +69 77 40 98 83 40 4 66 39 83 1 1 +63 24 32 39 75 92 81 49 2 51 5 1 +35 40 84 71 3 16 82 91 44 52 8 0 +21 78 66 4 57 27 21 89 4 34 7 1 94 18 57 49 88 26 29 76 56 67 6 0 -14 91 71 30 5 36 28 74 16 73 3 1 +14 91 71 30 5 36 28 74 16 73 3 1 93 36 43 46 77 44 59 19 56 84 3 0 -11 16 2 67 11 96 20 91 20 59 2 1 +11 16 2 67 11 96 20 91 20 59 2 1 72 79 26 99 90 71 56 46 35 99 3 0 29 87 20 40 13 14 14 40 61 27 6 0 41 64 28 51 56 52 87 67 37 91 6 1 -33 14 5 30 99 54 27 80 54 55 4 1 +33 14 5 30 99 54 27 80 54 55 4 1 60 44 73 91 71 53 54 95 59 81 6 0 -69 33 11 83 4 53 34 39 43 84 1 0 -73 31 19 4 50 20 66 73 94 88 4 0 -30 49 41 76 5 21 88 69 76 3 2 0 +69 33 11 83 4 53 34 39 43 84 1 0 +73 31 19 4 50 20 66 73 94 88 4 0 +30 49 41 76 5 21 88 69 76 3 2 0 18 50 27 76 67 38 87 16 52 87 5 1 -33 36 80 8 43 82 89 76 37 3 5 0 -98 21 61 24 58 13 9 85 56 74 1 1 -84 27 50 96 9 56 30 31 85 65 1 1 -65 74 40 2 8 40 18 57 30 38 1 1 -76 44 64 6 10 32 84 70 74 24 1 1 -14 29 59 34 27 8 0 37 27 68 3 0 -6 47 5 77 15 41 93 49 59 83 4 1 -39 88 43 89 32 98 82 0 5 12 9 0 -78 79 30 26 58 6 9 58 37 65 8 1 +33 36 80 8 43 82 89 76 37 3 5 0 +98 21 61 24 58 13 9 85 56 74 1 1 +84 27 50 96 9 56 30 31 85 65 1 1 +65 74 40 2 8 40 18 57 30 38 1 1 +76 44 64 6 10 32 84 70 74 24 1 1 +14 29 59 34 27 8 0 37 27 68 3 0 + 6 47 5 77 15 41 93 49 59 83 4 1 +39 88 43 89 32 98 82 0 5 12 9 0 +78 79 30 26 58 6 9 58 37 65 8 1 25 28 66 41 70 87 76 62 29 39 7 1 diff --git a/models/contentunderstanding/tagspace/model.py b/models/contentunderstanding/tagspace/model.py index aeb569c6b712fe10e698c2def30ae17004ac7e9a..033d51b8f5d50ddcb1199f566b679eff61acfccb 100644 --- a/models/contentunderstanding/tagspace/model.py +++ b/models/contentunderstanding/tagspace/model.py @@ -1,27 +1,38 @@ -import paddle.fluid as fluid -import math - -from paddlerec.core.utils import envs -from paddlerec.core.model import Model as ModelBase +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import paddle.fluid as fluid import paddle.fluid.layers.nn as nn import paddle.fluid.layers.tensor as tensor import paddle.fluid.layers.control_flow as cf +from paddlerec.core.model import Model as ModelBase +from paddlerec.core.utils import envs + + class Model(ModelBase): def __init__(self, config): ModelBase.__init__(self, config) self.cost = None self.metrics = {} - self.vocab_text_size = 11447#envs.get_global_env("vocab_text_size", None, self._namespace) - self.vocab_tag_size = 4#envs.get_global_env("vocab_tag_size", None, self._namespace) - self.emb_dim = 10#envs.get_global_env("emb_dim", None, self._namespace) - self.hid_dim = 1000#envs.get_global_env("hid_dim", None, self._namespace) - self.win_size = 5#envs.get_global_env("win_size", None, self._namespace) - self.margin = 0.1#envs.get_global_env("margin", None, self._namespace) - self.neg_size = 3#envs.get_global_env("neg_size", None, self._namespace) - print self.emb_dim + self.vocab_text_size = envs.get_global_env("vocab_text_size", None, self._namespace) + self.vocab_tag_size = envs.get_global_env("vocab_tag_size", None, self._namespace) + self.emb_dim = envs.get_global_env("emb_dim", None, self._namespace) + self.hid_dim = envs.get_global_env("hid_dim", None, self._namespace) + self.win_size = envs.get_global_env("win_size", None, self._namespace) + self.margin = envs.get_global_env("margin", None, self._namespace) + self.neg_size = envs.get_global_env("neg_size", None, self._namespace) def train_net(self): """ network definition """ @@ -78,18 +89,16 @@ class Model(ModelBase): self.metrics["correct"] = correct self.metrics["cos_pos"] = cos_pos - def get_cost_op(self): + def get_avg_cost(self): return self.cost def get_metrics(self): return self.metrics def optimizer(self): - learning_rate = 0.01#envs.get_global_env("hyper_parameters.base_lr", None, self._namespace) + learning_rate = envs.get_global_env("hyper_parameters.base_lr", None, self._namespace) sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=learning_rate) - #sgd_optimizer.minimize(avg_cost) return sgd_optimizer - def infer_net(self, parameter_list): self.train_net() diff --git a/models/contentunderstanding/tagspace/reader.py b/models/contentunderstanding/tagspace/reader.py index 93446e712f204c7caaf59c10d116e89e818ae78f..0f63b85fd1a322b55c6d0e451fe61ff90c82eaa5 100644 --- a/models/contentunderstanding/tagspace/reader.py +++ b/models/contentunderstanding/tagspace/reader.py @@ -1,23 +1,30 @@ -import re +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + import sys -import collections -import os -import six -import time + import numpy as np -import paddle.fluid as fluid -import paddle -import csv -import io from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs + class TrainReader(Reader): def init(self): pass - def _process_line(self, l): + def _process_line(self, l): tag_size = 4 neg_size = 3 l = l.strip().split(",") @@ -40,10 +47,7 @@ class TrainReader(Reader): neg_index = rand_i neg_tag.append(neg_index) sum_n += 1 - # if n > 0 and len(text) > n: - # #yield None - # return None, None, None - return text, pos_tag, neg_tag + return text, pos_tag, neg_tag def generate_sample(self, line): def data_iter(): @@ -52,4 +56,5 @@ class TrainReader(Reader): yield None return yield [('text', text), ('pos_tag', pos_tag), ('neg_tag', neg_tag)] + return data_iter diff --git a/models/match/dssm/model.py b/models/match/dssm/model.py index ef77a8f01ebb2e0cf13551001ecb8f071f1ace2f..630fb3eeef062bdfda7720c2c54dd884ec033a71 100755 --- a/models/match/dssm/model.py +++ b/models/match/dssm/model.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import math import paddle.fluid as fluid from paddlerec.core.utils import envs @@ -25,11 +24,12 @@ class Model(ModelBase): def input(self): TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None, self._namespace) - Neg = envs.get_global_env("hyper_parameters.NEG", None, self._namespace) + Neg = envs.get_global_env("hyper_parameters.NEG", None, self._namespace) self.query = fluid.data(name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0) self.doc_pos = fluid.data(name="doc_pos", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0) - self.doc_negs = [fluid.data(name="doc_neg_" + str(i), shape=[-1, TRIGRAM_D], dtype="float32", lod_level=0) for i in range(Neg)] + self.doc_negs = [fluid.data(name="doc_neg_" + str(i), shape=[-1, TRIGRAM_D], dtype="float32", lod_level=0) for i + in range(Neg)] self._data_var.append(self.query) self._data_var.append(self.doc_pos) for input in self.doc_negs: @@ -38,40 +38,40 @@ class Model(ModelBase): if self._platform != "LINUX": self._data_loader = fluid.io.DataLoader.from_generator( feed_list=self._data_var, capacity=64, use_double_buffer=False, iterable=False) - def net(self, is_infer=False): - hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace) + hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace) hidden_acts = envs.get_global_env("hyper_parameters.fc_acts", None, self._namespace) - + def fc(data, hidden_layers, hidden_acts, names): fc_inputs = [data] - for i in range(len(hidden_layers)): - xavier=fluid.initializer.Xavier(uniform=True, fan_in=fc_inputs[-1].shape[1], fan_out=hidden_layers[i]) - out = fluid.layers.fc(input=fc_inputs[-1], - size=hidden_layers[i], - act=hidden_acts[i], - param_attr=xavier, - bias_attr=xavier, - name=names[i]) - fc_inputs.append(out) - return fc_inputs[-1] - - query_fc = fc(self.query, hidden_layers, hidden_acts, ['query_l1', 'query_l2', 'query_l3']) - doc_pos_fc = fc(self.doc_pos, hidden_layers, hidden_acts, ['doc_pos_l1', 'doc_pos_l2', 'doc_pos_l3']) - self.R_Q_D_p = fluid.layers.cos_sim(query_fc, doc_pos_fc) + for i in range(len(hidden_layers)): + xavier = fluid.initializer.Xavier(uniform=True, fan_in=fc_inputs[-1].shape[1], fan_out=hidden_layers[i]) + out = fluid.layers.fc(input=fc_inputs[-1], + size=hidden_layers[i], + act=hidden_acts[i], + param_attr=xavier, + bias_attr=xavier, + name=names[i]) + fc_inputs.append(out) + return fc_inputs[-1] + + query_fc = fc(self.query, hidden_layers, hidden_acts, ['query_l1', 'query_l2', 'query_l3']) + doc_pos_fc = fc(self.doc_pos, hidden_layers, hidden_acts, ['doc_pos_l1', 'doc_pos_l2', 'doc_pos_l3']) + self.R_Q_D_p = fluid.layers.cos_sim(query_fc, doc_pos_fc) if is_infer: return R_Q_D_ns = [] - for i, doc_neg in enumerate(self.doc_negs): - doc_neg_fc_i = fc(doc_neg, hidden_layers, hidden_acts, ['doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i), 'doc_neg_l3_' + str(i)]) + for i, doc_neg in enumerate(self.doc_negs): + doc_neg_fc_i = fc(doc_neg, hidden_layers, hidden_acts, + ['doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i), 'doc_neg_l3_' + str(i)]) R_Q_D_ns.append(fluid.layers.cos_sim(query_fc, doc_neg_fc_i)) concat_Rs = fluid.layers.concat(input=[self.R_Q_D_p] + R_Q_D_ns, axis=-1) - prob = fluid.layers.softmax(concat_Rs, axis=1) - - hit_prob = fluid.layers.slice(prob, axes=[0,1], starts=[0,0], ends=[4, 1]) + prob = fluid.layers.softmax(concat_Rs, axis=1) + + hit_prob = fluid.layers.slice(prob, axes=[0, 1], starts=[0, 0], ends=[4, 1]) loss = -fluid.layers.reduce_sum(fluid.layers.log(hit_prob)) self.avg_cost = fluid.layers.mean(x=loss) @@ -101,10 +101,10 @@ class Model(ModelBase): self.doc_pos = fluid.data(name="doc_pos", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0) self._infer_data_var = [self.query, self.doc_pos] - self._infer_data_loader = fluid.io.DataLoader.from_generator( + self._infer_data_loader = fluid.io.DataLoader.from_generator( feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) - + def infer_net(self): - self.infer_input() + self.infer_input() self.net(is_infer=True) - self.infer_results() + self.infer_results() diff --git a/models/match/dssm/synthetic_evaluate_reader.py b/models/match/dssm/synthetic_evaluate_reader.py index 1be5b13d48e17dfca1a0c11736074e80aadcbdc7..97f50abf9720060b008b90c7729e93d13701bb3b 100755 --- a/models/match/dssm/synthetic_evaluate_reader.py +++ b/models/match/dssm/synthetic_evaluate_reader.py @@ -14,7 +14,6 @@ from __future__ import print_function from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs class EvaluateReader(Reader): diff --git a/models/match/dssm/synthetic_reader.py b/models/match/dssm/synthetic_reader.py index c4cfaf6dab61b4122a86e946dc004847c70465fb..13f57a6663ca372bc287386dd939214f362b503d 100755 --- a/models/match/dssm/synthetic_reader.py +++ b/models/match/dssm/synthetic_reader.py @@ -11,10 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs class TrainReader(Reader): @@ -37,7 +37,7 @@ class TrainReader(Reader): neg_docs = [] for i in range(len(features) - 2): feature_names.append('doc_neg_' + str(i)) - neg_docs.append(map(float, features[i+2].split(','))) + neg_docs.append(map(float, features[i + 2].split(','))) yield zip(feature_names, [query] + [pos_doc] + neg_docs) diff --git a/models/match/multiview-simnet/config.yaml b/models/match/multiview-simnet/config.yaml index 8399b40d4074f64d63e74e9cc0573ac6b422082c..53ac4c095c0d347cca8cba1afb9866c66ab85218 100755 --- a/models/match/multiview-simnet/config.yaml +++ b/models/match/multiview-simnet/config.yaml @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. evaluate: - workspace: "paddlerec.models.recall.multiview-simnet" + workspace: "paddlerec.models.match.multiview-simnet" reader: batch_size: 2 class: "{workspace}/evaluate_reader.py" @@ -24,7 +24,7 @@ train: strategy: "async" epochs: 2 - workspace: "paddlerec.models.recall.multiview-simnet" + workspace: "paddlerec.models.match.multiview-simnet" reader: batch_size: 2 diff --git a/models/match/multiview-simnet/data_process.sh b/models/match/multiview-simnet/data_process.sh index 15c6c908477cd3ba6a72a65bad039bb10295bd9c..91548c5863063f7a23cb2f713a3754b121b235b5 100755 --- a/models/match/multiview-simnet/data_process.sh +++ b/models/match/multiview-simnet/data_process.sh @@ -1,5 +1,20 @@ #! /bin/bash +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + set -e echo "begin to prepare data" diff --git a/models/match/multiview-simnet/evaluate_reader.py b/models/match/multiview-simnet/evaluate_reader.py index f7752e47825e358aee6ab5bc8bcd6b6d71fb539e..e0f8f9e43de80d003834056ea417914f1d10e898 100755 --- a/models/match/multiview-simnet/evaluate_reader.py +++ b/models/match/multiview-simnet/evaluate_reader.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,18 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np -import io -import copy -import random + from paddlerec.core.reader import Reader from paddlerec.core.utils import envs class EvaluateReader(Reader): def init(self): - self.query_slots = envs.get_global_env("hyper_parameters.query_slots", None, "train.model") - self.title_slots = envs.get_global_env("hyper_parameters.title_slots", None, "train.model") + self.query_slots = envs.get_global_env("hyper_parameters.query_slots", None, "train.model") + self.title_slots = envs.get_global_env("hyper_parameters.title_slots", None, "train.model") self.all_slots = [] for i in range(self.query_slots): @@ -52,6 +49,7 @@ class EvaluateReader(Reader): if visit: self._all_slots_dict[slot][0] = False else: - output[index][1].append(padding) + output[index][1].append(padding) yield output + return data_iter diff --git a/models/match/multiview-simnet/generate_synthetic_data.py b/models/match/multiview-simnet/generate_synthetic_data.py index 5ebb3a355f3904dc0a5cccff8e9d0b48b89f18f4..d453e031cdca9be29892b913ea5f2636a6c05f5e 100755 --- a/models/match/multiview-simnet/generate_synthetic_data.py +++ b/models/match/multiview-simnet/generate_synthetic_data.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,10 +14,12 @@ import random + class Dataset: def __init__(self): pass + class SyntheticDataset(Dataset): def __init__(self, sparse_feature_dim, query_slot_num, title_slot_num, dataset_size=10000): # ids are randomly generated @@ -39,7 +41,7 @@ class SyntheticDataset(Dataset): for i in range(self.query_slot_num): qslot = generate_ids(self.ids_per_slot, self.sparse_feature_dim) - qslot = [str(fea) + ':' + str(i) for fea in qslot] + qslot = [str(fea) + ':' + str(i) for fea in qslot] query_slots += qslot for i in range(self.title_slot_num): pt_slot = generate_ids(self.ids_per_slot, @@ -50,7 +52,8 @@ class SyntheticDataset(Dataset): for i in range(self.title_slot_num): nt_slot = generate_ids(self.ids_per_slot, self.sparse_feature_dim) - nt_slot = [str(fea) + ':' + str(i + self.query_slot_num + self.title_slot_num) for fea in nt_slot] + nt_slot = [str(fea) + ':' + str(i + self.query_slot_num + self.title_slot_num) for fea in + nt_slot] neg_title_slots += nt_slot yield query_slots + pos_title_slots + neg_title_slots else: @@ -67,6 +70,7 @@ class SyntheticDataset(Dataset): def test(self): return self._reader_creator(False) + if __name__ == '__main__': sparse_feature_dim = 1000001 query_slots = 1 @@ -75,7 +79,7 @@ if __name__ == '__main__': dataset = SyntheticDataset(sparse_feature_dim, query_slots, title_slots, dataset_size) train_reader = dataset.train() test_reader = dataset.test() - + with open("data/train/train.txt", 'w') as fout: for data in train_reader(): fout.write(' '.join(data)) diff --git a/models/match/multiview-simnet/model.py b/models/match/multiview-simnet/model.py index aa8d8db676155a3839496aa8096acc35cb784e90..5ba9fb5d05b27339d924bfe42c0e6ba0c2c68da3 100755 --- a/models/match/multiview-simnet/model.py +++ b/models/match/multiview-simnet/model.py @@ -12,16 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np -import math import paddle.fluid as fluid -import paddle.fluid.layers as layers import paddle.fluid.layers.tensor as tensor import paddle.fluid.layers.control_flow as cf from paddlerec.core.utils import envs from paddlerec.core.model import Model as ModelBase + class BowEncoder(object): """ bow-encoder """ @@ -97,13 +95,14 @@ class SimpleEncoderFactory(object): rnn_encode = GrnnEncoder(hidden_size=enc_hid_size) return rnn_encode + class Model(ModelBase): def __init__(self, config): ModelBase.__init__(self, config) self.init_config() - + def init_config(self): - self._fetch_interval = 1 + self._fetch_interval = 1 query_encoder = envs.get_global_env("hyper_parameters.query_encoder", None, self._namespace) title_encoder = envs.get_global_env("hyper_parameters.title_encoder", None, self._namespace) query_encode_dim = envs.get_global_env("hyper_parameters.query_encode_dim", None, self._namespace) @@ -115,19 +114,19 @@ class Model(ModelBase): factory.create(query_encoder, query_encode_dim) for i in range(query_slots) ] - self.title_encoders = [ + self.title_encoders = [ factory.create(title_encoder, title_encode_dim) for i in range(title_slots) ] - self.emb_size = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace) - self.emb_dim = envs.get_global_env("hyper_parameters.embedding_dim", None, self._namespace) - self.emb_shape = [self.emb_size, self.emb_dim] - self.hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None, self._namespace) - self.margin = 0.1 + self.emb_size = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace) + self.emb_dim = envs.get_global_env("hyper_parameters.embedding_dim", None, self._namespace) + self.emb_shape = [self.emb_size, self.emb_dim] + self.hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None, self._namespace) + self.margin = 0.1 def input(self, is_train=True): - self.q_slots = [ + self.q_slots = [ fluid.data( name="%d" % i, shape=[None, 1], lod_level=1, dtype='int64') for i in range(len(self.query_encoders)) @@ -138,22 +137,23 @@ class Model(ModelBase): for i in range(len(self.title_encoders)) ] - if is_train == False: - return self.q_slots + self.pt_slots + if is_train == False: + return self.q_slots + self.pt_slots self.nt_slots = [ fluid.data( - name="%d" % (i + len(self.query_encoders) + len(self.title_encoders)), shape=[None, 1], lod_level=1, dtype='int64') + name="%d" % (i + len(self.query_encoders) + len(self.title_encoders)), shape=[None, 1], lod_level=1, + dtype='int64') for i in range(len(self.title_encoders)) ] return self.q_slots + self.pt_slots + self.nt_slots - + def train_input(self): res = self.input() self._data_var = res - use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader", False, self._namespace) + use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader", False, self._namespace) if self._platform != "LINUX" or use_dataloader: self._data_loader = fluid.io.DataLoader.from_generator( @@ -161,15 +161,15 @@ class Model(ModelBase): def get_acc(self, x, y): less = tensor.cast(cf.less_than(x, y), dtype='float32') - label_ones = fluid.layers.fill_constant_batch_size_like( + label_ones = fluid.layers.fill_constant_batch_size_like( input=x, dtype='float32', shape=[-1, 1], value=1.0) correct = fluid.layers.reduce_sum(less) - total = fluid.layers.reduce_sum(label_ones) + total = fluid.layers.reduce_sum(label_ones) acc = fluid.layers.elementwise_div(correct, total) - return acc + return acc def net(self): - q_embs = [ + q_embs = [ fluid.embedding( input=query, size=self.emb_shape, param_attr="emb") for query in self.q_slots @@ -184,8 +184,8 @@ class Model(ModelBase): input=title, size=self.emb_shape, param_attr="emb") for title in self.nt_slots ] - - # encode each embedding field with encoder + + # encode each embedding field with encoder q_encodes = [ self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs) ] @@ -201,7 +201,7 @@ class Model(ModelBase): pt_concat = fluid.layers.concat(pt_encodes) nt_concat = fluid.layers.concat(nt_encodes) - # projection of hidden layer + # projection of hidden layer q_hid = fluid.layers.fc(q_concat, size=self.hidden_size, param_attr='q_fc.w', @@ -219,7 +219,7 @@ class Model(ModelBase): cos_pos = fluid.layers.cos_sim(q_hid, pt_hid) cos_neg = fluid.layers.cos_sim(q_hid, nt_hid) - # pairwise hinge_loss + # pairwise hinge_loss loss_part1 = fluid.layers.elementwise_sub( tensor.fill_constant_batch_size_like( input=cos_pos, @@ -236,7 +236,7 @@ class Model(ModelBase): loss_part2) self.avg_cost = fluid.layers.mean(loss_part3) - self.acc = self.get_acc(cos_neg, cos_pos) + self.acc = self.get_acc(cos_neg, cos_pos) def avg_loss(self): self._cost = self.avg_cost @@ -253,19 +253,19 @@ class Model(ModelBase): def optimizer(self): learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) - optimizer = fluid.optimizer.Adam(learning_rate=learning_rate) - return optimizer + optimizer = fluid.optimizer.Adam(learning_rate=learning_rate) + return optimizer def infer_input(self): res = self.input(is_train=False) - self._infer_data_var = res + self._infer_data_var = res self._infer_data_loader = fluid.io.DataLoader.from_generator( feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) - + def infer_net(self): - self.infer_input() - # lookup embedding for each slot + self.infer_input() + # lookup embedding for each slot q_embs = [ fluid.embedding( input=query, size=self.emb_shape, param_attr="emb") @@ -276,14 +276,14 @@ class Model(ModelBase): input=title, size=self.emb_shape, param_attr="emb") for title in self.pt_slots ] - # encode each embedding field with encoder + # encode each embedding field with encoder q_encodes = [ self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs) ] pt_encodes = [ self.title_encoders[i].forward(emb) for i, emb in enumerate(pt_embs) ] - # concat multi view for query, pos_title, neg_title + # concat multi view for query, pos_title, neg_title q_concat = fluid.layers.concat(q_encodes) pt_concat = fluid.layers.concat(pt_encodes) # projection of hidden layer diff --git a/models/match/multiview-simnet/reader.py b/models/match/multiview-simnet/reader.py index 5616547ee167707dd8d1cf6ab81d57f96152e023..43cd1a629a7540e727e423a98d497964203134ac 100755 --- a/models/match/multiview-simnet/reader.py +++ b/models/match/multiview-simnet/reader.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,18 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np -import io -import copy -import random + from paddlerec.core.reader import Reader from paddlerec.core.utils import envs class TrainReader(Reader): def init(self): - self.query_slots = envs.get_global_env("hyper_parameters.query_slots", None, "train.model") - self.title_slots = envs.get_global_env("hyper_parameters.title_slots", None, "train.model") + self.query_slots = envs.get_global_env("hyper_parameters.query_slots", None, "train.model") + self.title_slots = envs.get_global_env("hyper_parameters.title_slots", None, "train.model") self.all_slots = [] for i in range(self.query_slots): @@ -55,6 +52,7 @@ class TrainReader(Reader): if visit: self._all_slots_dict[slot][0] = False else: - output[index][1].append(padding) + output[index][1].append(padding) yield output + return data_iter diff --git a/models/match/readme.md b/models/match/readme.md index c2b52d1c2dd3b28715bb9b9c8660aa9e26880555..6bccc109ff14582e816dee64b72b786a1e90f49e 100755 --- a/models/match/readme.md +++ b/models/match/readme.md @@ -9,13 +9,7 @@ * [整体介绍](#整体介绍) * [匹配模型列表](#匹配模型列表) * [使用教程](#使用教程) - * [数据处理](#数据处理) - * [训练](#训练) - * [预测](#预测) -* [效果对比](#效果对比) - * [模型效果列表](#模型效果列表) -* [分布式](#分布式) - * [模型性能列表](#模型性能列表) + * [训练&预测](#训练&预测) ## 整体介绍 ### 匹配模型列表 @@ -25,22 +19,22 @@ | DSSM | Deep Structured Semantic Models | [Learning Deep Structured Semantic Models for Web Search using Clickthrough Data](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/cikm2013_DSSM_fullversion.pdf)(2013) | | MultiView-Simnet | Multi-view Simnet for Personalized recommendation | [A Multi-View Deep Learning Approach for Cross Domain User Modeling in Recommendation Systems](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/frp1159-songA.pdf)(2015) | +下面是每个模型的简介(注:图片引用自链接中的论文) + +[DSSM](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/cikm2013_DSSM_fullversion.pdf): +

+ +

+ +[MultiView-Simnet](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/frp1159-songA.pdf): +

+ +

+ ## 使用教程 -### 数据处理 -### 训练 -### 预测 - -## 效果对比 -### 模型效果列表 - -| 数据集 | 模型 | loss | auc | -| :------------------: | :--------------------: | :---------: |:---------: | -| - | DSSM | -- | -- | -| - | MultiView-Simnet | -- | -- | - -## 分布式 -### 模型性能列表 -| 数据集 | 模型 | 单机 | 多机(同步) | 多机(异步) | GPU | -| :------------------: | :--------------------: | :---------: |:---------: |:---------: |:---------: | -| - | DSSM | -- | -- | -- | -- | -| - | MultiView-Simnet | -- | -- | -- | -- | +### 训练&预测 +```shell +python -m paddlerec.run -m paddlerec.models.match.dssm # dssm +python -m paddlerec.run -m paddlerec.models.match.multiview-simnet # multiview-simnet +``` + diff --git a/models/multitask/esmm/esmm_infer_reader.py b/models/multitask/esmm/esmm_infer_reader.py index 6e94a1eed07bab82fa80ece0041f9b1e94bb531d..8ca9eca67fdbb9e11f39db34b5dd9cfae518773b 100644 --- a/models/multitask/esmm/esmm_infer_reader.py +++ b/models/multitask/esmm/esmm_infer_reader.py @@ -13,19 +13,19 @@ # limitations under the License. from __future__ import print_function -from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs from collections import defaultdict -import numpy as np + +from paddlerec.core.reader import Reader class EvaluateReader(Reader): def init(self): - all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128', '129', + all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128', + '129', '205', '206', '207', '210', '216', '508', '509', '702', '853', '301'] self.all_field_id_dict = defaultdict(int) - for i,field_id in enumerate(all_field_id): - self.all_field_id_dict[field_id] = [False,i] + for i, field_id in enumerate(all_field_id): + self.all_field_id_dict[field_id] = [False, i] def generate_sample(self, line): """ @@ -39,25 +39,26 @@ class EvaluateReader(Reader): features = line.strip().split(',') ctr = int(features[1]) cvr = int(features[2]) - + padding = 0 - output = [(field_id,[]) for field_id in self.all_field_id_dict] + output = [(field_id, []) for field_id in self.all_field_id_dict] for elem in features[4:]: - field_id,feat_id = elem.strip().split(':') + field_id, feat_id = elem.strip().split(':') if field_id not in self.all_field_id_dict: continue self.all_field_id_dict[field_id][0] = True index = self.all_field_id_dict[field_id][1] - output[index][1].append(int(feat_id)) - + output[index][1].append(int(feat_id)) + for field_id in self.all_field_id_dict: - visited,index = self.all_field_id_dict[field_id] + visited, index = self.all_field_id_dict[field_id] if visited: self.all_field_id_dict[field_id][0] = False else: - output[index][1].append(padding) + output[index][1].append(padding) output.append(('ctr', [ctr])) output.append(('cvr', [cvr])) yield output + return reader diff --git a/models/multitask/esmm/esmm_reader.py b/models/multitask/esmm/esmm_reader.py index f7aef643cd3db5c01c1cc3a632a99fa3fa342658..3d663038eefb4971b466336601ba436ff884e580 100644 --- a/models/multitask/esmm/esmm_reader.py +++ b/models/multitask/esmm/esmm_reader.py @@ -11,21 +11,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function -from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs from collections import defaultdict -import numpy as np + +from paddlerec.core.reader import Reader class TrainReader(Reader): def init(self): - all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128', '129', + all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128', + '129', '205', '206', '207', '210', '216', '508', '509', '702', '853', '301'] self.all_field_id_dict = defaultdict(int) - for i,field_id in enumerate(all_field_id): - self.all_field_id_dict[field_id] = [False,i] + for i, field_id in enumerate(all_field_id): + self.all_field_id_dict[field_id] = [False, i] def generate_sample(self, line): """ @@ -37,30 +38,31 @@ class TrainReader(Reader): This function needs to be implemented by the user, based on data format """ features = line.strip().split(',') - #ctr = list(map(int, features[1])) - #cvr = list(map(int, features[2])) + # ctr = list(map(int, features[1])) + # cvr = list(map(int, features[2])) ctr = int(features[1]) cvr = int(features[2]) - + padding = 0 - output = [(field_id,[]) for field_id in self.all_field_id_dict] + output = [(field_id, []) for field_id in self.all_field_id_dict] for elem in features[4:]: - field_id,feat_id = elem.strip().split(':') + field_id, feat_id = elem.strip().split(':') if field_id not in self.all_field_id_dict: continue self.all_field_id_dict[field_id][0] = True index = self.all_field_id_dict[field_id][1] - #feat_id = list(map(int, feat_id)) - output[index][1].append(int(feat_id)) - + # feat_id = list(map(int, feat_id)) + output[index][1].append(int(feat_id)) + for field_id in self.all_field_id_dict: - visited,index = self.all_field_id_dict[field_id] + visited, index = self.all_field_id_dict[field_id] if visited: self.all_field_id_dict[field_id][0] = False else: - output[index][1].append(padding) + output[index][1].append(padding) output.append(('ctr', [ctr])) output.append(('cvr', [cvr])) yield output + return reader diff --git a/models/multitask/esmm/model.py b/models/multitask/esmm/model.py index 1641f72d1e4eab39cfe7ce1aa5055c25d139fb16..8a8a203a87504cff310c0a799df40e937e2bbde8 100644 --- a/models/multitask/esmm/model.py +++ b/models/multitask/esmm/model.py @@ -12,83 +12,84 @@ # See the License for the specific language governing permissions and # limitations under the License. -import math +import numpy as np import paddle.fluid as fluid from paddlerec.core.utils import envs from paddlerec.core.model import Model as ModelBase -import numpy as np class Model(ModelBase): def __init__(self, config): ModelBase.__init__(self, config) - def fc(self,tag, data, out_dim, active='prelu'): - + def fc(self, tag, data, out_dim, active='prelu'): + init_stddev = 1.0 - scales = 1.0 / np.sqrt(data.shape[1]) - + scales = 1.0 / np.sqrt(data.shape[1]) + p_attr = fluid.param_attr.ParamAttr(name='%s_weight' % tag, - initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=init_stddev * scales)) - + initializer=fluid.initializer.NormalInitializer(loc=0.0, + scale=init_stddev * scales)) + b_attr = fluid.ParamAttr(name='%s_bias' % tag, initializer=fluid.initializer.Constant(0.1)) - + out = fluid.layers.fc(input=data, - size=out_dim, - act=active, - param_attr=p_attr, - bias_attr =b_attr, - name=tag) + size=out_dim, + act=active, + param_attr=p_attr, + bias_attr=b_attr, + name=tag) return out - + def input_data(self): sparse_input_ids = [ - fluid.data(name="field_" + str(i), shape=[-1, 1], dtype="int64", lod_level=1) for i in range(0,23) + fluid.data(name="field_" + str(i), shape=[-1, 1], dtype="int64", lod_level=1) for i in range(0, 23) ] label_ctr = fluid.data(name="ctr", shape=[-1, 1], dtype="int64") label_cvr = fluid.data(name="cvr", shape=[-1, 1], dtype="int64") inputs = sparse_input_ids + [label_ctr] + [label_cvr] self._data_var.extend(inputs) - + return inputs - + def net(self, inputs, is_infer=False): - + vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace) embed_size = envs.get_global_env("hyper_parameters.embed_size", None, self._namespace) emb = [] for data in inputs[0:-2]: feat_emb = fluid.embedding(input=data, - size=[vocab_size, embed_size], - param_attr=fluid.ParamAttr(name='dis_emb', - learning_rate=5, - initializer=fluid.initializer.Xavier(fan_in=embed_size,fan_out=embed_size) - ), - is_sparse=True) - field_emb = fluid.layers.sequence_pool(input=feat_emb,pool_type='sum') + size=[vocab_size, embed_size], + param_attr=fluid.ParamAttr(name='dis_emb', + learning_rate=5, + initializer=fluid.initializer.Xavier( + fan_in=embed_size, fan_out=embed_size) + ), + is_sparse=True) + field_emb = fluid.layers.sequence_pool(input=feat_emb, pool_type='sum') emb.append(field_emb) concat_emb = fluid.layers.concat(emb, axis=1) - + # ctr active = 'relu' ctr_fc1 = self.fc('ctr_fc1', concat_emb, 200, active) ctr_fc2 = self.fc('ctr_fc2', ctr_fc1, 80, active) ctr_out = self.fc('ctr_out', ctr_fc2, 2, 'softmax') - + # cvr cvr_fc1 = self.fc('cvr_fc1', concat_emb, 200, active) cvr_fc2 = self.fc('cvr_fc2', cvr_fc1, 80, active) - cvr_out = self.fc('cvr_out', cvr_fc2, 2,'softmax') - + cvr_out = self.fc('cvr_out', cvr_fc2, 2, 'softmax') + ctr_clk = inputs[-2] ctcvr_buy = inputs[-1] - + ctr_prop_one = fluid.layers.slice(ctr_out, axes=[1], starts=[1], ends=[2]) cvr_prop_one = fluid.layers.slice(cvr_out, axes=[1], starts=[1], ends=[2]) - + ctcvr_prop_one = fluid.layers.elementwise_mul(ctr_prop_one, cvr_prop_one) - ctcvr_prop = fluid.layers.concat(input=[1-ctcvr_prop_one,ctcvr_prop_one], axis = 1) + ctcvr_prop = fluid.layers.concat(input=[1 - ctcvr_prop_one, ctcvr_prop_one], axis=1) auc_ctr, batch_auc_ctr, auc_states_ctr = fluid.layers.auc(input=ctr_out, label=ctr_clk) auc_ctcvr, batch_auc_ctcvr, auc_states_ctcvr = fluid.layers.auc(input=ctcvr_prop, label=ctcvr_buy) @@ -98,27 +99,23 @@ class Model(ModelBase): self._infer_results["AUC_ctcvr"] = auc_ctcvr return - loss_ctr = fluid.layers.cross_entropy(input=ctr_out, label=ctr_clk) loss_ctcvr = fluid.layers.cross_entropy(input=ctcvr_prop, label=ctcvr_buy) cost = loss_ctr + loss_ctcvr avg_cost = fluid.layers.mean(cost) - self._cost = avg_cost self._metrics["AUC_ctr"] = auc_ctr self._metrics["BATCH_AUC_ctr"] = batch_auc_ctr self._metrics["AUC_ctcvr"] = auc_ctcvr self._metrics["BATCH_AUC_ctcvr"] = batch_auc_ctcvr - def train_net(self): input_data = self.input_data() self.net(input_data) - def infer_net(self): self._infer_data_var = self.input_data() self._infer_data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) + feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) self.net(self._infer_data_var, is_infer=True) diff --git a/models/multitask/mmoe/census_infer_reader.py b/models/multitask/mmoe/census_infer_reader.py index c25ccea8c4416bbfe64d3cdda59f5ee13b0cfac1..c62de8e69ce6ccfbb4df1e1252d9630a84fc56b3 100644 --- a/models/multitask/mmoe/census_infer_reader.py +++ b/models/multitask/mmoe/census_infer_reader.py @@ -11,11 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs -import numpy as np class EvaluateReader(Reader): diff --git a/models/multitask/mmoe/census_reader.py b/models/multitask/mmoe/census_reader.py index 30997c632302c2a6e963099c9e2679978c9d9b61..211e566882e5d8a7f50f22b0a1628307777099c8 100644 --- a/models/multitask/mmoe/census_reader.py +++ b/models/multitask/mmoe/census_reader.py @@ -11,11 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs -import numpy as np class TrainReader(Reader): @@ -44,8 +43,8 @@ class TrainReader(Reader): label_marital = [1, 0] elif int(l[0]) == 1: label_marital = [0, 1] - #label_income = np.array(label_income) - #label_marital = np.array(label_marital) + # label_income = np.array(label_income) + # label_marital = np.array(label_marital) feature_name = ["input", "label_income", "label_marital"] yield zip(feature_name, [data] + [label_income] + [label_marital]) diff --git a/models/multitask/mmoe/model.py b/models/multitask/mmoe/model.py index 753bacae77f089dc709e34812ef109ac06aebcfb..525e9d5cc0086757901262253cf0f23ee72f314c 100644 --- a/models/multitask/mmoe/model.py +++ b/models/multitask/mmoe/model.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import math import paddle.fluid as fluid from paddlerec.core.utils import envs @@ -37,22 +36,21 @@ class Model(ModelBase): if is_infer: self._infer_data_var = [input_data, label_income, label_marital] self._infer_data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) - + feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) + self._data_var.extend([input_data, label_income, label_marital]) # f_{i}(x) = activation(W_{i} * x + b), where activation is ReLU according to the paper expert_outputs = [] for i in range(0, expert_num): expert_output = fluid.layers.fc(input=input_data, - size=expert_size, - act='relu', - bias_attr=fluid.ParamAttr(learning_rate=1.0), - name='expert_' + str(i)) + size=expert_size, + act='relu', + bias_attr=fluid.ParamAttr(learning_rate=1.0), + name='expert_' + str(i)) expert_outputs.append(expert_output) expert_concat = fluid.layers.concat(expert_outputs, axis=1) - expert_concat = fluid.layers.reshape(expert_concat,[-1, expert_num, expert_size]) - - + expert_concat = fluid.layers.reshape(expert_concat, [-1, expert_num, expert_size]) + # g^{k}(x) = activation(W_{gk} * x + b), where activation is softmax according to the paper output_layers = [] for i in range(0, gate_num): @@ -62,52 +60,53 @@ class Model(ModelBase): bias_attr=fluid.ParamAttr(learning_rate=1.0), name='gate_' + str(i)) # f^{k}(x) = sum_{i=1}^{n}(g^{k}(x)_{i} * f_{i}(x)) - cur_gate_expert = fluid.layers.elementwise_mul(expert_concat, cur_gate, axis=0) + cur_gate_expert = fluid.layers.elementwise_mul(expert_concat, cur_gate, axis=0) cur_gate_expert = fluid.layers.reduce_sum(cur_gate_expert, dim=1) # Build tower layer - cur_tower = fluid.layers.fc(input=cur_gate_expert, - size=tower_size, - act='relu', - name='task_layer_' + str(i)) - out = fluid.layers.fc(input=cur_tower, - size=2, - act='softmax', - name='out_' + str(i)) - + cur_tower = fluid.layers.fc(input=cur_gate_expert, + size=tower_size, + act='relu', + name='task_layer_' + str(i)) + out = fluid.layers.fc(input=cur_tower, + size=2, + act='softmax', + name='out_' + str(i)) + output_layers.append(out) pred_income = fluid.layers.clip(output_layers[0], min=1e-15, max=1.0 - 1e-15) pred_marital = fluid.layers.clip(output_layers[1], min=1e-15, max=1.0 - 1e-15) - label_income_1 = fluid.layers.slice(label_income, axes=[1], starts=[1], ends=[2]) label_marital_1 = fluid.layers.slice(label_marital, axes=[1], starts=[1], ends=[2]) - - auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income, label=fluid.layers.cast(x=label_income_1, dtype='int64')) - auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital, label=fluid.layers.cast(x=label_marital_1, dtype='int64')) + + auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income, + label=fluid.layers.cast(x=label_income_1, + dtype='int64')) + auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital, + label=fluid.layers.cast(x=label_marital_1, + dtype='int64')) if is_infer: self._infer_results["AUC_income"] = auc_income self._infer_results["AUC_marital"] = auc_marital return - cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income,soft_label = True) - cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital,soft_label = True) - + cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income, soft_label=True) + cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital, soft_label=True) + avg_cost_income = fluid.layers.mean(x=cost_income) avg_cost_marital = fluid.layers.mean(x=cost_marital) - - cost = avg_cost_income + avg_cost_marital - + + cost = avg_cost_income + avg_cost_marital + self._cost = cost self._metrics["AUC_income"] = auc_income self._metrics["BATCH_AUC_income"] = batch_auc_1 self._metrics["AUC_marital"] = auc_marital self._metrics["BATCH_AUC_marital"] = batch_auc_2 - def train_net(self): self.MMOE() - def infer_net(self): self.MMOE(is_infer=True) diff --git a/models/multitask/readme.md b/models/multitask/readme.md index 3f31035351747ada8bcee3652f709c795a86e98b..d234f42f146e18bf254e518db0e78acc1e1d3e10 100755 --- a/models/multitask/readme.md +++ b/models/multitask/readme.md @@ -9,8 +9,7 @@ * [整体介绍](#整体介绍) * [多任务模型列表](#多任务模型列表) * [使用教程](#使用教程) - * [训练](#训练) - * [预测](#预测) + * [训练&预测](#训练&预测) * [效果对比](#效果对比) * [模型效果列表](#模型效果列表) @@ -42,16 +41,17 @@

## 使用教程 -### 训练 +### 训练&预测 ```shell -python -m paddlerec.run -m config.yaml -d cpu -e single +python -m paddlerec.run -m paddlerec.models.multitask.mmoe # mmoe +python -m paddlerec.run -m paddlerec.models.multitask.share-bottom # share-bottom +python -m paddlerec.run -m paddlerec.models.multitask.esmm # esmm ``` -### 预测 ## 效果对比 ### 模型效果列表 -| 数据集 | 模型 | loss | 评价指标 | +| 数据集 | 模型 | loss | auc | | :------------------: | :--------------------: | :---------: |:---------: | | Census-income Data | Share-Bottom | -- | 0.93120/0.99256 | | Census-income Data | MMoE | -- | 0.94465/0.99324 | diff --git a/models/multitask/share-bottom/census_infer_reader.py b/models/multitask/share-bottom/census_infer_reader.py index c25ccea8c4416bbfe64d3cdda59f5ee13b0cfac1..c62de8e69ce6ccfbb4df1e1252d9630a84fc56b3 100644 --- a/models/multitask/share-bottom/census_infer_reader.py +++ b/models/multitask/share-bottom/census_infer_reader.py @@ -11,11 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs -import numpy as np class EvaluateReader(Reader): diff --git a/models/multitask/share-bottom/census_reader.py b/models/multitask/share-bottom/census_reader.py index 30997c632302c2a6e963099c9e2679978c9d9b61..211e566882e5d8a7f50f22b0a1628307777099c8 100644 --- a/models/multitask/share-bottom/census_reader.py +++ b/models/multitask/share-bottom/census_reader.py @@ -11,11 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs -import numpy as np class TrainReader(Reader): @@ -44,8 +43,8 @@ class TrainReader(Reader): label_marital = [1, 0] elif int(l[0]) == 1: label_marital = [0, 1] - #label_income = np.array(label_income) - #label_marital = np.array(label_marital) + # label_income = np.array(label_income) + # label_marital = np.array(label_marital) feature_name = ["input", "label_income", "label_marital"] yield zip(feature_name, [data] + [label_income] + [label_marital]) diff --git a/models/multitask/share-bottom/model.py b/models/multitask/share-bottom/model.py index a91729953421704986725ccd3540fa2fc29e27e4..d570ba77067985b518247c8f6bba16a6431e1f9c 100644 --- a/models/multitask/share-bottom/model.py +++ b/models/multitask/share-bottom/model.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import math import paddle.fluid as fluid from paddlerec.core.utils import envs @@ -33,65 +32,65 @@ class Model(ModelBase): input_data = fluid.data(name="input", shape=[-1, feature_size], dtype="float32") label_income = fluid.data(name="label_income", shape=[-1, 2], dtype="float32", lod_level=0) label_marital = fluid.data(name="label_marital", shape=[-1, 2], dtype="float32", lod_level=0) - + if is_infer: self._infer_data_var = [input_data, label_income, label_marital] self._infer_data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) + feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) self._data_var.extend([input_data, label_income, label_marital]) bottom_output = fluid.layers.fc(input=input_data, - size=bottom_size, - act='relu', - bias_attr=fluid.ParamAttr(learning_rate=1.0), - name='bottom_output') - - + size=bottom_size, + act='relu', + bias_attr=fluid.ParamAttr(learning_rate=1.0), + name='bottom_output') + # Build tower layer from bottom layer output_layers = [] - for index in range(tower_nums): + for index in range(tower_nums): tower_layer = fluid.layers.fc(input=bottom_output, - size=tower_size, - act='relu', - name='task_layer_' + str(index)) + size=tower_size, + act='relu', + name='task_layer_' + str(index)) output_layer = fluid.layers.fc(input=tower_layer, - size=2, - act='softmax', - name='output_layer_' + str(index)) + size=2, + act='softmax', + name='output_layer_' + str(index)) output_layers.append(output_layer) - pred_income = fluid.layers.clip(output_layers[0], min=1e-15, max=1.0 - 1e-15) pred_marital = fluid.layers.clip(output_layers[1], min=1e-15, max=1.0 - 1e-15) label_income_1 = fluid.layers.slice(label_income, axes=[1], starts=[1], ends=[2]) label_marital_1 = fluid.layers.slice(label_marital, axes=[1], starts=[1], ends=[2]) - - auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income, label=fluid.layers.cast(x=label_income_1, dtype='int64')) - auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital, label=fluid.layers.cast(x=label_marital_1, dtype='int64')) + + auc_income, batch_auc_1, auc_states_1 = fluid.layers.auc(input=pred_income, + label=fluid.layers.cast(x=label_income_1, + dtype='int64')) + auc_marital, batch_auc_2, auc_states_2 = fluid.layers.auc(input=pred_marital, + label=fluid.layers.cast(x=label_marital_1, + dtype='int64')) if is_infer: self._infer_results["AUC_income"] = auc_income self._infer_results["AUC_marital"] = auc_marital return - cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income,soft_label = True) - cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital,soft_label = True) + cost_income = fluid.layers.cross_entropy(input=pred_income, label=label_income, soft_label=True) + cost_marital = fluid.layers.cross_entropy(input=pred_marital, label=label_marital, soft_label=True) cost = fluid.layers.elementwise_add(cost_income, cost_marital, axis=1) - - avg_cost = fluid.layers.mean(x=cost) - + + avg_cost = fluid.layers.mean(x=cost) + self._cost = avg_cost self._metrics["AUC_income"] = auc_income self._metrics["BATCH_AUC_income"] = batch_auc_1 self._metrics["AUC_marital"] = auc_marital self._metrics["BATCH_AUC_marital"] = batch_auc_2 - def train_net(self): self.model() - def infer_net(self): self.model(is_infer=True) diff --git a/models/rank/criteo_reader.py b/models/rank/criteo_reader.py new file mode 100755 index 0000000000000000000000000000000000000000..75994fb43f6ee3a72ab2aae25c36e0591c530fee --- /dev/null +++ b/models/rank/criteo_reader.py @@ -0,0 +1,61 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +from paddlerec.core.reader import Reader +from paddlerec.core.utils import envs + + +class TrainReader(Reader): + def init(self): + self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + self.cont_max_ = [20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50] + self.cont_diff_ = [20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50] + self.hash_dim_ = envs.get_global_env("hyper_parameters.sparse_feature_number", None, "train.model") + self.continuous_range_ = range(1, 14) + self.categorical_range_ = range(14, 40) + + def generate_sample(self, line): + """ + Read the data line by line and process it as a dictionary + """ + + def reader(): + """ + This function needs to be implemented by the user, based on data format + """ + features = line.rstrip('\n').split('\t') + + dense_feature = [] + sparse_feature = [] + for idx in self.continuous_range_: + if features[idx] == "": + dense_feature.append(0.0) + else: + dense_feature.append( + (float(features[idx]) - self.cont_min_[idx - 1]) / + self.cont_diff_[idx - 1]) + + for idx in self.categorical_range_: + sparse_feature.append( + [hash(str(idx) + features[idx]) % self.hash_dim_]) + label = [int(features[0])] + feature_name = ["D"] + for idx in self.categorical_range_: + feature_name.append("S" + str(idx - 13)) + feature_name.append("label") + yield zip(feature_name, [dense_feature] + sparse_feature + [label]) + + return reader diff --git a/models/rank/dcn/data/get_slot_data.py b/models/rank/dcn/data/get_slot_data.py index 131bff2afe548cdb8bfb942b6c609a837fe8ab6d..77b30296ab9ba99757039f053c6133fd175c2811 100755 --- a/models/rank/dcn/data/get_slot_data.py +++ b/models/rank/dcn/data/get_slot_data.py @@ -16,6 +16,8 @@ import sys import yaml from paddlerec.core.reader import Reader from paddlerec.core.utils import envs +import math +import os try: import cPickle as pickle except ImportError: @@ -55,7 +57,7 @@ class TrainReader(dg.MultiSlotDataGenerator): self.label_feat_names = target + dense_feat_names + sparse_feat_names self.cat_feat_idx_dict_list = [{} for _ in range(26)] - + # TODO: set vocabulary dictionary vocab_dir = "./vocab/" for i in range(26): @@ -63,7 +65,7 @@ class TrainReader(dg.MultiSlotDataGenerator): for line in open( os.path.join(vocab_dir, 'C' + str(i + 1) + '.txt')): self.cat_feat_idx_dict_list[i][line.strip()] = lookup_idx - lookup_idx += 1 + lookup_idx += 1 def _process_line(self, line): features = line.rstrip('\n').split('\t') @@ -81,18 +83,19 @@ class TrainReader(dg.MultiSlotDataGenerator): if idx == 2 else math.log(1 + float(features[idx]))) for idx in self.cat_idx_: if features[idx] == '' or features[ - idx] not in self.cat_feat_idx_dict_list[idx - 14]: + idx] not in self.cat_feat_idx_dict_list[idx - 14]: label_feat_list[idx].append(0) else: label_feat_list[idx].append(self.cat_feat_idx_dict_list[ - idx - 14][features[idx]]) + idx - 14][features[idx]]) label_feat_list[0].append(int(features[0])) return label_feat_list - + def generate_sample(self, line): """ Read the data line by line and process it as a dictionary """ + def data_iter(): label_feat_list = self._process_line(line) s = "" diff --git a/models/rank/dcn/model.py b/models/rank/dcn/model.py index 053e74e96eb1ef22376d7f7272e66d31295e0728..930646428714d1f77df9e6539dc4db231d8fc0a3 100755 --- a/models/rank/dcn/model.py +++ b/models/rank/dcn/model.py @@ -1,14 +1,29 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import OrderedDict + import paddle.fluid as fluid -import math from paddlerec.core.utils import envs from paddlerec.core.model import Model as ModelBase -from collections import OrderedDict + class Model(ModelBase): def __init__(self, config): ModelBase.__init__(self, config) - + def init_network(self): self.cross_num = envs.get_global_env("hyper_parameters.cross_num", None, self._namespace) self.dnn_hidden_units = envs.get_global_env("hyper_parameters.dnn_hidden_units", None, self._namespace) @@ -64,7 +79,7 @@ class Model(ModelBase): net_input = fluid.layers.concat([dense_input, sparse_input], axis=-1) return net_input - + def _deep_net(self, input, hidden_units, use_bn=False, is_test=False): for units in hidden_units: input = fluid.layers.fc(input=input, size=units) @@ -81,7 +96,7 @@ class Model(ModelBase): [input_dim], dtype='float32', name=prefix + "_b") xw = fluid.layers.reduce_sum(x * w, dim=1, keep_dim=True) # (N, 1) return x0 * xw + b + x, w - + def _cross_net(self, input, num_corss_layers): x = x0 = input l2_reg_cross_list = [] @@ -92,10 +107,10 @@ class Model(ModelBase): fluid.layers.concat( l2_reg_cross_list, axis=-1)) return x, l2_reg_cross_loss - + def _l2_loss(self, w): return fluid.layers.reduce_sum(fluid.layers.square(w)) - + def train_net(self): self.init_network() @@ -104,8 +119,8 @@ class Model(ModelBase): deep_out = self._deep_net(self.net_input, self.dnn_hidden_units, self.dnn_use_bn, False) cross_out, l2_reg_cross_loss = self._cross_net(self.net_input, - self.cross_num) - + self.cross_num) + last_out = fluid.layers.concat([deep_out, cross_out], axis=-1) logit = fluid.layers.fc(last_out, 1) diff --git a/models/rank/deepfm/data/get_slot_data.py b/models/rank/deepfm/data/get_slot_data.py index dcf441d9ea1afbae2d5c8e1c52a6cef4ff75ccdd..59dc33b0d7b6aa1b2087134c9952fc160ca6cd04 100755 --- a/models/rank/deepfm/data/get_slot_data.py +++ b/models/rank/deepfm/data/get_slot_data.py @@ -19,7 +19,6 @@ try: import cPickle as pickle except ImportError: import pickle - class TrainReader(dg.MultiSlotDataGenerator): def __init__(self, config): @@ -69,11 +68,12 @@ class TrainReader(dg.MultiSlotDataGenerator): feat_value.append(1.0) label = [int(features[0])] return feat_idx, feat_value, label - + def generate_sample(self, line): """ Read the data line by line and process it as a dictionary """ + def data_iter(): feat_idx, feat_value, label = self._process_line(line) s = "" diff --git a/models/rank/deepfm/model.py b/models/rank/deepfm/model.py index 778fb58ed16b9af13ca7b13621e1cd17596dc1ad..3addd00025e1f821e757926fd4f4592b7a824e18 100755 --- a/models/rank/deepfm/model.py +++ b/models/rank/deepfm/model.py @@ -1,6 +1,21 @@ -import paddle.fluid as fluid +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import math +import paddle.fluid as fluid + from paddlerec.core.utils import envs from paddlerec.core.model import Model as ModelBase @@ -14,9 +29,9 @@ class Model(ModelBase): is_distributed = True if envs.get_trainer() == "CtrTrainer" else False sparse_feature_number = envs.get_global_env("hyper_parameters.sparse_feature_number", None, self._namespace) sparse_feature_dim = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace) - + # ------------------------- network input -------------------------- - + num_field = envs.get_global_env("hyper_parameters.num_field", None, self._namespace) raw_feat_idx = self._sparse_data_var[1] @@ -26,8 +41,6 @@ class Model(ModelBase): feat_idx = raw_feat_idx feat_value = fluid.layers.reshape(raw_feat_value, [-1, num_field, 1]) # None * num_field * 1 - #------------------------- first order term -------------------------- - reg = envs.get_global_env("hyper_parameters.reg", 1e-4, self._namespace) first_weights_re = fluid.embedding( input=feat_idx, @@ -44,7 +57,7 @@ class Model(ModelBase): first_weights_re, shape=[-1, num_field, 1]) # None * num_field * 1 y_first_order = fluid.layers.reduce_sum((first_weights * feat_value), 1) - #------------------------- second order term -------------------------- + # ------------------------- second order term -------------------------- feat_embeddings_re = fluid.embedding( input=feat_idx, @@ -59,12 +72,12 @@ class Model(ModelBase): feat_embeddings = fluid.layers.reshape( feat_embeddings_re, shape=[-1, num_field, - sparse_feature_dim]) # None * num_field * embedding_size + sparse_feature_dim]) # None * num_field * embedding_size feat_embeddings = feat_embeddings * feat_value # None * num_field * embedding_size - + # sum_square part summed_features_emb = fluid.layers.reduce_sum(feat_embeddings, - 1) # None * embedding_size + 1) # None * embedding_size summed_features_emb_square = fluid.layers.square( summed_features_emb) # None * embedding_size @@ -78,13 +91,12 @@ class Model(ModelBase): summed_features_emb_square - squared_sum_features_emb, 1, keep_dim=True) # None * 1 - - #------------------------- DNN -------------------------- + # ------------------------- DNN -------------------------- layer_sizes = envs.get_global_env("hyper_parameters.fc_sizes", None, self._namespace) act = envs.get_global_env("hyper_parameters.act", None, self._namespace) y_dnn = fluid.layers.reshape(feat_embeddings, - [-1, num_field * sparse_feature_dim]) + [-1, num_field * sparse_feature_dim]) for s in layer_sizes: y_dnn = fluid.layers.fc( input=y_dnn, @@ -106,28 +118,28 @@ class Model(ModelBase): bias_attr=fluid.ParamAttr( initializer=fluid.initializer.TruncatedNormalInitializer( loc=0.0, scale=init_value_))) - - #------------------------- DeepFM -------------------------- + + # ------------------------- DeepFM -------------------------- self.predict = fluid.layers.sigmoid(y_first_order + y_second_order + y_dnn) - + def train_net(self): self.deepfm_net() - - #------------------------- Cost(logloss) -------------------------- + + # ------------------------- Cost(logloss) -------------------------- cost = fluid.layers.log_loss(input=self.predict, label=fluid.layers.cast(self.label, "float32")) avg_cost = fluid.layers.reduce_sum(cost) - + self._cost = avg_cost - #------------------------- Metric(Auc) -------------------------- - + # ------------------------- Metric(Auc) -------------------------- + predict_2d = fluid.layers.concat([1 - self.predict, self.predict], 1) label_int = fluid.layers.cast(self.label, 'int64') auc_var, batch_auc_var, _ = fluid.layers.auc(input=predict_2d, - label=label_int, - slide_steps=0) + label=label_int, + slide_steps=0) self._metrics["AUC"] = auc_var self._metrics["BATCH_AUC"] = batch_auc_var diff --git a/models/rank/din/model.py b/models/rank/din/model.py index 68585c8e512005aa942d088baeaa7be7b91c0cd4..2abc658b6d5cb58aaff222e1121d2c4282bcd65f 100755 --- a/models/rank/din/model.py +++ b/models/rank/din/model.py @@ -1,5 +1,18 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import paddle.fluid as fluid -import math from paddlerec.core.utils import envs from paddlerec.core.model import Model as ModelBase diff --git a/models/rank/din/reader.py b/models/rank/din/reader.py index d1742bddd8bbb5b98bdecaad0f43f438701cc480..39ed690fdc6fd35d50ebdcb46b5becc5ae399b62 100755 --- a/models/rank/din/reader.py +++ b/models/rank/din/reader.py @@ -13,24 +13,28 @@ # limitations under the License. from __future__ import print_function -from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs -import numpy as np import os import random + try: import cPickle as pickle except ImportError: import pickle +import numpy as np + +from paddlerec.core.reader import Reader +from paddlerec.core.utils import envs + + class TrainReader(Reader): def init(self): self.train_data_path = envs.get_global_env("train_data_path", None, "train.reader") self.res = [] self.max_len = 0 - + data_file_list = os.listdir(self.train_data_path) - for i in range(0, len(data_file_list)): + for i in range(0, len(data_file_list)): train_data_file = os.path.join(self.train_data_path, data_file_list[i]) with open(train_data_file, "r") as fin: for line in fin: @@ -43,9 +47,6 @@ class TrainReader(Reader): self.batch_size = envs.get_global_env("batch_size", 32, "train.reader") self.group_size = self.batch_size * 20 - - - def _process_line(self, line): line = line.strip().split(';') hist = line[0].split() @@ -54,22 +55,22 @@ class TrainReader(Reader): cate = [int(i) for i in cate] return [hist, cate, [int(line[2])], [int(line[3])], [float(line[4])]] - def generate_sample(self, line): """ Read the data line by line and process it as a dictionary """ + def data_iter(): - #feat_idx, feat_value, label = self._process_line(line) + # feat_idx, feat_value, label = self._process_line(line) yield self._process_line(line) return data_iter - + def pad_batch_data(self, input, max_len): res = np.array([x + [0] * (max_len - len(x)) for x in input]) res = res.astype("int64").reshape([-1, max_len]) return res - + def make_data(self, b): max_len = max(len(x[0]) for x in b) item = self.pad_batch_data([x[0] for x in b], max_len) @@ -77,7 +78,7 @@ class TrainReader(Reader): len_array = [len(x[0]) for x in b] mask = np.array( [[0] * x + [-1e9] * (max_len - x) for x in len_array]).reshape( - [-1, max_len, 1]) + [-1, max_len, 1]) target_item_seq = np.array( [[x[2]] * max_len for x in b]).astype("int64").reshape([-1, max_len]) target_cat_seq = np.array( @@ -89,7 +90,7 @@ class TrainReader(Reader): target_item_seq[i], target_cat_seq[i] ]) return res - + def batch_reader(self, reader, batch_size, group_size): def batch_reader(): bg = [] @@ -111,7 +112,7 @@ class TrainReader(Reader): yield self.make_data(b) return batch_reader - + def base_read(self, file_dir): res = [] for train_file in file_dir: @@ -122,10 +123,8 @@ class TrainReader(Reader): cate = line[1].split() res.append([hist, cate, line[2], line[3], float(line[4])]) return res - + def generate_batch_from_trainfiles(self, files): data_set = self.base_read(files) random.shuffle(data_set) return self.batch_reader(data_set, self.batch_size, self.batch_size * 20) - - \ No newline at end of file diff --git a/models/rank/dnn/config.yaml b/models/rank/dnn/config.yaml index df01841ddeeceb58f6be6d8ffecda7e11cd93a4a..16e0be0b8fb5535d495e9278879081b77b127fa0 100755 --- a/models/rank/dnn/config.yaml +++ b/models/rank/dnn/config.yaml @@ -13,13 +13,14 @@ # limitations under the License. train: + epochs: 10 + engine: single + workspace: "paddlerec.models.rank.dnn" + trainer: # for cluster training strategy: "async" - epochs: 10 - workspace: "paddlerec.models.rank.dnn" - reader: batch_size: 2 train_data_path: "{workspace}/data/slot_train_data" diff --git a/models/rank/dnn/model.py b/models/rank/dnn/model.py index 43f6cbb653e244f5cf455b5fa896acc4c546adfc..85986ae4742b89c3bf42a8331720b86cd993d228 100755 --- a/models/rank/dnn/model.py +++ b/models/rank/dnn/model.py @@ -13,6 +13,7 @@ # limitations under the License. import math + import paddle.fluid as fluid from paddlerec.core.utils import envs diff --git a/models/rank/readme.md b/models/rank/readme.md index 326fb481356982dfb2acccaba670c072363bdb76..ac75c89fdd6b2d734aec8c8e8408f9036753d4ec 100755 --- a/models/rank/readme.md +++ b/models/rank/readme.md @@ -58,8 +58,15 @@ ## 使用教程 ### 数据处理 +参考每个模型目录数据下载&预处理脚本 ### 训练 +``` +python -m paddlerec.run -m paddlerec.models.rank.dnn # 以DNN为例 +``` ### 预测 +``` +python -m paddlerec.run -m paddlerec.models.rank.dnn # 以DNN为例 +``` ## 效果对比 ### 模型效果 (测试) @@ -69,7 +76,7 @@ | Criteo | DNN | -- | 0.79395 | -- | -- | | Criteo | DeepFM | 0.44797 | 0.80460 | -- | -- | | Criteo | DCN | 0.44704 | 0.80654 | -- | -- | -| Criteo | xDeepFM | -- | -- | 0.48657 | -- | +| Criteo | xDeepFM | 0.48657 | -- | -- | -- | | Census-income Data | Wide&Deep | 0.76195 | 0.90577 | -- | -- | | Amazon Product | DIN | 0.47005 | 0.86379 | -- | -- | diff --git a/models/rank/wide_deep/data/get_slot_data.py b/models/rank/wide_deep/data/get_slot_data.py index 6ccc75c0e64f4255688a6285709607f2445e67b7..b928ae1267113215aa2b71f8dccffddc0db048fb 100755 --- a/models/rank/wide_deep/data/get_slot_data.py +++ b/models/rank/wide_deep/data/get_slot_data.py @@ -37,14 +37,15 @@ class TrainReader(dg.MultiSlotDataGenerator): line = line.strip().split(',') features = list(map(float, line)) wide_feat = features[0:8] - deep_feat = features[8:58+8] - label = int(features[-1]) + deep_feat = features[8:58 + 8] + label = features[-1] return wide_feat, deep_feat, [label] - + def generate_sample(self, line): """ Read the data line by line and process it as a dictionary """ + def data_iter(): wide_feat, deep_deat, label = self._process_line(line) diff --git a/models/rank/wide_deep/model.py b/models/rank/wide_deep/model.py index 6243a2175e1af17a1b4249844079888665208ce8..d1901ce495e29a4a0c55a619ef76d3c0d59cdbca 100755 --- a/models/rank/wide_deep/model.py +++ b/models/rank/wide_deep/model.py @@ -1,6 +1,21 @@ -import paddle.fluid as fluid +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import math +import paddle.fluid as fluid + from paddlerec.core.utils import envs from paddlerec.core.model import Model as ModelBase @@ -8,32 +23,39 @@ from paddlerec.core.model import Model as ModelBase class Model(ModelBase): def __init__(self, config): ModelBase.__init__(self, config) - + def wide_part(self, data): out = fluid.layers.fc(input=data, - size=1, - param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0 / math.sqrt(data.shape[1])), - regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)), - act=None, - name='wide') + size=1, + param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, + scale=1.0 / math.sqrt( + data.shape[ + 1])), + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4)), + act=None, + name='wide') return out - + def fc(self, data, hidden_units, active, tag): output = fluid.layers.fc(input=data, - size=hidden_units, - param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0 / math.sqrt(data.shape[1]))), - act=active, - name=tag) - + size=hidden_units, + param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, + scale=1.0 / math.sqrt( + data.shape[ + 1]))), + act=active, + name=tag) + return output - + def deep_part(self, data, hidden1_units, hidden2_units, hidden3_units): l1 = self.fc(data, hidden1_units, 'relu', 'l1') l2 = self.fc(l1, hidden2_units, 'relu', 'l2') l3 = self.fc(l2, hidden3_units, 'relu', 'l3') - + return l3 - + def train_net(self): wide_input = self._dense_data_var[0] deep_input = self._dense_data_var[1] @@ -44,26 +66,28 @@ class Model(ModelBase): hidden3_units = envs.get_global_env("hyper_parameters.hidden3_units", 25, self._namespace) wide_output = self.wide_part(wide_input) deep_output = self.deep_part(deep_input, hidden1_units, hidden2_units, hidden3_units) - + wide_model = fluid.layers.fc(input=wide_output, - size=1, - param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)), - act=None, - name='w_wide') - + size=1, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)), + act=None, + name='w_wide') + deep_model = fluid.layers.fc(input=deep_output, - size=1, - param_attr=fluid.ParamAttr(initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)), - act=None, - name='w_deep') - + size=1, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.TruncatedNormal(loc=0.0, scale=1.0)), + act=None, + name='w_deep') + prediction = fluid.layers.elementwise_add(wide_model, deep_model) pred = fluid.layers.sigmoid(fluid.layers.clip(prediction, min=-15.0, max=15.0), name="prediction") num_seqs = fluid.layers.create_tensor(dtype='int64') acc = fluid.layers.accuracy(input=pred, label=fluid.layers.cast(x=label, dtype='int64'), total=num_seqs) auc_var, batch_auc, auc_states = fluid.layers.auc(input=pred, label=fluid.layers.cast(x=label, dtype='int64')) - + self._metrics["AUC"] = auc_var self._metrics["BATCH_AUC"] = batch_auc self._metrics["ACC"] = acc diff --git a/models/rank/xdeepfm/data/get_slot_data.py b/models/rank/xdeepfm/data/get_slot_data.py index 2030f42902d4e478b8b4b77384fb05b420ad1283..d71444135c2198b426638bc4b2665ec053acb2aa 100755 --- a/models/rank/xdeepfm/data/get_slot_data.py +++ b/models/rank/xdeepfm/data/get_slot_data.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import print_function + import yaml from paddlerec.core.reader import Reader from paddlerec.core.utils import envs @@ -32,7 +32,7 @@ class TrainReader(dg.MultiSlotDataGenerator): def init(self): pass - + def _process_line(self, line): features = line.strip('\n').split('\t') feat_idx = [] @@ -42,7 +42,7 @@ class TrainReader(dg.MultiSlotDataGenerator): feat_value.append(1.0) label = [int(features[0])] return feat_idx, feat_value, label - + def generate_sample(self, line): def data_iter(): feat_idx, feat_value, label = self._process_line(line) diff --git a/models/rank/xdeepfm/model.py b/models/rank/xdeepfm/model.py index 3e3100e6768e233d2cdca051962167967746984d..c83a6f5bf5e4bcfba9b1eeb816cb4e0d43f677df 100755 --- a/models/rank/xdeepfm/model.py +++ b/models/rank/xdeepfm/model.py @@ -1,5 +1,18 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import paddle.fluid as fluid -import math from paddlerec.core.utils import envs from paddlerec.core.model import Model as ModelBase @@ -13,13 +26,13 @@ class Model(ModelBase): init_value_ = 0.1 initer = fluid.initializer.TruncatedNormalInitializer( loc=0.0, scale=init_value_) - + is_distributed = True if envs.get_trainer() == "CtrTrainer" else False sparse_feature_number = envs.get_global_env("hyper_parameters.sparse_feature_number", None, self._namespace) sparse_feature_dim = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace) - + # ------------------------- network input -------------------------- - + num_field = envs.get_global_env("hyper_parameters.num_field", None, self._namespace) raw_feat_idx = self._sparse_data_var[1] raw_feat_value = self._dense_data_var[0] @@ -39,7 +52,7 @@ class Model(ModelBase): feat_embeddings, [-1, num_field, sparse_feature_dim]) # None * num_field * embedding_size feat_embeddings = feat_embeddings * feat_value # None * num_field * embedding_size - + # -------------------- linear -------------------- weights_linear = fluid.embedding( @@ -57,7 +70,7 @@ class Model(ModelBase): default_initializer=fluid.initializer.ConstantInitializer(value=0)) y_linear = fluid.layers.reduce_sum( (weights_linear * feat_value), 1) + b_linear - + # -------------------- CIN -------------------- layer_sizes_cin = envs.get_global_env("hyper_parameters.layer_sizes_cin", None, self._namespace) @@ -68,7 +81,7 @@ class Model(ModelBase): X_0 = fluid.layers.reshape( fluid.layers.transpose(Xs[0], [0, 2, 1]), [-1, sparse_feature_dim, num_field, - 1]) # None, embedding_size, num_field, 1 + 1]) # None, embedding_size, num_field, 1 X_k = fluid.layers.reshape( fluid.layers.transpose(Xs[-1], [0, 2, 1]), [-1, sparse_feature_dim, 1, last_s]) # None, embedding_size, 1, last_s @@ -114,7 +127,7 @@ class Model(ModelBase): layer_sizes_dnn = envs.get_global_env("hyper_parameters.layer_sizes_dnn", None, self._namespace) act = envs.get_global_env("hyper_parameters.act", None, self._namespace) y_dnn = fluid.layers.reshape(feat_embeddings, - [-1, num_field * sparse_feature_dim]) + [-1, num_field * sparse_feature_dim]) for s in layer_sizes_dnn: y_dnn = fluid.layers.fc(input=y_dnn, size=s, @@ -130,7 +143,7 @@ class Model(ModelBase): # ------------------- xDeepFM ------------------ self.predict = fluid.layers.sigmoid(y_linear + y_cin + y_dnn) - + def train_net(self): self.xdeepfm_net() @@ -142,11 +155,11 @@ class Model(ModelBase): predict_2d = fluid.layers.concat([1 - self.predict, self.predict], 1) label_int = fluid.layers.cast(self.label, 'int64') auc_var, batch_auc_var, _ = fluid.layers.auc(input=predict_2d, - label=label_int, - slide_steps=0) + label=label_int, + slide_steps=0) self._metrics["AUC"] = auc_var self._metrics["BATCH_AUC"] = batch_auc_var - + def optimizer(self): learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) optimizer = fluid.optimizer.Adam(learning_rate, lazy_mode=True) diff --git a/models/recall/gnn/data_process.sh b/models/recall/gnn/data_process.sh index 9aa009f03cf2595ea0cb54e691800486f26a21bf..38877b6906ecd65ef190aae5f1dcf5a74cece6d0 100755 --- a/models/recall/gnn/data_process.sh +++ b/models/recall/gnn/data_process.sh @@ -1,5 +1,19 @@ #! /bin/bash +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + set -e echo "begin to download data" diff --git a/models/recall/gnn/evaluate_reader.py b/models/recall/gnn/evaluate_reader.py index 10d2f8e9a6ed70e3540b0b1f9038685a9a15525a..904140c2febf5164592348d0b4e8f90f197bbf06 100755 --- a/models/recall/gnn/evaluate_reader.py +++ b/models/recall/gnn/evaluate_reader.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,10 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np -import io + import copy import random + +import numpy as np + from paddlerec.core.reader import Reader from paddlerec.core.utils import envs @@ -22,17 +24,17 @@ from paddlerec.core.utils import envs class EvaluateReader(Reader): def init(self): self.batch_size = envs.get_global_env("batch_size", None, "evaluate.reader") - + self.input = [] self.length = None def base_read(self, files): res = [] for f in files: - with open(f, "r") as fin: + with open(f, "r") as fin: for line in fin: - line = line.strip().split('\t') - res.append(tuple([map(int, line[0].split(',')), int(line[1])])) + line = line.strip().split('\t') + res.append(tuple([map(int, line[0].split(',')), int(line[1])])) return res def make_data(self, cur_batch, batch_size): @@ -120,10 +122,11 @@ class EvaluateReader(Reader): else: # Due to fixed batch_size, discard the remaining ins return - #cur_batch = remain_data[i:] - #yield self.make_data(cur_batch, group_remain % batch_size) + # cur_batch = remain_data[i:] + # yield self.make_data(cur_batch, group_remain % batch_size) + return _reader - + def generate_batch_from_trainfiles(self, files): self.input = self.base_read(files) self.length = len(self.input) @@ -132,4 +135,5 @@ class EvaluateReader(Reader): def generate_sample(self, line): def data_iter(): yield [] + return data_iter diff --git a/models/recall/gnn/model.py b/models/recall/gnn/model.py index ac628ec6209e9489ac20550a7837b60c9e8e3771..b98625a6afc094e106b26d1e2b31a8712a9d7b94 100755 --- a/models/recall/gnn/model.py +++ b/models/recall/gnn/model.py @@ -12,8 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np import math +import numpy as np + import paddle.fluid as fluid import paddle.fluid.layers as layers @@ -25,19 +26,19 @@ class Model(ModelBase): def __init__(self, config): ModelBase.__init__(self, config) self.init_config() - + def init_config(self): self._fetch_interval = 1 - self.items_num, self.ins_num = self.config_read(envs.get_global_env("hyper_parameters.config_path", None, self._namespace)) + self.items_num, self.ins_num = self.config_read( + envs.get_global_env("hyper_parameters.config_path", None, self._namespace)) self.train_batch_size = envs.get_global_env("batch_size", None, "train.reader") self.evaluate_batch_size = envs.get_global_env("batch_size", None, "evaluate.reader") self.hidden_size = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace) self.step = envs.get_global_env("hyper_parameters.gnn_propogation_steps", None, self._namespace) - def config_read(self, config_path=None): - if config_path is None: - raise ValueError("please set train.model.hyper_parameters.config_path at first") + if config_path is None: + raise ValueError("please set train.model.hyper_parameters.config_path at first") with open(config_path, "r") as fin: item_nums = int(fin.readline().strip()) ins_nums = int(fin.readline().strip()) @@ -47,49 +48,49 @@ class Model(ModelBase): self.items = fluid.data( name="items", shape=[bs, -1], - dtype="int64") #[batch_size, uniq_max] + dtype="int64") # [batch_size, uniq_max] self.seq_index = fluid.data( name="seq_index", shape=[bs, -1, 2], - dtype="int32") #[batch_size, seq_max, 2] + dtype="int32") # [batch_size, seq_max, 2] self.last_index = fluid.data( name="last_index", shape=[bs, 2], - dtype="int32") #[batch_size, 2] + dtype="int32") # [batch_size, 2] self.adj_in = fluid.data( name="adj_in", shape=[bs, -1, -1], - dtype="float32") #[batch_size, seq_max, seq_max] + dtype="float32") # [batch_size, seq_max, seq_max] self.adj_out = fluid.data( name="adj_out", shape=[bs, -1, -1], - dtype="float32") #[batch_size, seq_max, seq_max] + dtype="float32") # [batch_size, seq_max, seq_max] self.mask = fluid.data( name="mask", shape=[bs, -1, 1], - dtype="float32") #[batch_size, seq_max, 1] + dtype="float32") # [batch_size, seq_max, 1] self.label = fluid.data( name="label", shape=[bs, 1], - dtype="int64") #[batch_size, 1] + dtype="int64") # [batch_size, 1] res = [self.items, self.seq_index, self.last_index, self.adj_in, self.adj_out, self.mask, self.label] return res - + def train_input(self): res = self.input(self.train_batch_size) self._data_var = res - use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader", False, self._namespace) + use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader", False, self._namespace) if self._platform != "LINUX" or use_dataloader: self._data_loader = fluid.io.DataLoader.from_generator( feed_list=self._data_var, capacity=256, use_double_buffer=False, iterable=False) def net(self, items_num, hidden_size, step, bs): - stdv = 1.0 / math.sqrt(hidden_size) + stdv = 1.0 / math.sqrt(hidden_size) - def embedding_layer(input, table_name, emb_dim, initializer_instance=None): + def embedding_layer(input, table_name, emb_dim, initializer_instance=None): emb = fluid.embedding( input=input, size=[items_num, emb_dim], @@ -97,10 +98,10 @@ class Model(ModelBase): name=table_name, initializer=initializer_instance), ) - return emb - - sparse_initializer = fluid.initializer.Uniform(low=-stdv, high=stdv) - items_emb = embedding_layer(self.items, "emb", hidden_size, sparse_initializer) + return emb + + sparse_initializer = fluid.initializer.Uniform(low=-stdv, high=stdv) + items_emb = embedding_layer(self.items, "emb", hidden_size, sparse_initializer) pre_state = items_emb for i in range(step): pre_state = layers.reshape(x=pre_state, shape=[bs, -1, hidden_size]) @@ -113,7 +114,7 @@ class Model(ModelBase): param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( - low=-stdv, high=stdv))) #[batch_size, uniq_max, h] + low=-stdv, high=stdv))) # [batch_size, uniq_max, h] state_out = layers.fc( input=pre_state, name="state_out", @@ -123,13 +124,13 @@ class Model(ModelBase): param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( - low=-stdv, high=stdv))) #[batch_size, uniq_max, h] - - state_adj_in = layers.matmul(self.adj_in, state_in) #[batch_size, uniq_max, h] - state_adj_out = layers.matmul(self.adj_out, state_out) #[batch_size, uniq_max, h] - + low=-stdv, high=stdv))) # [batch_size, uniq_max, h] + + state_adj_in = layers.matmul(self.adj_in, state_in) # [batch_size, uniq_max, h] + state_adj_out = layers.matmul(self.adj_out, state_out) # [batch_size, uniq_max, h] + gru_input = layers.concat([state_adj_in, state_adj_out], axis=2) - + gru_input = layers.reshape(x=gru_input, shape=[-1, hidden_size * 2]) gru_fc = layers.fc( input=gru_input, @@ -140,11 +141,11 @@ class Model(ModelBase): input=gru_fc, hidden=layers.reshape(x=pre_state, shape=[-1, hidden_size]), size=3 * hidden_size) - + final_state = layers.reshape(pre_state, shape=[bs, -1, hidden_size]) seq = layers.gather_nd(final_state, self.seq_index) last = layers.gather_nd(final_state, self.last_index) - + seq_fc = layers.fc( input=seq, name="seq_fc", @@ -154,7 +155,7 @@ class Model(ModelBase): num_flatten_dims=2, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( - low=-stdv, high=stdv))) #[batch_size, seq_max, h] + low=-stdv, high=stdv))) # [batch_size, seq_max, h] last_fc = layers.fc( input=last, name="last_fc", @@ -164,22 +165,22 @@ class Model(ModelBase): num_flatten_dims=1, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( - low=-stdv, high=stdv))) #[bathc_size, h] - + low=-stdv, high=stdv))) # [bathc_size, h] + seq_fc_t = layers.transpose( - seq_fc, perm=[1, 0, 2]) #[seq_max, batch_size, h] + seq_fc, perm=[1, 0, 2]) # [seq_max, batch_size, h] add = layers.elementwise_add( - seq_fc_t, last_fc) #[seq_max, batch_size, h] + seq_fc_t, last_fc) # [seq_max, batch_size, h] b = layers.create_parameter( shape=[hidden_size], dtype='float32', - default_initializer=fluid.initializer.Constant(value=0.0)) #[h] - add = layers.elementwise_add(add, b) #[seq_max, batch_size, h] - - add_sigmoid = layers.sigmoid(add) #[seq_max, batch_size, h] + default_initializer=fluid.initializer.Constant(value=0.0)) # [h] + add = layers.elementwise_add(add, b) # [seq_max, batch_size, h] + + add_sigmoid = layers.sigmoid(add) # [seq_max, batch_size, h] add_sigmoid = layers.transpose( - add_sigmoid, perm=[1, 0, 2]) #[batch_size, seq_max, h] - + add_sigmoid, perm=[1, 0, 2]) # [batch_size, seq_max, h] + weight = layers.fc( input=add_sigmoid, name="weight_fc", @@ -189,13 +190,13 @@ class Model(ModelBase): bias_attr=False, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( - low=-stdv, high=stdv))) #[batch_size, seq_max, 1] + low=-stdv, high=stdv))) # [batch_size, seq_max, 1] weight *= self.mask - weight_mask = layers.elementwise_mul(seq, weight, axis=0) #[batch_size, seq_max, h] - global_attention = layers.reduce_sum(weight_mask, dim=1) #[batch_size, h] - + weight_mask = layers.elementwise_mul(seq, weight, axis=0) # [batch_size, seq_max, h] + global_attention = layers.reduce_sum(weight_mask, dim=1) # [batch_size, h] + final_attention = layers.concat( - [global_attention, last], axis=1) #[batch_size, 2*h] + [global_attention, last], axis=1) # [batch_size, 2*h] final_attention_fc = layers.fc( input=final_attention, name="final_attention_fc", @@ -203,14 +204,14 @@ class Model(ModelBase): bias_attr=False, act=None, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( - low=-stdv, high=stdv))) #[batch_size, h] - - # all_vocab = layers.create_global_var( - # shape=[items_num - 1], - # value=0, - # dtype="int64", - # persistable=True, - # name="all_vocab") + low=-stdv, high=stdv))) # [batch_size, h] + + # all_vocab = layers.create_global_var( + # shape=[items_num - 1], + # value=0, + # dtype="int64", + # persistable=True, + # name="all_vocab") all_vocab = np.arange(1, items_num).reshape((-1)).astype('int32') all_vocab = fluid.layers.cast(x=fluid.layers.assign(all_vocab), dtype='int64') @@ -220,13 +221,13 @@ class Model(ModelBase): name="emb", initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), - size=[items_num, hidden_size]) #[all_vocab, h] - + size=[items_num, hidden_size]) # [all_vocab, h] + logits = layers.matmul( x=final_attention_fc, y=all_emb, - transpose_y=True) #[batch_size, all_vocab] + transpose_y=True) # [batch_size, all_vocab] softmax = layers.softmax_with_cross_entropy( - logits=logits, label=self.label) #[batch_size, 1] + logits=logits, label=self.label) # [batch_size, 1] self.loss = layers.reduce_mean(softmax) # [1] self.acc = layers.accuracy(input=logits, label=self.label, k=20) @@ -249,7 +250,7 @@ class Model(ModelBase): decay_steps = envs.get_global_env("hyper_parameters.decay_steps", None, self._namespace) decay_rate = envs.get_global_env("hyper_parameters.decay_rate", None, self._namespace) l2 = envs.get_global_env("hyper_parameters.l2", None, self._namespace) - optimizer = fluid.optimizer.Adam( + optimizer = fluid.optimizer.Adam( learning_rate=fluid.layers.exponential_decay( learning_rate=learning_rate, decay_steps=decay_steps * step_per_epoch, @@ -257,18 +258,18 @@ class Model(ModelBase): regularization=fluid.regularizer.L2DecayRegularizer( regularization_coeff=l2)) - return optimizer + return optimizer def infer_input(self): self._reader_namespace = "evaluate.reader" res = self.input(self.evaluate_batch_size) - self._infer_data_var = res + self._infer_data_var = res self._infer_data_loader = fluid.io.DataLoader.from_generator( feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) - + def infer_net(self): - self.infer_input() - self.net(self.items_num, self.hidden_size, self.step, self.evaluate_batch_size) + self.infer_input() + self.net(self.items_num, self.hidden_size, self.step, self.evaluate_batch_size) self._infer_results['acc'] = self.acc - self._infer_results['loss'] = self.loss + self._infer_results['loss'] = self.loss diff --git a/models/recall/gnn/reader.py b/models/recall/gnn/reader.py index 7e9433797458f795e40b881dd324521a0bce1de4..cffb45115ed6a3dd6232b34db8758ad6a20447e2 100755 --- a/models/recall/gnn/reader.py +++ b/models/recall/gnn/reader.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,10 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np -import io + import copy import random + +import numpy as np + from paddlerec.core.reader import Reader from paddlerec.core.utils import envs @@ -22,17 +24,17 @@ from paddlerec.core.utils import envs class TrainReader(Reader): def init(self): self.batch_size = envs.get_global_env("batch_size", None, "train.reader") - + self.input = [] self.length = None def base_read(self, files): res = [] for f in files: - with open(f, "r") as fin: + with open(f, "r") as fin: for line in fin: - line = line.strip().split('\t') - res.append(tuple([map(int, line[0].split(',')), int(line[1])])) + line = line.strip().split('\t') + res.append(tuple([map(int, line[0].split(',')), int(line[1])])) return res def make_data(self, cur_batch, batch_size): @@ -120,10 +122,11 @@ class TrainReader(Reader): else: # Due to fixed batch_size, discard the remaining ins return - #cur_batch = remain_data[i:] - #yield self.make_data(cur_batch, group_remain % batch_size) + # cur_batch = remain_data[i:] + # yield self.make_data(cur_batch, group_remain % batch_size) + return _reader - + def generate_batch_from_trainfiles(self, files): self.input = self.base_read(files) self.length = len(self.input) @@ -132,4 +135,5 @@ class TrainReader(Reader): def generate_sample(self, line): def data_iter(): yield [] + return data_iter diff --git a/models/recall/gru4rec/hdfs.log b/models/recall/gru4rec/hdfs.log deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/models/recall/gru4rec/model.py b/models/recall/gru4rec/model.py index d77f3c254f4f20b12854dbf05af35c64613c4b84..b79c7642201990efae56a640954154404bf2e606 100644 --- a/models/recall/gru4rec/model.py +++ b/models/recall/gru4rec/model.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import math import paddle.fluid as fluid from paddlerec.core.utils import envs @@ -87,10 +86,8 @@ class Model(ModelBase): self._metrics["cost"] = avg_cost self._metrics["acc"] = acc - def train_net(self): self.all_vocab_network() - def infer_net(self): self.all_vocab_network(is_infer=True) diff --git a/models/recall/gru4rec/rsc15_infer_reader.py b/models/recall/gru4rec/rsc15_infer_reader.py index 829726e3d8861292266c25dfba4298a1ee2f502a..b58532a471f4b70eedfebeeadb35df20b4c40e72 100644 --- a/models/recall/gru4rec/rsc15_infer_reader.py +++ b/models/recall/gru4rec/rsc15_infer_reader.py @@ -11,10 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs class EvaluateReader(Reader): diff --git a/models/recall/gru4rec/rsc15_reader.py b/models/recall/gru4rec/rsc15_reader.py index eeb2441dd697bce4b3d3dc51bc64f1e2bfc0d2ab..4fe9433a65e1ceec508891eecfaaa5e464bc9e24 100644 --- a/models/recall/gru4rec/rsc15_reader.py +++ b/models/recall/gru4rec/rsc15_reader.py @@ -11,10 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs class TrainReader(Reader): diff --git a/models/recall/readme.md b/models/recall/readme.md index f1e90134e7808b46c310cb440ced306e4c467122..806e0a99e7576b96ee6d64bb1acc9e695dacb281 100755 --- a/models/recall/readme.md +++ b/models/recall/readme.md @@ -1,7 +1,7 @@ # 召回模型库 ## 简介 -我们提供了常见的召回任务中使用的模型算法的PaddleRec实现, 单机训练&预测效果指标以及分布式训练&预测性能指标等。实现的召回模型包括 [SR-GNN](http://gitlab.baidu.com/tangwei12/paddlerec/tree/develop/models/recall/gnn)、[GRU4REC](http://gitlab.baidu.com/tangwei12/paddlerec/tree/develop/models/recall/gru4rec)、[Sequence Semantic Retrieval Model](http://gitlab.baidu.com/tangwei12/paddlerec/tree/develop/models/recall/ssr)、 [Tree-based Deep Model](http://gitlab.baidu.com/tangwei12/paddlerec/tree/develop/models/recall/tdm)、[Word2Vector](http://gitlab.baidu.com/tangwei12/paddlerec/tree/develop/models/recall/word2vec)。 +我们提供了常见的召回任务中使用的模型算法的PaddleRec实现, 单机训练&预测效果指标以及分布式训练&预测性能指标等。实现的召回模型包括 [SR-GNN](http://gitlab.baidu.com/tangwei12/paddlerec/tree/develop/models/recall/gnn)、[GRU4REC](http://gitlab.baidu.com/tangwei12/paddlerec/tree/develop/models/recall/gru4rec)、[Sequence Semantic Retrieval Model](http://gitlab.baidu.com/tangwei12/paddlerec/tree/develop/models/recall/ssr)、[Word2Vector](http://gitlab.baidu.com/tangwei12/paddlerec/tree/develop/models/recall/word2vec)。 模型算法库在持续添加中,欢迎关注。 @@ -9,53 +9,57 @@ * [整体介绍](#整体介绍) * [召回模型列表](#召回模型列表) * [使用教程](#使用教程) - * [数据处理](#数据处理) - * [训练](#训练) - * [预测](#预测) + * [训练&预测](#训练&预测) * [效果对比](#效果对比) * [模型效果列表](#模型效果列表) -* [分布式](#分布式) - * [模型性能列表](#模型性能列表) ## 整体介绍 ### 召回模型列表 | 模型 | 简介 | 论文 | | :------------------: | :--------------------: | :---------: | -| GNN | SR-GNN | [Session-based Recommendation with Graph Neural Networks](https://arxiv.org/abs/1811.00855)(2018) | +| Word2Vec | word2vector | [Distributed Representations of Words and Phrases and their Compositionality](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)(2013) | | GRU4REC | SR-GRU | [Session-based Recommendations with Recurrent Neural Networks](https://arxiv.org/abs/1511.06939)(2015) | | SSR | Sequence Semantic Retrieval Model | [Multi-Rate Deep Learning for Temporal Recommendation](http://sonyis.me/paperpdf/spr209-song_sigir16.pdf)(2016) | -| TDM | Tree-based Deep Model | [Learning Tree-based Deep Model for Recommender Systems](https://arxiv.org/pdf/1801.02294.pdf)(2018) | -| Word2Vec | word2vector | [Distributed Representations of Words and Phrases and their Compositionality](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)(2013) | +| GNN | SR-GNN | [Session-based Recommendation with Graph Neural Networks](https://arxiv.org/abs/1811.00855)(2018) | + +下面是每个模型的简介(注:图片引用自链接中的论文) + +[Word2Vec](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf): +

+ +

+ +[GRU4REC](https://arxiv.org/abs/1511.06939): +

+ +

+ +[SSR](http://sonyis.me/paperpdf/spr209-song_sigir16.pdf): +

+ +

+ +[GNN](https://arxiv.org/abs/1811.00855): +

+ +

## 使用教程 -### 数据处理 +### 训练&预测 ```shell -sh data_process.sh +python -m paddlerec.run -m paddlerec.models.recall.word2vec # word2vec +python -m paddlerec.run -m paddlerec.models.recall.ssr # ssr +python -m paddlerec.run -m paddlerec.models.recall.gru4rec # gru4rec +python -m paddlerec.run -m paddlerec.models.recall.gnn # gnn ``` -### 训练 -```shell -python -m paddlerec.run -m config.yaml -d cpu -e single -``` -### 预测 - ## 效果对比 ### 模型效果列表 | 数据集 | 模型 | loss | Recall@20 | | :------------------: | :--------------------: | :---------: |:---------: | | DIGINETICA | GNN | -- | 0.507 | -| RSC15 | GRU4REC | -- | 0.67 | -| RSC15 | SSR | -- | 无 | -| - | TDM | -- | -- | +| RSC15 | GRU4REC | -- | 0.670 | +| RSC15 | SSR | -- | 0.590 | | 1 Billion Word Language Model Benchmark | Word2Vec | -- | 0.54 | -## 分布式 -### 模型性能列表 -| 数据集 | 模型 | 单机 | 多机(同步) | 多机(异步) | GPU | -| :------------------: | :--------------------: | :---------: |:---------: |:---------: |:---------: | -| DIGINETICA | GNN | -- | -- | -- | -- | -| RSC15 | GRU4REC | -- | -- | -- | -- | -| RSC15 | SSR | -- | -- | -- | -- | -| - | TDM | -- | -- | -- | -- | -| 1 Billion Word Language Model Benchmark | Word2Vec | -- | -- | -- | -- | diff --git a/models/recall/ssr/model.py b/models/recall/ssr/model.py index 3f93c8a12d6432634d1a4bf5d5835cef91cd70b7..2c4b7f190088cd7681720f83e3a53730b790d462 100644 --- a/models/recall/ssr/model.py +++ b/models/recall/ssr/model.py @@ -12,15 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import math import paddle.fluid as fluid - -from paddlerec.core.utils import envs -from paddlerec.core.model import Model as ModelBase import paddle.fluid.layers.tensor as tensor -import paddle.fluid.layers.io as io import paddle.fluid.layers.control_flow as cf +from paddlerec.core.utils import envs +from paddlerec.core.model import Model as ModelBase class BowEncoder(object): @@ -54,6 +51,7 @@ class GrnnEncoder(object): bias_attr=self.param_name + ".bias") return fluid.layers.sequence_pool(input=gru_h, pool_type='max') + class PairwiseHingeLoss(object): def __init__(self, margin=0.8): self.margin = margin @@ -70,6 +68,7 @@ class PairwiseHingeLoss(object): loss_part2) return loss_part3 + class Model(ModelBase): def __init__(self, config): ModelBase.__init__(self, config) @@ -80,7 +79,6 @@ class Model(ModelBase): return correct def train(self): - vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace) emb_dim = envs.get_global_env("hyper_parameters.emb_dim", None, self._namespace) hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None, self._namespace) @@ -124,16 +122,14 @@ class Model(ModelBase): hinge_loss = self.pairwise_hinge_loss.forward(cos_pos, cos_neg) avg_cost = fluid.layers.mean(hinge_loss) correct = self.get_correct(cos_neg, cos_pos) - + self._cost = avg_cost self._metrics["correct"] = correct self._metrics["hinge_loss"] = hinge_loss - def train_net(self): self.train() - def infer(self): vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace) emb_dim = envs.get_global_env("hyper_parameters.emb_dim", None, self._namespace) @@ -146,7 +142,7 @@ class Model(ModelBase): pos_label = fluid.data(name="pos_label", shape=[None, 1], dtype="int64") self._infer_data_var = [user_data, all_item_data, pos_label] self._infer_data_loader = fluid.io.DataLoader.from_generator( - feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) + feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) user_emb = fluid.embedding( input=user_data, size=[vocab_size, emb_dim], param_attr="emb.item") @@ -173,6 +169,5 @@ class Model(ModelBase): self._infer_results['recall20'] = acc - def infer_net(self): self.infer() diff --git a/models/recall/ssr/ssr_infer_reader.py b/models/recall/ssr/ssr_infer_reader.py index b9e5f7263162551a7ae8cecdf86dc9b5423b53b0..18f3fc2f37236907801fb00047fd3b6da5b5fa8c 100644 --- a/models/recall/ssr/ssr_infer_reader.py +++ b/models/recall/ssr/ssr_infer_reader.py @@ -11,19 +11,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function +import numpy as np + from paddlerec.core.reader import Reader from paddlerec.core.utils import envs -import random -import numpy as np class EvaluateReader(Reader): def init(self): self.vocab_size = envs.get_global_env("vocab_size", 10, "train.model.hyper_parameters") - def generate_sample(self, line): """ Read the data line by line and process it as a dictionary @@ -39,6 +39,6 @@ class EvaluateReader(Reader): src = conv_ids[:boundary] pos_tgt = [conv_ids[boundary]] feature_name = ["user", "all_item", "p_item"] - yield zip(feature_name, [src] + [np.arange(self.vocab_size).astype("int64").tolist()]+ [pos_tgt]) + yield zip(feature_name, [src] + [np.arange(self.vocab_size).astype("int64").tolist()] + [pos_tgt]) return reader diff --git a/models/recall/ssr/ssr_reader.py b/models/recall/ssr/ssr_reader.py index e2a352650cb6d5bb09fda78d27c3925f5ce670a5..d2d35458d867bd560e7e0b751f61de83d0f822b6 100644 --- a/models/recall/ssr/ssr_reader.py +++ b/models/recall/ssr/ssr_reader.py @@ -11,12 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import print_function -from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs import random +from paddlerec.core.reader import Reader + class TrainReader(Reader): def init(self): @@ -25,7 +26,6 @@ class TrainReader(Reader): def sample_neg_from_seq(self, seq): return seq[random.randint(0, len(seq) - 1)] - def generate_sample(self, line): """ Read the data line by line and process it as a dictionary diff --git a/models/recall/word2vec/model.py b/models/recall/word2vec/model.py index ea8e0f5e81e5554668ba9ad81cc7f80718a0b3b9..bf09a04648a71a6618b99ef7de7d7244aaecbdba 100755 --- a/models/recall/word2vec/model.py +++ b/models/recall/word2vec/model.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import math import numpy as np import paddle.fluid as fluid diff --git a/models/recall/word2vec/prepare_data.sh b/models/recall/word2vec/prepare_data.sh index 743ae99871ba1cc2d7309cda11117446613eb0e5..8b78eeedd94f088e206e35729a6b35d349b99039 100755 --- a/models/recall/word2vec/prepare_data.sh +++ b/models/recall/word2vec/prepare_data.sh @@ -1,5 +1,20 @@ #! /bin/bash +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + # download train_data mkdir raw_data wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/1-billion-word-language-modeling-benchmark-r13output.tar diff --git a/models/recall/word2vec/preprocess.py b/models/recall/word2vec/preprocess.py index 31088efb90498e44ffd5fe4fe24942e1d52c2e6b..9c9934e40589bdc700b7df5dc432d9b6dc92a8cc 100755 --- a/models/recall/word2vec/preprocess.py +++ b/models/recall/word2vec/preprocess.py @@ -1,11 +1,27 @@ # -*- coding: utf-8 -* +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import io +import math import os import random import re import six + import argparse -import io -import math + prog = re.compile("[^a-z ]", flags=0) @@ -59,7 +75,7 @@ def parse_args(): def text_strip(text): - #English Preprocess Rule + # English Preprocess Rule return prog.sub("", text.lower()) @@ -101,7 +117,7 @@ def filter_corpus(args): word_all_count = 0 id_counts = [] word_id = 0 - #read dict + # read dict with io.open(args.dict_path, 'r', encoding='utf-8') as f: for line in f: word, count = line.split()[0], int(line.split()[1]) @@ -111,13 +127,13 @@ def filter_corpus(args): id_counts.append(count) word_all_count += count - #write word2id file + # write word2id file print("write word2id file to : " + args.dict_path + "_word_to_id_") with io.open( args.dict_path + "_word_to_id_", 'w+', encoding='utf-8') as fid: for k, v in word_to_id_.items(): fid.write(k + " " + str(v) + '\n') - #filter corpus and convert id + # filter corpus and convert id if not os.path.exists(args.output_corpus_dir): os.makedirs(args.output_corpus_dir) for file in os.listdir(args.input_corpus_dir): @@ -138,9 +154,9 @@ def filter_corpus(args): count_w = id_counts[idx] corpus_size = word_all_count keep_prob = ( - math.sqrt(count_w / - (args.downsample * corpus_size)) + 1 - ) * (args.downsample * corpus_size) / count_w + math.sqrt(count_w / + (args.downsample * corpus_size)) + 1 + ) * (args.downsample * corpus_size) / count_w r_value = random.random() if r_value > keep_prob: continue @@ -186,7 +202,7 @@ def build_dict(args): for item in item_to_remove: unk_sum += word_count[item] del word_count[item] - #sort by count + # sort by count word_count[native_to_unicode('')] = unk_sum word_count = sorted( word_count.items(), key=lambda word_count: -word_count[1]) @@ -208,17 +224,18 @@ def data_split(args): for file_ in files: with open(os.path.join(raw_data_dir, file_), 'r') as f: contents.extend(f.readlines()) - + num = int(args.file_nums) lines_per_file = len(contents) / num print("contents: ", str(len(contents))) print("lines_per_file: ", str(lines_per_file)) - - for i in range(1, num+1): + + for i in range(1, num + 1): with open(os.path.join(new_data_dir, "part_" + str(i)), 'w') as fout: - data = contents[(i-1)*lines_per_file:min(i*lines_per_file,len(contents))] + data = contents[(i - 1) * lines_per_file:min(i * lines_per_file, len(contents))] for line in data: - fout.write(line) + fout.write(line) + if __name__ == "__main__": args = parse_args() diff --git a/models/recall/word2vec/w2v_evaluate_reader.py b/models/recall/word2vec/w2v_evaluate_reader.py index 6a5c1e83d61e64b35ef8492ccf55399bcf1acee6..04be9d41b2cd1ec51768696817a57c38dd958a44 100755 --- a/models/recall/word2vec/w2v_evaluate_reader.py +++ b/models/recall/word2vec/w2v_evaluate_reader.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,16 +11,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np + import io + import six + from paddlerec.core.reader import Reader from paddlerec.core.utils import envs class EvaluateReader(Reader): def init(self): - dict_path = envs.get_global_env("word_id_dict_path", None, "evaluate.reader") + dict_path = envs.get_global_env("word_id_dict_path", None, "evaluate.reader") self.word_to_id = dict() self.id_to_word = dict() with io.open(dict_path, 'r', encoding='utf-8') as f: @@ -46,19 +48,16 @@ class EvaluateReader(Reader): if isinstance(s, str): return True return False - - + def _to_unicode(self, s, ignore_errors=False): if self._is_unicode(s): return s error_mode = "ignore" if ignore_errors else "strict" return s.decode("utf-8", errors=error_mode) - - + def strip_lines(self, line, vocab): return self._replace_oov(vocab, self.native_to_unicode(line)) - - + def _replace_oov(self, original_vocab, line): """Replace out-of-vocab words with "". This maintains compatibility with published results. @@ -76,5 +75,7 @@ class EvaluateReader(Reader): def reader(): features = self.strip_lines(line.lower(), self.word_to_id) features = features.split() - yield [('analogy_a', [self.word_to_id[features[0]]]), ('analogy_b', [self.word_to_id[features[1]]]), ('analogy_c', [self.word_to_id[features[2]]]), ('analogy_d', [self.word_to_id[features[3]]])] + yield [('analogy_a', [self.word_to_id[features[0]]]), ('analogy_b', [self.word_to_id[features[1]]]), + ('analogy_c', [self.word_to_id[features[2]]]), ('analogy_d', [self.word_to_id[features[3]]])] + return reader diff --git a/models/recall/word2vec/w2v_reader.py b/models/recall/word2vec/w2v_reader.py index 24cf2e304e1cbf81af622dde30262959cb74e74f..88e52b47692778feef8396dd037448a8053aa958 100755 --- a/models/recall/word2vec/w2v_reader.py +++ b/models/recall/word2vec/w2v_reader.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,8 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np + import io + +import numpy as np + from paddlerec.core.reader import Reader from paddlerec.core.utils import envs @@ -37,7 +40,7 @@ class NumpyRandomInt(object): class TrainReader(Reader): def init(self): - dict_path = envs.get_global_env("word_count_dict_path", None, "train.reader") + dict_path = envs.get_global_env("word_count_dict_path", None, "train.reader") self.window_size = envs.get_global_env("hyper_parameters.window_size", None, "train.model") self.neg_num = envs.get_global_env("hyper_parameters.neg_num", None, "train.model") self.with_shuffle_batch = envs.get_global_env("hyper_parameters.with_shuffle_batch", None, "train.model") @@ -72,7 +75,7 @@ class TrainReader(Reader): start_point = 0 end_point = idx + target_window targets = words[start_point:idx] + words[idx + 1:end_point + 1] - return targets + return targets def generate_sample(self, line): def reader(): @@ -84,7 +87,7 @@ class TrainReader(Reader): output = [('input_word', [int(target_id)]), ('true_label', [int(context_id)])] if not self.with_shuffle_batch: neg_array = self.cs.searchsorted(np.random.sample(self.neg_num)) - output += [('neg_label', [int(str(i)) for i in neg_array ])] + output += [('neg_label', [int(str(i)) for i in neg_array])] yield output - return reader + return reader diff --git a/models/treebased/tdm/model.py b/models/treebased/tdm/model.py index 0fc1adf9cd04286cdb31cc3ba08715574dbee18b..fa5f225f68068f826df6fc9ef0c7c9d35dbd9b89 100755 --- a/models/treebased/tdm/model.py +++ b/models/treebased/tdm/model.py @@ -14,8 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import paddle.fluid as fluid -import math from paddlerec.core.utils import envs from paddlerec.core.model import Model as ModelBase @@ -134,7 +134,7 @@ class Model(ModelBase): sample_nodes_emb = [ fluid.layers.reshape(sample_nodes_emb[i], [-1, self.neg_sampling_list[i] + - self.output_positive, self.node_emb_size] + self.output_positive, self.node_emb_size] ) for i in range(self.max_layers) ] @@ -229,7 +229,7 @@ class Model(ModelBase): act=self.act, param_attr=fluid.ParamAttr( name="trans.layer_fc.weight." + str(i)), - bias_attr=fluid.ParamAttr(name="trans.layer_fc.bias."+str(i)), + bias_attr=fluid.ParamAttr(name="trans.layer_fc.bias." + str(i)), ) for i in range(self.max_layers) ] @@ -268,8 +268,8 @@ class Model(ModelBase): num_flatten_dims=2, act=self.act, param_attr=fluid.ParamAttr( - name="cls.concat_fc.weight."+str(i)), - bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias."+str(i)) + name="cls.concat_fc.weight." + str(i)), + bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias." + str(i)) ) for i in range(self.max_layers) ] @@ -348,7 +348,7 @@ class Model(ModelBase): current_layer_node_num = self.first_layer_node.shape[1] else: current_layer_node_num = current_layer_node.shape[1] * \ - current_layer_node.shape[2] + current_layer_node.shape[2] current_layer_node = fluid.layers.reshape( current_layer_node, [-1, current_layer_node_num]) @@ -458,7 +458,7 @@ class Model(ModelBase): param_attr=fluid.ParamAttr( name="trans.layer_fc.weight." + str(layer_idx)), bias_attr=fluid.ParamAttr( - name="trans.layer_fc.bias."+str(layer_idx)), + name="trans.layer_fc.bias." + str(layer_idx)), ) return input_layer_fc_out @@ -479,6 +479,6 @@ class Model(ModelBase): num_flatten_dims=2, act=self.act, param_attr=fluid.ParamAttr( - name="cls.concat_fc.weight."+str(layer_idx)), - bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias."+str(layer_idx))) + name="cls.concat_fc.weight." + str(layer_idx)), + bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias." + str(layer_idx))) return hidden_states_fc diff --git a/models/treebased/tdm/tdm_evaluate_reader.py b/models/treebased/tdm/tdm_evaluate_reader.py index 844e441fbda303ea4a5ab3c0f549711579dbf5d5..4e3b64770b42c05726dd3c90466d77e422e00902 100644 --- a/models/treebased/tdm/tdm_evaluate_reader.py +++ b/models/treebased/tdm/tdm_evaluate_reader.py @@ -28,6 +28,7 @@ class EvaluateReader(Reader): """ Read the data line by line and process it as a dictionary """ + def reader(): """ This function needs to be implemented by the user, based on data format diff --git a/models/treebased/tdm/tdm_reader.py b/models/treebased/tdm/tdm_reader.py index 0b8ada9ea4d695aafd38c1e87831c9939e483618..709900649a03c3439cbf474781a5c0ae7b087dd7 100755 --- a/models/treebased/tdm/tdm_reader.py +++ b/models/treebased/tdm/tdm_reader.py @@ -28,6 +28,7 @@ class TrainReader(Reader): """ Read the data line by line and process it as a dictionary """ + def reader(): """ This function needs to be implemented by the user, based on data format diff --git a/readme.md b/readme.md deleted file mode 100644 index ff2b64b8d7eea316b4d4a73249a84ff97751b21e..0000000000000000000000000000000000000000 --- a/readme.md +++ /dev/null @@ -1,165 +0,0 @@ - -

- -

- -

-
- Release - License - Slack -
-

- - -

什么是PaddleRec

- -

- -

- -- 源于飞桨生态的搜索推荐模型**一站式开箱即用工具** -- 适合初学者,开发者,研究者从调研,训练到预测部署的全流程解决方案 -- 包含语义理解、召回、粗排、精排、多任务学习、融合等多个任务的推荐搜索算法库 -- 配置**yaml**自定义选项,即可快速上手使用单机训练、大规模分布式训练、离线预测、在线部署 - - -

PadlleRec概览

- -

- -

- - -

便捷安装

- -### 环境要求 -* Python 2.7/ 3.5 / 3.6 / 3.7 -* PaddlePaddle >= 1.7.2 -* 操作系统: Windows/Mac/Linux - -### 安装命令 - -- 安装方法一: - ```bash - python -m pip install paddle-rec - ``` - -- 安装方法二 - - 源码编译安装 - 1. 安装飞桨 **注:需要用户安装版本 >1.7.2 的飞桨** - - ```shell - python -m pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple - ``` - - 2. 源码安装PaddleRec - - ``` - git clone https://github.com/PaddlePaddle/PaddleRec/ - cd PaddleRec - python setup.py install - ``` - - -

快速启动

- -### 启动内置模型的默认配置 - -目前框架内置了多个模型,简单的命令即可使用内置模型开始单机训练和本地1*1模拟训练,我们以`dnn`为例介绍PaddleRec的简单使用。 - -#### 单机训练 - -```bash -# 使用CPU进行单机训练 -python -m paddlerec.run -m paddlerec.models.rank.dnn -d cpu -e single - -# 使用GPU进行单机训练 -python -m paddlerec.run -m paddlerec.models.rank.dnn -d gpu -e single -``` - -#### 本地模拟分布式训练 - -```bash -# 使用CPU资源进行本地模拟分布式训练 -python -m paddlerec.run -m paddlerec.models.rank.dnn -e local_cluster -``` - -#### 集群分布式训练 - -```bash -# 配置好 mpi/k8s/paddlecloud集群环境后 -python -m paddlerec.run -m paddlerec.models.rank.dnn -e cluster -``` - -### 启动内置模型的自定配置 - -若您复用内置模型,对**yaml**配置文件进行了修改,如更改超参,重新配置数据后,可以直接使用paddlerec运行该yaml文件。 - -我们以dnn模型为例,在paddlerec代码目录下,修改了dnn模型`config.yaml`的配置后,运行`dnn`模型: -```bash -python -m paddlerec.run -m ./models/rank/dnn/config.yaml -e single -``` - - -

支持模型列表

- - -| 方向 | 模型 | 单机CPU训练 | 单机GPU训练 | 分布式CPU训练 | -| :------: | :----------------------------------------------------------------------------: | :---------: | :---------: | :-----------: | -| 内容理解 | [Text-Classifcation](models/contentunderstanding/classification/model.py) | ✓ | x | ✓ | -| 内容理解 | [TagSpace](models/contentunderstanding/tagspace/model.py) | ✓ | x | ✓ | -| 召回 | [TDM](models/treebased/tdm/model.py) | ✓ | x | ✓ | -| 召回 | [Word2Vec](models/recall/word2vec/model.py) | ✓ | x | ✓ | -| 召回 | [SSR](models/recall/ssr/model.py) | ✓ | ✓ | ✓ | -| 召回 | [Gru4Rec](models/recall/gru4rec/model.py) | ✓ | ✓ | ✓ | -| 排序 | [Dnn](models/rank/dnn/model.py) | ✓ | x | ✓ | -| 排序 | [DeepFM](models/rank/deepfm/model.py) | ✓ | x | ✓ | -| 排序 | [xDeepFM](models/rank/xdeepfm/model.py) | ✓ | x | ✓ | -| 排序 | [DIN](models/rank/din/model.py) | ✓ | x | ✓ | -| 排序 | [Wide&Deep](models/rank/wide_deep/model.py) | ✓ | x | ✓ | -| 多任务 | [ESMM](models/multitask/esmm/model.py) | ✓ | ✓ | ✓ | -| 多任务 | [MMOE](models/multitask/mmoe/model.py) | ✓ | ✓ | ✓ | -| 多任务 | [ShareBottom](models/multitask/share-bottom/model.py) | ✓ | ✓ | ✓ | -| 匹配 | [DSSM](models/match/dssm/model.py) | ✓ | x | ✓ | -| 匹配 | [MultiView-Simnet](models/match/multiview-simnet/model.py) | ✓ | x | ✓ | - - - -

文档

- -### 背景介绍 -* [推荐系统](doc/rec_background.md) -* [分布式-参数服务器](doc/ps_background.md) - -### 新手教程 -* [环境要求](#环境要求) -* [安装命令](#安装命令) -* [快速开始](#启动内置模型的默认配置) - -### 进阶教程 -* [自定义数据集及Reader](doc/custom_dataset_reader.md) -* [分布式训练](doc/distributed_train.md) - -### 开发者教程 -* [PaddleRec设计文档](doc/design.md) - -### 关于PaddleRec性能 -* [Benchmark](doc/benchmark.md) - -### FAQ -* [常见问题FAQ](doc/faq.md) - - -

社区

- -### 反馈 -如有意见、建议及使用中的BUG,欢迎在`GitHub Issue`提交 - -### 版本历史 -- 2020.5.14 - PaddleRec v0.1 - -### 许可证书 -本项目的发布受[Apache 2.0 license](LICENSE)许可认证。 - diff --git a/run.py b/run.py index 0a755cc263d554144b735a7f274363bb6ea69726..56999935f21bc1de2b2bc7b4a080da023559174a 100755 --- a/run.py +++ b/run.py @@ -1,8 +1,22 @@ -import argparse +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import subprocess -import tempfile +import argparse +import tempfile import yaml from paddlerec.core.factory import TrainerFactory @@ -12,37 +26,52 @@ from paddlerec.core.utils import util engines = {} device = ["CPU", "GPU"] clusters = ["SINGLE", "LOCAL_CLUSTER", "CLUSTER"] -custom_model = ['tdm'] +engine_choices = ["SINGLE", "LOCAL_CLUSTER", "CLUSTER", + "TDM_SINGLE", "TDM_LOCAL_CLUSTER", "TDM_CLUSTER"] +custom_model = ['TDM'] model_name = "" def engine_registry(): - cpu = {"TRANSPILER": {}, "PSLIB": {}} - cpu["TRANSPILER"]["SINGLE"] = single_engine - cpu["TRANSPILER"]["LOCAL_CLUSTER"] = local_cluster_engine - cpu["TRANSPILER"]["CLUSTER"] = cluster_engine - cpu["PSLIB"]["SINGLE"] = local_mpi_engine - cpu["PSLIB"]["LOCAL_CLUSTER"] = local_mpi_engine - cpu["PSLIB"]["CLUSTER"] = cluster_mpi_engine + engines["TRANSPILER"] = {} + engines["PSLIB"] = {} + + engines["TRANSPILER"]["SINGLE"] = single_engine + engines["TRANSPILER"]["LOCAL_CLUSTER"] = local_cluster_engine + engines["TRANSPILER"]["CLUSTER"] = cluster_engine + + engines["PSLIB"]["SINGLE"] = local_mpi_engine + engines["PSLIB"]["LOCAL_CLUSTER"] = local_mpi_engine + engines["PSLIB"]["CLUSTER"] = cluster_mpi_engine - gpu = {"TRANSPILER": {}, "PSLIB": {}} - gpu["TRANSPILER"]["SINGLE"] = single_engine - engines["CPU"] = cpu - engines["GPU"] = gpu +def get_inters_from_yaml(file, filter): + with open(file, 'r') as rb: + _envs = yaml.load(rb.read(), Loader=yaml.FullLoader) + + flattens = envs.flatten_environs(_envs) + + inters = {} + for k, v in flattens.items(): + if k.startswith(filter): + inters[k] = v + return inters def get_engine(args): - device = args.device - d_engine = engines[device] transpiler = get_transpiler() + run_extras = get_inters_from_yaml(args.model, "train.") + + engine = run_extras.get("train.engine", "single") + engine = engine.upper() - engine = args.engine - run_engine = d_engine[transpiler].get(engine, None) + if engine not in engine_choices: + raise ValueError("train.engin can not be chosen in {}".format(engine_choices)) + + print("engines: \n{}".format(engines)) + + run_engine = engines[transpiler].get(engine, None) - if run_engine is None: - raise ValueError( - "engine {} can not be supported on device: {}".format(engine, device)) return run_engine @@ -59,24 +88,13 @@ def get_transpiler(): def set_runtime_envs(cluster_envs, engine_yaml): - def get_engine_extras(): - with open(engine_yaml, 'r') as rb: - _envs = yaml.load(rb.read(), Loader=yaml.FullLoader) - - flattens = envs.flatten_environs(_envs) - - engine_extras = {} - for k, v in flattens.items(): - if k.startswith("train.trainer."): - engine_extras[k] = v - return engine_extras - if cluster_envs is None: cluster_envs = {} - engine_extras = get_engine_extras() + engine_extras = get_inters_from_yaml(engine_yaml, "train.trainer.") if "train.trainer.threads" in engine_extras and "CPU_NUM" in cluster_envs: cluster_envs["CPU_NUM"] = engine_extras["train.trainer.threads"] + envs.set_runtime_environs(cluster_envs) envs.set_runtime_environs(engine_extras) @@ -100,7 +118,6 @@ def single_engine(args): single_envs["train.trainer.trainer"] = trainer single_envs["train.trainer.threads"] = "2" single_envs["train.trainer.engine"] = "single" - single_envs["train.trainer.device"] = args.device single_envs["train.trainer.platform"] = envs.get_platform() print("use {} engine to run model: {}".format(trainer, args.model)) @@ -110,32 +127,26 @@ def single_engine(args): def cluster_engine(args): - def update_workspace(cluster_envs): workspace = cluster_envs.get("engine_workspace", None) + if not workspace: return - - # is fleet inner models - if workspace.startswith("paddlerec."): - fleet_package = envs.get_runtime_environ("PACKAGE_BASE") - workspace_dir = workspace.split("paddlerec.")[1].replace(".", "/") - path = os.path.join(fleet_package, workspace_dir) - else: - path = workspace - + path = envs.path_adapter(workspace) for name, value in cluster_envs.items(): if isinstance(value, str): value = value.replace("{workspace}", path) + value = envs.windows_path_converter(value) cluster_envs[name] = value def master(): + role = "MASTER" from paddlerec.core.engine.cluster.cluster import ClusterEngine with open(args.backend, 'r') as rb: _envs = yaml.load(rb.read(), Loader=yaml.FullLoader) flattens = envs.flatten_environs(_envs, "_") - flattens["engine_role"] = args.role + flattens["engine_role"] = role flattens["engine_run_config"] = args.model flattens["engine_temp_path"] = tempfile.mkdtemp() update_workspace(flattens) @@ -147,12 +158,12 @@ def cluster_engine(args): return launch def worker(): + role = "WORKER" trainer = get_trainer_prefix(args) + "ClusterTrainer" cluster_envs = {} cluster_envs["train.trainer.trainer"] = trainer cluster_envs["train.trainer.engine"] = "cluster" cluster_envs["train.trainer.threads"] = envs.get_runtime_environ("CPU_NUM") - cluster_envs["train.trainer.device"] = args.device cluster_envs["train.trainer.platform"] = envs.get_platform() print("launch {} engine with cluster to with model: {}".format( trainer, args.model)) @@ -161,7 +172,9 @@ def cluster_engine(args): trainer = TrainerFactory.create(args.model) return trainer - if args.role == "WORKER": + role = os.getenv("PADDLE_PADDLEREC_ROLE", "MASTER") + + if role == "WORKER": return worker() else: return master() @@ -172,7 +185,6 @@ def cluster_mpi_engine(args): cluster_envs = {} cluster_envs["train.trainer.trainer"] = "CtrCodingTrainer" - cluster_envs["train.trainer.device"] = args.device cluster_envs["train.trainer.platform"] = envs.get_platform() set_runtime_envs(cluster_envs, args.model) @@ -194,8 +206,6 @@ def local_cluster_engine(args): cluster_envs["train.trainer.strategy"] = "async" cluster_envs["train.trainer.threads"] = "2" cluster_envs["train.trainer.engine"] = "local_cluster" - - cluster_envs["train.trainer.device"] = args.device cluster_envs["train.trainer.platform"] = envs.get_platform() cluster_envs["CPU_NUM"] = "2" @@ -221,7 +231,6 @@ def local_mpi_engine(args): cluster_envs["log_dir"] = "logs" cluster_envs["train.trainer.engine"] = "local_cluster" - cluster_envs["train.trainer.device"] = args.device cluster_envs["train.trainer.platform"] = envs.get_platform() set_runtime_envs(cluster_envs, args.model) @@ -231,9 +240,8 @@ def local_mpi_engine(args): def get_abs_model(model): if model.startswith("paddlerec."): - fleet_base = envs.get_runtime_environ("PACKAGE_BASE") - workspace_dir = model.split("paddlerec.")[1].replace(".", "/") - path = os.path.join(fleet_base, workspace_dir, "config.yaml") + dir = envs.path_adapter(model) + path = os.path.join(dir, "config.yaml") else: if not os.path.isfile(model): raise IOError("model config: {} invalid".format(model)) @@ -244,29 +252,17 @@ def get_abs_model(model): if __name__ == "__main__": parser = argparse.ArgumentParser(description='paddle-rec run') parser.add_argument("-m", "--model", type=str) - parser.add_argument("-e", "--engine", type=str, - choices=["single", "local_cluster", "cluster", - "tdm_single", "tdm_local_cluster", "tdm_cluster"]) - - parser.add_argument("-d", "--device", type=str, - choices=["cpu", "gpu"], default="cpu") parser.add_argument("-b", "--backend", type=str, default=None) - parser.add_argument("-r", "--role", type=str, - choices=["master", "worker"], default="master") abs_dir = os.path.dirname(os.path.abspath(__file__)) envs.set_runtime_environs({"PACKAGE_BASE": abs_dir}) args = parser.parse_args() - args.engine = args.engine.upper() - args.device = args.device.upper() - args.role = args.role.upper() model_name = args.model.split('.')[-1] args.model = get_abs_model(args.model) engine_registry() which_engine = get_engine(args) - engine = which_engine(args) engine.run() diff --git a/setup.py b/setup.py index f3e97f97b1b51c6344275a4aa96583fd79004efe..c655c37576e310fac825bd1cc01dfca5d051d18c 100644 --- a/setup.py +++ b/setup.py @@ -1,10 +1,27 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """ setup for paddle-rec. """ + import os + from setuptools import setup, find_packages -import tempfile import shutil +import tempfile + requires = [ "paddlepaddle == 1.7.2", @@ -19,7 +36,7 @@ about["__author__"] = "paddle-dev" about["__author_email__"] = "paddle-dev@baidu.com" about["__url__"] = "https://github.com/PaddlePaddle/PaddleRec" -readme = "..." +readme = "" def run_cmd(command): diff --git a/tools/tools.py b/tools/tools.py index da34a027c027d6809603869f946499ec45edf8e6..8508a790bc8012954b53cc43b088d2e50655647d 100644 --- a/tools/tools.py +++ b/tools/tools.py @@ -1,12 +1,27 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools import os -import time +import platform +import sys import shutil +import time + import requests -import sys import tarfile import zipfile -import platform -import functools lasttime = time.time() FLUSH_INTERVAL = 0.1