diff --git a/.gitignore b/.gitignore index cca95ea7cc11c4ac25907f7646d14fe27bcd597c..445ef0ac33add299980040930490f5f6a9cdcdd0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,6 @@ *.o output .idea/ -build/ -dist/ -fleetrec.egg-info/ +paddlerec.egg-info/ *~ *.pyc diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6d715f5a61674c72082313517cc40f13b1ea85ed --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,28 @@ +repos: +- repo: https://github.com/Lucas-C/pre-commit-hooks.git + sha: v1.0.1 + hooks: + - id: remove-crlf + files: (?!.*third_party)^.*$ | (?!.*book)^.*$ +- repo: https://github.com/PaddlePaddle/mirrors-yapf.git + sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37 + hooks: + - id: yapf + files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$ +- repo: https://github.com/pre-commit/pre-commit-hooks + sha: 5bf6c09bfa1297d3692cadd621ef95f1284e33c0 + hooks: + - id: check-added-large-files + - id: check-merge-conflict + - id: check-symlinks + - id: detect-private-key + files: (?!.*third_party)^.*$ | (?!.*book)^.*$ + - id: end-of-file-fixer +- repo: local + hooks: + - id: copyright_checker + name: copyright_checker + entry: python ./tools/codestyle/copyright.hook + language: system + files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$ + exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$ diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000000000000000000000000000000000000..5b00ebbf73523eb310c16dcef60f78df9ab48156 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,30 @@ +language: generic +sudo: required +dist: trusty + +services: + - docker + +os: + - linux + +env: + - JOB=check_style + +before_install: + # For pylint dockstring checker + - sudo pip install pylint pytest astroid isort pre-commit + - | + function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; } + +script: + - "travis_wait 30 sleep 1800 &" + - | + # 43min timeout + tools/build_script.sh ${JOB} + if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi; + +notifications: + email: + on_success: change + on_failure: always diff --git a/LICENSE b/LICENSE index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 100644 --- a/LICENSE +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..49b4b466f41f913e42006518101b39444d367e07 --- /dev/null +++ b/README.md @@ -0,0 +1,228 @@ +

+ +

+ +

+
+ Release + License + Slack +
+

+ + +

什么是PaddleRec

+ +

+ +

+ +- 源于飞桨生态的搜索推荐模型**一站式开箱即用工具** +- 适合初学者,开发者,研究者从调研,训练到预测部署的全流程解决方案 +- 包含语义理解、召回、粗排、精排、多任务学习、融合等多个任务的推荐搜索算法库 +- 配置**yaml**自定义选项,即可快速上手使用单机训练、大规模分布式训练、离线预测、在线部署 + + +

PadlleRec概览

+ +

+ +

+ + +

推荐系统-流程概览

+ +

+ +

+ +

便捷安装

+ +### 环境要求 +* Python 2.7/ 3.5 / 3.6 / 3.7 +* PaddlePaddle >= 1.7.2 +* 操作系统: Windows/Mac/Linux + + > Windows下目前仅提供单机训练,建议使用Linux + +### 安装命令 + +- 安装方法一: + ```bash + python -m pip install paddle-rec + ``` + +- 安装方法二 + + 源码编译安装 + 1. 安装飞桨 **注:需要用户安装版本 >1.7.2 的飞桨** + + ```shell + python -m pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple + ``` + + 2. 源码安装PaddleRec + + ``` + git clone https://github.com/PaddlePaddle/PaddleRec/ + cd PaddleRec + python setup.py install + ``` + + +

快速启动

+ +### 启动内置模型的默认配置 + +目前框架内置了多个模型,一行命令即可使用内置模型开始单机训练和本地模拟分布式训练。 + > 本地模拟分布式(`local_cluster`)为`1个server + 1个trainer`的参数服务器模式 + + +我们以排序模型中的`dnn`模型为例介绍PaddleRec的简单使用。训练数据来源为[Criteo数据集](https://www.kaggle.com/c/criteo-display-ad-challenge/),我们从中截取了100条方便您快速上手体验完整的PaddleRec流程。 + +```bash +# 使用CPU进行单机训练 +python -m paddlerec.run -m paddlerec.models.rank.dnn +``` + +### 启动内置模型的自定配置 + +若您复用内置模型,对**yaml**配置文件进行了修改,如更改超参,重新配置数据后,可以直接使用paddlerec运行该yaml文件。 + +我们以dnn模型为例,在paddlerec代码目录下: +```bash +cd paddlerec +``` + +修改dnn模型的[超参配置](./models/rank/dnn/config.yaml),例如将迭代训练轮数从10轮修改为5轮: +```yaml +runner: +- name: runner1 + class: single_train + epochs: 5 # 10->5 +``` + +在Linux环境下,可以使用`vim`等文本编辑工具修改yaml文件: + +```bash +vim ./models/rank/dnn/config.yaml +# 键入 i, 进入编辑模式 +# 修改yaml文件配置 +# 完成修改后,点击esc,退出编辑模式 +# 键入 :wq 保存文件并退出 +``` + +完成dnn模型`models/rank/dnn/config.yaml`的配置修改后,运行`dnn`模型: +```bash +# 使用自定配置进行训练 +python -m paddlerec.run -m ./models/rank/dnn/config.yaml +``` + +### 分布式训练 + +分布式训练需要配置`config.yaml`,加入或修改`engine`选项为`cluster`或`local_cluster`,以进行分布式训练,或本地模拟分布式训练。 + +#### 本地模拟分布式训练 + +我们以dnn模型为例,在paddlerec代码目录下,修改dnn模型的`config.yaml`文件: + +```yaml +runner: +- name: runner1 + class: local_cluster_train # single_train -> local_cluster_train +``` +然后启动paddlerec训练: + +```bash +# 进行本地模拟分布式训练 +python -m paddlerec.run -m ./models/rank/dnn/config.yaml +``` + +#### 集群分布式训练 + +我们以dnn模型为例,在paddlerec代码目录下,首先修改dnn模型`config.yaml`文件: + +```yaml +runner: +- name: runner1 + class: cluster_train # single_train -> cluster_train +``` +再添加分布式启动配置文件`backend.yaml`,具体配置规则在[分布式训练](doc/distributed_train.md)教程中介绍。最后启动paddlerec训练: + +```bash +# 配置好 mpi/k8s/paddlecloud集群环境后 +python -m paddlerec.run -m ./models/rank/dnn/config.yaml -b backend.yaml +``` + + +

支持模型列表

+ + +| 方向 | 模型 | 单机CPU训练 | 单机GPU训练 | 分布式CPU训练 | +| :------: | :-----------------------------------------------------------------------: | :---------: | :---------: | :-----------: | +| 内容理解 | [Text-Classifcation](models/contentunderstanding/classification/model.py) | ✓ | x | ✓ | +| 内容理解 | [TagSpace](models/contentunderstanding/tagspace/model.py) | ✓ | x | ✓ | +| 召回 | [DSSM](models/match/dssm/model.py) | ✓ | x | ✓ | +| 召回 | [MultiView-Simnet](models/match/multiview-simnet/model.py) | ✓ | x | ✓ | +| 召回 | [TDM](models/treebased/tdm/model.py) | ✓ | x | ✓ | +| 召回 | [Word2Vec](models/recall/word2vec/model.py) | ✓ | x | ✓ | +| 召回 | [SSR](models/recall/ssr/model.py) | ✓ | ✓ | ✓ | +| 召回 | [Gru4Rec](models/recall/gru4rec/model.py) | ✓ | ✓ | ✓ | +| 召回 | [Youtube_dnn](models/recall/youtube_dnn/model.py) | ✓ | ✓ | ✓ | +| 召回 | [NCF](models/recall/ncf/model.py) | ✓ | ✓ | ✓ | +| 排序 | [Dnn](models/rank/dnn/model.py) | ✓ | x | ✓ | +| 排序 | [DeepFM](models/rank/deepfm/model.py) | ✓ | x | ✓ | +| 排序 | [xDeepFM](models/rank/xdeepfm/model.py) | ✓ | x | ✓ | +| 排序 | [DIN](models/rank/din/model.py) | ✓ | x | ✓ | +| 排序 | [Wide&Deep](models/rank/wide_deep/model.py) | ✓ | x | ✓ | +| 多任务 | [ESMM](models/multitask/esmm/model.py) | ✓ | ✓ | ✓ | +| 多任务 | [MMOE](models/multitask/mmoe/model.py) | ✓ | ✓ | ✓ | +| 多任务 | [ShareBottom](models/multitask/share-bottom/model.py) | ✓ | ✓ | ✓ | +| 重排序 | [Listwise](models/rerank/listwise/model.py) | ✓ | x | ✓ | + + + + +

文档

+ +### 背景介绍 +* [推荐系统介绍](doc/rec_background.md) +* [分布式深度学习介绍](doc/ps_background.md) + +### 新手教程 +* [环境要求](#环境要求) +* [安装命令](#安装命令) +* [快速开始](#启动内置模型的默认配置) + +### 进阶教程 +* [自定义数据集及Reader](doc/custom_dataset_reader.md) +* [分布式训练](doc/distributed_train.md) + +### 开发者教程 +* [PaddleRec设计文档](doc/design.md) + +### 关于PaddleRec性能 +* [Benchmark](doc/benchmark.md) + +### 开发者教程 +* [PaddleRec设计文档](doc/design.md) +* [二次开发](doc/development.md) + +### 关于PaddleRec性能 +* [Benchmark](doc/benchmark.md) + +### FAQ +* [常见问题FAQ](doc/faq.md) + + +

社区

+ +### 反馈 +如有意见、建议及使用中的BUG,欢迎在[GitHub Issue](https://github.com/PaddlePaddle/PaddleRec/issues)提交 + +### 版本历史 +- 2020.5.14 - PaddleRec v0.1 + +### 许可证书 +本项目的发布受[Apache 2.0 license](LICENSE)许可认证。 + diff --git a/fleet_rec/__init__.py b/__init__.py similarity index 100% rename from fleet_rec/__init__.py rename to __init__.py diff --git a/fleet_rec/core/metrics/__init__.py b/core/__init__.py similarity index 100% rename from fleet_rec/core/metrics/__init__.py rename to core/__init__.py diff --git a/fleet_rec/core/utils/__init__.py b/core/engine/__init__.py similarity index 100% rename from fleet_rec/core/utils/__init__.py rename to core/engine/__init__.py diff --git a/models/rank/tagspace/__init__.py b/core/engine/cluster/__init__.py similarity index 100% rename from models/rank/tagspace/__init__.py rename to core/engine/cluster/__init__.py diff --git a/models/recall/multiview-simnet/__init__.py b/core/engine/cluster/cloud/__init__.py old mode 100755 new mode 100644 similarity index 100% rename from models/recall/multiview-simnet/__init__.py rename to core/engine/cluster/cloud/__init__.py diff --git a/core/engine/cluster/cloud/cluster.sh b/core/engine/cluster/cloud/cluster.sh new file mode 100644 index 0000000000000000000000000000000000000000..1a0605fd9aeefbf87542e5e5156470eb1d81b836 --- /dev/null +++ b/core/engine/cluster/cloud/cluster.sh @@ -0,0 +1,95 @@ +#!/bin/bash +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +################################################### +# Usage: submit.sh +# Description: run mpi submit client implement +################################################### + +# ---------------------------------------------------------------------------- # +# variable define # +# ---------------------------------------------------------------------------- # + +#----------------------------------------------------------------------------------------------------------------- +#fun : package +#param : N/A +#return : 0 -- success; not 0 -- failure +#----------------------------------------------------------------------------------------------------------------- +function package_hook() { + g_run_stage="package" + package +} + +#----------------------------------------------------------------------------------------------------------------- +#fun : before hook submit to cluster +#param : N/A +#return : 0 -- success; not 0 -- failure +#----------------------------------------------------------------------------------------------------------------- +function _before_submit() { + echo "before_submit" + before_submit_hook +} + +#----------------------------------------------------------------------------------------------------------------- +#fun : after hook submit to cluster +#param : N/A +#return : 0 -- success; not 0 -- failure +#----------------------------------------------------------------------------------------------------------------- +function _after_submit() { + echo "after_submit" + after_submit_hook +} + +#----------------------------------------------------------------------------------------------------------------- +#fun : submit to cluster +#param : N/A +#return : 0 -- success; not 0 -- failure +#----------------------------------------------------------------------------------------------------------------- +function _submit() { + g_run_stage="submit" + + cd ${engine_temp_path} + + paddlecloud job --ak ${engine_submit_ak} --sk ${engine_submit_sk} train --cluster-name ${engine_submit_cluster} \ + --job-version ${engine_submit_version} \ + --mpi-priority ${engine_submit_priority} \ + --mpi-wall-time 300:59:00 \ + --mpi-nodes ${engine_submit_nodes} --is-standalone 0 \ + --mpi-memory 110Gi \ + --job-name ${engine_submit_jobname} \ + --start-cmd "${g_run_cmd}" \ + --group-name ${engine_submit_group} \ + --job-conf ${engine_submit_config} \ + --files ${g_submitfiles} \ + --json + + cd - +} + +function submit_hook() { + _before_submit + _submit + _after_submit +} + +function main() { + source ${engine_submit_scrpit} + + package_hook + submit_hook +} + +main diff --git a/core/engine/cluster/cluster.py b/core/engine/cluster/cluster.py new file mode 100644 index 0000000000000000000000000000000000000000..8c45335799afb165b66c133bd217caf3320f703f --- /dev/null +++ b/core/engine/cluster/cluster.py @@ -0,0 +1,62 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +from __future__ import unicode_literals + +import copy +import os +import subprocess + +from paddlerec.core.engine.engine import Engine +from paddlerec.core.factory import TrainerFactory +from paddlerec.core.utils import envs + + +class ClusterEngine(Engine): + def __init_impl__(self): + abs_dir = os.path.dirname(os.path.abspath(__file__)) + + backend = envs.get_runtime_environ("engine_backend") + if backend == "PaddleCloud": + self.submit_script = os.path.join(abs_dir, "cloud/cluster.sh") + else: + raise ValueError("{} can not be supported now".format(backend)) + + def start_worker_procs(self): + trainer = TrainerFactory.create(self.trainer) + trainer.run() + + def start_master_procs(self): + default_env = os.environ.copy() + current_env = copy.copy(default_env) + current_env.pop("http_proxy", None) + current_env.pop("https_proxy", None) + + cmd = ("bash {}".format(self.submit_script)).split(" ") + proc = subprocess.Popen(cmd, env=current_env, cwd=os.getcwd()) + proc.wait() + + def run(self): + role = envs.get_runtime_environ("engine_role") + + if role == "MASTER": + self.start_master_procs() + + elif role == "WORKER": + self.start_worker_procs() + + else: + raise ValueError("role {} error, must in MASTER/WORKER".format( + role)) diff --git a/core/engine/engine.py b/core/engine/engine.py new file mode 100755 index 0000000000000000000000000000000000000000..492bf8e1c6f83f015be4fbd287ebef7d432e953d --- /dev/null +++ b/core/engine/engine.py @@ -0,0 +1,31 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import abc + + +class Engine: + __metaclass__ = abc.ABCMeta + + def __init__(self, envs, trainer): + self.envs = envs + self.trainer = trainer + self.__init_impl__() + + def __init_impl__(self): + pass + + @abc.abstractmethod + def run(self): + pass diff --git a/fleet_rec/core/engine/local_cluster_engine.py b/core/engine/local_cluster.py similarity index 78% rename from fleet_rec/core/engine/local_cluster_engine.py rename to core/engine/local_cluster.py index 86a8d2497035c8a791f06b22137b067af9923ae1..89ceafa973c9488a727aecb2e01a74f2574a81f9 100755 --- a/fleet_rec/core/engine/local_cluster_engine.py +++ b/core/engine/local_cluster.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,13 +14,15 @@ from __future__ import print_function from __future__ import unicode_literals -import subprocess -import sys -import os + import copy +import os +import sys +import subprocess + +from paddlerec.core.engine.engine import Engine +from paddlerec.core.utils import envs -from fleetrec.core.engine.engine import Engine -from fleetrec.core.utils import envs class LocalClusterEngine(Engine): def start_procs(self): @@ -36,7 +38,7 @@ class LocalClusterEngine(Engine): current_env.pop("https_proxy", None) procs = [] log_fns = [] - + for i in range(server_num - 1): while True: new_port = envs.find_free_port() @@ -44,10 +46,15 @@ class LocalClusterEngine(Engine): ports.append(new_port) break user_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports]) - user_endpoints_ips = [x.split(":")[0] for x in user_endpoints.split(",")] - user_endpoints_port = [x.split(":")[1] for x in user_endpoints.split(",")] - factory = "fleetrec.core.factory" + user_endpoints_ips = [ + x.split(":")[0] for x in user_endpoints.split(",") + ] + user_endpoints_port = [ + x.split(":")[1] for x in user_endpoints.split(",") + ] + + factory = "paddlerec.core.factory" cmd = [sys.executable, "-u", "-m", factory, self.trainer] for i in range(server_num): @@ -62,7 +69,8 @@ class LocalClusterEngine(Engine): os.system("mkdir -p {}".format(logs_dir)) fn = open("%s/server.%d" % (logs_dir, i), "w") log_fns.append(fn) - proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn, cwd=os.getcwd()) + proc = subprocess.Popen( + cmd, env=current_env, stdout=fn, stderr=fn, cwd=os.getcwd()) procs.append(proc) for i in range(worker_num): @@ -76,7 +84,8 @@ class LocalClusterEngine(Engine): os.system("mkdir -p {}".format(logs_dir)) fn = open("%s/worker.%d" % (logs_dir, i), "w") log_fns.append(fn) - proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn, cwd=os.getcwd()) + proc = subprocess.Popen( + cmd, env=current_env, stdout=fn, stderr=fn, cwd=os.getcwd()) procs.append(proc) # only wait worker to finish here @@ -91,8 +100,10 @@ class LocalClusterEngine(Engine): if len(log_fns) > 0: log_fns[i].close() procs[i].terminate() - print("all workers already completed, you can view logs under the `{}` directory".format(logs_dir), - file=sys.stderr) + print( + "all workers already completed, you can view logs under the `{}` directory". + format(logs_dir), + file=sys.stderr) def run(self): self.start_procs() diff --git a/fleet_rec/core/engine/local_mpi_engine.py b/core/engine/local_mpi.py similarity index 80% rename from fleet_rec/core/engine/local_mpi_engine.py rename to core/engine/local_mpi.py index 84ed6c307ab270b17f8307fd8252fbef3b197474..830bf28c4957e342d317070ab2060cde1de6d6a6 100755 --- a/fleet_rec/core/engine/local_mpi_engine.py +++ b/core/engine/local_mpi.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,18 +14,18 @@ from __future__ import print_function from __future__ import unicode_literals -import subprocess -import sys -import os + import copy +import os +import sys +import subprocess -from fleetrec.core.engine.engine import Engine +from paddlerec.core.engine.engine import Engine class LocalMPIEngine(Engine): def start_procs(self): logs_dir = self.envs["log_dir"] - default_env = os.environ.copy() current_env = copy.copy(default_env) current_env.pop("http_proxy", None) @@ -33,7 +33,7 @@ class LocalMPIEngine(Engine): procs = [] log_fns = [] - factory = "fleetrec.core.factory" + factory = "paddlerec.core.factory" cmd = "mpirun -npernode 2 -timestamp-output -tag-output".split(" ") cmd.extend([sys.executable, "-u", "-m", factory, self.trainer]) @@ -41,7 +41,8 @@ class LocalMPIEngine(Engine): os.system("mkdir -p {}".format(logs_dir)) fn = open("%s/job.log" % logs_dir, "w") log_fns.append(fn) - proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn, cwd=os.getcwd()) + proc = subprocess.Popen( + cmd, env=current_env, stdout=fn, stderr=fn, cwd=os.getcwd()) else: proc = subprocess.Popen(cmd, env=current_env, cwd=os.getcwd()) procs.append(proc) @@ -50,7 +51,9 @@ class LocalMPIEngine(Engine): if len(log_fns) > 0: log_fns[i].close() procs[i].wait() - print("all workers and parameter servers already completed", file=sys.stderr) + print( + "all workers and parameter servers already completed", + file=sys.stderr) def run(self): self.start_procs() diff --git a/fleet_rec/core/factory.py b/core/factory.py similarity index 62% rename from fleet_rec/core/factory.py rename to core/factory.py index c5e6f30f41e854101e902ee31f7313f9ae400c1a..2e2d013bd27eb73abd3b9ad5507b4e9373276a3b 100755 --- a/fleet_rec/core/factory.py +++ b/core/factory.py @@ -17,26 +17,26 @@ import sys import yaml -from fleetrec.core.utils import envs +from paddlerec.core.utils import envs -trainer_abs = os.path.join(os.path.dirname( - os.path.abspath(__file__)), "trainers") +trainer_abs = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "trainers") trainers = {} def trainer_registry(): - trainers["SingleTrainer"] = os.path.join( - trainer_abs, "single_trainer.py") - trainers["ClusterTrainer"] = os.path.join( - trainer_abs, "cluster_trainer.py") - trainers["CtrCodingTrainer"] = os.path.join( - trainer_abs, "ctr_coding_trainer.py") - trainers["CtrModulTrainer"] = os.path.join( - trainer_abs, "ctr_modul_trainer.py") - trainers["TDMSingleTrainer"] = os.path.join( - trainer_abs, "tdm_single_trainer.py") - trainers["TDMClusterTrainer"] = os.path.join( - trainer_abs, "tdm_cluster_trainer.py") + trainers["SingleTrainer"] = os.path.join(trainer_abs, "single_trainer.py") + trainers["SingleInfer"] = os.path.join(trainer_abs, "single_infer.py") + trainers["ClusterTrainer"] = os.path.join(trainer_abs, + "cluster_trainer.py") + trainers["CtrCodingTrainer"] = os.path.join(trainer_abs, + "ctr_coding_trainer.py") + trainers["CtrModulTrainer"] = os.path.join(trainer_abs, + "ctr_modul_trainer.py") + trainers["TDMSingleTrainer"] = os.path.join(trainer_abs, + "tdm_single_trainer.py") + trainers["TDMClusterTrainer"] = os.path.join(trainer_abs, + "tdm_cluster_trainer.py") trainer_registry() @@ -55,8 +55,8 @@ class TrainerFactory(object): if trainer_abs is None: if not os.path.isfile(train_mode): - raise IOError( - "trainer {} can not be recognized".format(train_mode)) + raise IOError("trainer {} can not be recognized".format( + train_mode)) trainer_abs = train_mode train_mode = "UserDefineTrainer" @@ -71,7 +71,7 @@ class TrainerFactory(object): with open(config, 'r') as rb: _config = yaml.load(rb.read(), Loader=yaml.FullLoader) else: - raise ValueError("fleetrec's config only support yaml") + raise ValueError("paddlerec's config only support yaml") envs.set_global_envs(_config) envs.update_workspace() diff --git a/core/layer.py b/core/layer.py new file mode 100755 index 0000000000000000000000000000000000000000..07c058c77d3fb947334d5d8fca89b15e2629c724 --- /dev/null +++ b/core/layer.py @@ -0,0 +1,32 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import abc + + +class Layer(object): + """R + """ + __metaclass__ = abc.ABCMeta + + def __init__(self, config): + """R + """ + pass + + @abc.abstractmethod + def generate(self, param): + """R + """ + pass diff --git a/fleet_rec/core/metric.py b/core/metric.py similarity index 97% rename from fleet_rec/core/metric.py rename to core/metric.py index 469fd56c8cb97c8e21739b402b4b89daab57bde9..e0f6b24e7e6bfc3e4e1689622019ffd540c8c033 100755 --- a/fleet_rec/core/metric.py +++ b/core/metric.py @@ -53,7 +53,7 @@ class Metric(object): pass @abc.abstractmethod - def get_result_to_string(self): + def __str__(self): """ Return: result(string) : calculate result with string format, for output diff --git a/core/metrics/__init__.py b/core/metrics/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/core/metrics/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/fleet_rec/core/metrics/auc_metrics.py b/core/metrics/auc_metrics.py similarity index 93% rename from fleet_rec/core/metrics/auc_metrics.py rename to core/metrics/auc_metrics.py index f2f68890c79ef35123c131e9064b99e28d8e0aea..085c84990e4a0a3a3e606ef707fef5d90387e8b0 100755 --- a/fleet_rec/core/metrics/auc_metrics.py +++ b/core/metrics/auc_metrics.py @@ -13,14 +13,16 @@ # limitations under the License. import math + import numpy as np import paddle.fluid as fluid -from fleetrec.core.metric import Metric + +from paddlerec.core.metric import Metric class AUCMetric(Metric): """ - Metric For Paddle Model + Metric For Fluid Model """ def __init__(self, config, fleet): @@ -81,7 +83,8 @@ class AUCMetric(Metric): if scope.find_var(metric_item['var'].name) is None: result[metric_name] = None continue - result[metric_name] = self.get_metric(scope, metric_item['var'].name) + result[metric_name] = self.get_metric(scope, + metric_item['var'].name) return result def calculate_auc(self, global_pos, global_neg): @@ -176,14 +179,18 @@ class AUCMetric(Metric): self._result['mean_q'] = 0 return self._result if 'stat_pos' in result and 'stat_neg' in result: - result['auc'] = self.calculate_auc(result['stat_pos'], result['stat_neg']) - result['bucket_error'] = self.calculate_auc(result['stat_pos'], result['stat_neg']) + result['auc'] = self.calculate_auc(result['stat_pos'], + result['stat_neg']) + result['bucket_error'] = self.calculate_auc(result['stat_pos'], + result['stat_neg']) if 'pos_ins_num' in result: - result['actual_ctr'] = result['pos_ins_num'] / result['total_ins_num'] + result['actual_ctr'] = result['pos_ins_num'] / result[ + 'total_ins_num'] if 'abserr' in result: result['mae'] = result['abserr'] / result['total_ins_num'] if 'sqrerr' in result: - result['rmse'] = math.sqrt(result['sqrerr'] / result['total_ins_num']) + result['rmse'] = math.sqrt(result['sqrerr'] / + result['total_ins_num']) if 'prob' in result: result['predict_ctr'] = result['prob'] / result['total_ins_num'] if abs(result['predict_ctr']) > 1e-6: @@ -198,7 +205,7 @@ class AUCMetric(Metric): """ """ return self._result - def get_result_to_string(self): + def __str__(self): """ """ result = self.get_result() result_str = "%s AUC=%.6f BUCKET_ERROR=%.6f MAE=%.6f RMSE=%.6f " \ diff --git a/core/model.py b/core/model.py new file mode 100755 index 0000000000000000000000000000000000000000..9422dc49ea9492ff958106a1a51ea11f318bd148 --- /dev/null +++ b/core/model.py @@ -0,0 +1,229 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import abc + +import paddle.fluid as fluid + +from paddlerec.core.utils import envs + + +class Model(object): + """Base Model + """ + __metaclass__ = abc.ABCMeta + + def __init__(self, config): + """R + """ + self._cost = None + self._metrics = {} + self._data_var = [] + self._infer_data_var = [] + self._infer_results = {} + self._data_loader = None + self._infer_data_loader = None + self._fetch_interval = 20 + self._namespace = "train.model" + self._platform = envs.get_platform() + self._init_hyper_parameters() + self._env = config + self._slot_inited = False + + def _init_hyper_parameters(self): + pass + + def _init_slots(self, **kargs): + if self._slot_inited: + return + self._slot_inited = True + dataset = {} + model_dict = {} + for i in self._env["executor"]: + if i["name"] == kargs["name"]: + model_dict = i + break + for i in self._env["dataset"]: + if i["name"] == model_dict["dataset_name"]: + dataset = i + break + name = "dataset." + dataset["name"] + "." + sparse_slots = envs.get_global_env(name + "sparse_slots", "").strip() + dense_slots = envs.get_global_env(name + "dense_slots", "").strip() + if sparse_slots != "" or dense_slots != "": + if sparse_slots == "": + sparse_slots = [] + else: + sparse_slots = sparse_slots.strip().split(" ") + if dense_slots == "": + dense_slots = [] + else: + dense_slots = dense_slots.strip().split(" ") + dense_slots_shape = [[ + int(j) for j in i.split(":")[1].strip("[]").split(",") + ] for i in dense_slots] + dense_slots = [i.split(":")[0] for i in dense_slots] + self._dense_data_var = [] + for i in range(len(dense_slots)): + l = fluid.layers.data( + name=dense_slots[i], + shape=dense_slots_shape[i], + dtype="float32") + self._data_var.append(l) + self._dense_data_var.append(l) + self._sparse_data_var = [] + for name in sparse_slots: + l = fluid.layers.data( + name=name, shape=[1], lod_level=1, dtype="int64") + self._data_var.append(l) + self._sparse_data_var.append(l) + + dataset_class = dataset["type"] + if dataset_class == "DataLoader": + self._init_dataloader() + + def _init_dataloader(self, is_infer=False): + if is_infer: + data = self._infer_data_var + else: + data = self._data_var + self._data_loader = fluid.io.DataLoader.from_generator( + feed_list=data, + capacity=64, + use_double_buffer=False, + iterable=False) + + def get_inputs(self): + return self._data_var + + def get_infer_inputs(self): + return self._infer_data_var + + def get_infer_results(self): + return self._infer_results + + def get_avg_cost(self): + """R + """ + return self._cost + + def get_metrics(self): + """R + """ + return self._metrics + + def get_fetch_period(self): + return self._fetch_interval + + def _build_optimizer(self, name, lr, strategy=None): + name = name.upper() + optimizers = ["SGD", "ADAM", "ADAGRAD"] + if name not in optimizers: + raise ValueError( + "configured optimizer can only supported SGD/Adam/Adagrad") + + if name == "SGD": + reg = envs.get_global_env("hyper_parameters.reg", 0.0001, + self._namespace) + optimizer_i = fluid.optimizer.SGD( + lr, regularization=fluid.regularizer.L2DecayRegularizer(reg)) + elif name == "ADAM": + optimizer_i = fluid.optimizer.Adam(lr, lazy_mode=True) + elif name == "ADAGRAD": + optimizer_i = fluid.optimizer.Adagrad(lr) + else: + raise ValueError( + "configured optimizer can only supported SGD/Adam/Adagrad") + + return optimizer_i + + def optimizer(self): + learning_rate = envs.get_global_env("hyper_parameters.learning_rate", + None, self._namespace) + optimizer = envs.get_global_env("hyper_parameters.optimizer", None, + self._namespace) + return self._build_optimizer(optimizer, learning_rate) + + def input_data(self, is_infer=False, **kwargs): + name = "dataset." + kwargs.get("dataset_name") + "." + sparse_slots = envs.get_global_env(name + "sparse_slots", "").strip() + dense_slots = envs.get_global_env(name + "dense_slots", "").strip() + self._sparse_data_var_map = {} + self._dense_data_var_map = {} + if sparse_slots != "" or dense_slots != "": + if sparse_slots == "": + sparse_slots = [] + else: + sparse_slots = sparse_slots.strip().split(" ") + if dense_slots == "": + dense_slots = [] + else: + dense_slots = dense_slots.strip().split(" ") + dense_slots_shape = [[ + int(j) for j in i.split(":")[1].strip("[]").split(",") + ] for i in dense_slots] + dense_slots = [i.split(":")[0] for i in dense_slots] + self._dense_data_var = [] + data_var_ = [] + for i in range(len(dense_slots)): + l = fluid.layers.data( + name=dense_slots[i], + shape=dense_slots_shape[i], + dtype="float32") + data_var_.append(l) + self._dense_data_var.append(l) + self._dense_data_var_map[dense_slots[i]] = l + self._sparse_data_var = [] + for name in sparse_slots: + l = fluid.layers.data( + name=name, shape=[1], lod_level=1, dtype="int64") + data_var_.append(l) + self._sparse_data_var.append(l) + self._sparse_data_var_map[name] = l + return data_var_ + + else: + return None + + def net(self, is_infer=False): + return None + + def _construct_reader(self, is_infer=False): + if is_infer: + self._infer_data_loader = fluid.io.DataLoader.from_generator( + feed_list=self._infer_data_var, + capacity=64, + use_double_buffer=False, + iterable=False) + else: + dataset_class = envs.get_global_env("dataset_class", None, + "train.reader") + if dataset_class == "DataLoader": + self._data_loader = fluid.io.DataLoader.from_generator( + feed_list=self._data_var, + capacity=64, + use_double_buffer=False, + iterable=False) + + def train_net(self): + input_data = self.input_data(is_infer=False) + self._data_var = input_data + self._construct_reader(is_infer=False) + self.net(input_data, is_infer=False) + + def infer_net(self): + input_data = self.input_data(is_infer=True) + self._infer_data_var = input_data + self._construct_reader(is_infer=True) + self.net(input_data, is_infer=True) diff --git a/core/modules/__init__.py b/core/modules/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/core/modules/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/core/modules/coding/__init__.py b/core/modules/coding/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/core/modules/coding/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/core/modules/coding/layers.py b/core/modules/coding/layers.py new file mode 100755 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/core/modules/coding/layers.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/core/modules/modul/__init__.py b/core/modules/modul/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/core/modules/modul/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/fleet_rec/core/modules/modul/build.py b/core/modules/modul/build.py similarity index 72% rename from fleet_rec/core/modules/modul/build.py rename to core/modules/modul/build.py index 51b83e5f3ddc1b51ad3c990701cd38625772629e..dae777176e49831bde4f6f9938637a3289a0a218 100755 --- a/fleet_rec/core/modules/modul/build.py +++ b/core/modules/modul/build.py @@ -1,10 +1,25 @@ -import yaml +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import copy + import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet +import yaml -from fleetrec.core.model import Model -from fleetrec.core.utils import table +from paddlerec.core.model import Model +from paddlerec.core.utils import table def create(config): @@ -16,6 +31,7 @@ def create(config): Model Instance """ model = None + if config['mode'] == 'fluid': model = YamlModel(config) model.train_net() @@ -35,7 +51,12 @@ class YamlModel(Model): f = open(config['layer_file'], 'r') self._build_nodes = yaml.safe_load(f.read()) self._build_phase = ['input', 'param', 'summary', 'layer'] - self._build_param = {'layer': {}, 'inner_layer': {}, 'layer_extend': {}, 'model': {}} + self._build_param = { + 'layer': {}, + 'inner_layer': {}, + 'layer_extend': {}, + 'model': {} + } self._inference_meta = {'dependency': {}, 'params': {}} def train_net(self): @@ -61,10 +82,12 @@ class YamlModel(Model): if self._build_nodes[phase] is None: continue for node in self._build_nodes[phase]: - exec("""layer=layer.{}(node)""".format(node['class'])) - layer_output, extend_output = layer.generate(self._config['mode'], self._build_param) + exec ("""layer=layer.{}(node)""".format(node['class'])) + layer_output, extend_output = layer.generate( + self._config['mode'], self._build_param) self._build_param['layer'][node['name']] = layer_output - self._build_param['layer_extend'][node['name']] = extend_output + self._build_param['layer_extend'][node[ + 'name']] = extend_output if extend_output is None: continue if 'loss' in extend_output: @@ -74,17 +97,24 @@ class YamlModel(Model): self._cost += extend_output['loss'] if 'data_var' in extend_output: self._data_var += extend_output['data_var'] - if 'metric_label' in extend_output and extend_output['metric_label'] is not None: - self._metrics[extend_output['metric_label']] = extend_output['metric_dict'] + if 'metric_label' in extend_output and extend_output[ + 'metric_label'] is not None: + self._metrics[extend_output[ + 'metric_label']] = extend_output['metric_dict'] if 'inference_param' in extend_output: inference_param = extend_output['inference_param'] param_name = inference_param['name'] if param_name not in self._build_param['table']: - self._build_param['table'][param_name] = {'params': []} - table_meta = table.TableMeta.alloc_new_table(inference_param['table_id']) - self._build_param['table'][param_name]['_meta'] = table_meta - self._build_param['table'][param_name]['params'] += inference_param['params'] + self._build_param['table'][param_name] = { + 'params': [] + } + table_meta = table.TableMeta.alloc_new_table( + inference_param['table_id']) + self._build_param['table'][param_name][ + '_meta'] = table_meta + self._build_param['table'][param_name][ + 'params'] += inference_param['params'] pass @classmethod @@ -99,20 +129,25 @@ class YamlModel(Model): metrics = params['metrics'] for name in metrics: model_metrics = metrics[name] - stat_var_names += [model_metrics[metric]['var'].name for metric in model_metrics] + stat_var_names += [ + model_metrics[metric]['var'].name + for metric in model_metrics + ] strategy['stat_var_names'] = list(set(stat_var_names)) optimizer_generator = 'optimizer = fluid.optimizer.' + optimizer_conf['class'] + \ '(learning_rate=' + str(optimizer_conf['learning_rate']) + ')' - exec(optimizer_generator) + exec (optimizer_generator) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) return optimizer def dump_model_program(self, path): """R """ - with open(path + '/' + self._name + '_main_program.pbtxt', "w") as fout: + with open(path + '/' + self._name + '_main_program.pbtxt', + "w") as fout: print >> fout, self._build_param['model']['train_program'] - with open(path + '/' + self._name + '_startup_program.pbtxt', "w") as fout: + with open(path + '/' + self._name + '_startup_program.pbtxt', + "w") as fout: print >> fout, self._build_param['model']['startup_program'] pass @@ -122,7 +157,8 @@ class YamlModel(Model): scope = params['scope'] decay = params['decay'] for param_table in self._build_param['table']: - table_id = self._build_param['table'][param_table]['_meta']._table_id + table_id = self._build_param['table'][param_table][ + '_meta']._table_id fleet.shrink_dense_table(decay, scope=scope, table_id=table_id) def dump_inference_program(self, inference_layer, path): @@ -137,17 +173,25 @@ class YamlModel(Model): executor = params['executor'] program = self._build_param['model']['train_program'] for table_name, table in self._build_param['table'].items(): - fleet._fleet_ptr.pull_dense(scope, table['_meta']._table_id, table['params']) + fleet._fleet_ptr.pull_dense(scope, table['_meta']._table_id, + table['params']) for infernce_item in params['inference_list']: - params_name_list = self.inference_params(infernce_item['layer_name']) - params_var_list = [program.global_block().var(i) for i in params_name_list] + params_name_list = self.inference_params(infernce_item[ + 'layer_name']) + params_var_list = [ + program.global_block().var(i) for i in params_name_list + ] params_file_name = infernce_item['save_file_name'] with fluid.scope_guard(scope): if params['save_combine']: fluid.io.save_vars(executor, "./", \ program, vars=params_var_list, filename=params_file_name) else: - fluid.io.save_vars(executor, params_file_name, program, vars=params_var_list) + fluid.io.save_vars( + executor, + params_file_name, + program, + vars=params_var_list) def inference_params(self, inference_layer): """ @@ -162,11 +206,13 @@ class YamlModel(Model): return self._inference_meta['params'][layer] self._inference_meta['params'][layer] = [] - self._inference_meta['dependency'][layer] = self.get_dependency(self._build_param['inner_layer'], layer) + self._inference_meta['dependency'][layer] = self.get_dependency( + self._build_param['inner_layer'], layer) for node in self._build_nodes['layer']: if node['name'] not in self._inference_meta['dependency'][layer]: continue - if 'inference_param' in self._build_param['layer_extend'][node['name']]: + if 'inference_param' in self._build_param['layer_extend'][node[ + 'name']]: self._inference_meta['params'][layer] += \ self._build_param['layer_extend'][node['name']]['inference_param']['params'] return self._inference_meta['params'][layer] @@ -184,5 +230,6 @@ class YamlModel(Model): dependencys = copy.deepcopy(layer_graph[dest_layer]['input']) dependency_list = copy.deepcopy(dependencys) for dependency in dependencys: - dependency_list = dependency_list + self.get_dependency(layer_graph, dependency) + dependency_list = dependency_list + self.get_dependency( + layer_graph, dependency) return list(set(dependency_list)) diff --git a/fleet_rec/core/modules/modul/layers.py b/core/modules/modul/layers.py similarity index 69% rename from fleet_rec/core/modules/modul/layers.py rename to core/modules/modul/layers.py index 84a08ee808377175a7b378a95f06181e7fca5173..008ce6e40987a6a3adf6605590aa2b8fe53f034a 100755 --- a/fleet_rec/core/modules/modul/layers.py +++ b/core/modules/modul/layers.py @@ -13,11 +13,12 @@ # limitations under the License. import paddle.fluid as fluid -from fleetrec.core.layer import Layer +from paddlerec.core.layer import Layer -class EmbeddingInputLayer(Layer): - """R + +class EmbeddingFuseLayer(Layer): + """embedding + sequence + concat """ def __init__(self, config): @@ -31,7 +32,7 @@ class EmbeddingInputLayer(Layer): self._emb_dim = self._mf_dim + 3 # append show ctr lr self._emb_layers = [] - def generate_fluid(self, param): + def generate(self, param): """R """ show_clk = fluid.layers.concat( @@ -39,7 +40,8 @@ class EmbeddingInputLayer(Layer): show_clk.stop_gradient = True data_var = [] for slot in self._slots: - l = fluid.layers.data(name=slot, shape=[1], dtype="int64", lod_level=1) + l = fluid.layers.data( + name=slot, shape=[1], dtype="int64", lod_level=1) data_var.append(l) emb = fluid.layers.embedding(input=l, size=[10, self._emb_dim], \ is_sparse=True, is_distributed=True, @@ -47,7 +49,8 @@ class EmbeddingInputLayer(Layer): emb = fluid.layers.sequence_pool(input=emb, pool_type='sum') emb = fluid.layers.continuous_value_model(emb, show_clk, self._cvm) self._emb_layers.append(emb) - output = fluid.layers.concat(input=self._emb_layers, axis=1, name=self._name) + output = fluid.layers.concat( + input=self._emb_layers, axis=1, name=self._name) return output, {'data_var': data_var} @@ -63,7 +66,7 @@ class LabelInputLayer(Layer): self._data_type = config.get('data_type', "int64") self._label_idx = config['label_idx'] - def generate_fluid(self, param): + def generate(self, param): """R """ label = fluid.layers.data(name=self._name, shape=[-1, self._dim], \ @@ -85,7 +88,7 @@ class TagInputLayer(Layer): self._dim = config.get('dim', 1) self._data_type = config['data_type'] - def generate_fluid(self, param): + def generate(self, param): """R """ output = fluid.layers.data(name=self._name, shape=[-1, self._dim], \ @@ -107,10 +110,16 @@ class ParamLayer(Layer): self._data_type = config.get('data_type', 'float32') self._config = config - def generate_fluid(self, param): + def generate(self, param): """R """ - return self._config, {'inference_param': {'name': 'param', 'params': [], 'table_id': self._table_id}} + return self._config, { + 'inference_param': { + 'name': 'param', + 'params': [], + 'table_id': self._table_id + } + } class SummaryLayer(Layer): @@ -125,13 +134,19 @@ class SummaryLayer(Layer): self._data_type = config.get('data_type', 'float32') self._config = config - def generate_fluid(self, param): + def generate(self, param): """R """ - return self._config, {'inference_param': {'name': 'summary', 'params': [], 'table_id': self._table_id}} + return self._config, { + 'inference_param': { + 'name': 'summary', + 'params': [], + 'table_id': self._table_id + } + } -class NormalizetionLayer(Layer): +class NormalizationLayer(Layer): """R """ @@ -143,7 +158,7 @@ class NormalizetionLayer(Layer): self._summary = config['summary'] self._table_id = config.get('table_id', -1) - def generate_fluid(self, param): + def generate(self, param): """R """ input_layer = param['layer'][self._input[0]] @@ -151,14 +166,24 @@ class NormalizetionLayer(Layer): if len(self._input) > 0: input_list = [param['layer'][i] for i in self._input] input_layer = fluid.layers.concat(input=input_list, axis=1) - bn = fluid.layers.data_norm(input=input_layer, name=self._name, epsilon=1e-4, param_attr={ - "batch_size": 1e4, "batch_sum_default": 0.0, "batch_square": 1e4}) - inference_param = [self._name + '.batch_size', self._name + '.batch_sum', self._name + '.batch_square_sum'] + bn = fluid.layers.data_norm( + input=input_layer, + name=self._name, + epsilon=1e-4, + param_attr={ + "batch_size": 1e4, + "batch_sum_default": 0.0, + "batch_square": 1e4 + }) + inference_param = [ + self._name + '.batch_size', self._name + '.batch_sum', + self._name + '.batch_square_sum' + ] return bn, {'inference_param': {'name': 'summary', \ 'params': inference_param, 'table_id': summary_layer.get('table_id', -1)}} -class NeuralLayer(Layer): +class FCLayer(Layer): """R """ @@ -171,7 +196,7 @@ class NeuralLayer(Layer): self._bias = config.get('bias', True) self._act_func = config.get('act_func', None) - def generate_fluid(self, param): + def generate(self, param): """R """ param_layer = param['layer'][self._param] @@ -180,11 +205,13 @@ class NeuralLayer(Layer): input_list = [param['layer'][i] for i in self._input] input_layer = fluid.layers.concat(input=input_list, axis=1) input_coln = input_layer.shape[1] - scale = param_layer['init_range'] / (input_coln ** 0.5) + scale = param_layer['init_range'] / (input_coln**0.5) bias = None if self._bias: - bias = fluid.ParamAttr(learning_rate=1.0, - initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=scale)) + bias = fluid.ParamAttr( + learning_rate=1.0, + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=scale)) fc = fluid.layers.fc( name=self._name, input=input_layer, @@ -199,7 +226,7 @@ class NeuralLayer(Layer): 'table_id': param_layer.get('table_id', -1)}} -class SigmoidLossLayer(Layer): +class LogLossLayer(Layer): """R """ @@ -215,29 +242,60 @@ class SigmoidLossLayer(Layer): self._extend_output = { 'metric_label': self._metric_label, 'metric_dict': { - 'auc': {'var': None}, - 'batch_auc': {'var': None}, - 'stat_pos': {'var': None, 'data_type': 'int64'}, - 'stat_neg': {'var': None, 'data_type': 'int64'}, - 'batch_stat_pos': {'var': None, 'data_type': 'int64'}, - 'batch_stat_neg': {'var': None, 'data_type': 'int64'}, - 'pos_ins_num': {'var': None}, - 'abserr': {'var': None}, - 'sqrerr': {'var': None}, - 'prob': {'var': None}, - 'total_ins_num': {'var': None}, - 'q': {'var': None} + 'auc': { + 'var': None + }, + 'batch_auc': { + 'var': None + }, + 'stat_pos': { + 'var': None, + 'data_type': 'int64' + }, + 'stat_neg': { + 'var': None, + 'data_type': 'int64' + }, + 'batch_stat_pos': { + 'var': None, + 'data_type': 'int64' + }, + 'batch_stat_neg': { + 'var': None, + 'data_type': 'int64' + }, + 'pos_ins_num': { + 'var': None + }, + 'abserr': { + 'var': None + }, + 'sqrerr': { + 'var': None + }, + 'prob': { + 'var': None + }, + 'total_ins_num': { + 'var': None + }, + 'q': { + 'var': None + } } } - def generate_fluid(self, param): + def generate(self, param): """R """ input_layer = param['layer'][self._input[0]] label_layer = param['layer'][self._label] - output = fluid.layers.clip(input_layer, self._bound[0], self._bound[1], name=self._name) + output = fluid.layers.clip( + input_layer, self._bound[0], self._bound[1], name=self._name) norm = fluid.layers.sigmoid(output, name=self._name) - output = fluid.layers.log_loss(norm, fluid.layers.cast(x=label_layer, dtype='float32')) + output = fluid.layers.log_loss( + norm, fluid.layers.cast( + x=label_layer, dtype='float32')) if self._weight: weight_layer = param['layer'][self._weight] output = fluid.layers.elementwise_mul(output, weight_layer) @@ -247,7 +305,11 @@ class SigmoidLossLayer(Layer): # For AUC Metric metric = self._extend_output['metric_dict'] binary_predict = fluid.layers.concat( - input=[fluid.layers.elementwise_sub(fluid.layers.ceil(norm), norm), norm], axis=1) + input=[ + fluid.layers.elementwise_sub(fluid.layers.ceil(norm), norm), + norm + ], + axis=1) metric['auc']['var'], metric['batch_auc']['var'], [metric['batch_stat_pos']['var'], \ metric['batch_stat_neg']['var'], metric['stat_pos']['var'], metric['stat_neg']['var']] = \ diff --git a/core/reader.py b/core/reader.py new file mode 100755 index 0000000000000000000000000000000000000000..555ae4ba83fa1fd0e1e57e110c199c9cedc1b1cb --- /dev/null +++ b/core/reader.py @@ -0,0 +1,111 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import abc +import os + +import paddle.fluid.incubate.data_generator as dg +import yaml + +from paddlerec.core.utils import envs + + +class Reader(dg.MultiSlotDataGenerator): + __metaclass__ = abc.ABCMeta + + def __init__(self, config): + dg.MultiSlotDataGenerator.__init__(self) + + if os.path.isfile(config): + with open(config, 'r') as rb: + _config = yaml.load(rb.read(), Loader=yaml.FullLoader) + else: + raise ValueError("reader config only support yaml") + + @abc.abstractmethod + def init(self): + """init""" + pass + + @abc.abstractmethod + def generate_sample(self, line): + pass + + +class SlotReader(dg.MultiSlotDataGenerator): + __metaclass__ = abc.ABCMeta + + def __init__(self, config): + dg.MultiSlotDataGenerator.__init__(self) + if os.path.isfile(config): + with open(config, 'r') as rb: + _config = yaml.load(rb.read(), Loader=yaml.FullLoader) + else: + raise ValueError("reader config only support yaml") + + def init(self, sparse_slots, dense_slots, padding=0): + from operator import mul + self.sparse_slots = [] + if sparse_slots.strip() != "#" and sparse_slots.strip( + ) != "?" and sparse_slots.strip() != "": + self.sparse_slots = sparse_slots.strip().split(" ") + self.dense_slots = [] + if dense_slots.strip() != "#" and dense_slots.strip( + ) != "?" and dense_slots.strip() != "": + self.dense_slots = dense_slots.strip().split(" ") + self.dense_slots_shape = [ + reduce(mul, + [int(j) for j in i.split(":")[1].strip("[]").split(",")]) + for i in self.dense_slots + ] + self.dense_slots = [i.split(":")[0] for i in self.dense_slots] + self.slots = self.dense_slots + self.sparse_slots + self.slot2index = {} + self.visit = {} + for i in range(len(self.slots)): + self.slot2index[self.slots[i]] = i + self.visit[self.slots[i]] = False + self.padding = padding + + def generate_sample(self, l): + def reader(): + line = l.strip().split(" ") + output = [(i, []) for i in self.slots] + for i in line: + slot_feasign = i.split(":") + slot = slot_feasign[0] + if slot not in self.slots: + continue + if slot in self.sparse_slots: + feasign = int(slot_feasign[1]) + else: + feasign = float(slot_feasign[1]) + output[self.slot2index[slot]][1].append(feasign) + self.visit[slot] = True + for i in self.visit: + slot = i + if not self.visit[slot]: + if i in self.dense_slots: + output[self.slot2index[i]][1].extend( + [self.padding] * + self.dense_slots_shape[self.slot2index[i]]) + else: + output[self.slot2index[i]][1].extend([self.padding]) + else: + self.visit[slot] = False + yield output + + return reader diff --git a/fleet_rec/core/trainer.py b/core/trainer.py similarity index 94% rename from fleet_rec/core/trainer.py rename to core/trainer.py index ce6edf68a10d75b83d5901d6214351a6506a7093..b7c22ea89bd279a2e1e233edeb4d8cf11b8aa5c0 100755 --- a/fleet_rec/core/trainer.py +++ b/core/trainer.py @@ -12,15 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -import sys - import abc +import os import time +import sys import yaml from paddle import fluid -from fleetrec.core.utils import envs + +from paddlerec.core.utils import envs class Trainer(object): @@ -30,8 +30,10 @@ class Trainer(object): def __init__(self, config=None): self._status_processor = {} + self._place = fluid.CPUPlace() self._exe = fluid.Executor(self._place) + self._exector_context = {} self._context = {'status': 'uninit', 'is_exit': False} self._config_yaml = config @@ -95,6 +97,6 @@ def user_define_engine(engine_yaml): train_dirname = os.path.dirname(train_location) base_name = os.path.splitext(os.path.basename(train_location))[0] sys.path.append(train_dirname) - trainer_class = envs.lazy_instance_by_fliename( - base_name, "UserDefineTraining") + trainer_class = envs.lazy_instance_by_fliename(base_name, + "UserDefineTraining") return trainer_class diff --git a/core/trainers/__init__.py b/core/trainers/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..f14704cad8f3859746f95353ba68753f857ff78d --- /dev/null +++ b/core/trainers/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +trainer implement. + + ↗ (single/cluster) CtrTrainer +Trainer + ↗ (for single training) SingleTrainer/TDMSingleTrainer + ↘ TranspilerTrainer → (for cluster training) ClusterTrainer/TDMClusterTrainer + ↘ (for online learning training) OnlineLearningTrainer + +""" diff --git a/fleet_rec/core/trainers/cluster_trainer.py b/core/trainers/cluster_trainer.py similarity index 79% rename from fleet_rec/core/trainers/cluster_trainer.py rename to core/trainers/cluster_trainer.py index 4635d91020888666a3b3449e6dfd8994ff45a6cf..792b897f779b82a0989d6c25dd79663d52d05abd 100755 --- a/fleet_rec/core/trainers/cluster_trainer.py +++ b/core/trainers/cluster_trainer.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """ Training use fluid with one node only. """ @@ -19,13 +18,15 @@ Training use fluid with one node only. from __future__ import print_function import os +import time + import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory from paddle.fluid.incubate.fleet.base.role_maker import PaddleCloudRoleMaker -from fleetrec.core.utils import envs -from fleetrec.core.trainers.transpiler_trainer import TranspileTrainer +from paddlerec.core.utils import envs +from paddlerec.core.trainers.transpiler_trainer import TranspileTrainer class ClusterTrainer(TranspileTrainer): @@ -41,11 +42,14 @@ class ClusterTrainer(TranspileTrainer): self.regist_context_processor('uninit', self.instance) self.regist_context_processor('init_pass', self.init) self.regist_context_processor('startup_pass', self.startup) - if envs.get_platform() == "LINUX" and envs.get_global_env("dataset_class", None, "train.reader") != "DataLoader": + + if envs.get_platform() == "LINUX" and envs.get_global_env( + "dataset_class", None, "train.reader") != "DataLoader": self.regist_context_processor('train_pass', self.dataset_train) else: - self.regist_context_processor( - 'train_pass', self.dataloader_train) + self.regist_context_processor('train_pass', + self.dataloader_train) + self.regist_context_processor('infer_pass', self.infer) self.regist_context_processor('terminal_pass', self.terminal) @@ -73,14 +77,14 @@ class ClusterTrainer(TranspileTrainer): def init(self, context): self.model.train_net() optimizer = self.model.optimizer() - optimizer_name = envs.get_global_env( - "hyper_parameters.optimizer", None, "train.model") + optimizer_name = envs.get_global_env("hyper_parameters.optimizer", + None, "train.model") if optimizer_name not in ["", "sgd", "SGD", "Sgd"]: os.environ["FLAGS_communicator_is_sgd_optimizer"] = '0' strategy = self.build_strategy() optimizer = fleet.distributed_optimizer(optimizer, strategy) - optimizer.minimize(self.model.get_cost_op()) + optimizer.minimize(self.model.get_avg_cost()) if fleet.is_server(): context['status'] = 'server_pass' @@ -112,9 +116,9 @@ class ClusterTrainer(TranspileTrainer): program = fluid.compiler.CompiledProgram( fleet.main_program).with_data_parallel( - loss_name=self.model.get_cost_op().name, - build_strategy=self.strategy.get_build_strategy(), - exec_strategy=self.strategy.get_execute_strategy()) + loss_name=self.model.get_avg_cost().name, + build_strategy=self.strategy.get_build_strategy(), + exec_strategy=self.strategy.get_execute_strategy()) metrics_varnames = [] metrics_format = [] @@ -133,9 +137,8 @@ class ClusterTrainer(TranspileTrainer): batch_id = 0 try: while True: - metrics_rets = self._exe.run( - program=program, - fetch_list=metrics_varnames) + metrics_rets = self._exe.run(program=program, + fetch_list=metrics_varnames) metrics = [epoch, batch_id] metrics.extend(metrics_rets) @@ -154,14 +157,23 @@ class ClusterTrainer(TranspileTrainer): fleet.init_worker() dataset = self._get_dataset() + ins = self._get_dataset_ins() + epochs = envs.get_global_env("train.epochs") for i in range(epochs): - self._exe.train_from_dataset(program=fluid.default_main_program(), - dataset=dataset, - fetch_list=self.fetch_vars, - fetch_info=self.fetch_alias, - print_period=self.fetch_period) + begin_time = time.time() + self._exe.train_from_dataset( + program=fluid.default_main_program(), + dataset=dataset, + fetch_list=self.fetch_vars, + fetch_info=self.fetch_alias, + print_period=self.fetch_period) + end_time = time.time() + times = end_time - begin_time + print("epoch {} using time {}, speed {:.2f} lines/s".format( + i, times, ins / times)) + self.save(i, "train", is_fleet=True) fleet.stop_worker() context['status'] = 'infer_pass' diff --git a/core/trainers/online_learning_trainer.py b/core/trainers/online_learning_trainer.py new file mode 100755 index 0000000000000000000000000000000000000000..b285684464ed2cd1d8bfd7710d6f28d30de3f936 --- /dev/null +++ b/core/trainers/online_learning_trainer.py @@ -0,0 +1,189 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Training use fluid with one node only. +""" + +from __future__ import print_function + +import datetime +import os +import time + +import paddle.fluid as fluid +from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet +from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory +from paddle.fluid.incubate.fleet.base.role_maker import PaddleCloudRoleMaker + +from paddlerec.core.utils import envs +from paddlerec.core.trainers.transpiler_trainer import TranspileTrainer + + +class OnlineLearningTrainer(TranspileTrainer): + def processor_register(self): + role = PaddleCloudRoleMaker() + fleet.init(role) + + if fleet.is_server(): + self.regist_context_processor('uninit', self.instance) + self.regist_context_processor('init_pass', self.init) + self.regist_context_processor('server_pass', self.server) + else: + self.regist_context_processor('uninit', self.instance) + self.regist_context_processor('init_pass', self.init) + self.regist_context_processor('startup_pass', self.startup) + + if envs.get_platform() == "LINUX" and envs.get_global_env( + "dataset_class", None, "train.reader") != "DataLoader": + self.regist_context_processor('train_pass', self.dataset_train) + else: + self.regist_context_processor('train_pass', + self.dataloader_train) + + self.regist_context_processor('infer_pass', self.infer) + self.regist_context_processor('terminal_pass', self.terminal) + + def build_strategy(self): + mode = envs.get_runtime_environ("train.trainer.strategy") + assert mode in ["async", "geo", "sync", "half_async"] + + strategy = None + + if mode == "async": + strategy = StrategyFactory.create_async_strategy() + elif mode == "geo": + push_num = envs.get_global_env("train.strategy.mode.push_num", 100) + strategy = StrategyFactory.create_geo_strategy(push_num) + elif mode == "sync": + strategy = StrategyFactory.create_sync_strategy() + elif mode == "half_async": + strategy = StrategyFactory.create_half_async_strategy() + + assert strategy is not None + + self.strategy = strategy + return strategy + + def init(self, context): + self.model.train_net() + optimizer = self.model.optimizer() + strategy = self.build_strategy() + optimizer = fleet.distributed_optimizer(optimizer, strategy) + optimizer.minimize(self.model.get_avg_cost()) + + if fleet.is_server(): + context['status'] = 'server_pass' + else: + self.fetch_vars = [] + self.fetch_alias = [] + self.fetch_period = self.model.get_fetch_period() + + metrics = self.model.get_metrics() + if metrics: + self.fetch_vars = metrics.values() + self.fetch_alias = metrics.keys() + context['status'] = 'startup_pass' + + def server(self, context): + fleet.init_server() + fleet.run_server() + context['is_exit'] = True + + def startup(self, context): + self._exe.run(fleet.startup_program) + context['status'] = 'train_pass' + + def dataloader_train(self, context): + print("online learning can only support LINUX only") + context['status'] = 'terminal_pass' + + def _get_dataset(self, state="TRAIN", hour=None): + if state == "TRAIN": + inputs = self.model.get_inputs() + namespace = "train.reader" + train_data_path = envs.get_global_env("train_data_path", None, + namespace) + else: + inputs = self.model.get_infer_inputs() + namespace = "evaluate.reader" + train_data_path = envs.get_global_env("test_data_path", None, + namespace) + + threads = int(envs.get_runtime_environ("train.trainer.threads")) + batch_size = envs.get_global_env("batch_size", None, namespace) + reader_class = envs.get_global_env("class", None, namespace) + abs_dir = os.path.dirname(os.path.abspath(__file__)) + reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py') + pipe_cmd = "python {} {} {} {}".format(reader, reader_class, state, + self._config_yaml) + + if train_data_path.startswith("paddlerec::"): + package_base = envs.get_runtime_environ("PACKAGE_BASE") + assert package_base is not None + train_data_path = os.path.join(package_base, + train_data_path.split("::")[1]) + + dataset = fluid.DatasetFactory().create_dataset() + dataset.set_use_var(inputs) + dataset.set_pipe_command(pipe_cmd) + dataset.set_batch_size(batch_size) + dataset.set_thread(threads) + + if hour is not None: + train_data_path = os.path.join(train_data_path, hour) + + file_list = [ + os.path.join(train_data_path, x) + for x in os.listdir(train_data_path) + ] + + self.files = file_list + dataset.set_filelist(self.files) + return dataset + + def dataset_train(self, context): + fleet.init_worker() + + days = envs.get_global_env("train.days") + begin_day = datetime.datetime.strptime("begin_day_d", '%Y%m%d') + + for day in range(days): + for hour in range(24): + day = begin_day + datetime.timedelta(days=day, hours=hour) + day_s = day.strftime('%Y%m%d/%H') + i = day.strftime('%Y%m%d_%H') + + dataset = self._get_dataset(hour=day_s) + ins = self._get_dataset_ins() + + begin_time = time.time() + self._exe.train_from_dataset( + program=fluid.default_main_program(), + dataset=dataset, + fetch_list=self.fetch_vars, + fetch_info=self.fetch_alias, + print_period=self.fetch_period) + end_time = time.time() + times = end_time - begin_time + print("epoch {} using time {}, speed {:.2f} lines/s".format( + i, times, ins / times)) + self.save(i, "train", is_fleet=True) + + fleet.stop_worker() + context['status'] = 'infer_pass' + + def terminal(self, context): + for model in self.increment_models: + print("epoch :{}, dir: {}".format(model[0], model[1])) + context['is_exit'] = True diff --git a/core/trainers/single_infer.py b/core/trainers/single_infer.py new file mode 100755 index 0000000000000000000000000000000000000000..0f1c92f3f2d948c76d4cd2b0fdcca131b99cfc92 --- /dev/null +++ b/core/trainers/single_infer.py @@ -0,0 +1,355 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Training use fluid with one node only. +""" + +from __future__ import print_function + +import time +import logging +import os +import paddle.fluid as fluid + +from paddlerec.core.trainers.transpiler_trainer import TranspileTrainer +from paddlerec.core.utils import envs +from paddlerec.core.reader import SlotReader +from paddlerec.core.utils import dataloader_instance + +logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") +logger = logging.getLogger("fluid") +logger.setLevel(logging.INFO) + + +class SingleInfer(TranspileTrainer): + def __init__(self, config=None): + super(TranspileTrainer, self).__init__(config) + self._env = self._config + device = envs.get_global_env("device") + if device == 'gpu': + self._place = fluid.CUDAPlace(0) + elif device == 'cpu': + self._place = fluid.CPUPlace() + self._exe = fluid.Executor(self._place) + self.processor_register() + self._model = {} + self._dataset = {} + envs.set_global_envs(self._config) + envs.update_workspace() + self._runner_name = envs.get_global_env("mode") + device = envs.get_global_env("runner." + self._runner_name + ".device") + if device == 'gpu': + self._place = fluid.CUDAPlace(0) + elif device == 'cpu': + self._place = fluid.CPUPlace() + self._exe = fluid.Executor(self._place) + + def processor_register(self): + self.regist_context_processor('uninit', self.instance) + self.regist_context_processor('init_pass', self.init) + self.regist_context_processor('startup_pass', self.startup) + self.regist_context_processor('train_pass', self.executor_train) + self.regist_context_processor('terminal_pass', self.terminal) + + def instance(self, context): + context['status'] = 'init_pass' + + def _get_dataset(self, dataset_name): + name = "dataset." + dataset_name + "." + thread_num = envs.get_global_env(name + "thread_num") + batch_size = envs.get_global_env(name + "batch_size") + reader_class = envs.get_global_env(name + "data_converter") + abs_dir = os.path.dirname(os.path.abspath(__file__)) + reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py') + sparse_slots = envs.get_global_env(name + "sparse_slots", "").strip() + dense_slots = envs.get_global_env(name + "dense_slots", "").strip() + if sparse_slots == "" and dense_slots == "": + pipe_cmd = "python {} {} {} {}".format(reader, reader_class, + "TRAIN", self._config_yaml) + else: + if sparse_slots == "": + sparse_slots = "?" + if dense_slots == "": + dense_slots = "?" + padding = envs.get_global_env(name + "padding", 0) + pipe_cmd = "python {} {} {} {} {} {} {} {}".format( + reader, "slot", "slot", self._config_yaml, "fake", \ + sparse_slots.replace(" ", "?"), dense_slots.replace(" ", "?"), str(padding)) + + dataset = fluid.DatasetFactory().create_dataset() + dataset.set_batch_size(envs.get_global_env(name + "batch_size")) + dataset.set_pipe_command(pipe_cmd) + train_data_path = envs.get_global_env(name + "data_path") + file_list = [ + os.path.join(train_data_path, x) + for x in os.listdir(train_data_path) + ] + dataset.set_filelist(file_list) + for model_dict in self._env["phase"]: + if model_dict["dataset_name"] == dataset_name: + model = self._model[model_dict["name"]][3] + inputs = model._infer_data_var + dataset.set_use_var(inputs) + break + return dataset + + def _get_dataloader(self, dataset_name, dataloader): + name = "dataset." + dataset_name + "." + thread_num = envs.get_global_env(name + "thread_num") + batch_size = envs.get_global_env(name + "batch_size") + reader_class = envs.get_global_env(name + "data_converter") + abs_dir = os.path.dirname(os.path.abspath(__file__)) + sparse_slots = envs.get_global_env(name + "sparse_slots", "").strip() + dense_slots = envs.get_global_env(name + "dense_slots", "").strip() + if sparse_slots == "" and dense_slots == "": + reader = dataloader_instance.dataloader_by_name( + reader_class, dataset_name, self._config_yaml) + reader_class = envs.lazy_instance_by_fliename(reader_class, + "TrainReader") + reader_ins = reader_class(self._config_yaml) + else: + reader = dataloader_instance.slotdataloader_by_name( + "", dataset_name, self._config_yaml) + reader_ins = SlotReader(self._config_yaml) + if hasattr(reader_ins, 'generate_batch_from_trainfiles'): + dataloader.set_sample_list_generator(reader) + else: + dataloader.set_sample_generator(reader, batch_size) + return dataloader + + def _create_dataset(self, dataset_name): + name = "dataset." + dataset_name + "." + sparse_slots = envs.get_global_env(name + "sparse_slots") + dense_slots = envs.get_global_env(name + "dense_slots") + thread_num = envs.get_global_env(name + "thread_num") + batch_size = envs.get_global_env(name + "batch_size") + type_name = envs.get_global_env(name + "type") + if envs.get_platform() != "LINUX": + print("platform ", envs.get_platform(), + " change reader to DataLoader") + type_name = "DataLoader" + padding = 0 + + if type_name == "DataLoader": + return None + else: + return self._get_dataset(dataset_name) + + def init(self, context): + for model_dict in self._env["phase"]: + self._model[model_dict["name"]] = [None] * 5 + train_program = fluid.Program() + startup_program = fluid.Program() + scope = fluid.Scope() + dataset_name = model_dict["dataset_name"] + opt_name = envs.get_global_env("hyper_parameters.optimizer.class") + opt_lr = envs.get_global_env( + "hyper_parameters.optimizer.learning_rate") + opt_strategy = envs.get_global_env( + "hyper_parameters.optimizer.strategy") + with fluid.program_guard(train_program, startup_program): + with fluid.unique_name.guard(): + with fluid.scope_guard(scope): + model_path = model_dict["model"].replace( + "{workspace}", + envs.path_adapter(self._env["workspace"])) + model = envs.lazy_instance_by_fliename( + model_path, "Model")(self._env) + model._infer_data_var = model.input_data( + dataset_name=model_dict["dataset_name"]) + if envs.get_global_env("dataset." + dataset_name + + ".type") == "DataLoader": + model._init_dataloader(is_infer=True) + self._get_dataloader(dataset_name, + model._data_loader) + model.net(model._infer_data_var, True) + self._model[model_dict["name"]][0] = train_program + self._model[model_dict["name"]][1] = startup_program + self._model[model_dict["name"]][2] = scope + self._model[model_dict["name"]][3] = model + self._model[model_dict["name"]][4] = train_program.clone() + + for dataset in self._env["dataset"]: + if dataset["type"] != "DataLoader": + self._dataset[dataset["name"]] = self._create_dataset(dataset[ + "name"]) + + context['status'] = 'startup_pass' + + def startup(self, context): + for model_dict in self._env["phase"]: + with fluid.scope_guard(self._model[model_dict["name"]][2]): + self._exe.run(self._model[model_dict["name"]][1]) + context['status'] = 'train_pass' + + def executor_train(self, context): + epochs = int( + envs.get_global_env("runner." + self._runner_name + ".epochs")) + for j in range(epochs): + for model_dict in self._env["phase"]: + if j == 0: + with fluid.scope_guard(self._model[model_dict["name"]][2]): + train_prog = self._model[model_dict["name"]][0] + startup_prog = self._model[model_dict["name"]][1] + with fluid.program_guard(train_prog, startup_prog): + self.load() + reader_name = model_dict["dataset_name"] + name = "dataset." + reader_name + "." + begin_time = time.time() + if envs.get_global_env(name + "type") == "DataLoader": + self._executor_dataloader_train(model_dict) + else: + self._executor_dataset_train(model_dict) + with fluid.scope_guard(self._model[model_dict["name"]][2]): + train_prog = self._model[model_dict["name"]][4] + startup_prog = self._model[model_dict["name"]][1] + with fluid.program_guard(train_prog, startup_prog): + self.save(j) + end_time = time.time() + seconds = end_time - begin_time + print("epoch {} done, time elasped: {}".format(j, seconds)) + context['status'] = "terminal_pass" + + def _executor_dataset_train(self, model_dict): + reader_name = model_dict["dataset_name"] + model_name = model_dict["name"] + model_class = self._model[model_name][3] + fetch_vars = [] + fetch_alias = [] + fetch_period = int( + envs.get_global_env("runner." + self._runner_name + + ".print_interval", 20)) + metrics = model_class.get_infer_results() + if metrics: + fetch_vars = metrics.values() + fetch_alias = metrics.keys() + scope = self._model[model_name][2] + program = self._model[model_name][0] + reader = self._dataset[reader_name] + with fluid.scope_guard(scope): + self._exe.infer_from_dataset( + program=program, + dataset=reader, + fetch_list=fetch_vars, + fetch_info=fetch_alias, + print_period=fetch_period) + + def _executor_dataloader_train(self, model_dict): + reader_name = model_dict["dataset_name"] + model_name = model_dict["name"] + model_class = self._model[model_name][3] + program = self._model[model_name][0].clone() + fetch_vars = [] + fetch_alias = [] + metrics = model_class.get_infer_results() + if metrics: + fetch_vars = metrics.values() + fetch_alias = metrics.keys() + metrics_varnames = [] + metrics_format = [] + fetch_period = int( + envs.get_global_env("runner." + self._runner_name + + ".print_interval", 20)) + metrics_format.append("{}: {{}}".format("batch")) + for name, var in metrics.items(): + metrics_varnames.append(var.name) + metrics_format.append("{}: {{}}".format(name)) + metrics_format = ", ".join(metrics_format) + + reader = self._model[model_name][3]._data_loader + reader.start() + batch_id = 0 + scope = self._model[model_name][2] + with fluid.scope_guard(scope): + try: + while True: + metrics_rets = self._exe.run(program=program, + fetch_list=metrics_varnames) + metrics = [batch_id] + metrics.extend(metrics_rets) + + if batch_id % fetch_period == 0 and batch_id != 0: + print(metrics_format.format(*metrics)) + batch_id += 1 + except fluid.core.EOFException: + reader.reset() + + def terminal(self, context): + context['is_exit'] = True + + def load(self, is_fleet=False): + name = "runner." + self._runner_name + "." + dirname = envs.get_global_env(name + "init_model_path", None) + if dirname is None or dirname == "": + return + print("single_infer going to load ", dirname) + if is_fleet: + fleet.load_persistables(self._exe, dirname) + else: + fluid.io.load_persistables(self._exe, dirname) + + def save(self, epoch_id, is_fleet=False): + def need_save(epoch_id, epoch_interval, is_last=False): + if is_last: + return True + if epoch_id == -1: + return False + + return epoch_id % epoch_interval == 0 + + def save_inference_model(): + name = "runner." + self._runner_name + "." + save_interval = int( + envs.get_global_env(name + "save_inference_interval", -1)) + if not need_save(epoch_id, save_interval, False): + return + feed_varnames = envs.get_global_env( + name + "save_inference_feed_varnames", None) + fetch_varnames = envs.get_global_env( + name + "save_inference_fetch_varnames", None) + if feed_varnames is None or fetch_varnames is None or feed_varnames == "": + return + fetch_vars = [ + fluid.default_main_program().global_block().vars[varname] + for varname in fetch_varnames + ] + dirname = envs.get_global_env(name + "save_inference_path", None) + + assert dirname is not None + dirname = os.path.join(dirname, str(epoch_id)) + + if is_fleet: + fleet.save_inference_model(self._exe, dirname, feed_varnames, + fetch_vars) + else: + fluid.io.save_inference_model(dirname, feed_varnames, + fetch_vars, self._exe) + + def save_persistables(): + name = "runner." + self._runner_name + "." + save_interval = int( + envs.get_global_env(name + "save_checkpoint_interval", -1)) + if not need_save(epoch_id, save_interval, False): + return + dirname = envs.get_global_env(name + "save_checkpoint_path", None) + if dirname is None or dirname == "": + return + dirname = os.path.join(dirname, str(epoch_id)) + if is_fleet: + fleet.save_persistables(self._exe, dirname) + else: + fluid.io.save_persistables(self._exe, dirname) + + save_persistables() + save_inference_model() diff --git a/core/trainers/single_trainer.py b/core/trainers/single_trainer.py new file mode 100755 index 0000000000000000000000000000000000000000..66daf40276c0230505c00baa5ddf79449198074c --- /dev/null +++ b/core/trainers/single_trainer.py @@ -0,0 +1,355 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Training use fluid with one node only. +""" + +from __future__ import print_function + +import time +import logging +import os +import paddle.fluid as fluid + +from paddlerec.core.trainers.transpiler_trainer import TranspileTrainer +from paddlerec.core.utils import envs +from paddlerec.core.reader import SlotReader +from paddlerec.core.utils import dataloader_instance + +logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") +logger = logging.getLogger("fluid") +logger.setLevel(logging.INFO) + + +class SingleTrainer(TranspileTrainer): + def __init__(self, config=None): + super(TranspileTrainer, self).__init__(config) + self._env = self._config + self.processor_register() + self._model = {} + self._dataset = {} + envs.set_global_envs(self._config) + envs.update_workspace() + self._runner_name = envs.get_global_env("mode") + device = envs.get_global_env("runner." + self._runner_name + ".device") + if device == 'gpu': + self._place = fluid.CUDAPlace(0) + elif device == 'cpu': + self._place = fluid.CPUPlace() + self._exe = fluid.Executor(self._place) + + def processor_register(self): + self.regist_context_processor('uninit', self.instance) + self.regist_context_processor('init_pass', self.init) + self.regist_context_processor('startup_pass', self.startup) + self.regist_context_processor('train_pass', self.executor_train) + self.regist_context_processor('terminal_pass', self.terminal) + + def instance(self, context): + context['status'] = 'init_pass' + + def _get_dataset(self, dataset_name): + name = "dataset." + dataset_name + "." + thread_num = envs.get_global_env(name + "thread_num") + batch_size = envs.get_global_env(name + "batch_size") + reader_class = envs.get_global_env(name + "data_converter") + abs_dir = os.path.dirname(os.path.abspath(__file__)) + reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py') + sparse_slots = envs.get_global_env(name + "sparse_slots", "").strip() + dense_slots = envs.get_global_env(name + "dense_slots", "").strip() + if sparse_slots == "" and dense_slots == "": + pipe_cmd = "python {} {} {} {}".format(reader, reader_class, + "TRAIN", self._config_yaml) + else: + if sparse_slots == "": + sparse_slots = "?" + if dense_slots == "": + dense_slots = "?" + padding = envs.get_global_env(name + "padding", 0) + pipe_cmd = "python {} {} {} {} {} {} {} {}".format( + reader, "slot", "slot", self._config_yaml, "fake", \ + sparse_slots.replace(" ", "?"), dense_slots.replace(" ", "?"), str(padding)) + + dataset = fluid.DatasetFactory().create_dataset() + dataset.set_batch_size(envs.get_global_env(name + "batch_size")) + dataset.set_pipe_command(pipe_cmd) + train_data_path = envs.get_global_env(name + "data_path") + file_list = [ + os.path.join(train_data_path, x) + for x in os.listdir(train_data_path) + ] + dataset.set_filelist(file_list) + for model_dict in self._env["phase"]: + if model_dict["dataset_name"] == dataset_name: + model = self._model[model_dict["name"]][3] + inputs = model._data_var + dataset.set_use_var(inputs) + break + return dataset + + def _get_dataloader(self, dataset_name, dataloader): + name = "dataset." + dataset_name + "." + sparse_slots = envs.get_global_env(name + "sparse_slots", "").strip() + dense_slots = envs.get_global_env(name + "dense_slots", "").strip() + thread_num = envs.get_global_env(name + "thread_num") + batch_size = envs.get_global_env(name + "batch_size") + reader_class = envs.get_global_env(name + "data_converter") + abs_dir = os.path.dirname(os.path.abspath(__file__)) + if sparse_slots == "" and dense_slots == "": + reader = dataloader_instance.dataloader_by_name( + reader_class, dataset_name, self._config_yaml) + reader_class = envs.lazy_instance_by_fliename(reader_class, + "TrainReader") + reader_ins = reader_class(self._config_yaml) + else: + reader = dataloader_instance.slotdataloader_by_name( + "", dataset_name, self._config_yaml) + reader_ins = SlotReader(self._config_yaml) + if hasattr(reader_ins, 'generate_batch_from_trainfiles'): + dataloader.set_sample_list_generator(reader) + else: + dataloader.set_sample_generator(reader, batch_size) + return dataloader + + def _create_dataset(self, dataset_name): + name = "dataset." + dataset_name + "." + sparse_slots = envs.get_global_env(name + "sparse_slots", "").strip() + dense_slots = envs.get_global_env(name + "dense_slots", "").strip() + thread_num = envs.get_global_env(name + "thread_num") + batch_size = envs.get_global_env(name + "batch_size") + type_name = envs.get_global_env(name + "type") + if envs.get_platform() != "LINUX": + print("platform ", envs.get_platform(), + " change reader to DataLoader") + type_name = "DataLoader" + padding = 0 + + if type_name == "DataLoader": + return None + else: + return self._get_dataset(dataset_name) + + def init(self, context): + for model_dict in self._env["phase"]: + self._model[model_dict["name"]] = [None] * 5 + train_program = fluid.Program() + startup_program = fluid.Program() + scope = fluid.Scope() + dataset_name = model_dict["dataset_name"] + opt_name = envs.get_global_env("hyper_parameters.optimizer.class") + opt_lr = envs.get_global_env( + "hyper_parameters.optimizer.learning_rate") + opt_strategy = envs.get_global_env( + "hyper_parameters.optimizer.strategy") + with fluid.program_guard(train_program, startup_program): + with fluid.unique_name.guard(): + with fluid.scope_guard(scope): + model_path = model_dict["model"].replace( + "{workspace}", + envs.path_adapter(self._env["workspace"])) + model = envs.lazy_instance_by_fliename( + model_path, "Model")(self._env) + model._data_var = model.input_data( + dataset_name=model_dict["dataset_name"]) + if envs.get_global_env("dataset." + dataset_name + + ".type") == "DataLoader": + model._init_dataloader(is_infer=False) + self._get_dataloader(dataset_name, + model._data_loader) + model.net(model._data_var, False) + optimizer = model._build_optimizer(opt_name, opt_lr, + opt_strategy) + optimizer.minimize(model._cost) + self._model[model_dict["name"]][0] = train_program + self._model[model_dict["name"]][1] = startup_program + self._model[model_dict["name"]][2] = scope + self._model[model_dict["name"]][3] = model + self._model[model_dict["name"]][4] = train_program.clone() + + for dataset in self._env["dataset"]: + if dataset["type"] != "DataLoader": + self._dataset[dataset["name"]] = self._create_dataset(dataset[ + "name"]) + + context['status'] = 'startup_pass' + + def startup(self, context): + for model_dict in self._env["phase"]: + with fluid.scope_guard(self._model[model_dict["name"]][2]): + self._exe.run(self._model[model_dict["name"]][1]) + context['status'] = 'train_pass' + + def executor_train(self, context): + epochs = int( + envs.get_global_env("runner." + self._runner_name + ".epochs")) + for j in range(epochs): + for model_dict in self._env["phase"]: + if j == 0: + with fluid.scope_guard(self._model[model_dict["name"]][2]): + train_prog = self._model[model_dict["name"]][0] + startup_prog = self._model[model_dict["name"]][1] + with fluid.program_guard(train_prog, startup_prog): + self.load() + reader_name = model_dict["dataset_name"] + name = "dataset." + reader_name + "." + begin_time = time.time() + if envs.get_global_env(name + "type") == "DataLoader": + self._executor_dataloader_train(model_dict) + else: + self._executor_dataset_train(model_dict) + with fluid.scope_guard(self._model[model_dict["name"]][2]): + train_prog = self._model[model_dict["name"]][4] + startup_prog = self._model[model_dict["name"]][1] + with fluid.program_guard(train_prog, startup_prog): + self.save(j) + end_time = time.time() + seconds = end_time - begin_time + print("epoch {} done, time elasped: {}".format(j, seconds)) + context['status'] = "terminal_pass" + + def _executor_dataset_train(self, model_dict): + reader_name = model_dict["dataset_name"] + model_name = model_dict["name"] + model_class = self._model[model_name][3] + fetch_vars = [] + fetch_alias = [] + fetch_period = int( + envs.get_global_env("runner." + self._runner_name + + ".print_interval", 20)) + metrics = model_class.get_metrics() + if metrics: + fetch_vars = metrics.values() + fetch_alias = metrics.keys() + scope = self._model[model_name][2] + program = self._model[model_name][0] + reader = self._dataset[reader_name] + with fluid.scope_guard(scope): + self._exe.train_from_dataset( + program=program, + dataset=reader, + fetch_list=fetch_vars, + fetch_info=fetch_alias, + print_period=fetch_period) + + def _executor_dataloader_train(self, model_dict): + reader_name = model_dict["dataset_name"] + model_name = model_dict["name"] + model_class = self._model[model_name][3] + program = self._model[model_name][0].clone() + program = fluid.compiler.CompiledProgram(program).with_data_parallel( + loss_name=model_class.get_avg_cost().name) + fetch_vars = [] + fetch_alias = [] + fetch_period = int( + envs.get_global_env("runner." + self._runner_name + + ".print_interval", 20)) + metrics = model_class.get_metrics() + if metrics: + fetch_vars = metrics.values() + fetch_alias = metrics.keys() + metrics_varnames = [] + metrics_format = [] + metrics_format.append("{}: {{}}".format("batch")) + for name, var in metrics.items(): + metrics_varnames.append(var.name) + metrics_format.append("{}: {{}}".format(name)) + metrics_format = ", ".join(metrics_format) + + reader = self._model[model_name][3]._data_loader + reader.start() + batch_id = 0 + scope = self._model[model_name][2] + with fluid.scope_guard(scope): + try: + while True: + metrics_rets = self._exe.run(program=program, + fetch_list=metrics_varnames) + metrics = [batch_id] + metrics.extend(metrics_rets) + + if batch_id % fetch_period == 0 and batch_id != 0: + print(metrics_format.format(*metrics)) + batch_id += 1 + except fluid.core.EOFException: + reader.reset() + + def terminal(self, context): + context['is_exit'] = True + + def load(self, is_fleet=False): + dirname = envs.get_global_env( + "runner." + self._runner_name + ".init_model_path", None) + if dirname is None or dirname == "": + return + print("going to load ", dirname) + if is_fleet: + fleet.load_persistables(self._exe, dirname) + else: + fluid.io.load_persistables(self._exe, dirname) + + def save(self, epoch_id, is_fleet=False): + def need_save(epoch_id, epoch_interval, is_last=False): + if is_last: + return True + if epoch_id == -1: + return False + + return epoch_id % epoch_interval == 0 + + def save_inference_model(): + name = "runner." + self._runner_name + "." + save_interval = int( + envs.get_global_env(name + "save_inference_interval", -1)) + if not need_save(epoch_id, save_interval, False): + return + feed_varnames = envs.get_global_env( + name + "save_inference_feed_varnames", []) + fetch_varnames = envs.get_global_env( + name + "save_inference_fetch_varnames", []) + if feed_varnames is None or fetch_varnames is None or feed_varnames == "" or fetch_varnames == "" or \ + len(feed_varnames) == 0 or len(fetch_varnames) == 0: + return + fetch_vars = [ + fluid.default_main_program().global_block().vars[varname] + for varname in fetch_varnames + ] + dirname = envs.get_global_env(name + "save_inference_path", None) + + assert dirname is not None + dirname = os.path.join(dirname, str(epoch_id)) + + if is_fleet: + fleet.save_inference_model(self._exe, dirname, feed_varnames, + fetch_vars) + else: + fluid.io.save_inference_model(dirname, feed_varnames, + fetch_vars, self._exe) + + def save_persistables(): + name = "runner." + self._runner_name + "." + save_interval = int( + envs.get_global_env(name + "save_checkpoint_interval", -1)) + if not need_save(epoch_id, save_interval, False): + return + dirname = envs.get_global_env(name + "save_checkpoint_path", None) + if dirname is None or dirname == "": + return + dirname = os.path.join(dirname, str(epoch_id)) + if is_fleet: + fleet.save_persistables(self._exe, dirname) + else: + fluid.io.save_persistables(self._exe, dirname) + + save_persistables() + save_inference_model() diff --git a/fleet_rec/core/trainers/tdm_cluster_trainer.py b/core/trainers/tdm_cluster_trainer.py similarity index 66% rename from fleet_rec/core/trainers/tdm_cluster_trainer.py rename to core/trainers/tdm_cluster_trainer.py index 40a46d92fd7aae583c313be9e27865d700b6ca73..a7e8f97e446bc266a733fc12a798c505ee4d9ec5 100755 --- a/fleet_rec/core/trainers/tdm_cluster_trainer.py +++ b/core/trainers/tdm_cluster_trainer.py @@ -1,4 +1,3 @@ -# -*- coding=utf-8 -*- # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,22 +11,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """ Training use fluid with one node only. """ from __future__ import print_function + import logging + import numpy as np import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet -from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory -from paddle.fluid.incubate.fleet.base.role_maker import PaddleCloudRoleMaker - -from fleetrec.core.utils import envs -from fleetrec.core.trainers.cluster_trainer import ClusterTrainer +from paddlerec.core.utils import envs +from paddlerec.core.trainers.cluster_trainer import ClusterTrainer logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger("fluid") @@ -38,8 +35,8 @@ special_param = ["TDM_Tree_Travel", "TDM_Tree_Layer", "TDM_Tree_Info"] class TDMClusterTrainer(ClusterTrainer): def server(self, context): namespace = "train.startup" - init_model_path = envs.get_global_env( - "cluster.init_model_path", "", namespace) + init_model_path = envs.get_global_env("cluster.init_model_path", "", + namespace) assert init_model_path != "", "Cluster train must has init_model for TDM" fleet.init_server(init_model_path) logger.info("TDM: load model from {}".format(init_model_path)) @@ -50,26 +47,28 @@ class TDMClusterTrainer(ClusterTrainer): self._exe.run(fleet.startup_program) namespace = "train.startup" - load_tree = envs.get_global_env( - "tree.load_tree", True, namespace) - self.tree_layer_path = envs.get_global_env( - "tree.tree_layer_path", "", namespace) - self.tree_travel_path = envs.get_global_env( - "tree.tree_travel_path", "", namespace) - self.tree_info_path = envs.get_global_env( - "tree.tree_info_path", "", namespace) - - save_init_model = envs.get_global_env( - "cluster.save_init_model", False, namespace) - init_model_path = envs.get_global_env( - "cluster.init_model_path", "", namespace) + load_tree = envs.get_global_env("tree.load_tree", True, namespace) + + self.tree_layer_path = envs.get_global_env("tree.tree_layer_path", "", + namespace) + + self.tree_travel_path = envs.get_global_env("tree.tree_travel_path", + "", namespace) + + self.tree_info_path = envs.get_global_env("tree.tree_info_path", "", + namespace) + + save_init_model = envs.get_global_env("cluster.save_init_model", False, + namespace) + init_model_path = envs.get_global_env("cluster.init_model_path", "", + namespace) if load_tree: - # 将明文树结构及数据,set到组网中的Variale中 - # 不使用NumpyInitialize方法是考虑到树结构相关数据size过大,有性能风险 + # covert tree to tensor, set it into Fluid's variable. for param_name in special_param: - param_t = fluid.global_scope().find_var(param_name).get_tensor() - param_array = self.tdm_prepare(param_name) + param_t = fluid.global_scope().find_var(param_name).get_tensor( + ) + param_array = self._tdm_prepare(param_name) param_t.set(param_array.astype('int32'), self._place) if save_init_model: @@ -80,27 +79,27 @@ class TDMClusterTrainer(ClusterTrainer): context['status'] = 'train_pass' - def tdm_prepare(self, param_name): + def _tdm_prepare(self, param_name): if param_name == "TDM_Tree_Travel": - travel_array = self.tdm_travel_prepare() + travel_array = self._tdm_travel_prepare() return travel_array elif param_name == "TDM_Tree_Layer": - layer_array, _ = self.tdm_layer_prepare() + layer_array, _ = self._tdm_layer_prepare() return layer_array elif param_name == "TDM_Tree_Info": - info_array = self.tdm_info_prepare() + info_array = self._tdm_info_prepare() return info_array else: raise " {} is not a special tdm param name".format(param_name) - def tdm_travel_prepare(self): + def _tdm_travel_prepare(self): """load tdm tree param from npy/list file""" travel_array = np.load(self.tree_travel_path) - logger.info("TDM Tree leaf node nums: {}".format( - travel_array.shape[0])) + logger.info("TDM Tree leaf node nums: {}".format(travel_array.shape[ + 0])) return travel_array - def tdm_layer_prepare(self): + def _tdm_layer_prepare(self): """load tdm tree param from npy/list file""" layer_list = [] layer_list_flat = [] @@ -120,7 +119,7 @@ class TDMClusterTrainer(ClusterTrainer): [len(i) for i in layer_list])) return layer_array, layer_list - def tdm_info_prepare(self): + def _tdm_info_prepare(self): """load tdm tree param from list file""" info_array = np.load(self.tree_info_path) return info_array diff --git a/fleet_rec/core/trainers/tdm_single_trainer.py b/core/trainers/tdm_single_trainer.py similarity index 65% rename from fleet_rec/core/trainers/tdm_single_trainer.py rename to core/trainers/tdm_single_trainer.py index 6d010999d0f71f7634ee6913c911461b6180e89d..c0f23fc361e907ca5732a3531fc7c460ddc5aad3 100755 --- a/fleet_rec/core/trainers/tdm_single_trainer.py +++ b/core/trainers/tdm_single_trainer.py @@ -1,4 +1,3 @@ -# -*- coding=utf-8 -*- # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,50 +11,53 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """ Training use fluid with one node only. """ from __future__ import print_function import logging -import paddle.fluid as fluid -from fleetrec.core.trainers.transpiler_trainer import TranspileTrainer -from fleetrec.core.trainers.single_trainer import SingleTrainer -from fleetrec.core.utils import envs import numpy as np +import paddle.fluid as fluid +from paddlerec.core.trainers.single_trainer import SingleTrainer +from paddlerec.core.utils import envs logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger("fluid") logger.setLevel(logging.INFO) -special_param = ["TDM_Tree_Travel", "TDM_Tree_Layer", - "TDM_Tree_Info", "TDM_Tree_Emb"] +special_param = [ + "TDM_Tree_Travel", "TDM_Tree_Layer", "TDM_Tree_Info", "TDM_Tree_Emb" +] class TDMSingleTrainer(SingleTrainer): def startup(self, context): namespace = "train.startup" - load_persistables = envs.get_global_env( - "single.load_persistables", False, namespace) + load_persistables = envs.get_global_env("single.load_persistables", + False, namespace) + persistables_model_path = envs.get_global_env( "single.persistables_model_path", "", namespace) - load_tree = envs.get_global_env( - "tree.load_tree", False, namespace) - self.tree_layer_path = envs.get_global_env( - "tree.tree_layer_path", "", namespace) - self.tree_travel_path = envs.get_global_env( - "tree.tree_travel_path", "", namespace) - self.tree_info_path = envs.get_global_env( - "tree.tree_info_path", "", namespace) - self.tree_emb_path = envs.get_global_env( - "tree.tree_emb_path", "", namespace) - - save_init_model = envs.get_global_env( - "single.save_init_model", False, namespace) - init_model_path = envs.get_global_env( - "single.init_model_path", "", namespace) + load_tree = envs.get_global_env("tree.load_tree", False, namespace) + + self.tree_layer_path = envs.get_global_env("tree.tree_layer_path", "", + namespace) + + self.tree_travel_path = envs.get_global_env("tree.tree_travel_path", + "", namespace) + + self.tree_info_path = envs.get_global_env("tree.tree_info_path", "", + namespace) + + self.tree_emb_path = envs.get_global_env("tree.tree_emb_path", "", + namespace) + + save_init_model = envs.get_global_env("single.save_init_model", False, + namespace) + init_model_path = envs.get_global_env("single.init_model_path", "", + namespace) self._exe.run(fluid.default_startup_program()) if load_persistables: @@ -68,11 +70,11 @@ class TDMSingleTrainer(SingleTrainer): persistables_model_path)) if load_tree: - # 将明文树结构及数据,set到组网中的Variale中 - # 不使用NumpyInitialize方法是考虑到树结构相关数据size过大,有性能风险 + # covert tree to tensor, set it into Fluid's variable. for param_name in special_param: - param_t = fluid.global_scope().find_var(param_name).get_tensor() - param_array = self.tdm_prepare(param_name) + param_t = fluid.global_scope().find_var(param_name).get_tensor( + ) + param_array = self._tdm_prepare(param_name) if param_name == 'TDM_Tree_Emb': param_t.set(param_array.astype('float32'), self._place) else: @@ -86,37 +88,37 @@ class TDMSingleTrainer(SingleTrainer): context['status'] = 'train_pass' - def tdm_prepare(self, param_name): + def _tdm_prepare(self, param_name): if param_name == "TDM_Tree_Travel": - travel_array = self.tdm_travel_prepare() + travel_array = self._tdm_travel_prepare() return travel_array elif param_name == "TDM_Tree_Layer": - layer_array, _ = self.tdm_layer_prepare() + layer_array, _ = self._tdm_layer_prepare() return layer_array elif param_name == "TDM_Tree_Info": - info_array = self.tdm_info_prepare() + info_array = self._tdm_info_prepare() return info_array elif param_name == "TDM_Tree_Emb": - emb_array = self.tdm_emb_prepare() + emb_array = self._tdm_emb_prepare() return emb_array else: raise " {} is not a special tdm param name".format(param_name) - def tdm_travel_prepare(self): + def _tdm_travel_prepare(self): """load tdm tree param from npy/list file""" travel_array = np.load(self.tree_travel_path) - logger.info("TDM Tree leaf node nums: {}".format( - travel_array.shape[0])) + logger.info("TDM Tree leaf node nums: {}".format(travel_array.shape[ + 0])) return travel_array - def tdm_emb_prepare(self): + def _tdm_emb_prepare(self): """load tdm tree param from npy/list file""" emb_array = np.load(self.tree_emb_path) - logger.info("TDM Tree node nums from emb: {}".format( - emb_array.shape[0])) + logger.info("TDM Tree node nums from emb: {}".format(emb_array.shape[ + 0])) return emb_array - def tdm_layer_prepare(self): + def _tdm_layer_prepare(self): """load tdm tree param from npy/list file""" layer_list = [] layer_list_flat = [] @@ -136,7 +138,7 @@ class TDMSingleTrainer(SingleTrainer): [len(i) for i in layer_list])) return layer_array, layer_list - def tdm_info_prepare(self): + def _tdm_info_prepare(self): """load tdm tree param from list file""" info_array = np.load(self.tree_info_path) return info_array diff --git a/fleet_rec/core/trainers/transpiler_trainer.py b/core/trainers/transpiler_trainer.py similarity index 55% rename from fleet_rec/core/trainers/transpiler_trainer.py rename to core/trainers/transpiler_trainer.py index 96afab2164f9f1e815ae0c1c48249ea0fa109dbf..b3c09797b6e023bc5968fabbd70676522d885f19 100755 --- a/fleet_rec/core/trainers/transpiler_trainer.py +++ b/core/trainers/transpiler_trainer.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """ Training use fluid with DistributeTranspiler """ @@ -20,23 +19,31 @@ import os import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet -from fleetrec.core.trainer import Trainer -from fleetrec.core.utils import envs -from fleetrec.core.utils import dataloader_instance +from paddlerec.core.trainer import Trainer +from paddlerec.core.utils import envs +from paddlerec.core.utils import dataloader_instance +from paddlerec.core.reader import SlotReader class TranspileTrainer(Trainer): def __init__(self, config=None): Trainer.__init__(self, config) + device = envs.get_global_env("train.device", "cpu") + if device == 'gpu': + self._place = fluid.CUDAPlace(0) + self._exe = fluid.Executor(self._place) self.processor_register() self.model = None self.inference_models = [] self.increment_models = [] def processor_register(self): - print("Need implement by trainer, `self.regist_context_processor('uninit', self.instance)` must be the first") + print( + "Need implement by trainer, `self.regist_context_processor('uninit', self.instance)` must be the first" + ) def _get_dataloader(self, state="TRAIN"): + if state == "TRAIN": dataloader = self.model._data_loader namespace = "train.reader" @@ -46,45 +53,86 @@ class TranspileTrainer(Trainer): namespace = "evaluate.reader" class_name = "EvaluateReader" + sparse_slots = envs.get_global_env("sparse_slots", None, namespace) + dense_slots = envs.get_global_env("dense_slots", None, namespace) + batch_size = envs.get_global_env("batch_size", None, namespace) - reader_class = envs.get_global_env("class", None, namespace) print("batch_size: {}".format(batch_size)) - reader = dataloader_instance.dataloader( - reader_class, state, self._config_yaml) - reader_class = envs.lazy_instance_by_fliename(reader_class, class_name) - reader_ins = reader_class(self._config_yaml) + if sparse_slots is None and dense_slots is None: + reader_class = envs.get_global_env("class", None, namespace) + reader = dataloader_instance.dataloader(reader_class, state, + self._config_yaml) + reader_class = envs.lazy_instance_by_fliename(reader_class, + class_name) + reader_ins = reader_class(self._config_yaml) + else: + reader = dataloader_instance.slotdataloader("", state, + self._config_yaml) + reader_ins = SlotReader(self._config_yaml) + if hasattr(reader_ins, 'generate_batch_from_trainfiles'): dataloader.set_sample_list_generator(reader) else: dataloader.set_sample_generator(reader, batch_size) + + debug_mode = envs.get_global_env("reader_debug_mode", False, namespace) + if debug_mode: + print("--- DataLoader Debug Mode Begin , show pre 10 data ---") + for idx, line in enumerate(reader()): + print(line) + if idx >= 9: + break + print("--- DataLoader Debug Mode End , show pre 10 data ---") + exit(0) return dataloader + def _get_dataset_ins(self): + count = 0 + for f in self.files: + for _, _ in enumerate(open(f, 'r')): + count += 1 + return count + def _get_dataset(self, state="TRAIN"): if state == "TRAIN": inputs = self.model.get_inputs() namespace = "train.reader" - train_data_path = envs.get_global_env( - "train_data_path", None, namespace) + train_data_path = envs.get_global_env("train_data_path", None, + namespace) else: inputs = self.model.get_infer_inputs() namespace = "evaluate.reader" - train_data_path = envs.get_global_env( - "test_data_path", None, namespace) + train_data_path = envs.get_global_env("test_data_path", None, + namespace) + + sparse_slots = envs.get_global_env("sparse_slots", None, namespace) + dense_slots = envs.get_global_env("dense_slots", None, namespace) threads = int(envs.get_runtime_environ("train.trainer.threads")) batch_size = envs.get_global_env("batch_size", None, namespace) reader_class = envs.get_global_env("class", None, namespace) abs_dir = os.path.dirname(os.path.abspath(__file__)) reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py') - pipe_cmd = "python {} {} {} {}".format( - reader, reader_class, state, self._config_yaml) - if train_data_path.startswith("fleetrec::"): + if sparse_slots is None and dense_slots is None: + pipe_cmd = "python {} {} {} {}".format(reader, reader_class, state, + self._config_yaml) + else: + if sparse_slots is None: + sparse_slots = "#" + if dense_slots is None: + dense_slots = "#" + padding = envs.get_global_env("padding", 0, namespace) + pipe_cmd = "python {} {} {} {} {} {} {} {}".format( + reader, "slot", "slot", self._config_yaml, namespace, \ + sparse_slots.replace(" ", "#"), dense_slots.replace(" ", "#"), str(padding)) + + if train_data_path.startswith("paddlerec::"): package_base = envs.get_runtime_environ("PACKAGE_BASE") assert package_base is not None - train_data_path = os.path.join( - package_base, train_data_path.split("::")[1]) + train_data_path = os.path.join(package_base, + train_data_path.split("::")[1]) dataset = fluid.DatasetFactory().create_dataset() dataset.set_use_var(inputs) @@ -95,8 +143,18 @@ class TranspileTrainer(Trainer): os.path.join(train_data_path, x) for x in os.listdir(train_data_path) ] + self.files = file_list + dataset.set_filelist(self.files) + + debug_mode = envs.get_global_env("reader_debug_mode", False, namespace) + if debug_mode: + print("--- Dataset Debug Mode Begin , show pre 10 data of {}---". + format(file_list[0])) + os.system("cat {} | {} | head -10".format(file_list[0], pipe_cmd)) + print("--- Dataset Debug Mode End , show pre 10 data of {}---". + format(file_list[0])) + exit(0) - dataset.set_filelist(file_list) return dataset def save(self, epoch_id, namespace, is_fleet=False): @@ -115,26 +173,30 @@ class TranspileTrainer(Trainer): if not need_save(epoch_id, save_interval, False): return - - # print("save inference model is not supported now.") - # return - feed_varnames = envs.get_global_env("save.inference.feed_varnames", None, namespace) - fetch_varnames = envs.get_global_env("save.inference.fetch_varnames", None, namespace) + feed_varnames = envs.get_global_env("save.inference.feed_varnames", + None, namespace) + fetch_varnames = envs.get_global_env( + "save.inference.fetch_varnames", None, namespace) if feed_varnames is None or fetch_varnames is None: return - fetch_vars = [fluid.default_main_program().global_block().vars[varname] for varname in fetch_varnames] - dirname = envs.get_global_env("save.inference.dirname", None, namespace) + fetch_vars = [ + fluid.default_main_program().global_block().vars[varname] + for varname in fetch_varnames + ] + dirname = envs.get_global_env("save.inference.dirname", None, + namespace) assert dirname is not None dirname = os.path.join(dirname, str(epoch_id)) if is_fleet: - fleet.save_inference_model(self._exe, dirname, feed_varnames, fetch_vars) + fleet.save_inference_model(self._exe, dirname, feed_varnames, + fetch_vars) else: - fluid.io.save_inference_model( - dirname, feed_varnames, fetch_vars, self._exe) + fluid.io.save_inference_model(dirname, feed_varnames, + fetch_vars, self._exe) self.inference_models.append((epoch_id, dirname)) def save_persistables(): @@ -144,8 +206,8 @@ class TranspileTrainer(Trainer): if not need_save(epoch_id, save_interval, False): return - dirname = envs.get_global_env( - "save.increment.dirname", None, namespace) + dirname = envs.get_global_env("save.increment.dirname", None, + namespace) assert dirname is not None dirname = os.path.join(dirname, str(epoch_id)) @@ -203,17 +265,29 @@ class TranspileTrainer(Trainer): metrics_format = ", ".join(metrics_format) self._exe.run(startup_program) - for (epoch, model_dir) in self.increment_models: - print("Begin to infer epoch {}, model_dir: {}".format(epoch, model_dir)) + model_list = self.increment_models + + evaluate_only = envs.get_global_env( + 'evaluate_only', False, namespace='evaluate') + if evaluate_only: + model_list = [(0, envs.get_global_env( + 'evaluate_model_path', "", namespace='evaluate'))] + + is_return_numpy = envs.get_global_env( + 'is_return_numpy', True, namespace='evaluate') + + for (epoch, model_dir) in model_list: + print("Begin to infer No.{} model, model_dir: {}".format( + epoch, model_dir)) program = infer_program.clone() fluid.io.load_persistables(self._exe, model_dir, program) reader.start() batch_id = 0 try: while True: - metrics_rets = self._exe.run( - program=program, - fetch_list=metrics_varnames) + metrics_rets = self._exe.run(program=program, + fetch_list=metrics_varnames, + return_numpy=is_return_numpy) metrics = [epoch, batch_id] metrics.extend(metrics_rets) diff --git a/core/utils/__init__.py b/core/utils/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/core/utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/core/utils/dataloader_instance.py b/core/utils/dataloader_instance.py new file mode 100755 index 0000000000000000000000000000000000000000..a26e2df20876e48bec5a1f30492987605613ef8a --- /dev/null +++ b/core/utils/dataloader_instance.py @@ -0,0 +1,195 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function + +import os +from paddlerec.core.utils.envs import lazy_instance_by_fliename +from paddlerec.core.utils.envs import get_global_env +from paddlerec.core.utils.envs import get_runtime_environ +from paddlerec.core.reader import SlotReader + + +def dataloader_by_name(readerclass, dataset_name, yaml_file): + reader_class = lazy_instance_by_fliename(readerclass, "TrainReader") + name = "dataset." + dataset_name + "." + data_path = get_global_env(name + "data_path") + + if data_path.startswith("paddlerec::"): + package_base = get_runtime_environ("PACKAGE_BASE") + assert package_base is not None + data_path = os.path.join(package_base, data_path.split("::")[1]) + + files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)] + reader = reader_class(yaml_file) + reader.init() + + def gen_reader(): + for file in files: + with open(file, 'r') as f: + for line in f: + line = line.rstrip('\n') + iter = reader.generate_sample(line) + for parsed_line in iter(): + if parsed_line is None: + continue + else: + values = [] + for pased in parsed_line: + values.append(pased[1]) + yield values + + def gen_batch_reader(): + return reader.generate_batch_from_trainfiles(files) + + if hasattr(reader, 'generate_batch_from_trainfiles'): + return gen_batch_reader() + return gen_reader + + +def slotdataloader_by_name(readerclass, dataset_name, yaml_file): + name = "dataset." + dataset_name + "." + reader_name = "SlotReader" + data_path = get_global_env(name + "data_path") + + if data_path.startswith("paddlerec::"): + package_base = get_runtime_environ("PACKAGE_BASE") + assert package_base is not None + data_path = os.path.join(package_base, data_path.split("::")[1]) + + files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)] + sparse = get_global_env(name + "sparse_slots", "#") + if sparse == "": + sparse = "#" + dense = get_global_env(name + "dense_slots", "#") + if dense == "": + dense = "#" + padding = get_global_env(name + "padding", 0) + reader = SlotReader(yaml_file) + reader.init(sparse, dense, int(padding)) + + def gen_reader(): + for file in files: + with open(file, 'r') as f: + for line in f: + line = line.rstrip('\n') + iter = reader.generate_sample(line) + for parsed_line in iter(): + if parsed_line is None: + continue + else: + values = [] + for pased in parsed_line: + values.append(pased[1]) + yield values + + def gen_batch_reader(): + return reader.generate_batch_from_trainfiles(files) + + if hasattr(reader, 'generate_batch_from_trainfiles'): + return gen_batch_reader() + return gen_reader + + +def dataloader(readerclass, train, yaml_file): + if train == "TRAIN": + reader_name = "TrainReader" + namespace = "train.reader" + data_path = get_global_env("train_data_path", None, namespace) + else: + reader_name = "EvaluateReader" + namespace = "evaluate.reader" + data_path = get_global_env("test_data_path", None, namespace) + + if data_path.startswith("paddlerec::"): + package_base = get_runtime_environ("PACKAGE_BASE") + assert package_base is not None + data_path = os.path.join(package_base, data_path.split("::")[1]) + + files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)] + + reader_class = lazy_instance_by_fliename(readerclass, reader_name) + reader = reader_class(yaml_file) + reader.init() + + def gen_reader(): + for file in files: + with open(file, 'r') as f: + for line in f: + line = line.rstrip('\n') + iter = reader.generate_sample(line) + for parsed_line in iter(): + if parsed_line is None: + continue + else: + values = [] + for pased in parsed_line: + values.append(pased[1]) + yield values + + def gen_batch_reader(): + return reader.generate_batch_from_trainfiles(files) + + if hasattr(reader, 'generate_batch_from_trainfiles'): + return gen_batch_reader() + return gen_reader + + +def slotdataloader(readerclass, train, yaml_file): + if train == "TRAIN": + reader_name = "SlotReader" + namespace = "train.reader" + data_path = get_global_env("train_data_path", None, namespace) + else: + reader_name = "SlotReader" + namespace = "evaluate.reader" + data_path = get_global_env("test_data_path", None, namespace) + + if data_path.startswith("paddlerec::"): + package_base = get_runtime_environ("PACKAGE_BASE") + assert package_base is not None + data_path = os.path.join(package_base, data_path.split("::")[1]) + + files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)] + + sparse = get_global_env("sparse_slots", "#", namespace) + if sparse == "": + sparse = "#" + dense = get_global_env("dense_slots", "#", namespace) + if dense == "": + dense = "#" + padding = get_global_env("padding", 0, namespace) + reader = SlotReader(yaml_file) + reader.init(sparse, dense, int(padding)) + + def gen_reader(): + for file in files: + with open(file, 'r') as f: + for line in f: + line = line.rstrip('\n') + iter = reader.generate_sample(line) + for parsed_line in iter(): + if parsed_line is None: + continue + else: + values = [] + for pased in parsed_line: + values.append(pased[1]) + yield values + + def gen_batch_reader(): + return reader.generate_batch_from_trainfiles(files) + + if hasattr(reader, 'generate_batch_from_trainfiles'): + return gen_batch_reader() + return gen_reader diff --git a/fleet_rec/core/utils/dataset.py b/core/utils/dataset_holder.py similarity index 74% rename from fleet_rec/core/utils/dataset.py rename to core/utils/dataset_holder.py index 92cbe8911a327f07d667e0e9f6d35bcb18140914..a75d52b60440f924acbb45ff7ff9125eaa121e36 100755 --- a/fleet_rec/core/utils/dataset.py +++ b/core/utils/dataset_holder.py @@ -13,18 +13,18 @@ # limitations under the License. import abc -import time import datetime +import time import paddle.fluid as fluid -from fleetrec.core.utils import fs as fs -from fleetrec.core.utils import util as util +from paddlerec.core.utils import fs as fs +from paddlerec.core.utils import util as util -class Dataset(object): +class DatasetHolder(object): """ - Dataset Base + Dataset Holder """ __metaclass__ = abc.ABCMeta @@ -62,7 +62,7 @@ class Dataset(object): pass -class TimeSplitDataset(Dataset): +class TimeSplitDatasetHolder(DatasetHolder): """ Dataset with time split dir. root_path/$DAY/$HOUR """ @@ -74,11 +74,17 @@ class TimeSplitDataset(Dataset): Dataset.__init__(self, config) if 'data_donefile' not in config or config['data_donefile'] is None: config['data_donefile'] = config['data_path'] + "/to.hadoop.done" - self._path_generator = util.PathGenerator({'templates': [ - {'name': 'data_path', 'template': config['data_path']}, - {'name': 'donefile_path', 'template': config['data_donefile']} - ]}) - self._split_interval = config['split_interval'] # data split N mins per dir + self._path_generator = util.PathGenerator({ + 'templates': [{ + 'name': 'data_path', + 'template': config['data_path'] + }, { + 'name': 'donefile_path', + 'template': config['data_donefile'] + }] + }) + self._split_interval = config[ + 'split_interval'] # data split N mins per dir self._data_file_handler = fs.FileHandler(config) def _format_data_time(self, daytime_str, time_window_mins): @@ -91,7 +97,8 @@ class TimeSplitDataset(Dataset): return None, 0 if mins_of_day % self._split_interval != 0: - skip_mins = self._split_interval - (mins_of_day % self._split_interval) + skip_mins = self._split_interval - (mins_of_day % + self._split_interval) data_time = data_time + datetime.timedelta(minutes=skip_mins) time_window_mins = time_window_mins - skip_mins return data_time, time_window_mins @@ -106,17 +113,24 @@ class TimeSplitDataset(Dataset): True/False """ is_ready = True - data_time, windows_mins = self._format_data_time(daytime_str, time_window_mins) + data_time, windows_mins = self._format_data_time(daytime_str, + time_window_mins) while time_window_mins > 0: - file_path = self._path_generator.generate_path('donefile_path', {'time_format': data_time}) + file_path = self._path_generator.generate_path( + 'donefile_path', {'time_format': data_time}) if not self._data_file_handler.is_exist(file_path): is_ready = False break time_window_mins = time_window_mins - self._split_interval - data_time = data_time + datetime.timedelta(minutes=self._split_interval) + data_time = data_time + datetime.timedelta( + minutes=self._split_interval) return is_ready - def get_file_list(self, daytime_str, time_window_mins, node_num=1, node_idx=0): + def get_file_list(self, + daytime_str, + time_window_mins, + node_num=1, + node_idx=0): """ data in [daytime_str, daytime_str + time_window_mins], random shard to node_num, return shard[node_idx] Args: @@ -128,36 +142,32 @@ class TimeSplitDataset(Dataset): list, data_shard[node_idx] """ data_file_list = [] - data_time, windows_mins = self._format_data_time(daytime_str, time_window_mins) + data_time, windows_mins = self._format_data_time(daytime_str, + time_window_mins) while time_window_mins > 0: - file_path = self._path_generator.generate_path('data_path', {'time_format': data_time}) + file_path = self._path_generator.generate_path( + 'data_path', {'time_format': data_time}) sub_file_list = self._data_file_handler.ls(file_path) for sub_file in sub_file_list: sub_file_name = self._data_file_handler.get_file_name(sub_file) - if not sub_file_name.startswith(self._config['filename_prefix']): + if not sub_file_name.startswith(self._config[ + 'filename_prefix']): continue if hash(sub_file_name) % node_num == node_idx: data_file_list.append(sub_file) time_window_mins = time_window_mins - self._split_interval - data_time = data_time + datetime.timedelta(minutes=self._split_interval) + data_time = data_time + datetime.timedelta( + minutes=self._split_interval) return data_file_list - -class FluidTimeSplitDataset(TimeSplitDataset): - """ - A Dataset with time split for PaddleFluid - """ - - def __init__(self, config): - """ """ - TimeSplitDataset.__init__(self, config) - def _alloc_dataset(self, file_list): """ """ - dataset = fluid.DatasetFactory().create_dataset(self._config['dataset_type']) + dataset = fluid.DatasetFactory().create_dataset(self._config[ + 'dataset_type']) dataset.set_batch_size(self._config['batch_size']) dataset.set_thread(self._config['load_thread']) - dataset.set_hdfs_config(self._config['fs_name'], self._config['fs_ugi']) + dataset.set_hdfs_config(self._config['fs_name'], + self._config['fs_ugi']) dataset.set_pipe_command(self._config['data_converter']) dataset.set_filelist(file_list) dataset.set_use_var(self._config['data_vars']) @@ -173,7 +183,9 @@ class FluidTimeSplitDataset(TimeSplitDataset): while self.check_ready(begin_time, windown_min) == False: print("dataset not ready, time:" + begin_time) time.sleep(30) - file_list = self.get_file_list(begin_time, windown_min, params['node_num'], params['node_idx']) + file_list = self.get_file_list(begin_time, windown_min, + params['node_num'], + params['node_idx']) self._datasets[begin_time] = self._alloc_dataset(file_list) self._datasets[begin_time].load_into_memory() else: @@ -186,9 +198,12 @@ class FluidTimeSplitDataset(TimeSplitDataset): windown_min = params['time_window_min'] if begin_time not in self._datasets: if self.check_ready(begin_time, windown_min): - file_list = self.get_file_list(begin_time, windown_min, params['node_num'], params['node_idx']) + file_list = self.get_file_list(begin_time, windown_min, + params['node_num'], + params['node_idx']) self._datasets[begin_time] = self._alloc_dataset(file_list) - self._datasets[begin_time].preload_into_memory(self._config['preload_thread']) + self._datasets[begin_time].preload_into_memory(self._config[ + 'preload_thread']) return True return False diff --git a/core/utils/dataset_instance.py b/core/utils/dataset_instance.py new file mode 100755 index 0000000000000000000000000000000000000000..3f0a3a484dfccfbc26216cc9eb09fe8443401078 --- /dev/null +++ b/core/utils/dataset_instance.py @@ -0,0 +1,49 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import sys + +from paddlerec.core.utils.envs import lazy_instance_by_fliename +from paddlerec.core.reader import SlotReader + +if len(sys.argv) < 4: + raise ValueError( + "reader only accept 3 argument: 1. reader_class 2.train/evaluate/slotreader 3.yaml_abs_path" + ) + +reader_package = sys.argv[1] + +if sys.argv[2].upper() == "TRAIN": + reader_name = "TrainReader" +elif sys.argv[2].upper() == "EVALUATE": + reader_name = "EvaluateReader" +else: + reader_name = "SlotReader" + namespace = sys.argv[4] + sparse_slots = sys.argv[5].replace("?", " ") + dense_slots = sys.argv[6].replace("?", " ") + padding = int(sys.argv[7]) + +yaml_abs_path = sys.argv[3] + +if reader_name != "SlotReader": + reader_class = lazy_instance_by_fliename(reader_package, reader_name) + reader = reader_class(yaml_abs_path) + reader.init() + reader.run_from_stdin() +else: + reader = SlotReader(yaml_abs_path) + reader.init(sparse_slots, dense_slots, padding) + reader.run_from_stdin() diff --git a/fleet_rec/core/utils/envs.py b/core/utils/envs.py similarity index 70% rename from fleet_rec/core/utils/envs.py rename to core/utils/envs.py index 2d4cfadea305e2d42bd454f3ddc4048dd593945a..f432950dfa50571cd307d4a370484e35ff77b408 100755 --- a/fleet_rec/core/utils/envs.py +++ b/core/utils/envs.py @@ -12,21 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os +from contextlib import closing import copy -import sys +import os import socket -from contextlib import closing +import sys + global_envs = {} -def flatten_environs(envs): +def flatten_environs(envs, separator="."): flatten_dict = {} assert isinstance(envs, dict) def fatten_env_namespace(namespace_nests, local_envs): if not isinstance(local_envs, dict): - global_k = ".".join(namespace_nests) + global_k = separator.join(namespace_nests) flatten_dict[global_k] = str(local_envs) else: for k, v in local_envs.items(): @@ -35,7 +36,7 @@ def flatten_environs(envs): nests.append(k) fatten_env_namespace(nests, v) else: - global_k = ".".join(namespace_nests + [k]) + global_k = separator.join(namespace_nests + [k]) flatten_dict[global_k] = str(v) for k, v in envs.items(): @@ -67,19 +68,28 @@ def set_global_envs(envs): nests = copy.deepcopy(namespace_nests) nests.append(k) fatten_env_namespace(nests, v) + elif (k == "dataset" or k == "phase" or + k == "runner") and isinstance(v, list): + for i in v: + if i.get("name") is None: + raise ValueError("name must be in dataset list ", v) + nests = copy.deepcopy(namespace_nests) + nests.append(k) + nests.append(i["name"]) + fatten_env_namespace(nests, i) else: global_k = ".".join(namespace_nests + [k]) global_envs[global_k] = v - for k, v in envs.items(): - fatten_env_namespace([k], v) + fatten_env_namespace([], envs) def get_global_env(env_name, default_value=None, namespace=None): """ get os environment value """ - _env_name = env_name if namespace is None else ".".join([namespace, env_name]) + _env_name = env_name if namespace is None else ".".join( + [namespace, env_name]) return global_envs.get(_env_name, default_value) @@ -87,22 +97,32 @@ def get_global_envs(): return global_envs +def path_adapter(path): + if path.startswith("paddlerec."): + package = get_runtime_environ("PACKAGE_BASE") + l_p = path.split("paddlerec.")[1].replace(".", "/") + return os.path.join(package, l_p) + else: + return path + + +def windows_path_converter(path): + if get_platform() == "WINDOWS": + return path.replace("/", "\\") + else: + return path.replace("\\", "/") + + def update_workspace(): - workspace = global_envs.get("train.workspace", None) + workspace = global_envs.get("workspace") if not workspace: return - - # is fleet inner models - if workspace.startswith("fleetrec."): - fleet_package = get_runtime_environ("PACKAGE_BASE") - workspace_dir = workspace.split("fleetrec.")[1].replace(".", "/") - path = os.path.join(fleet_package, workspace_dir) - else: - path = workspace + workspace = path_adapter(workspace) for name, value in global_envs.items(): if isinstance(value, str): - value = value.replace("{workspace}", path) + value = value.replace("{workspace}", workspace) + value = windows_path_converter(value) global_envs[name] = value @@ -127,7 +147,7 @@ def pretty_print_envs(envs, header=None): if header: draws += h_format.format(header[0], header[1]) else: - draws += h_format.format("fleetrec Global Envs", "Value") + draws += h_format.format("paddlerec Global Envs", "Value") draws += line + "\n" @@ -147,7 +167,8 @@ def pretty_print_envs(envs, header=None): def lazy_instance_by_package(package, class_name): models = get_global_env("train.model.models") - model_package = __import__(package, globals(), locals(), package.split(".")) + model_package = __import__(package, + globals(), locals(), package.split(".")) instance = getattr(model_package, class_name) return instance @@ -157,7 +178,8 @@ def lazy_instance_by_fliename(abs, class_name): sys.path.append(dirname) package = os.path.splitext(os.path.basename(abs))[0] - model_package = __import__(package, globals(), locals(), package.split(".")) + model_package = __import__(package, + globals(), locals(), package.split(".")) instance = getattr(model_package, class_name) return instance @@ -172,11 +194,12 @@ def get_platform(): if 'Windows' in plats: return "WINDOWS" + def find_free_port(): def __free_port(): - with closing(socket.socket(socket.AF_INET, - socket.SOCK_STREAM)) as s: + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: s.bind(('', 0)) return s.getsockname()[1] + new_port = __free_port() return new_port diff --git a/fleet_rec/core/utils/fs.py b/core/utils/fs.py similarity index 90% rename from fleet_rec/core/utils/fs.py rename to core/utils/fs.py index df2379721041197170945ebfd03ccf8f8cc61d0a..fab84496c5761e4214f4e5bb3666960408abf68c 100755 --- a/fleet_rec/core/utils/fs.py +++ b/core/utils/fs.py @@ -13,11 +13,12 @@ # limitations under the License. import os + from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient def is_afs_path(path): - """R + """is_afs_path """ if path.startswith("afs") or path.startswith("hdfs"): return True @@ -28,12 +29,12 @@ class LocalFSClient(object): """ Util for local disk file_system io """ - + def __init__(self): """R """ pass - + def write(self, content, path, mode): """ write to file @@ -43,7 +44,7 @@ class LocalFSClient(object): mode(string): w/a w:clear_write a:append_write """ temp_dir = os.path.dirname(path) - if not os.path.exists(temp_dir): + if not os.path.exists(temp_dir): os.makedirs(temp_dir) f = open(path, mode) f.write(content) @@ -75,7 +76,7 @@ class LocalFSClient(object): """R """ os.system("rm -rf " + path) - + def is_exist(self, path): """R """ @@ -94,13 +95,14 @@ class FileHandler(object): """ A Smart file handler. auto judge local/afs by path """ + def __init__(self, config): """R """ if 'fs_name' in config: - hadoop_home="$HADOOP_HOME" + hadoop_home = "$HADOOP_HOME" hdfs_configs = { - "hadoop.job.ugi": config['fs_ugi'], + "hadoop.job.ugi": config['fs_ugi'], "fs.default.name": config['fs_name'] } self._hdfs_client = HDFSClient(hadoop_home, hdfs_configs) @@ -131,7 +133,9 @@ class FileHandler(object): if mode.find('a') >= 0: org_content = self._hdfs_client.cat(dest_path) content = content + org_content - self._local_fs_client.write(content, temp_local_file, mode) #fleet hdfs_client only support upload, so write tmp file + self._local_fs_client.write( + content, temp_local_file, mode + ) # fleet hdfs_client only support upload, so write tmp file self._hdfs_client.delete(dest_path + ".tmp") self._hdfs_client.upload(dest_path + ".tmp", temp_local_file) self._hdfs_client.delete(dest_path + ".bak") @@ -139,7 +143,7 @@ class FileHandler(object): self._hdfs_client.rename(dest_path + ".tmp", dest_path) else: self._local_fs_client.write(content, dest_path, mode) - + def cat(self, path): """R """ @@ -148,19 +152,20 @@ class FileHandler(object): return hdfs_cat else: return self._local_fs_client.cat(path) - + def ls(self, path): """R """ files = [] if is_afs_path(path): files = self._hdfs_client.ls(path) - files = [path + '/' + self.get_file_name(fi) for fi in files] # absulte path + files = [path + '/' + self.get_file_name(fi) + for fi in files] # absulte path else: files = self._local_fs_client.ls(path) files = [path + '/' + fi for fi in files] # absulte path return files - + def cp(self, org_path, dest_path): """R """ @@ -170,6 +175,6 @@ class FileHandler(object): return self._local_fs_client.cp(org_path, dest_path) if not org_is_afs and dest_is_afs: return self._hdfs_client.upload(dest_path, org_path) - if org_is_afs and not dest_is_afs: + if org_is_afs and not dest_is_afs: return self._hdfs_client.download(org_path, dest_path) print("Not Suppor hdfs cp currently") diff --git a/fleet_rec/core/utils/table.py b/core/utils/table.py similarity index 97% rename from fleet_rec/core/utils/table.py rename to core/utils/table.py index 7c86d0f03edb47a7835ab7364ebe318985d091e2..558cd26d61b1be165e964b4dea3a1f3dfe82e0ba 100755 --- a/fleet_rec/core/utils/table.py +++ b/core/utils/table.py @@ -12,16 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -import copy -import yaml - class TableMeta(object): """ Simple ParamTable Meta, Contain table_id """ TableId = 1 - + @staticmethod def alloc_new_table(table_id): """ diff --git a/fleet_rec/core/utils/util.py b/core/utils/util.py similarity index 88% rename from fleet_rec/core/utils/util.py rename to core/utils/util.py index b410df1d8bb31fe7e613925e2a731d61924d8835..34f26c6d113faf4739ff621d1087da475414c46f 100755 --- a/fleet_rec/core/utils/util.py +++ b/core/utils/util.py @@ -12,15 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. +import datetime import os import time -import datetime from paddle import fluid -from fleetrec.core.utils import fs as fs + +from paddlerec.core.utils import fs as fs def save_program_proto(path, program=None): + if program is None: _program = fluid.default_main_program() else: @@ -174,7 +176,8 @@ class PathGenerator(object): """ if template_name in self._templates: if 'time_format' in param: - str = param['time_format'].strftime(self._templates[template_name]) + str = param['time_format'].strftime(self._templates[ + template_name]) return str.format(**param) return self._templates[template_name].format(**param) else: @@ -197,31 +200,39 @@ class TimeTrainPass(object): self._begin_day = make_datetime(day_fields[0].strip()) if len(day_fields) == 1 or len(day_fields[1]) == 0: # 100 years, meaning to continuous running - self._end_day = self._begin_day + datetime.timedelta(days=36500) + self._end_day = self._begin_day + datetime.timedelta( + days=36500) else: # example: 2020212+10 run_day = int(day_fields[1].strip()) - self._end_day = self._begin_day + datetime.timedelta(days=run_day) + self._end_day = self._begin_day + datetime.timedelta( + days=run_day) else: # example: {20191001..20191031} - days = os.popen("echo -n " + self._config['days']).read().split(" ") + days = os.popen("echo -n " + self._config['days']).read().split( + " ") self._begin_day = make_datetime(days[0]) self._end_day = make_datetime(days[len(days) - 1]) self._checkpoint_interval = self._config['checkpoint_interval'] self._dump_inference_interval = self._config['dump_inference_interval'] - self._interval_per_pass = self._config['train_time_interval'] # train N min data per pass + self._interval_per_pass = self._config[ + 'train_time_interval'] # train N min data per pass self._pass_id = 0 self._inference_pass_id = 0 self._pass_donefile_handler = None if 'pass_donefile_name' in self._config: - self._train_pass_donefile = global_config['output_path'] + '/' + self._config['pass_donefile_name'] + self._train_pass_donefile = global_config[ + 'output_path'] + '/' + self._config['pass_donefile_name'] if fs.is_afs_path(self._train_pass_donefile): - self._pass_donefile_handler = fs.FileHandler(global_config['io']['afs']) + self._pass_donefile_handler = fs.FileHandler(global_config[ + 'io']['afs']) else: - self._pass_donefile_handler = fs.FileHandler(global_config['io']['local_fs']) + self._pass_donefile_handler = fs.FileHandler(global_config[ + 'io']['local_fs']) - last_done = self._pass_donefile_handler.cat(self._train_pass_donefile).strip().split('\n')[-1] + last_done = self._pass_donefile_handler.cat( + self._train_pass_donefile).strip().split('\n')[-1] done_fileds = last_done.split('\t') if len(done_fileds) > 4: self._base_key = done_fileds[1] @@ -235,15 +246,18 @@ class TimeTrainPass(object): """ return 24 * 60 / self._interval_per_pass - def save_train_progress(self, day, pass_id, base_key, model_path, is_checkpoint): + def save_train_progress(self, day, pass_id, base_key, model_path, + is_checkpoint): """R """ if is_checkpoint: self._checkpoint_pass_id = pass_id self._checkpoint_model_path = model_path - done_content = "%s\t%s\t%s\t%s\t%d\n" % (day, base_key, - self._checkpoint_model_path, self._checkpoint_pass_id, pass_id) - self._pass_donefile_handler.write(done_content, self._train_pass_donefile, 'a') + done_content = "%s\t%s\t%s\t%s\t%d\n" % ( + day, base_key, self._checkpoint_model_path, + self._checkpoint_pass_id, pass_id) + self._pass_donefile_handler.write(done_content, + self._train_pass_donefile, 'a') pass def init_pass_by_id(self, date_str, pass_id): @@ -285,12 +299,14 @@ class TimeTrainPass(object): if self._pass_id < 1: self.init_pass_by_time(self._begin_day.strftime("%Y%m%d%H%M")) else: - next_time = self._current_train_time + datetime.timedelta(minutes=self._interval_per_pass) + next_time = self._current_train_time + datetime.timedelta( + minutes=self._interval_per_pass) if (next_time - self._end_day).total_seconds() > 0: has_next = False else: self.init_pass_by_time(next_time.strftime("%Y%m%d%H%M")) - if has_next and (self._inference_pass_id < self._pass_id or self._pass_id < old_pass_id): + if has_next and (self._inference_pass_id < self._pass_id or + self._pass_id < old_pass_id): self._inference_pass_id = self._pass_id - 1 return has_next @@ -318,9 +334,11 @@ class TimeTrainPass(object): Return: date(current_train_time + delta_day) """ - return (self._current_train_time + datetime.timedelta(days=delta_day)).strftime("%Y%m%d") + return (self._current_train_time + datetime.timedelta(days=delta_day) + ).strftime("%Y%m%d") def timestamp(self, delta_day=0): """R """ - return (self._current_train_time + datetime.timedelta(days=delta_day)).timestamp() + return (self._current_train_time + datetime.timedelta(days=delta_day) + ).timestamp() diff --git a/doc/.DS_Store b/doc/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5e5553cf5f7f3c1c2b23fba7fb45834c3a521779 Binary files /dev/null and b/doc/.DS_Store differ diff --git a/doc/__init__.py b/doc/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100755 --- a/doc/__init__.py +++ b/doc/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/doc/benchmark.md b/doc/benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..969d74408070b02ab08bb98d397e4806eb698ff7 --- /dev/null +++ b/doc/benchmark.md @@ -0,0 +1,13 @@ +# PaddleRec Benchmark + +`PaddleRec`中各模型在各种模式下的效果及性能数据将随版本迭代不断更新,欢迎持续关注并监督,如有任何问题,欢迎在[Github Issue](https://github.com/PaddlePaddle/PaddleRec/issues)提出。 + +## [召回模型介绍及Benchmark](../models/recall/readme.md) + +## [排序模型介绍及Benchmark](../models/rank/readme.md) + +## [内容理解模型介绍及Benchmark](../models/contentunderstanding/readme.md) + +## [多任务模型介绍及Benchmark](../models/multitask/readme.md) + +## [树模型介绍及Benchamrk](../models/treebased/README.md) diff --git a/doc/contribute.md b/doc/contribute.md new file mode 100644 index 0000000000000000000000000000000000000000..26770d8ac0b64e9835f7398768d0f81de9383132 --- /dev/null +++ b/doc/contribute.md @@ -0,0 +1,2 @@ +# PaddleRec 贡献代码 +> 占位 diff --git a/doc/custom_dataset_reader.md b/doc/custom_dataset_reader.md new file mode 100644 index 0000000000000000000000000000000000000000..82b0fd12d4f52fe83155fd371f6041be52c8bcba --- /dev/null +++ b/doc/custom_dataset_reader.md @@ -0,0 +1,432 @@ +# PaddleRec 推荐数据集格式 + +当你的数据集格式为[slot:feasign]*这种模式,或者可以预处理为这种格式时,可以直接使用PaddleRec内置的Reader。 +好处是不用自己写Reader了,各个model之间的数据格式也都可以统一成一样的格式。 + +## 数据格式说明 + +假如你的原始数据格式为 + +```bash +