fix demo (#213)

* fix demo * fix * fix * fix code style

fix demo (#213)
* fix demo * fix * fix * fix code style
ddf6ec25 · Chengmo · GitHub · b1f708fc · ddf6ec25 · ddf6ec25
14 changed file
--- a/core/trainers/framework/runner.py
+++ b/core/trainers/framework/runner.py
@@ -19,6 +19,7 @@ import time
 import warnings
 import numpy as np
 import random
+import json
 import logging
 import paddle.fluid as fluid

@@ -147,17 +148,22 @@ class RunnerBase(object):
        metrics_format = []

        if context["is_infer"]:
-            metrics_format.append("\t[Infer]\t{}: {{}}".format("batch"))
+            metrics_format.append("\t[Infer] {}: {{}}".format("batch"))
        else:
-            metrics_format.append("\t[Train]\t{}: {{}}".format("batch"))
+            metrics_format.append("\t[Train]")
+            if "current_epoch" in context:
+                metrics_format.append(" epoch: {}".format(context[
+                    "current_epoch"]))
+            metrics_format.append(" {}: {{}}".format("batch"))

        metrics_format.append("{}: {{:.2f}}s".format("time_each_interval"))

        metrics_names = ["total_batch"]
-
+        metrics_indexes = dict()
        for name, var in metrics.items():
            metrics_names.append(name)
            metrics_varnames.append(var.name)
+            metrics_indexes[var.name] = len(metrics_varnames) - 1
            metrics_format.append("{}: {{}}".format(name))
        metrics_format = ", ".join(metrics_format)

@@ -166,6 +172,7 @@ class RunnerBase(object):
        batch_id = 0
        begin_time = time.time()
        scope = context["model"][model_name]["scope"]
+        runner_results = []
        result = None
        with fluid.scope_guard(scope):
            try:
@@ -182,18 +189,35 @@ class RunnerBase(object):
                    ]
                    metrics.extend(metrics_rets)

+                    batch_runner_result = {}
+                    for k, v in metrics_indexes.items():
+                        batch_runner_result[k] = np.array(metrics_rets[
+                            v]).tolist()
+                    runner_results.append(batch_runner_result)
+
                    if batch_id % fetch_period == 0 and batch_id != 0:
                        end_time = time.time()
                        seconds = end_time - begin_time
                        metrics_logging = metrics[:]
                        metrics_logging = metrics.insert(1, seconds)
                        begin_time = end_time
-
                        logging.info(metrics_format.format(*metrics))
                    batch_id += 1
            except fluid.core.EOFException:
                reader.reset()

+        runner_result_save_path = envs.get_global_env(
+            "runner." + context["runner_name"] + ".runner_result_dump_path",
+            None)
+        if runner_result_save_path:
+            if "current_epoch" in context:
+                runner_result_save_path = runner_result_save_path + "_epoch_{}".format(
+                    context["current_epoch"])
+            logging.info("Dump runner result in {}".format(
+                runner_result_save_path))
+            with open(runner_result_save_path, 'w+') as fout:
+                json.dump(runner_results, fout)
+
        if batch_id > 0:
            result = dict(zip(metrics_names, metrics))
        return result
@@ -402,6 +426,7 @@ class SingleRunner(RunnerBase):
                    filelist = context["file_list"]
                    context["file_list"] = shuffle_files(need_shuffle_files,
                                                         filelist)
+                context["current_epoch"] = epoch
                begin_time = time.time()
                result = self._run(context, model_dict)
                end_time = time.time()
@@ -450,6 +475,7 @@ class PSRunner(RunnerBase):
                filelist = context["file_list"]
                context["file_list"] = shuffle_files(need_shuffle_files,
                                                     filelist)
+            context["current_epoch"] = epoch
            begin_time = time.time()
            result = self._run(context, model_dict)
            end_time = time.time()
@@ -500,6 +526,7 @@ class CollectiveRunner(RunnerBase):
                filelist = context["file_list"]
                context["file_list"] = shuffle_files(need_shuffle_files,
                                                     filelist)
+            context["current_epoch"] = epoch
            begin_time = time.time()
            self._run(context, model_dict)
            end_time = time.time()
@@ -533,6 +560,7 @@ class PslibRunner(RunnerBase):
                filelist = context["file_list"]
                context["file_list"] = shuffle_files(need_shuffle_files,
                                                     filelist)
+            context["current_epoch"] = epoch
            begin_time = time.time()
            self._run(context, model_dict)
            end_time = time.time()

--- a/doc/yaml.md
+++ b/doc/yaml.md
@@ -38,6 +38,7 @@
 |       runner_class_path       |    string    |                           路径                            |    否    |                      自定义runner流程实现的地址                      |
 |      terminal_class_path      |    string    |                           路径                            |    否    |                     自定义terminal流程实现的地址                     |
 |  init_pretraining_model_path  |    string    |                           路径                            |    否    |自定义的startup流程中需要传入这个参数，finetune中需要加载的参数的地址 |
+|  runner_result_dump_path  |    string    |                           路径                            |    否    | 运行中metrics的结果使用json.dump到文件的地址，若是在训练的runner中使用, 会自动加上epoch后缀 |




--- a/models/demo/movie_recommand/README.md
+++ b/models/demo/movie_recommand/README.md
+# PaddleRec 基于 Movielens 数据集的全流程示例
+
+## 模型的详细教程可以查阅： [十分钟！全流程！从零搭建推荐系统](https://aistudio.baidu.com/aistudio/projectdetail/559336)
+
+## 本地运行流程
+
+在本地需要安装`PaddleRec`及`PaddlePaddle`，推荐在`Linux` + `python2.7` 环境下执行此demo
+
+本地运行流程与AiStudio流程基本一致，细节略有区别
+
+### 离线训练
+```shell
+sh train.sh
+```
+
+### 离线测试
+```shell
+sh offline_test.sh
+```
+
+### 模拟在线召回
+```shell
+sh online_recall.sh
+```
+
+### 模拟在线排序
+```shell
+sh online_rank.sh
+```
--- a/models/demo/movie_recommand/data_prepare.sh
+++ b/models/demo/movie_recommand/data_prepare.sh
 cd data

+echo "---> Download movielens 1M data ..."
 wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
+echo "---> Unzip ml-1m.zip ..."
 unzip ml-1m.zip
+rm ml-1m.zip

+echo "---> Split movielens data ..."
 python split.py

-mkdir train/
-mkdir test/
+mkdir -p train/
+mkdir -p test/

+echo "---> Process train & test data ..."
 python process_ml_1m.py process_raw ./ml-1m/train.dat | sort -t $'\t' -k 9 -n > log.data.train
 python process_ml_1m.py process_raw ./ml-1m/test.dat | sort -t $'\t' -k 9 -n > log.data.test
 python process_ml_1m.py hash log.data.train > ./train/data.txt
@@ -15,4 +20,6 @@ python process_ml_1m.py hash log.data.test > ./test/data.txt

 rm log.data.train
 rm log.data.test
-cd ../
+cd ..
+
+echo "---> Finish data process"
--- a/models/demo/movie_recommand/offline_test.sh
+++ b/models/demo/movie_recommand/offline_test.sh
 ## modify config.yaml to infer mode at first

-cd recall
-python -m paddlerec.run -m ./config.yaml
-cd ../rank
-python -m paddlerec.run -m ./config.yaml
-cd ..
+echo "Recall offline test ..."
+echo "Model config at models/demo/movie_recommand/recall/config_offline_test.yaml"
+python -m paddlerec.run -m ./recall/config_test_offline.yaml 
+
+echo "Rank offline test ..."
+echo "Model config at models/demo/movie_recommand/rank/config_offline_test.yaml"
+python -m paddlerec.run -m ./rank/config_test_offline.yaml 

 echo "recall offline test result:"
 python parse.py recall_offline recall/infer_result
+
 echo "rank offline test result:"
 python parse.py rank_offline rank/infer_result
--- a/models/demo/movie_recommand/online_rank.sh
+++ b/models/demo/movie_recommand/online_rank.sh
 cd data
+echo "Create online test data ..."
 python process_ml_1m.py data_rank > online_user/test/data.txt

-## modify recall/config.yaml to online_infer mode
-cd ../rank
-python -m paddlerec.run -m ./config.yaml
-cd ../
-python parse.py rank_online rank/infer_result
+cd ..
+echo "Rank online test ..."
+echo "Model config at models/demo/movie_recommand/rank/config_online_test.yaml"
+python -m paddlerec.run -m ./rank/config_test_online.yaml
+python parse.py rank_online ./rank/infer_result
--- a/models/demo/movie_recommand/online_recall.sh
+++ b/models/demo/movie_recommand/online_recall.sh
 cd data
+echo "Create online test data ..."
 mkdir online_user/test
 python process_ml_1m.py data_recall > online_user/test/data.txt

-## modify recall/config.yaml to online_infer mode
-cd ../recall
-python -m paddlerec.run -m ./config.yaml
-cd ../
+cd ..
+echo "Recall online test ..."
+echo "Model config at models/demo/movie_recommand/recall/config_online_test.yaml"
+python -m paddlerec.run -m ./recall/config_test_online.yaml
 python parse.py recall_online recall/infer_result
--- a/models/demo/movie_recommand/rank/config.yaml
+++ b/models/demo/movie_recommand/rank/config.yaml
@@ -12,26 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-workspace: "models/demo/movie_recommand"
+workspace: "./"

 # list of dataset
 dataset:
 - name: dataset_train # name of dataset to distinguish different datasets
-  batch_size: 128
-  type: QueueDataset 
-  data_path: "{workspace}/data/train"
-  sparse_slots: "logid time userid gender age occupation movieid title genres label"
-  dense_slots: ""
- name: dataset_infer # name
  batch_size: 128
  type: DataLoader 
-  data_path: "{workspace}/data/test"
-  sparse_slots: "logid time userid gender age occupation movieid title genres label"
-  dense_slots: ""
- name: dataset_online_infer # name
-  batch_size: 10
-  type: DataLoader
-  data_path: "{workspace}/data/online_user/test"
+  data_path: "{workspace}/data/train"
  sparse_slots: "logid time userid gender age occupation movieid title genres label"
  dense_slots: ""

@@ -51,42 +39,17 @@ hyper_parameters:
 # train
 mode: runner_train

-## online or offline infer
-#mode: runner_infer
 runner:
 - name: runner_train
  class: train
  save_checkpoint_interval: 1 # save model interval of epochs
-  save_inference_interval: 1 # save inference
-  save_checkpoint_path: "increment" # save checkpoint path
-  save_inference_path: "inference" # save inference path
+  save_checkpoint_path: "increment_rank" # save checkpoint path
  epochs: 10
  device: cpu

- name: runner_infer
-  class: infer
-  print_interval: 10000
-  init_model_path: "increment/9" # load model path
-
 #train
 phase:
 - name: phase1
-  model: "{workspace}/model.py" # user-defined model
+  model: "{workspace}/rank/model.py" # user-defined model
  dataset_name: dataset_train # select dataset by name
-  thread_num: 12
-
-##offline infer
-#phase:
-#- name: phase1
-#  model: "{workspace}/model.py" # user-defined model
-#  dataset_name: dataset_infer # select dataset by name
-#  save_path: "./infer_result"
-#  thread_num: 1
-
-##offline infer
-#phase:
-#- name: phase1
-#  model: "{workspace}/model.py" # user-defined model
-#  dataset_name: dataset_online_infer # select dataset by name
-#  save_path: "./infer_result"
-#  thread_num: 1
+  thread_num: 4
--- a/models/demo/movie_recommand/rank/config_test_offline.yaml
+++ b/models/demo/movie_recommand/rank/config_test_offline.yaml
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#workspace: "paddlerec.models.demo.movie_recommand"
+workspace: "./"
+
+# list of dataset
+dataset:
+- name: dataset_infer # name
+  batch_size: 128
+  type: DataLoader
+  data_path: "{workspace}/data/test"
+  sparse_slots: "logid time userid gender age occupation movieid title genres label"
+  dense_slots: ""
+
+# hyper parameters of user-defined network
+hyper_parameters:
+  # optimizer config
+  optimizer:
+    class: Adam
+    learning_rate: 0.001
+    strategy: async
+  # user-defined <key, value> pairs
+  sparse_feature_number: 60000000
+  sparse_feature_dim: 9
+  dense_input_dim: 13
+  fc_sizes: [512, 256, 128, 32]
+
+# train
+mode: runner_infer
+
+## online or offline infer
+#mode: runner_infer
+runner:
+- name: runner_infer
+  epochs: 1
+  device: cpu
+  class: infer
+  print_interval: 10000
+  runner_result_dump_path: "{workspace}/rank/infer_result"
+  init_model_path: "increment_rank/9" # load model path
+
+#offline infer
+phase:
+- name: phase1
+  model: "{workspace}/rank/model.py" # user-defined model
+  dataset_name: dataset_infer # select dataset by name
+  thread_num: 1
+  
--- a/models/demo/movie_recommand/rank/config_test_online.yaml
+++ b/models/demo/movie_recommand/rank/config_test_online.yaml
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+workspace: "./"
+
+# list of dataset
+dataset:
+- name: dataset_online_infer # name
+  batch_size: 10
+  type: DataLoader
+  data_path: "{workspace}/data/online_user/test"
+  sparse_slots: "logid time userid gender age occupation movieid title genres label"
+  dense_slots: ""
+
+# hyper parameters of user-defined network
+hyper_parameters:
+  # optimizer config
+  optimizer:
+    class: Adam
+    learning_rate: 0.001
+    strategy: async
+  # user-defined <key, value> pairs
+  sparse_feature_number: 60000000
+  sparse_feature_dim: 9
+  dense_input_dim: 13
+  fc_sizes: [512, 256, 128, 32]
+
+# train
+mode: runner_infer
+
+runner:
+- name: runner_infer
+  epochs: 1
+  device: cpu
+  class: infer
+  print_interval: 10000
+  runner_result_dump_path: "{workspace}/rank/infer_result"
+  init_model_path: "increment_rank/9" # load model path
+
+#offline infer
+phase:
+- name: phase1
+  model: "{workspace}/rank/model.py" # user-defined model
+  dataset_name: dataset_online_infer # select dataset by name
+  thread_num: 1
+  
--- a/models/demo/movie_recommand/recall/config.yaml
+++ b/models/demo/movie_recommand/recall/config.yaml
@@ -12,26 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-workspace: "models/demo/movie_recommand"
+workspace: "./"

 # list of dataset
 dataset:
 - name: dataset_train # name of dataset to distinguish different datasets
-  batch_size: 128
-  type: QueueDataset 
-  data_path: "{workspace}/data/train"
-  sparse_slots: "logid time userid gender age occupation movieid title genres label"
-  dense_slots: ""
- name: dataset_infer # name
  batch_size: 128
  type: DataLoader 
-  data_path: "{workspace}/data/test"
-  sparse_slots: "logid time userid gender age occupation movieid title genres label"
-  dense_slots: ""
- name: dataset_online_infer # name
-  batch_size: 128
-  type: DataLoader
-  data_path: "{workspace}/data/online_user/test"
+  data_path: "{workspace}/data/train"
  sparse_slots: "logid time userid gender age occupation movieid title genres label"
  dense_slots: ""

@@ -50,43 +38,17 @@ hyper_parameters:

 # train
 mode: runner_train
-
-## online or offline infer
-#mode: runner_infer
 runner:
 - name: runner_train
  class: train
  save_checkpoint_interval: 1 # save model interval of epochs
-  save_inference_interval: 1 # save inference
-  save_checkpoint_path: "increment" # save checkpoint path
-  save_inference_path: "inference" # save inference path
+  save_checkpoint_path: "increment_recall" # save checkpoint path
  epochs: 10
  device: cpu

- name: runner_infer
-  class: infer
-  print_interval: 10000
-  init_model_path: "increment/9" # load model path
-
 #train
 phase:
 - name: phase1
-  model: "{workspace}/model.py" # user-defined model
+  model: "{workspace}/recall/model.py" # user-defined model
  dataset_name: dataset_train # select dataset by name
-  thread_num: 12
-
-##offline infer
-#phase:
-#- name: phase1
-#  model: "{workspace}/model.py" # user-defined model
-#  dataset_name: dataset_infer # select dataset by name
-#  save_path: "./infer_result"
-#  thread_num: 1
-
-##offline infer
-#phase:
-#- name: phase1
-#  model: "{workspace}/model.py" # user-defined model
-#  dataset_name: dataset_online_infer # select dataset by name
-#  save_path: "./infer_result"
-#  thread_num: 1
+  thread_num: 4
--- a/models/demo/movie_recommand/recall/config_test_offline.yaml
+++ b/models/demo/movie_recommand/recall/config_test_offline.yaml
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#workspace: "paddlerec.models.demo.movie_recommand"
+workspace: "./"
+# list of dataset
+dataset:
+- name: dataset_infer # name
+  batch_size: 128
+  type: DataLoader
+  data_path: "{workspace}/data/test"
+  sparse_slots: "logid time userid gender age occupation movieid title genres label"
+  dense_slots: ""
+
+# hyper parameters of user-defined network
+hyper_parameters:
+  # optimizer config
+  optimizer:
+    class: Adam
+    learning_rate: 0.001
+    strategy: async
+  # user-defined <key, value> pairs
+  sparse_feature_number: 60000000
+  sparse_feature_dim: 9
+  dense_input_dim: 13
+  fc_sizes: [512, 256, 128, 32]
+
+# train
+mode: runner_infer
+
+runner:
+- name: runner_infer
+  epochs: 1
+  device: cpu
+  class: infer
+  print_interval: 100000
+  runner_result_dump_path: "{workspace}/recall/infer_result"
+  init_model_path: "increment_recall/9" # load model path
+
+
+#offline infer
+phase:
+- name: phase1
+  model: "{workspace}/recall/model.py" # user-defined model
+  dataset_name: dataset_infer 
+  thread_num: 1
--- a/models/demo/movie_recommand/recall/config_test_online.yaml
+++ b/models/demo/movie_recommand/recall/config_test_online.yaml
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#workspace: "paddlerec.models.demo.movie_recommand"
+workspace: ./
+# list of dataset
+dataset:
+- name: dataset_online_infer # name
+  batch_size: 128
+  type: DataLoader
+  data_path: "{workspace}/data/online_user/test"
+  sparse_slots: "logid time userid gender age occupation movieid title genres label"
+  dense_slots: ""
+
+# hyper parameters of user-defined network
+hyper_parameters:
+  # optimizer config
+  optimizer:
+    class: Adam
+    learning_rate: 0.001
+    strategy: async
+  # user-defined <key, value> pairs
+  sparse_feature_number: 60000000
+  sparse_feature_dim: 9
+  dense_input_dim: 13
+  fc_sizes: [512, 256, 128, 32]
+
+# train
+mode: runner_infer
+
+## online or offline infer
+#mode: runner_infer
+runner:
+- name: runner_infer
+  epochs: 1
+  device: cpu
+  class: infer
+  print_interval: 10000
+  runner_result_dump_path: "{workspace}/recall/infer_result"
+  init_model_path: "increment_recall/9" # load model path
+
+#offline infer
+phase:
+- name: phase1
+  model: "{workspace}/recall/model.py" # user-defined model
+  dataset_name: dataset_online_infer # select dataset by name
+  thread_num: 1
+  
--- a/models/demo/movie_recommand/train.sh
+++ b/models/demo/movie_recommand/train.sh
-cd recall
-python -m paddlerec.run -m ./config.yaml &> log &
-cd ../rank
-python -m paddlerec.run -m ./config.yaml &> log &
-cd ..
+echo "Recall offline training ..."
+echo "Model config at models/demo/movie_recommand/recall/config.yaml"
+python -m paddlerec.run -m ./recall/config.yaml 
+
+echo "----------------------------------------"
+echo "Rank offline training ..."
+echo "Model config at models/demo/movie_recommand/rank/config.yaml"
+python -m paddlerec.run -m ./rank/config.yaml