diff --git a/core/trainers/framework/runner.py b/core/trainers/framework/runner.py index 0229a3a8c6e245df5d6530c48dcca0d8a0638306..8bf64a71eceaf9f9df204e487934d5129b8a32a5 100644 --- a/core/trainers/framework/runner.py +++ b/core/trainers/framework/runner.py @@ -19,6 +19,7 @@ import time import warnings import numpy as np import random +import json import logging import paddle.fluid as fluid @@ -147,17 +148,22 @@ class RunnerBase(object): metrics_format = [] if context["is_infer"]: - metrics_format.append("\t[Infer]\t{}: {{}}".format("batch")) + metrics_format.append("\t[Infer] {}: {{}}".format("batch")) else: - metrics_format.append("\t[Train]\t{}: {{}}".format("batch")) + metrics_format.append("\t[Train]") + if "current_epoch" in context: + metrics_format.append(" epoch: {}".format(context[ + "current_epoch"])) + metrics_format.append(" {}: {{}}".format("batch")) metrics_format.append("{}: {{:.2f}}s".format("time_each_interval")) metrics_names = ["total_batch"] - + metrics_indexes = dict() for name, var in metrics.items(): metrics_names.append(name) metrics_varnames.append(var.name) + metrics_indexes[var.name] = len(metrics_varnames) - 1 metrics_format.append("{}: {{}}".format(name)) metrics_format = ", ".join(metrics_format) @@ -166,6 +172,7 @@ class RunnerBase(object): batch_id = 0 begin_time = time.time() scope = context["model"][model_name]["scope"] + runner_results = [] result = None with fluid.scope_guard(scope): try: @@ -182,18 +189,35 @@ class RunnerBase(object): ] metrics.extend(metrics_rets) + batch_runner_result = {} + for k, v in metrics_indexes.items(): + batch_runner_result[k] = np.array(metrics_rets[ + v]).tolist() + runner_results.append(batch_runner_result) + if batch_id % fetch_period == 0 and batch_id != 0: end_time = time.time() seconds = end_time - begin_time metrics_logging = metrics[:] metrics_logging = metrics.insert(1, seconds) begin_time = end_time - logging.info(metrics_format.format(*metrics)) batch_id += 1 except fluid.core.EOFException: reader.reset() + runner_result_save_path = envs.get_global_env( + "runner." + context["runner_name"] + ".runner_result_dump_path", + None) + if runner_result_save_path: + if "current_epoch" in context: + runner_result_save_path = runner_result_save_path + "_epoch_{}".format( + context["current_epoch"]) + logging.info("Dump runner result in {}".format( + runner_result_save_path)) + with open(runner_result_save_path, 'w+') as fout: + json.dump(runner_results, fout) + if batch_id > 0: result = dict(zip(metrics_names, metrics)) return result @@ -402,6 +426,7 @@ class SingleRunner(RunnerBase): filelist = context["file_list"] context["file_list"] = shuffle_files(need_shuffle_files, filelist) + context["current_epoch"] = epoch begin_time = time.time() result = self._run(context, model_dict) end_time = time.time() @@ -450,6 +475,7 @@ class PSRunner(RunnerBase): filelist = context["file_list"] context["file_list"] = shuffle_files(need_shuffle_files, filelist) + context["current_epoch"] = epoch begin_time = time.time() result = self._run(context, model_dict) end_time = time.time() @@ -500,6 +526,7 @@ class CollectiveRunner(RunnerBase): filelist = context["file_list"] context["file_list"] = shuffle_files(need_shuffle_files, filelist) + context["current_epoch"] = epoch begin_time = time.time() self._run(context, model_dict) end_time = time.time() @@ -533,6 +560,7 @@ class PslibRunner(RunnerBase): filelist = context["file_list"] context["file_list"] = shuffle_files(need_shuffle_files, filelist) + context["current_epoch"] = epoch begin_time = time.time() self._run(context, model_dict) end_time = time.time() diff --git a/doc/yaml.md b/doc/yaml.md index 3652e935922c2d6bf4e9054e02ebfe221a86fc1b..4c517f43e76872246bc1919d01f77c55e56104ea 100644 --- a/doc/yaml.md +++ b/doc/yaml.md @@ -38,6 +38,7 @@ | runner_class_path | string | 路径 | 否 | 自定义runner流程实现的地址 | | terminal_class_path | string | 路径 | 否 | 自定义terminal流程实现的地址 | | init_pretraining_model_path | string | 路径 | 否 |自定义的startup流程中需要传入这个参数,finetune中需要加载的参数的地址 | +| runner_result_dump_path | string | 路径 | 否 | 运行中metrics的结果使用json.dump到文件的地址,若是在训练的runner中使用, 会自动加上epoch后缀 | diff --git a/models/demo/movie_recommand/README.md b/models/demo/movie_recommand/README.md new file mode 100644 index 0000000000000000000000000000000000000000..552806840877c356a26d4a535ffee927a7ae0ed4 --- /dev/null +++ b/models/demo/movie_recommand/README.md @@ -0,0 +1,29 @@ +# PaddleRec 基于 Movielens 数据集的全流程示例 + +## 模型的详细教程可以查阅: [十分钟!全流程!从零搭建推荐系统](https://aistudio.baidu.com/aistudio/projectdetail/559336) + +## 本地运行流程 + +在本地需要安装`PaddleRec`及`PaddlePaddle`,推荐在`Linux` + `python2.7` 环境下执行此demo + +本地运行流程与AiStudio流程基本一致,细节略有区别 + +### 离线训练 +```shell +sh train.sh +``` + +### 离线测试 +```shell +sh offline_test.sh +``` + +### 模拟在线召回 +```shell +sh online_recall.sh +``` + +### 模拟在线排序 +```shell +sh online_rank.sh +``` diff --git a/models/demo/movie_recommand/data_prepare.sh b/models/demo/movie_recommand/data_prepare.sh index f99b5b273b4ed496030cfe46bf228ae32159ee26..bf9812c352dff030e358a78e3bbf9a646058c89c 100644 --- a/models/demo/movie_recommand/data_prepare.sh +++ b/models/demo/movie_recommand/data_prepare.sh @@ -1,13 +1,18 @@ cd data +echo "---> Download movielens 1M data ..." wget http://files.grouplens.org/datasets/movielens/ml-1m.zip +echo "---> Unzip ml-1m.zip ..." unzip ml-1m.zip +rm ml-1m.zip +echo "---> Split movielens data ..." python split.py -mkdir train/ -mkdir test/ +mkdir -p train/ +mkdir -p test/ +echo "---> Process train & test data ..." python process_ml_1m.py process_raw ./ml-1m/train.dat | sort -t $'\t' -k 9 -n > log.data.train python process_ml_1m.py process_raw ./ml-1m/test.dat | sort -t $'\t' -k 9 -n > log.data.test python process_ml_1m.py hash log.data.train > ./train/data.txt @@ -15,4 +20,6 @@ python process_ml_1m.py hash log.data.test > ./test/data.txt rm log.data.train rm log.data.test -cd ../ +cd .. + +echo "---> Finish data process" diff --git a/models/demo/movie_recommand/offline_test.sh b/models/demo/movie_recommand/offline_test.sh index 88bf29cebf25b185bcdbb13cf64db5b0984b7704..98a04fd1712e53e11633cc5e87327492a47e6213 100644 --- a/models/demo/movie_recommand/offline_test.sh +++ b/models/demo/movie_recommand/offline_test.sh @@ -1,12 +1,15 @@ ## modify config.yaml to infer mode at first -cd recall -python -m paddlerec.run -m ./config.yaml -cd ../rank -python -m paddlerec.run -m ./config.yaml -cd .. +echo "Recall offline test ..." +echo "Model config at models/demo/movie_recommand/recall/config_offline_test.yaml" +python -m paddlerec.run -m ./recall/config_test_offline.yaml + +echo "Rank offline test ..." +echo "Model config at models/demo/movie_recommand/rank/config_offline_test.yaml" +python -m paddlerec.run -m ./rank/config_test_offline.yaml echo "recall offline test result:" python parse.py recall_offline recall/infer_result + echo "rank offline test result:" python parse.py rank_offline rank/infer_result diff --git a/models/demo/movie_recommand/online_rank.sh b/models/demo/movie_recommand/online_rank.sh index f2f5f167493e1c35f824f0bd87a922d25f832191..9a9c376ffcec4581c2c5212f645d1a9aafbdf7a3 100644 --- a/models/demo/movie_recommand/online_rank.sh +++ b/models/demo/movie_recommand/online_rank.sh @@ -1,8 +1,9 @@ cd data +echo "Create online test data ..." python process_ml_1m.py data_rank > online_user/test/data.txt -## modify recall/config.yaml to online_infer mode -cd ../rank -python -m paddlerec.run -m ./config.yaml -cd ../ -python parse.py rank_online rank/infer_result +cd .. +echo "Rank online test ..." +echo "Model config at models/demo/movie_recommand/rank/config_online_test.yaml" +python -m paddlerec.run -m ./rank/config_test_online.yaml +python parse.py rank_online ./rank/infer_result diff --git a/models/demo/movie_recommand/online_recall.sh b/models/demo/movie_recommand/online_recall.sh index 23fa7912c2f173310da7f73694833aeaa59646df..2cd47aa321f213313e2edf279f5e8c9ce8fcdd34 100644 --- a/models/demo/movie_recommand/online_recall.sh +++ b/models/demo/movie_recommand/online_recall.sh @@ -1,9 +1,10 @@ cd data +echo "Create online test data ..." mkdir online_user/test python process_ml_1m.py data_recall > online_user/test/data.txt -## modify recall/config.yaml to online_infer mode -cd ../recall -python -m paddlerec.run -m ./config.yaml -cd ../ +cd .. +echo "Recall online test ..." +echo "Model config at models/demo/movie_recommand/recall/config_online_test.yaml" +python -m paddlerec.run -m ./recall/config_test_online.yaml python parse.py recall_online recall/infer_result diff --git a/models/demo/movie_recommand/rank/config.yaml b/models/demo/movie_recommand/rank/config.yaml index e5834178a98e7132fc85ed25f4a2a509dc979e9c..bce49150488330d0d42e6ad45657f0d4bae3cdba 100644 --- a/models/demo/movie_recommand/rank/config.yaml +++ b/models/demo/movie_recommand/rank/config.yaml @@ -12,28 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -workspace: "models/demo/movie_recommand" +workspace: "./" # list of dataset dataset: - name: dataset_train # name of dataset to distinguish different datasets batch_size: 128 - type: QueueDataset + type: DataLoader data_path: "{workspace}/data/train" sparse_slots: "logid time userid gender age occupation movieid title genres label" dense_slots: "" -- name: dataset_infer # name - batch_size: 128 - type: DataLoader - data_path: "{workspace}/data/test" - sparse_slots: "logid time userid gender age occupation movieid title genres label" - dense_slots: "" -- name: dataset_online_infer # name - batch_size: 10 - type: DataLoader - data_path: "{workspace}/data/online_user/test" - sparse_slots: "logid time userid gender age occupation movieid title genres label" - dense_slots: "" # hyper parameters of user-defined network hyper_parameters: @@ -51,42 +39,17 @@ hyper_parameters: # train mode: runner_train -## online or offline infer -#mode: runner_infer runner: - name: runner_train class: train save_checkpoint_interval: 1 # save model interval of epochs - save_inference_interval: 1 # save inference - save_checkpoint_path: "increment" # save checkpoint path - save_inference_path: "inference" # save inference path + save_checkpoint_path: "increment_rank" # save checkpoint path epochs: 10 device: cpu -- name: runner_infer - class: infer - print_interval: 10000 - init_model_path: "increment/9" # load model path - #train phase: - name: phase1 - model: "{workspace}/model.py" # user-defined model + model: "{workspace}/rank/model.py" # user-defined model dataset_name: dataset_train # select dataset by name - thread_num: 12 - -##offline infer -#phase: -#- name: phase1 -# model: "{workspace}/model.py" # user-defined model -# dataset_name: dataset_infer # select dataset by name -# save_path: "./infer_result" -# thread_num: 1 - -##offline infer -#phase: -#- name: phase1 -# model: "{workspace}/model.py" # user-defined model -# dataset_name: dataset_online_infer # select dataset by name -# save_path: "./infer_result" -# thread_num: 1 + thread_num: 4 diff --git a/models/demo/movie_recommand/rank/config_test_offline.yaml b/models/demo/movie_recommand/rank/config_test_offline.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9281e844c03ae358a1af45b3aa78d1295c0d8b12 --- /dev/null +++ b/models/demo/movie_recommand/rank/config_test_offline.yaml @@ -0,0 +1,60 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#workspace: "paddlerec.models.demo.movie_recommand" +workspace: "./" + +# list of dataset +dataset: +- name: dataset_infer # name + batch_size: 128 + type: DataLoader + data_path: "{workspace}/data/test" + sparse_slots: "logid time userid gender age occupation movieid title genres label" + dense_slots: "" + +# hyper parameters of user-defined network +hyper_parameters: + # optimizer config + optimizer: + class: Adam + learning_rate: 0.001 + strategy: async + # user-defined pairs + sparse_feature_number: 60000000 + sparse_feature_dim: 9 + dense_input_dim: 13 + fc_sizes: [512, 256, 128, 32] + +# train +mode: runner_infer + +## online or offline infer +#mode: runner_infer +runner: +- name: runner_infer + epochs: 1 + device: cpu + class: infer + print_interval: 10000 + runner_result_dump_path: "{workspace}/rank/infer_result" + init_model_path: "increment_rank/9" # load model path + +#offline infer +phase: +- name: phase1 + model: "{workspace}/rank/model.py" # user-defined model + dataset_name: dataset_infer # select dataset by name + thread_num: 1 + diff --git a/models/demo/movie_recommand/rank/config_test_online.yaml b/models/demo/movie_recommand/rank/config_test_online.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0ade082c4731b9d0d1f81975aa719cd873e150ab --- /dev/null +++ b/models/demo/movie_recommand/rank/config_test_online.yaml @@ -0,0 +1,57 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +workspace: "./" + +# list of dataset +dataset: +- name: dataset_online_infer # name + batch_size: 10 + type: DataLoader + data_path: "{workspace}/data/online_user/test" + sparse_slots: "logid time userid gender age occupation movieid title genres label" + dense_slots: "" + +# hyper parameters of user-defined network +hyper_parameters: + # optimizer config + optimizer: + class: Adam + learning_rate: 0.001 + strategy: async + # user-defined pairs + sparse_feature_number: 60000000 + sparse_feature_dim: 9 + dense_input_dim: 13 + fc_sizes: [512, 256, 128, 32] + +# train +mode: runner_infer + +runner: +- name: runner_infer + epochs: 1 + device: cpu + class: infer + print_interval: 10000 + runner_result_dump_path: "{workspace}/rank/infer_result" + init_model_path: "increment_rank/9" # load model path + +#offline infer +phase: +- name: phase1 + model: "{workspace}/rank/model.py" # user-defined model + dataset_name: dataset_online_infer # select dataset by name + thread_num: 1 + diff --git a/models/demo/movie_recommand/recall/config.yaml b/models/demo/movie_recommand/recall/config.yaml index 63ca1c9c42cc232c4873578991b4534f1aa5f325..852241f3a5f24654cac340d088848faf23c597f1 100644 --- a/models/demo/movie_recommand/recall/config.yaml +++ b/models/demo/movie_recommand/recall/config.yaml @@ -12,28 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -workspace: "models/demo/movie_recommand" +workspace: "./" # list of dataset dataset: - name: dataset_train # name of dataset to distinguish different datasets batch_size: 128 - type: QueueDataset + type: DataLoader data_path: "{workspace}/data/train" sparse_slots: "logid time userid gender age occupation movieid title genres label" dense_slots: "" -- name: dataset_infer # name - batch_size: 128 - type: DataLoader - data_path: "{workspace}/data/test" - sparse_slots: "logid time userid gender age occupation movieid title genres label" - dense_slots: "" -- name: dataset_online_infer # name - batch_size: 128 - type: DataLoader - data_path: "{workspace}/data/online_user/test" - sparse_slots: "logid time userid gender age occupation movieid title genres label" - dense_slots: "" # hyper parameters of user-defined network hyper_parameters: @@ -50,43 +38,17 @@ hyper_parameters: # train mode: runner_train - -## online or offline infer -#mode: runner_infer runner: - name: runner_train class: train save_checkpoint_interval: 1 # save model interval of epochs - save_inference_interval: 1 # save inference - save_checkpoint_path: "increment" # save checkpoint path - save_inference_path: "inference" # save inference path + save_checkpoint_path: "increment_recall" # save checkpoint path epochs: 10 device: cpu -- name: runner_infer - class: infer - print_interval: 10000 - init_model_path: "increment/9" # load model path - #train phase: - name: phase1 - model: "{workspace}/model.py" # user-defined model + model: "{workspace}/recall/model.py" # user-defined model dataset_name: dataset_train # select dataset by name - thread_num: 12 - -##offline infer -#phase: -#- name: phase1 -# model: "{workspace}/model.py" # user-defined model -# dataset_name: dataset_infer # select dataset by name -# save_path: "./infer_result" -# thread_num: 1 - -##offline infer -#phase: -#- name: phase1 -# model: "{workspace}/model.py" # user-defined model -# dataset_name: dataset_online_infer # select dataset by name -# save_path: "./infer_result" -# thread_num: 1 + thread_num: 4 diff --git a/models/demo/movie_recommand/recall/config_test_offline.yaml b/models/demo/movie_recommand/recall/config_test_offline.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c246ad15a767634e569aa6a3fd851ea210d519d2 --- /dev/null +++ b/models/demo/movie_recommand/recall/config_test_offline.yaml @@ -0,0 +1,57 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#workspace: "paddlerec.models.demo.movie_recommand" +workspace: "./" +# list of dataset +dataset: +- name: dataset_infer # name + batch_size: 128 + type: DataLoader + data_path: "{workspace}/data/test" + sparse_slots: "logid time userid gender age occupation movieid title genres label" + dense_slots: "" + +# hyper parameters of user-defined network +hyper_parameters: + # optimizer config + optimizer: + class: Adam + learning_rate: 0.001 + strategy: async + # user-defined pairs + sparse_feature_number: 60000000 + sparse_feature_dim: 9 + dense_input_dim: 13 + fc_sizes: [512, 256, 128, 32] + +# train +mode: runner_infer + +runner: +- name: runner_infer + epochs: 1 + device: cpu + class: infer + print_interval: 100000 + runner_result_dump_path: "{workspace}/recall/infer_result" + init_model_path: "increment_recall/9" # load model path + + +#offline infer +phase: +- name: phase1 + model: "{workspace}/recall/model.py" # user-defined model + dataset_name: dataset_infer + thread_num: 1 diff --git a/models/demo/movie_recommand/recall/config_test_online.yaml b/models/demo/movie_recommand/recall/config_test_online.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d21f7fe49ef99dc24748209386f9f28fc43be083 --- /dev/null +++ b/models/demo/movie_recommand/recall/config_test_online.yaml @@ -0,0 +1,59 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#workspace: "paddlerec.models.demo.movie_recommand" +workspace: ./ +# list of dataset +dataset: +- name: dataset_online_infer # name + batch_size: 128 + type: DataLoader + data_path: "{workspace}/data/online_user/test" + sparse_slots: "logid time userid gender age occupation movieid title genres label" + dense_slots: "" + +# hyper parameters of user-defined network +hyper_parameters: + # optimizer config + optimizer: + class: Adam + learning_rate: 0.001 + strategy: async + # user-defined pairs + sparse_feature_number: 60000000 + sparse_feature_dim: 9 + dense_input_dim: 13 + fc_sizes: [512, 256, 128, 32] + +# train +mode: runner_infer + +## online or offline infer +#mode: runner_infer +runner: +- name: runner_infer + epochs: 1 + device: cpu + class: infer + print_interval: 10000 + runner_result_dump_path: "{workspace}/recall/infer_result" + init_model_path: "increment_recall/9" # load model path + +#offline infer +phase: +- name: phase1 + model: "{workspace}/recall/model.py" # user-defined model + dataset_name: dataset_online_infer # select dataset by name + thread_num: 1 + diff --git a/models/demo/movie_recommand/train.sh b/models/demo/movie_recommand/train.sh index 47756c1414030bf3cd5da0532198eedf19eff3e0..ad32edda3036e22ef9eee86946456d67e1f42f59 100644 --- a/models/demo/movie_recommand/train.sh +++ b/models/demo/movie_recommand/train.sh @@ -1,5 +1,8 @@ -cd recall -python -m paddlerec.run -m ./config.yaml &> log & -cd ../rank -python -m paddlerec.run -m ./config.yaml &> log & -cd .. +echo "Recall offline training ..." +echo "Model config at models/demo/movie_recommand/recall/config.yaml" +python -m paddlerec.run -m ./recall/config.yaml + +echo "----------------------------------------" +echo "Rank offline training ..." +echo "Model config at models/demo/movie_recommand/rank/config.yaml" +python -m paddlerec.run -m ./rank/config.yaml