diff --git a/core/trainers/framework/dataset.py b/core/trainers/framework/dataset.py index 00652e358e9919f85ed9e8938dda8e2122de2fb6..273e3a2ab4823fb5dd3ee1adcb5eb2b50e2f4bd2 100644 --- a/core/trainers/framework/dataset.py +++ b/core/trainers/framework/dataset.py @@ -15,13 +15,13 @@ from __future__ import print_function import os -import warnings import paddle.fluid as fluid from paddlerec.core.utils import envs from paddlerec.core.utils import dataloader_instance from paddlerec.core.reader import SlotReader from paddlerec.core.trainer import EngineMode +from paddlerec.core.utils.util import split_files __all__ = ["DatasetBase", "DataLoader", "QueueDataset"] @@ -123,7 +123,8 @@ class QueueDataset(DatasetBase): for x in os.listdir(train_data_path) ] if context["engine"] == EngineMode.LOCAL_CLUSTER: - file_list = context["fleet"].split_files(file_list) + file_list = split_files(file_list, context["fleet"].worker_index(), + context["fleet"].worker_num()) dataset.set_filelist(file_list) for model_dict in context["phases"]: diff --git a/core/utils/dataloader_instance.py b/core/utils/dataloader_instance.py index e3062bb7441e907160c7186a7c4ee883dcd964dd..c66d1b36571df0331b8319798cdc692fa825a481 100755 --- a/core/utils/dataloader_instance.py +++ b/core/utils/dataloader_instance.py @@ -19,6 +19,7 @@ from paddlerec.core.utils.envs import get_global_env from paddlerec.core.utils.envs import get_runtime_environ from paddlerec.core.reader import SlotReader from paddlerec.core.trainer import EngineMode +from paddlerec.core.utils.util import split_files def dataloader_by_name(readerclass, @@ -39,7 +40,8 @@ def dataloader_by_name(readerclass, files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)] if context["engine"] == EngineMode.LOCAL_CLUSTER: - files = context["fleet"].split_files(files) + files = split_files(files, context["fleet"].worker_index(), + context["fleet"].worker_num()) print("file_list : {}".format(files)) reader = reader_class(yaml_file) @@ -80,7 +82,8 @@ def slotdataloader_by_name(readerclass, dataset_name, yaml_file, context): files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)] if context["engine"] == EngineMode.LOCAL_CLUSTER: - files = context["fleet"].split_files(files) + files = split_files(files, context["fleet"].worker_index(), + context["fleet"].worker_num()) print("file_list: {}".format(files)) sparse = get_global_env(name + "sparse_slots", "#") @@ -133,7 +136,8 @@ def slotdataloader(readerclass, train, yaml_file, context): files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)] if context["engine"] == EngineMode.LOCAL_CLUSTER: - files = context["fleet"].split_files(files) + files = split_files(files, context["fleet"].worker_index(), + context["fleet"].worker_num()) print("file_list: {}".format(files)) sparse = get_global_env("sparse_slots", "#", namespace) diff --git a/core/utils/envs.py b/core/utils/envs.py index bfc18b148e9db719f0dff6cda7e5fee4f7ee2d2d..ddcc9a94b3adc47cda2023c4d9e196b9fb16faeb 100755 --- a/core/utils/envs.py +++ b/core/utils/envs.py @@ -18,7 +18,9 @@ import copy import os import socket import sys +import six import traceback +import six global_envs = {} global_envs_flatten = {} @@ -101,6 +103,12 @@ def set_global_envs(envs): name = ".".join(["dataset", dataset["name"], "type"]) global_envs[name] = "DataLoader" + if get_platform() == "LINUX" and six.PY3: + print("QueueDataset can not support PY3, change to DataLoader") + for dataset in envs["dataset"]: + name = ".".join(["dataset", dataset["name"], "type"]) + global_envs[name] = "DataLoader" + def get_global_env(env_name, default_value=None, namespace=None): """ @@ -253,11 +261,19 @@ def load_yaml(config): use_full_loader = False if os.path.isfile(config): - with open(config, 'r') as rb: - if use_full_loader: - _config = yaml.load(rb.read(), Loader=yaml.FullLoader) - else: - _config = yaml.load(rb.read()) - return _config + if six.PY2: + with open(config, 'r') as rb: + if use_full_loader: + _config = yaml.load(rb.read(), Loader=yaml.FullLoader) + else: + _config = yaml.load(rb.read()) + return _config + else: + with open(config, 'r', encoding="utf-8") as rb: + if use_full_loader: + _config = yaml.load(rb.read(), Loader=yaml.FullLoader) + else: + _config = yaml.load(rb.read()) + return _config else: raise ValueError("config {} can not be supported".format(config)) diff --git a/core/utils/util.py b/core/utils/util.py index 381d35cade663c89b93608eadc95601b234cdffe..4eba912cafda6619ba37c3f8bc170d7d41ea40c4 100755 --- a/core/utils/util.py +++ b/core/utils/util.py @@ -19,11 +19,8 @@ import time import numpy as np from paddle import fluid -from paddlerec.core.utils import fs as fs - def save_program_proto(path, program=None): - if program is None: _program = fluid.default_main_program() else: @@ -171,6 +168,39 @@ def print_cost(cost, params): return log_str +def split_files(files, trainer_id, trainers): + """ + split files before distributed training, + example 1: files is [a, b, c ,d, e] and trainer_num = 2, then trainer + 0 gets [a, b, c] and trainer 1 gets [d, e]. + example 2: files is [a, b], and trainer_num = 3, then trainer 0 gets + [a], trainer 1 gets [b], trainer 2 gets [] + + Args: + files(list): file list need to be read. + + Returns: + list: files belongs to this worker. + """ + if not isinstance(files, list): + raise TypeError("files should be a list of file need to be read.") + + remainder = len(files) % trainers + blocksize = int(len(files) / trainers) + + blocks = [blocksize] * trainers + for i in range(remainder): + blocks[i] += 1 + + trainer_files = [[]] * trainers + begin = 0 + for i in range(trainers): + trainer_files[i] = files[begin:begin + blocks[i]] + begin += blocks[i] + + return trainer_files[trainer_id] + + class CostPrinter(object): """ For count cost time && print cost log diff --git a/models/contentunderstanding/readme.md b/models/contentunderstanding/readme.md index 56ec74a3e593760fa8f8b11d3521d612f894dd59..59bddb165dac77825d8f98ff51ac95b1a091dc1a 100644 --- a/models/contentunderstanding/readme.md +++ b/models/contentunderstanding/readme.md @@ -86,7 +86,7 @@ python -m paddlerec.run -m ./config.yaml # 自定义修改超参后,指定配 # 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径 # 修改对应模型的config.yaml,mode配置infer_runner # 示例: mode: train_runner -> mode: infer_runner -# infer_runner中 class配置为 class: single_infer +# infer_runner中 class配置为 class: infer # 修改phase阶段为infer的配置,参照config注释 # 修改完config.yaml后 执行: @@ -106,7 +106,7 @@ python -m paddlerec.run -m ./config.yaml # 自定义修改超参后,指定配 # 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径 # 修改对应模型的config.yaml,mode配置infer_runner # 示例: mode: train_runner -> mode: infer_runner -# infer_runner中 class配置为 class: single_infer +# infer_runner中 class配置为 class: infer # 修改phase阶段为infer的配置,参照config注释 # 修改完config.yaml后 执行: diff --git a/models/demo/movie_recommand/rank/config.yaml b/models/demo/movie_recommand/rank/config.yaml index 8f11d51f3ab8f4627e899dc208c8feb2109c8eab..94e7b0f1aec4759ebabf238b832ebe0110a0ea8c 100644 --- a/models/demo/movie_recommand/rank/config.yaml +++ b/models/demo/movie_recommand/rank/config.yaml @@ -64,8 +64,7 @@ runner: device: cpu - name: runner_infer - epochs: 1 - class: single_infer + class: infer print_interval: 10000 init_model_path: "increment/9" # load model path diff --git a/models/demo/movie_recommand/recall/config.yaml b/models/demo/movie_recommand/recall/config.yaml index ca24df12d872bfe0937aecedce332e2219f783f1..4b683c1ccecffc81b792c40fbe450979ad5a6ffb 100644 --- a/models/demo/movie_recommand/recall/config.yaml +++ b/models/demo/movie_recommand/recall/config.yaml @@ -64,8 +64,7 @@ runner: device: cpu - name: runner_infer - epochs: 1 - class: single_infer + class: infer print_interval: 10000 init_model_path: "increment/9" # load model path diff --git a/models/match/dssm/config.yaml b/models/match/dssm/config.yaml index 7d28f3ded2324dc1bd712652551e92b8d3d53f1e..8f97c496739d820d37fd6878ef5ddb669b671ad7 100755 --- a/models/match/dssm/config.yaml +++ b/models/match/dssm/config.yaml @@ -56,9 +56,7 @@ runner: init_model_path: "" # load model path print_interval: 2 - name: infer_runner - class: single_infer - # num of epochs - epochs: 1 + class: infer # device to run training or infer device: cpu print_interval: 1 diff --git a/models/match/multiview-simnet/config.yaml b/models/match/multiview-simnet/config.yaml index 3cf6c97f1ccda281f24eafd4babd81075ad28fdf..bff01ae660bb5262e099a53c594cd0244a3ccf06 100755 --- a/models/match/multiview-simnet/config.yaml +++ b/models/match/multiview-simnet/config.yaml @@ -63,9 +63,7 @@ runner: init_model_path: "" # load model path print_interval: 1 - name: infer_runner - class: single_infer - # num of epochs - epochs: 1 + class: infer # device to run training or infer device: cpu print_interval: 1 diff --git a/models/match/readme.md b/models/match/readme.md index f8f000880836ac604bf208902da57fb0281f19bb..38e72229a76de7c175d5177ee895bb114625ab48 100755 --- a/models/match/readme.md +++ b/models/match/readme.md @@ -43,7 +43,7 @@ python -m paddlerec.run -m paddlerec.models.match.multiview-simnet # multiview-s # 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径 # 修改对应模型的config.yaml,mode配置infer_runner # 示例: mode: train_runner -> mode: infer_runner -# infer_runner中 class配置为 class: single_infer +# infer_runner中 class配置为 class: infer # 修改phase阶段为infer的配置,参照config注释 # 修改完config.yaml后 执行: diff --git a/models/multitask/esmm/config.yaml b/models/multitask/esmm/config.yaml index 9606ca79320a20bd27175c5ce91c1c1c7f3513fe..d160f164455862912bc142d2b0bbd22042168d42 100644 --- a/models/multitask/esmm/config.yaml +++ b/models/multitask/esmm/config.yaml @@ -16,21 +16,21 @@ workspace: "paddlerec.models.multitask.esmm" dataset: -- name: dataset_train - batch_size: 1 - type: QueueDataset - data_path: "{workspace}/data/train" - data_converter: "{workspace}/esmm_reader.py" -- name: dataset_infer - batch_size: 1 - type: QueueDataset - data_path: "{workspace}/data/test" - data_converter: "{workspace}/esmm_reader.py" + - name: dataset_train + batch_size: 1 + type: QueueDataset + data_path: "{workspace}/data/train" + data_converter: "{workspace}/esmm_reader.py" + - name: dataset_infer + batch_size: 1 + type: QueueDataset + data_path: "{workspace}/data/test" + data_converter: "{workspace}/esmm_reader.py" hyper_parameters: vocab_size: 10000 embed_size: 128 - optimizer: + optimizer: class: adam learning_rate: 0.001 strategy: async @@ -39,30 +39,29 @@ hyper_parameters: mode: [train_runner, infer_runner] runner: -- name: train_runner - class: train - device: cpu - epochs: 3 - save_checkpoint_interval: 2 - save_inference_interval: 4 - save_checkpoint_path: "increment" - save_inference_path: "inference" - print_interval: 10 - phases: [train] -- name: infer_runner - class: infer - init_model_path: "increment/0" - device: cpu - epochs: 1 - print_interval: 1 - phases: [infer] + - name: train_runner + class: train + device: cpu + epochs: 3 + save_checkpoint_interval: 2 + save_inference_interval: 4 + save_checkpoint_path: "increment" + save_inference_path: "inference" + print_interval: 10 + phases: [train] + - name: infer_runner + class: infer + init_model_path: "increment/1" + device: cpu + print_interval: 1 + phases: [infer] phase: -- name: train - model: "{workspace}/model.py" - dataset_name: dataset_train - thread_num: 1 -- name: infer - model: "{workspace}/model.py" - dataset_name: dataset_infer - thread_num: 1 + - name: train + model: "{workspace}/model.py" + dataset_name: dataset_train + thread_num: 1 + - name: infer + model: "{workspace}/model.py" + dataset_name: dataset_infer + thread_num: 1 diff --git a/models/multitask/mmoe/config.yaml b/models/multitask/mmoe/config.yaml index 751ed1986d769e409db61ef54e528d6af4a8f6e2..63f052be105d969c2efa7a9c328fcfb05afdd3b9 100644 --- a/models/multitask/mmoe/config.yaml +++ b/models/multitask/mmoe/config.yaml @@ -52,10 +52,9 @@ runner: save_inference_path: "inference" print_interval: 10 - name: infer_runner - class: single_infer + class: infer init_model_path: "increment/0" device: cpu - epochs: 3 phase: - name: train diff --git a/models/multitask/readme.md b/models/multitask/readme.md index 4a377da6d1f9c081beecb1494ae8a680bc30a523..7bf23ae3c626797db8ab7c13148b24a6904da355 100755 --- a/models/multitask/readme.md +++ b/models/multitask/readme.md @@ -77,7 +77,7 @@ python -m paddlerec.run -m ./config.yaml # 自定义修改超参后,指定配 # 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径 # 修改对应模型的config.yaml,mode配置infer_runner # 示例: mode: train_runner -> mode: infer_runner -# infer_runner中 class配置为 class: single_infer +# infer_runner中 class配置为 class: infer # 修改phase阶段为infer的配置,参照config注释 # 修改完config.yaml后 执行: diff --git a/models/multitask/share-bottom/config.yaml b/models/multitask/share-bottom/config.yaml index ebbdfcc20b356f61f5b469f9e46e3e79415fe362..9abb67dbe90fd4654ac949290960cf0cb0f02cf5 100644 --- a/models/multitask/share-bottom/config.yaml +++ b/models/multitask/share-bottom/config.yaml @@ -51,10 +51,9 @@ runner: save_inference_path: "inference" print_interval: 5 - name: infer_runner - class: single_infer + class: infer init_model_path: "increment/0" device: cpu - epochs: 3 phase: - name: train diff --git a/models/rank/afm/config.yaml b/models/rank/afm/config.yaml index 81cc01a6ea095a1f9118e251ccc006569105f985..c55a96948d52388c94c6a614f448bda2c883f609 100644 --- a/models/rank/afm/config.yaml +++ b/models/rank/afm/config.yaml @@ -59,8 +59,7 @@ runner: save_inference_path: "inference" print_interval: 1 - name: infer_runner - class: single_infer - epochs: 1 + class: infer device: cpu init_model_path: "increment/0" print_interval: 1 diff --git a/models/rank/dcn/config.yaml b/models/rank/dcn/config.yaml index e7538744f264301c7974e1f4e20e9901aa7be76f..2f8a1be4209cf4350c35d8e40d54e04ce0682ed4 100755 --- a/models/rank/dcn/config.yaml +++ b/models/rank/dcn/config.yaml @@ -60,7 +60,7 @@ runner: save_inference_path: "inference" print_interval: 1 - name: infer_runner - class: single_infer + class: infer epochs: 1 device: cpu init_model_path: "increment/0" diff --git a/models/rank/deep_crossing/config.yaml b/models/rank/deep_crossing/config.yaml index 5033cd5627fb2e3a8fcce7af32a1b9fb73fca6fb..54a4a895a1de8c63702df13b648577a0d2f17d7f 100755 --- a/models/rank/deep_crossing/config.yaml +++ b/models/rank/deep_crossing/config.yaml @@ -58,8 +58,7 @@ runner: save_inference_path: "inference" print_interval: 1 - name: infer_runner - class: single_infer - epochs: 1 + class: infer device: cpu init_model_path: "increment/0" print_interval: 1 diff --git a/models/rank/deepfm/config.yaml b/models/rank/deepfm/config.yaml index 8443d45ae269320fa7af8f4a8ed4827bd55d03d4..10c6fa35336ee333c47ceae0d3840f2c4dc89d14 100755 --- a/models/rank/deepfm/config.yaml +++ b/models/rank/deepfm/config.yaml @@ -58,8 +58,7 @@ runner: save_inference_path: "inference" print_interval: 1 - name: infer_runner - class: single_infer - epochs: 1 + class: infer device: cpu init_model_path: "increment/0" print_interval: 1 diff --git a/models/rank/din/config.yaml b/models/rank/din/config.yaml index ab327885fba35d08aca86799b5c14fb5b159d03d..95693c6de7e293f1f7f12e2d52684d8e1f59475b 100755 --- a/models/rank/din/config.yaml +++ b/models/rank/din/config.yaml @@ -56,8 +56,7 @@ runner: save_inference_path: "inference" print_interval: 1 - name: infer_runner - class: single_infer - epochs: 1 + class: infer device: cpu init_model_path: "increment/0" print_interval: 1 diff --git a/models/rank/ffm/config.yaml b/models/rank/ffm/config.yaml index 863ff6c01e87e43e0cdab72da9d96ab41c351117..262407062f18aaf6544f00887cc8999ec35433ec 100644 --- a/models/rank/ffm/config.yaml +++ b/models/rank/ffm/config.yaml @@ -57,8 +57,7 @@ runner: save_inference_path: "inference" print_interval: 1 - name: infer_runner - class: single_infer - epochs: 1 + class: infer device: cpu init_model_path: "increment/0" print_interval: 1 diff --git a/models/rank/fgcnn/config.yaml b/models/rank/fgcnn/config.yaml index 24ee2636bc0e8edec50b5d2c021235f847ccca56..c329c7eae0224583a42f5d1dbc454d5e712c1de0 100755 --- a/models/rank/fgcnn/config.yaml +++ b/models/rank/fgcnn/config.yaml @@ -62,8 +62,7 @@ runner: save_inference_path: "inference" print_interval: 1 - name: infer_runner - class: single_infer - epochs: 1 + class: infer device: cpu init_model_path: "increment/0" print_interval: 1 diff --git a/models/rank/fm/config.yaml b/models/rank/fm/config.yaml index 617d727c7b2086c8f1eabb01d0f2829a933526f6..e9f30573177708c470162b8f8c2a35a656a4e5c2 100644 --- a/models/rank/fm/config.yaml +++ b/models/rank/fm/config.yaml @@ -57,8 +57,7 @@ runner: save_inference_path: "inference" print_interval: 1 - name: infer_runner - class: single_infer - epochs: 1 + class: infer device: cpu init_model_path: "increment/0" print_interval: 1 diff --git a/models/rank/fnn/config.yaml b/models/rank/fnn/config.yaml index 16e985e24fa5d1dc4d144098ebd768dd1940d4da..6f3995d8c0eb7f1dc98834719055015f5ac6fecd 100755 --- a/models/rank/fnn/config.yaml +++ b/models/rank/fnn/config.yaml @@ -68,8 +68,7 @@ runner: save_inference_path: "inference_fnn" print_interval: 1 - name: infer_runner - trainer_class: single_infer - epochs: 1 + trainer_class: infer device: cpu init_model_path: "increment/0" print_interval: 1 diff --git a/models/rank/logistic_regression/config.yaml b/models/rank/logistic_regression/config.yaml index 4dbb48cab1c343a8df74330adf511238c2861e8b..8e88ee1b68b7110dad731365b9324dc31caad1aa 100644 --- a/models/rank/logistic_regression/config.yaml +++ b/models/rank/logistic_regression/config.yaml @@ -56,8 +56,7 @@ runner: save_inference_path: "inference" print_interval: 1 - name: infer_runner - class: single_infer - epochs: 1 + class: infer device: cpu init_model_path: "increment/0" print_interval: 1 diff --git a/models/rank/nfm/config.yaml b/models/rank/nfm/config.yaml index ffea34621030917b848b5ba2ab8a65f5ddae3d7e..266cdfbeb6fd2a82c18409e004c93b1cfbfcb0ea 100644 --- a/models/rank/nfm/config.yaml +++ b/models/rank/nfm/config.yaml @@ -63,8 +63,7 @@ runner: save_inference_path: "inference" print_interval: 1 - name: infer_runner - class: single_infer - epochs: 1 + class: infer device: cpu init_model_path: "increment/0" print_interval: 1 diff --git a/models/rank/pnn/config.yaml b/models/rank/pnn/config.yaml index 836e3175fd580f424208d39106438ff916e4149a..4624d9388677b4c83fe133ebfc5b4b595fc915e9 100644 --- a/models/rank/pnn/config.yaml +++ b/models/rank/pnn/config.yaml @@ -60,8 +60,7 @@ runner: save_inference_path: "inference" print_interval: 1 - name: infer_runner - class: single_infer - epochs: 1 + class: infer device: cpu init_model_path: "increment/0" print_interval: 1 diff --git a/models/rank/readme.md b/models/rank/readme.md index d94f359faf766d7629a0de56c7c06acb83f588ac..51438fd65c5d36c351815ab903b9864db3cdd2c1 100755 --- a/models/rank/readme.md +++ b/models/rank/readme.md @@ -98,7 +98,7 @@ python -m paddlerec.run -m ./config.yaml ``` # 修改对应模型的config.yaml,mode配置infer_runner # 示例: mode: runner1 -> mode: infer_runner -# infer_runner中 class配置为 class: single_infer +# infer_runner中 class配置为 class: infer # 如果训练阶段和预测阶段的模型输入一致,phase不需要改动,复用train的即可 # 修改完config.yaml后 执行: diff --git a/models/rank/wide_deep/config.yaml b/models/rank/wide_deep/config.yaml index 16c112050a2b6a7a8e9193c5124327695b72afb1..1ff5232e727b9f4af28639be15c36611c29b4ee7 100755 --- a/models/rank/wide_deep/config.yaml +++ b/models/rank/wide_deep/config.yaml @@ -54,8 +54,7 @@ runner: save_checkpoint_path: "increment" save_inference_path: "inference" - name: infer_runner - class: single_infer - epochs: 1 + class: infer device: cpu init_model_path: "increment/0" diff --git a/models/rank/xdeepfm/config.yaml b/models/rank/xdeepfm/config.yaml index 0571a88bd7da3d5c5661d2bee4246bd942d48300..716513d405431dcf2ad55011d8e0a68b8daecc43 100755 --- a/models/rank/xdeepfm/config.yaml +++ b/models/rank/xdeepfm/config.yaml @@ -55,8 +55,7 @@ runner: save_checkpoint_path: "increment" save_inference_path: "inference" - name: infer_runner - class: single_infer - epochs: 1 + class: infer device: cpu init_model_path: "increment/0" diff --git a/models/recall/gnn/config.yaml b/models/recall/gnn/config.yaml index 88ff55f5ef50ad1a34081c5c47c12fe233738cb2..ed290b2f81e530def392b2e851a81e1ff74cb8a2 100755 --- a/models/recall/gnn/config.yaml +++ b/models/recall/gnn/config.yaml @@ -61,9 +61,7 @@ runner: init_model_path: "" # load model path print_interval: 1 - name: infer_runner - class: single_infer - # num of epochs - epochs: 1 + class: infer # device to run training or infer device: cpu print_interval: 1 diff --git a/models/recall/gru4rec/config.yaml b/models/recall/gru4rec/config.yaml index b74db3dddcda54fff83740280e5b8b9a159d9d95..98250ae04cf6d76a6a9e61009fad079f0773b05d 100644 --- a/models/recall/gru4rec/config.yaml +++ b/models/recall/gru4rec/config.yaml @@ -54,10 +54,9 @@ runner: save_inference_path: "inference" print_interval: 10 - name: infer_runner - class: single_infer + class: infer init_model_path: "increment/0" device: cpu - epochs: 1 phase: - name: train diff --git a/models/recall/ncf/config.yaml b/models/recall/ncf/config.yaml index 2d603397ae8e4dd82dcf1659af70759228f75a25..3c87eb3b4ea76479348810f3f6ea9ec1f6644a32 100644 --- a/models/recall/ncf/config.yaml +++ b/models/recall/ncf/config.yaml @@ -51,10 +51,9 @@ runner: save_inference_path: "inference" print_interval: 10 - name: infer_runner - class: single_infer + class: infer init_model_path: "increment/0" device: cpu - epochs: 1 phase: - name: train diff --git a/models/recall/readme.md b/models/recall/readme.md index bb0539b268ce41e41c29fa8f5cf6d9bfdef00a40..e5589188858a423a40adb28d7c70f8be800cdc86 100755 --- a/models/recall/readme.md +++ b/models/recall/readme.md @@ -95,7 +95,7 @@ python -m paddlerec.run -m ./config.yaml # 自定义修改超参后,指定配 # 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径 # 修改对应模型的config.yaml,mode配置infer_runner # 示例: mode: train_runner -> mode: infer_runner -# infer_runner中 class配置为 class: single_infer +# infer_runner中 class配置为 class: infer # 修改phase阶段为infer的配置,参照config注释 # 修改完config.yaml后 执行: diff --git a/models/recall/ssr/config.yaml b/models/recall/ssr/config.yaml index ae23609ec939fdd7c64f2fa9337e80426ced0aea..5152c20c04cf03476ed5b4ad18d30b823807a2ac 100644 --- a/models/recall/ssr/config.yaml +++ b/models/recall/ssr/config.yaml @@ -50,10 +50,9 @@ runner: save_inference_path: "inference" print_interval: 10 - name: infer_runner - class: single_infer + class: infer init_model_path: "increment/0" device: cpu - epochs: 1 phase: - name: train diff --git a/models/recall/word2vec/config.yaml b/models/recall/word2vec/config.yaml index 3626198626ceed12bb18d05ba9f51dbd8d6f7cdd..34a25e59ecfa4ccd292a3b6e358c83ac827ed59f 100755 --- a/models/recall/word2vec/config.yaml +++ b/models/recall/word2vec/config.yaml @@ -61,9 +61,7 @@ runner: init_model_path: "" # load model path print_interval: 1 - name: infer_runner - class: single_infer - # num of epochs - epochs: 1 + class: infer # device to run training or infer device: cpu init_model_path: "increment/0" # load model path diff --git a/models/rerank/listwise/config.yaml b/models/rerank/listwise/config.yaml index 8891a057e2e93b026653642ed97dd8325f5f0afa..6d06ab09a58e44976af5219fd34dd9fd41525eff 100644 --- a/models/rerank/listwise/config.yaml +++ b/models/rerank/listwise/config.yaml @@ -51,10 +51,9 @@ runner: save_checkpoint_path: "increment" save_inference_path: "inference" - name: infer_runner - class: single_infer + class: infer init_model_path: "increment/0" device: cpu - epochs: 3 phase: - name: train diff --git a/models/treebased/tdm/config.yaml b/models/treebased/tdm/config.yaml index 3ed4a2572163eb44815407af4062973077e69d66..e5920803a7d7aeec20dc0a3375273952559733d6 100755 --- a/models/treebased/tdm/config.yaml +++ b/models/treebased/tdm/config.yaml @@ -80,10 +80,8 @@ runner: print_interval: 10 - name: runner2 - class: single_infer + class: infer startup_class_path: "{workspace}/tdm_startup.py" - # num of epochs - epochs: 1 # device to run training or infer device: cpu init_model_path: "increment/0" # load model path diff --git a/run.py b/run.py index 699d48f9addfb5baf294c082eb924caa537b0536..b9e15a50ea40393a1f49c1d1e1c876947bc1ef10 100755 --- a/run.py +++ b/run.py @@ -139,8 +139,8 @@ def get_engine(args, running_config, mode): engine = "LOCAL_CLUSTER_TRAIN" if engine not in engine_choices: - raise ValueError("{} can not be chosen in {}".format(engine_class, - engine_choices)) + raise ValueError("{} can only be chosen in {}".format(engine_class, + engine_choices)) run_engine = engines[transpiler].get(engine, None) return run_engine @@ -439,8 +439,8 @@ def local_cluster_engine(args): if fleet_mode == "COLLECTIVE": cluster_envs["selected_gpus"] = selected_gpus gpus = selected_gpus.split(",") - gpu_num = get_worker_num(run_extras, len(gpus)) - cluster_envs["selected_gpus"] = ','.join(gpus[:gpu_num]) + worker_num = get_worker_num(run_extras, len(gpus)) + cluster_envs["selected_gpus"] = ','.join(gpus[:worker_num]) cluster_envs["server_num"] = server_num cluster_envs["worker_num"] = worker_num diff --git a/tools/build_script.sh b/tools/build_script.sh index 9e5e27e3a1be524ef0a4a02dbbadd0ff8283aa8d..b39b97f829a23fd0e79b0d50aad5996bb76941f3 100755 --- a/tools/build_script.sh +++ b/tools/build_script.sh @@ -49,7 +49,7 @@ function model_test() { root_dir=`pwd` all_model=$(find ${root_dir} -name config.yaml) - special_models=("demo" "pnn" "fgcnn" "esmm") + special_models=("demo" "pnn" "fgcnn" "gru4rec" "tagspace") for model in ${all_model} do