提交 0244a9db 编写于 作者: C chengmo

Merge remote-tracking branch 'chengmo/fix_setup_print' into fix_setup_print

...@@ -15,13 +15,13 @@ ...@@ -15,13 +15,13 @@
from __future__ import print_function from __future__ import print_function
import os import os
import warnings
import paddle.fluid as fluid import paddle.fluid as fluid
from paddlerec.core.utils import envs from paddlerec.core.utils import envs
from paddlerec.core.utils import dataloader_instance from paddlerec.core.utils import dataloader_instance
from paddlerec.core.reader import SlotReader from paddlerec.core.reader import SlotReader
from paddlerec.core.trainer import EngineMode from paddlerec.core.trainer import EngineMode
from paddlerec.core.utils.util import split_files
__all__ = ["DatasetBase", "DataLoader", "QueueDataset"] __all__ = ["DatasetBase", "DataLoader", "QueueDataset"]
...@@ -123,7 +123,8 @@ class QueueDataset(DatasetBase): ...@@ -123,7 +123,8 @@ class QueueDataset(DatasetBase):
for x in os.listdir(train_data_path) for x in os.listdir(train_data_path)
] ]
if context["engine"] == EngineMode.LOCAL_CLUSTER: if context["engine"] == EngineMode.LOCAL_CLUSTER:
file_list = context["fleet"].split_files(file_list) file_list = split_files(file_list, context["fleet"].worker_index(),
context["fleet"].worker_num())
dataset.set_filelist(file_list) dataset.set_filelist(file_list)
for model_dict in context["phases"]: for model_dict in context["phases"]:
......
...@@ -19,6 +19,7 @@ from paddlerec.core.utils.envs import get_global_env ...@@ -19,6 +19,7 @@ from paddlerec.core.utils.envs import get_global_env
from paddlerec.core.utils.envs import get_runtime_environ from paddlerec.core.utils.envs import get_runtime_environ
from paddlerec.core.reader import SlotReader from paddlerec.core.reader import SlotReader
from paddlerec.core.trainer import EngineMode from paddlerec.core.trainer import EngineMode
from paddlerec.core.utils.util import split_files
def dataloader_by_name(readerclass, def dataloader_by_name(readerclass,
...@@ -39,7 +40,8 @@ def dataloader_by_name(readerclass, ...@@ -39,7 +40,8 @@ def dataloader_by_name(readerclass,
files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)] files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)]
if context["engine"] == EngineMode.LOCAL_CLUSTER: if context["engine"] == EngineMode.LOCAL_CLUSTER:
files = context["fleet"].split_files(files) files = split_files(files, context["fleet"].worker_index(),
context["fleet"].worker_num())
print("file_list : {}".format(files)) print("file_list : {}".format(files))
reader = reader_class(yaml_file) reader = reader_class(yaml_file)
...@@ -80,7 +82,8 @@ def slotdataloader_by_name(readerclass, dataset_name, yaml_file, context): ...@@ -80,7 +82,8 @@ def slotdataloader_by_name(readerclass, dataset_name, yaml_file, context):
files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)] files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)]
if context["engine"] == EngineMode.LOCAL_CLUSTER: if context["engine"] == EngineMode.LOCAL_CLUSTER:
files = context["fleet"].split_files(files) files = split_files(files, context["fleet"].worker_index(),
context["fleet"].worker_num())
print("file_list: {}".format(files)) print("file_list: {}".format(files))
sparse = get_global_env(name + "sparse_slots", "#") sparse = get_global_env(name + "sparse_slots", "#")
...@@ -133,7 +136,8 @@ def slotdataloader(readerclass, train, yaml_file, context): ...@@ -133,7 +136,8 @@ def slotdataloader(readerclass, train, yaml_file, context):
files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)] files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)]
if context["engine"] == EngineMode.LOCAL_CLUSTER: if context["engine"] == EngineMode.LOCAL_CLUSTER:
files = context["fleet"].split_files(files) files = split_files(files, context["fleet"].worker_index(),
context["fleet"].worker_num())
print("file_list: {}".format(files)) print("file_list: {}".format(files))
sparse = get_global_env("sparse_slots", "#", namespace) sparse = get_global_env("sparse_slots", "#", namespace)
......
...@@ -18,7 +18,9 @@ import copy ...@@ -18,7 +18,9 @@ import copy
import os import os
import socket import socket
import sys import sys
import six
import traceback import traceback
import six
global_envs = {} global_envs = {}
global_envs_flatten = {} global_envs_flatten = {}
...@@ -101,6 +103,12 @@ def set_global_envs(envs): ...@@ -101,6 +103,12 @@ def set_global_envs(envs):
name = ".".join(["dataset", dataset["name"], "type"]) name = ".".join(["dataset", dataset["name"], "type"])
global_envs[name] = "DataLoader" global_envs[name] = "DataLoader"
if get_platform() == "LINUX" and six.PY3:
print("QueueDataset can not support PY3, change to DataLoader")
for dataset in envs["dataset"]:
name = ".".join(["dataset", dataset["name"], "type"])
global_envs[name] = "DataLoader"
def get_global_env(env_name, default_value=None, namespace=None): def get_global_env(env_name, default_value=None, namespace=None):
""" """
...@@ -253,11 +261,19 @@ def load_yaml(config): ...@@ -253,11 +261,19 @@ def load_yaml(config):
use_full_loader = False use_full_loader = False
if os.path.isfile(config): if os.path.isfile(config):
with open(config, 'r') as rb: if six.PY2:
if use_full_loader: with open(config, 'r') as rb:
_config = yaml.load(rb.read(), Loader=yaml.FullLoader) if use_full_loader:
else: _config = yaml.load(rb.read(), Loader=yaml.FullLoader)
_config = yaml.load(rb.read()) else:
return _config _config = yaml.load(rb.read())
return _config
else:
with open(config, 'r', encoding="utf-8") as rb:
if use_full_loader:
_config = yaml.load(rb.read(), Loader=yaml.FullLoader)
else:
_config = yaml.load(rb.read())
return _config
else: else:
raise ValueError("config {} can not be supported".format(config)) raise ValueError("config {} can not be supported".format(config))
...@@ -19,11 +19,8 @@ import time ...@@ -19,11 +19,8 @@ import time
import numpy as np import numpy as np
from paddle import fluid from paddle import fluid
from paddlerec.core.utils import fs as fs
def save_program_proto(path, program=None): def save_program_proto(path, program=None):
if program is None: if program is None:
_program = fluid.default_main_program() _program = fluid.default_main_program()
else: else:
...@@ -171,6 +168,39 @@ def print_cost(cost, params): ...@@ -171,6 +168,39 @@ def print_cost(cost, params):
return log_str return log_str
def split_files(files, trainer_id, trainers):
"""
split files before distributed training,
example 1: files is [a, b, c ,d, e] and trainer_num = 2, then trainer
0 gets [a, b, c] and trainer 1 gets [d, e].
example 2: files is [a, b], and trainer_num = 3, then trainer 0 gets
[a], trainer 1 gets [b], trainer 2 gets []
Args:
files(list): file list need to be read.
Returns:
list: files belongs to this worker.
"""
if not isinstance(files, list):
raise TypeError("files should be a list of file need to be read.")
remainder = len(files) % trainers
blocksize = int(len(files) / trainers)
blocks = [blocksize] * trainers
for i in range(remainder):
blocks[i] += 1
trainer_files = [[]] * trainers
begin = 0
for i in range(trainers):
trainer_files[i] = files[begin:begin + blocks[i]]
begin += blocks[i]
return trainer_files[trainer_id]
class CostPrinter(object): class CostPrinter(object):
""" """
For count cost time && print cost log For count cost time && print cost log
......
...@@ -86,7 +86,7 @@ python -m paddlerec.run -m ./config.yaml # 自定义修改超参后,指定配 ...@@ -86,7 +86,7 @@ python -m paddlerec.run -m ./config.yaml # 自定义修改超参后,指定配
# 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径 # 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径
# 修改对应模型的config.yaml,mode配置infer_runner # 修改对应模型的config.yaml,mode配置infer_runner
# 示例: mode: train_runner -> mode: infer_runner # 示例: mode: train_runner -> mode: infer_runner
# infer_runner中 class配置为 class: single_infer # infer_runner中 class配置为 class: infer
# 修改phase阶段为infer的配置,参照config注释 # 修改phase阶段为infer的配置,参照config注释
# 修改完config.yaml后 执行: # 修改完config.yaml后 执行:
...@@ -106,7 +106,7 @@ python -m paddlerec.run -m ./config.yaml # 自定义修改超参后,指定配 ...@@ -106,7 +106,7 @@ python -m paddlerec.run -m ./config.yaml # 自定义修改超参后,指定配
# 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径 # 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径
# 修改对应模型的config.yaml,mode配置infer_runner # 修改对应模型的config.yaml,mode配置infer_runner
# 示例: mode: train_runner -> mode: infer_runner # 示例: mode: train_runner -> mode: infer_runner
# infer_runner中 class配置为 class: single_infer # infer_runner中 class配置为 class: infer
# 修改phase阶段为infer的配置,参照config注释 # 修改phase阶段为infer的配置,参照config注释
# 修改完config.yaml后 执行: # 修改完config.yaml后 执行:
......
...@@ -64,8 +64,7 @@ runner: ...@@ -64,8 +64,7 @@ runner:
device: cpu device: cpu
- name: runner_infer - name: runner_infer
epochs: 1 class: infer
class: single_infer
print_interval: 10000 print_interval: 10000
init_model_path: "increment/9" # load model path init_model_path: "increment/9" # load model path
......
...@@ -64,8 +64,7 @@ runner: ...@@ -64,8 +64,7 @@ runner:
device: cpu device: cpu
- name: runner_infer - name: runner_infer
epochs: 1 class: infer
class: single_infer
print_interval: 10000 print_interval: 10000
init_model_path: "increment/9" # load model path init_model_path: "increment/9" # load model path
......
...@@ -56,9 +56,7 @@ runner: ...@@ -56,9 +56,7 @@ runner:
init_model_path: "" # load model path init_model_path: "" # load model path
print_interval: 2 print_interval: 2
- name: infer_runner - name: infer_runner
class: single_infer class: infer
# num of epochs
epochs: 1
# device to run training or infer # device to run training or infer
device: cpu device: cpu
print_interval: 1 print_interval: 1
......
...@@ -63,9 +63,7 @@ runner: ...@@ -63,9 +63,7 @@ runner:
init_model_path: "" # load model path init_model_path: "" # load model path
print_interval: 1 print_interval: 1
- name: infer_runner - name: infer_runner
class: single_infer class: infer
# num of epochs
epochs: 1
# device to run training or infer # device to run training or infer
device: cpu device: cpu
print_interval: 1 print_interval: 1
......
...@@ -43,7 +43,7 @@ python -m paddlerec.run -m paddlerec.models.match.multiview-simnet # multiview-s ...@@ -43,7 +43,7 @@ python -m paddlerec.run -m paddlerec.models.match.multiview-simnet # multiview-s
# 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径 # 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径
# 修改对应模型的config.yaml,mode配置infer_runner # 修改对应模型的config.yaml,mode配置infer_runner
# 示例: mode: train_runner -> mode: infer_runner # 示例: mode: train_runner -> mode: infer_runner
# infer_runner中 class配置为 class: single_infer # infer_runner中 class配置为 class: infer
# 修改phase阶段为infer的配置,参照config注释 # 修改phase阶段为infer的配置,参照config注释
# 修改完config.yaml后 执行: # 修改完config.yaml后 执行:
......
...@@ -16,21 +16,21 @@ ...@@ -16,21 +16,21 @@
workspace: "paddlerec.models.multitask.esmm" workspace: "paddlerec.models.multitask.esmm"
dataset: dataset:
- name: dataset_train - name: dataset_train
batch_size: 1 batch_size: 1
type: QueueDataset type: QueueDataset
data_path: "{workspace}/data/train" data_path: "{workspace}/data/train"
data_converter: "{workspace}/esmm_reader.py" data_converter: "{workspace}/esmm_reader.py"
- name: dataset_infer - name: dataset_infer
batch_size: 1 batch_size: 1
type: QueueDataset type: QueueDataset
data_path: "{workspace}/data/test" data_path: "{workspace}/data/test"
data_converter: "{workspace}/esmm_reader.py" data_converter: "{workspace}/esmm_reader.py"
hyper_parameters: hyper_parameters:
vocab_size: 10000 vocab_size: 10000
embed_size: 128 embed_size: 128
optimizer: optimizer:
class: adam class: adam
learning_rate: 0.001 learning_rate: 0.001
strategy: async strategy: async
...@@ -39,30 +39,29 @@ hyper_parameters: ...@@ -39,30 +39,29 @@ hyper_parameters:
mode: [train_runner, infer_runner] mode: [train_runner, infer_runner]
runner: runner:
- name: train_runner - name: train_runner
class: train class: train
device: cpu device: cpu
epochs: 3 epochs: 3
save_checkpoint_interval: 2 save_checkpoint_interval: 2
save_inference_interval: 4 save_inference_interval: 4
save_checkpoint_path: "increment" save_checkpoint_path: "increment"
save_inference_path: "inference" save_inference_path: "inference"
print_interval: 10 print_interval: 10
phases: [train] phases: [train]
- name: infer_runner - name: infer_runner
class: infer class: infer
init_model_path: "increment/0" init_model_path: "increment/1"
device: cpu device: cpu
epochs: 1 print_interval: 1
print_interval: 1 phases: [infer]
phases: [infer]
phase: phase:
- name: train - name: train
model: "{workspace}/model.py" model: "{workspace}/model.py"
dataset_name: dataset_train dataset_name: dataset_train
thread_num: 1 thread_num: 1
- name: infer - name: infer
model: "{workspace}/model.py" model: "{workspace}/model.py"
dataset_name: dataset_infer dataset_name: dataset_infer
thread_num: 1 thread_num: 1
...@@ -52,10 +52,9 @@ runner: ...@@ -52,10 +52,9 @@ runner:
save_inference_path: "inference" save_inference_path: "inference"
print_interval: 10 print_interval: 10
- name: infer_runner - name: infer_runner
class: single_infer class: infer
init_model_path: "increment/0" init_model_path: "increment/0"
device: cpu device: cpu
epochs: 3
phase: phase:
- name: train - name: train
......
...@@ -77,7 +77,7 @@ python -m paddlerec.run -m ./config.yaml # 自定义修改超参后,指定配 ...@@ -77,7 +77,7 @@ python -m paddlerec.run -m ./config.yaml # 自定义修改超参后,指定配
# 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径 # 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径
# 修改对应模型的config.yaml,mode配置infer_runner # 修改对应模型的config.yaml,mode配置infer_runner
# 示例: mode: train_runner -> mode: infer_runner # 示例: mode: train_runner -> mode: infer_runner
# infer_runner中 class配置为 class: single_infer # infer_runner中 class配置为 class: infer
# 修改phase阶段为infer的配置,参照config注释 # 修改phase阶段为infer的配置,参照config注释
# 修改完config.yaml后 执行: # 修改完config.yaml后 执行:
......
...@@ -51,10 +51,9 @@ runner: ...@@ -51,10 +51,9 @@ runner:
save_inference_path: "inference" save_inference_path: "inference"
print_interval: 5 print_interval: 5
- name: infer_runner - name: infer_runner
class: single_infer class: infer
init_model_path: "increment/0" init_model_path: "increment/0"
device: cpu device: cpu
epochs: 3
phase: phase:
- name: train - name: train
......
...@@ -59,8 +59,7 @@ runner: ...@@ -59,8 +59,7 @@ runner:
save_inference_path: "inference" save_inference_path: "inference"
print_interval: 1 print_interval: 1
- name: infer_runner - name: infer_runner
class: single_infer class: infer
epochs: 1
device: cpu device: cpu
init_model_path: "increment/0" init_model_path: "increment/0"
print_interval: 1 print_interval: 1
......
...@@ -60,7 +60,7 @@ runner: ...@@ -60,7 +60,7 @@ runner:
save_inference_path: "inference" save_inference_path: "inference"
print_interval: 1 print_interval: 1
- name: infer_runner - name: infer_runner
class: single_infer class: infer
epochs: 1 epochs: 1
device: cpu device: cpu
init_model_path: "increment/0" init_model_path: "increment/0"
......
...@@ -58,8 +58,7 @@ runner: ...@@ -58,8 +58,7 @@ runner:
save_inference_path: "inference" save_inference_path: "inference"
print_interval: 1 print_interval: 1
- name: infer_runner - name: infer_runner
class: single_infer class: infer
epochs: 1
device: cpu device: cpu
init_model_path: "increment/0" init_model_path: "increment/0"
print_interval: 1 print_interval: 1
......
...@@ -58,8 +58,7 @@ runner: ...@@ -58,8 +58,7 @@ runner:
save_inference_path: "inference" save_inference_path: "inference"
print_interval: 1 print_interval: 1
- name: infer_runner - name: infer_runner
class: single_infer class: infer
epochs: 1
device: cpu device: cpu
init_model_path: "increment/0" init_model_path: "increment/0"
print_interval: 1 print_interval: 1
......
...@@ -56,8 +56,7 @@ runner: ...@@ -56,8 +56,7 @@ runner:
save_inference_path: "inference" save_inference_path: "inference"
print_interval: 1 print_interval: 1
- name: infer_runner - name: infer_runner
class: single_infer class: infer
epochs: 1
device: cpu device: cpu
init_model_path: "increment/0" init_model_path: "increment/0"
print_interval: 1 print_interval: 1
......
...@@ -57,8 +57,7 @@ runner: ...@@ -57,8 +57,7 @@ runner:
save_inference_path: "inference" save_inference_path: "inference"
print_interval: 1 print_interval: 1
- name: infer_runner - name: infer_runner
class: single_infer class: infer
epochs: 1
device: cpu device: cpu
init_model_path: "increment/0" init_model_path: "increment/0"
print_interval: 1 print_interval: 1
......
...@@ -62,8 +62,7 @@ runner: ...@@ -62,8 +62,7 @@ runner:
save_inference_path: "inference" save_inference_path: "inference"
print_interval: 1 print_interval: 1
- name: infer_runner - name: infer_runner
class: single_infer class: infer
epochs: 1
device: cpu device: cpu
init_model_path: "increment/0" init_model_path: "increment/0"
print_interval: 1 print_interval: 1
......
...@@ -57,8 +57,7 @@ runner: ...@@ -57,8 +57,7 @@ runner:
save_inference_path: "inference" save_inference_path: "inference"
print_interval: 1 print_interval: 1
- name: infer_runner - name: infer_runner
class: single_infer class: infer
epochs: 1
device: cpu device: cpu
init_model_path: "increment/0" init_model_path: "increment/0"
print_interval: 1 print_interval: 1
......
...@@ -68,8 +68,7 @@ runner: ...@@ -68,8 +68,7 @@ runner:
save_inference_path: "inference_fnn" save_inference_path: "inference_fnn"
print_interval: 1 print_interval: 1
- name: infer_runner - name: infer_runner
trainer_class: single_infer trainer_class: infer
epochs: 1
device: cpu device: cpu
init_model_path: "increment/0" init_model_path: "increment/0"
print_interval: 1 print_interval: 1
......
...@@ -56,8 +56,7 @@ runner: ...@@ -56,8 +56,7 @@ runner:
save_inference_path: "inference" save_inference_path: "inference"
print_interval: 1 print_interval: 1
- name: infer_runner - name: infer_runner
class: single_infer class: infer
epochs: 1
device: cpu device: cpu
init_model_path: "increment/0" init_model_path: "increment/0"
print_interval: 1 print_interval: 1
......
...@@ -63,8 +63,7 @@ runner: ...@@ -63,8 +63,7 @@ runner:
save_inference_path: "inference" save_inference_path: "inference"
print_interval: 1 print_interval: 1
- name: infer_runner - name: infer_runner
class: single_infer class: infer
epochs: 1
device: cpu device: cpu
init_model_path: "increment/0" init_model_path: "increment/0"
print_interval: 1 print_interval: 1
......
...@@ -60,8 +60,7 @@ runner: ...@@ -60,8 +60,7 @@ runner:
save_inference_path: "inference" save_inference_path: "inference"
print_interval: 1 print_interval: 1
- name: infer_runner - name: infer_runner
class: single_infer class: infer
epochs: 1
device: cpu device: cpu
init_model_path: "increment/0" init_model_path: "increment/0"
print_interval: 1 print_interval: 1
......
...@@ -98,7 +98,7 @@ python -m paddlerec.run -m ./config.yaml ...@@ -98,7 +98,7 @@ python -m paddlerec.run -m ./config.yaml
``` ```
# 修改对应模型的config.yaml,mode配置infer_runner # 修改对应模型的config.yaml,mode配置infer_runner
# 示例: mode: runner1 -> mode: infer_runner # 示例: mode: runner1 -> mode: infer_runner
# infer_runner中 class配置为 class: single_infer # infer_runner中 class配置为 class: infer
# 如果训练阶段和预测阶段的模型输入一致,phase不需要改动,复用train的即可 # 如果训练阶段和预测阶段的模型输入一致,phase不需要改动,复用train的即可
# 修改完config.yaml后 执行: # 修改完config.yaml后 执行:
......
...@@ -54,8 +54,7 @@ runner: ...@@ -54,8 +54,7 @@ runner:
save_checkpoint_path: "increment" save_checkpoint_path: "increment"
save_inference_path: "inference" save_inference_path: "inference"
- name: infer_runner - name: infer_runner
class: single_infer class: infer
epochs: 1
device: cpu device: cpu
init_model_path: "increment/0" init_model_path: "increment/0"
......
...@@ -55,8 +55,7 @@ runner: ...@@ -55,8 +55,7 @@ runner:
save_checkpoint_path: "increment" save_checkpoint_path: "increment"
save_inference_path: "inference" save_inference_path: "inference"
- name: infer_runner - name: infer_runner
class: single_infer class: infer
epochs: 1
device: cpu device: cpu
init_model_path: "increment/0" init_model_path: "increment/0"
......
...@@ -61,9 +61,7 @@ runner: ...@@ -61,9 +61,7 @@ runner:
init_model_path: "" # load model path init_model_path: "" # load model path
print_interval: 1 print_interval: 1
- name: infer_runner - name: infer_runner
class: single_infer class: infer
# num of epochs
epochs: 1
# device to run training or infer # device to run training or infer
device: cpu device: cpu
print_interval: 1 print_interval: 1
......
...@@ -54,10 +54,9 @@ runner: ...@@ -54,10 +54,9 @@ runner:
save_inference_path: "inference" save_inference_path: "inference"
print_interval: 10 print_interval: 10
- name: infer_runner - name: infer_runner
class: single_infer class: infer
init_model_path: "increment/0" init_model_path: "increment/0"
device: cpu device: cpu
epochs: 1
phase: phase:
- name: train - name: train
......
...@@ -51,10 +51,9 @@ runner: ...@@ -51,10 +51,9 @@ runner:
save_inference_path: "inference" save_inference_path: "inference"
print_interval: 10 print_interval: 10
- name: infer_runner - name: infer_runner
class: single_infer class: infer
init_model_path: "increment/0" init_model_path: "increment/0"
device: cpu device: cpu
epochs: 1
phase: phase:
- name: train - name: train
......
...@@ -95,7 +95,7 @@ python -m paddlerec.run -m ./config.yaml # 自定义修改超参后,指定配 ...@@ -95,7 +95,7 @@ python -m paddlerec.run -m ./config.yaml # 自定义修改超参后,指定配
# 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径 # 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径
# 修改对应模型的config.yaml,mode配置infer_runner # 修改对应模型的config.yaml,mode配置infer_runner
# 示例: mode: train_runner -> mode: infer_runner # 示例: mode: train_runner -> mode: infer_runner
# infer_runner中 class配置为 class: single_infer # infer_runner中 class配置为 class: infer
# 修改phase阶段为infer的配置,参照config注释 # 修改phase阶段为infer的配置,参照config注释
# 修改完config.yaml后 执行: # 修改完config.yaml后 执行:
......
...@@ -50,10 +50,9 @@ runner: ...@@ -50,10 +50,9 @@ runner:
save_inference_path: "inference" save_inference_path: "inference"
print_interval: 10 print_interval: 10
- name: infer_runner - name: infer_runner
class: single_infer class: infer
init_model_path: "increment/0" init_model_path: "increment/0"
device: cpu device: cpu
epochs: 1
phase: phase:
- name: train - name: train
......
...@@ -61,9 +61,7 @@ runner: ...@@ -61,9 +61,7 @@ runner:
init_model_path: "" # load model path init_model_path: "" # load model path
print_interval: 1 print_interval: 1
- name: infer_runner - name: infer_runner
class: single_infer class: infer
# num of epochs
epochs: 1
# device to run training or infer # device to run training or infer
device: cpu device: cpu
init_model_path: "increment/0" # load model path init_model_path: "increment/0" # load model path
......
...@@ -51,10 +51,9 @@ runner: ...@@ -51,10 +51,9 @@ runner:
save_checkpoint_path: "increment" save_checkpoint_path: "increment"
save_inference_path: "inference" save_inference_path: "inference"
- name: infer_runner - name: infer_runner
class: single_infer class: infer
init_model_path: "increment/0" init_model_path: "increment/0"
device: cpu device: cpu
epochs: 3
phase: phase:
- name: train - name: train
......
...@@ -80,10 +80,8 @@ runner: ...@@ -80,10 +80,8 @@ runner:
print_interval: 10 print_interval: 10
- name: runner2 - name: runner2
class: single_infer class: infer
startup_class_path: "{workspace}/tdm_startup.py" startup_class_path: "{workspace}/tdm_startup.py"
# num of epochs
epochs: 1
# device to run training or infer # device to run training or infer
device: cpu device: cpu
init_model_path: "increment/0" # load model path init_model_path: "increment/0" # load model path
......
...@@ -139,8 +139,8 @@ def get_engine(args, running_config, mode): ...@@ -139,8 +139,8 @@ def get_engine(args, running_config, mode):
engine = "LOCAL_CLUSTER_TRAIN" engine = "LOCAL_CLUSTER_TRAIN"
if engine not in engine_choices: if engine not in engine_choices:
raise ValueError("{} can not be chosen in {}".format(engine_class, raise ValueError("{} can only be chosen in {}".format(engine_class,
engine_choices)) engine_choices))
run_engine = engines[transpiler].get(engine, None) run_engine = engines[transpiler].get(engine, None)
return run_engine return run_engine
...@@ -439,8 +439,8 @@ def local_cluster_engine(args): ...@@ -439,8 +439,8 @@ def local_cluster_engine(args):
if fleet_mode == "COLLECTIVE": if fleet_mode == "COLLECTIVE":
cluster_envs["selected_gpus"] = selected_gpus cluster_envs["selected_gpus"] = selected_gpus
gpus = selected_gpus.split(",") gpus = selected_gpus.split(",")
gpu_num = get_worker_num(run_extras, len(gpus)) worker_num = get_worker_num(run_extras, len(gpus))
cluster_envs["selected_gpus"] = ','.join(gpus[:gpu_num]) cluster_envs["selected_gpus"] = ','.join(gpus[:worker_num])
cluster_envs["server_num"] = server_num cluster_envs["server_num"] = server_num
cluster_envs["worker_num"] = worker_num cluster_envs["worker_num"] = worker_num
......
...@@ -49,7 +49,7 @@ function model_test() { ...@@ -49,7 +49,7 @@ function model_test() {
root_dir=`pwd` root_dir=`pwd`
all_model=$(find ${root_dir} -name config.yaml) all_model=$(find ${root_dir} -name config.yaml)
special_models=("demo" "pnn" "fgcnn" "esmm") special_models=("demo" "pnn" "fgcnn" "gru4rec" "tagspace")
for model in ${all_model} for model in ${all_model}
do do
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册