diff --git a/core/factory.py b/core/factory.py index 2e2d013bd27eb73abd3b9ad5507b4e9373276a3b..ec6ec2ea28831c92ed59a3ada8169ce1dc433b5d 100755 --- a/core/factory.py +++ b/core/factory.py @@ -14,9 +14,7 @@ import os import sys - import yaml - from paddlerec.core.utils import envs trainer_abs = os.path.join( @@ -66,16 +64,9 @@ class TrainerFactory(object): @staticmethod def create(config): - _config = None - if os.path.isfile(config): - with open(config, 'r') as rb: - _config = yaml.load(rb.read(), Loader=yaml.FullLoader) - else: - raise ValueError("paddlerec's config only support yaml") - + _config = envs.load_yaml(config) envs.set_global_envs(_config) envs.update_workspace() - trainer = TrainerFactory._build_trainer(config) return trainer diff --git a/core/reader.py b/core/reader.py index 555ae4ba83fa1fd0e1e57e110c199c9cedc1b1cb..6565471e117902fbc747bd4cbe9a79259466b42a 100755 --- a/core/reader.py +++ b/core/reader.py @@ -13,13 +13,11 @@ # limitations under the License. from __future__ import print_function - import abc import os - +from functools import reduce import paddle.fluid.incubate.data_generator as dg import yaml - from paddlerec.core.utils import envs @@ -28,12 +26,9 @@ class Reader(dg.MultiSlotDataGenerator): def __init__(self, config): dg.MultiSlotDataGenerator.__init__(self) - - if os.path.isfile(config): - with open(config, 'r') as rb: - _config = yaml.load(rb.read(), Loader=yaml.FullLoader) - else: - raise ValueError("reader config only support yaml") + _config = envs.load_yaml(config) + envs.set_global_envs(_config) + envs.update_workspace() @abc.abstractmethod def init(self): @@ -50,11 +45,9 @@ class SlotReader(dg.MultiSlotDataGenerator): def __init__(self, config): dg.MultiSlotDataGenerator.__init__(self) - if os.path.isfile(config): - with open(config, 'r') as rb: - _config = yaml.load(rb.read(), Loader=yaml.FullLoader) - else: - raise ValueError("reader config only support yaml") + _config = envs.load_yaml(config) + envs.set_global_envs(_config) + envs.update_workspace() def init(self, sparse_slots, dense_slots, padding=0): from operator import mul diff --git a/core/trainer.py b/core/trainer.py index b7c22ea89bd279a2e1e233edeb4d8cf11b8aa5c0..46b77b75366bd9269eaff11d7db12d1901f820b1 100755 --- a/core/trainer.py +++ b/core/trainer.py @@ -30,16 +30,12 @@ class Trainer(object): def __init__(self, config=None): self._status_processor = {} - self._place = fluid.CPUPlace() self._exe = fluid.Executor(self._place) - self._exector_context = {} self._context = {'status': 'uninit', 'is_exit': False} self._config_yaml = config - - with open(config, 'r') as rb: - self._config = yaml.load(rb.read(), Loader=yaml.FullLoader) + self._config = envs.load_yaml(config) def regist_context_processor(self, status_name, processor): """ @@ -87,12 +83,8 @@ class Trainer(object): def user_define_engine(engine_yaml): - with open(engine_yaml, 'r') as rb: - _config = yaml.load(rb.read(), Loader=yaml.FullLoader) - assert _config is not None - + _config = envs.load_yaml(engine_yaml) envs.set_runtime_environs(_config) - train_location = envs.get_global_env("engine.file") train_dirname = os.path.dirname(train_location) base_name = os.path.splitext(os.path.basename(train_location))[0] diff --git a/core/utils/envs.py b/core/utils/envs.py index f432950dfa50571cd307d4a370484e35ff77b408..79168e83b742466e27c1f7db846706185adfad06 100755 --- a/core/utils/envs.py +++ b/core/utils/envs.py @@ -203,3 +203,26 @@ def find_free_port(): new_port = __free_port() return new_port + + +def load_yaml(config): + vs = [int(i) for i in yaml.__version__.split(".")] + if vs[0] < 5: + use_full_loader = False + elif vs[0] > 5: + use_full_loader = True + else: + if vs[1] >= 1: + use_full_loader = True + else: + use_full_loader = False + + if os.path.isfile(config): + with open(config, 'r') as rb: + if use_full_loader: + _config = yaml.load(rb.read(), Loader=yaml.FullLoader) + else: + _config = yaml.load(rb.read()) + return _config + else: + raise ValueError("config {} can not be supported".format(config)) diff --git a/core/utils/validation.py b/core/utils/validation.py new file mode 100644 index 0000000000000000000000000000000000000000..6f800ed27fb936839143dc254dc139c017b91c54 --- /dev/null +++ b/core/utils/validation.py @@ -0,0 +1,151 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddlerec.core.utils import envs + + +class ValueFormat: + def __init__(self, type, value, value_handler): + self.type = type + self.value = value + self.value_handler = value_handler + self.help = help + + def is_valid(self, name, value): + ret = self.is_type_valid(name, value) + if not ret: + return ret + + ret = self.is_value_valid(name, value) + return ret + + def is_type_valid(self, name, value): + if self.type == "int": + if not isinstance(value, int): + print("\nattr {} should be int, but {} now\n".format( + name, self.type)) + return False + return True + + elif self.type == "str": + if not isinstance(value, str): + print("\nattr {} should be str, but {} now\n".format( + name, self.type)) + return False + return True + + elif self.type == "strs": + if not isinstance(value, list): + print("\nattr {} should be list(str), but {} now\n".format( + name, self.type)) + return False + for v in value: + if not isinstance(v, str): + print("\nattr {} should be list(str), but list({}) now\n". + format(name, type(v))) + return False + return True + + elif self.type == "ints": + if not isinstance(value, list): + print("\nattr {} should be list(int), but {} now\n".format( + name, self.type)) + return False + for v in value: + if not isinstance(v, int): + print("\nattr {} should be list(int), but list({}) now\n". + format(name, type(v))) + return False + return True + + else: + print("\nattr {}'s type is {}, can not be supported now\n".format( + name, type(value))) + return False + + def is_value_valid(self, name, value): + ret = self.value_handler(value) + return ret + + +def in_value_handler(name, value, values): + if value not in values: + print("\nattr {}'s value is {}, but {} is expected\n".format( + name, value, values)) + return False + return True + + +def eq_value_handler(name, value, values): + if value != values: + print("\nattr {}'s value is {}, but == {} is expected\n".format( + name, value, values)) + return False + return True + + +def ge_value_handler(name, value, values): + if value < values: + print("\nattr {}'s value is {}, but >= {} is expected\n".format( + name, value, values)) + return False + return True + + +def le_value_handler(name, value, values): + if value > values: + print("\nattr {}'s value is {}, but <= {} is expected\n".format( + name, value, values)) + return False + return True + + +def register(): + validations = {} + validations["train.workspace"] = ValueFormat("str", None, eq_value_handler) + validations["train.device"] = ValueFormat("str", ["cpu", "gpu"], + in_value_handler) + validations["train.epochs"] = ValueFormat("int", 1, ge_value_handler) + validations["train.engine"] = ValueFormat( + "str", ["single", "local_cluster", "cluster"], in_value_handler) + + requires = [ + "train.namespace", "train.device", "train.epochs", "train.engine" + ] + return validations, requires + + +def yaml_validation(config): + all_checkers, require_checkers = register() + + _config = envs.load_yaml(config) + flattens = envs.flatten_environs(_config) + + for required in require_checkers: + if required not in flattens.keys(): + print("\ncan not find {} in yaml, which is required\n".format( + required)) + return False + + for name, flatten in flattens.items(): + checker = all_checkers.get(name, None) + + if not checker: + continue + + ret = checker.is_valid(name, flattens) + if not ret: + return False + + return True diff --git a/doc/design.md b/doc/design.md index a442bd16a25301178538f482cd537a4ca23bc395..f88401aafc74a1b9910bdb082fa7a6cefa301359 100644 --- a/doc/design.md +++ b/doc/design.md @@ -197,13 +197,7 @@ class Reader(dg.MultiSlotDataGenerator): def __init__(self, config): dg.MultiSlotDataGenerator.__init__(self) - - if os.path.isfile(config): - with open(config, 'r') as rb: - _config = yaml.load(rb.read(), Loader=yaml.FullLoader) - else: - raise ValueError("reader config only support yaml") - + _config = envs.load_yaml(config) envs.set_global_envs(_config) envs.update_workspace() diff --git a/models/rank/dcn/data/get_slot_data.py b/models/rank/dcn/data/get_slot_data.py index b1adb02817f1abbabfecd328e85aea3ee9136533..59a12010e767179e61a3e2d63480d3c1935afe77 100755 --- a/models/rank/dcn/data/get_slot_data.py +++ b/models/rank/dcn/data/get_slot_data.py @@ -12,18 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. import math -import sys -import yaml -from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs -import math import os try: import cPickle as pickle except ImportError: import pickle -from collections import Counter -import os import paddle.fluid.incubate.data_generator as dg @@ -31,12 +24,6 @@ class TrainReader(dg.MultiSlotDataGenerator): def __init__(self, config): dg.MultiSlotDataGenerator.__init__(self) - if os.path.isfile(config): - with open(config, 'r') as rb: - _config = yaml.load(rb.read(), Loader=yaml.FullLoader) - else: - raise ValueError("reader config only support yaml") - def init(self): self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] self.cont_max_ = [ diff --git a/models/rank/deepfm/data/get_slot_data.py b/models/rank/deepfm/data/get_slot_data.py index c03b29248557de21ddb29c6a287045d9a7f1b500..0e281063b072276a00c3d7d4ca6c743e74c658cf 100755 --- a/models/rank/deepfm/data/get_slot_data.py +++ b/models/rank/deepfm/data/get_slot_data.py @@ -12,10 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import yaml, os - -from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs +import os import paddle.fluid.incubate.data_generator as dg try: import cPickle as pickle @@ -27,12 +24,6 @@ class TrainReader(dg.MultiSlotDataGenerator): def __init__(self, config): dg.MultiSlotDataGenerator.__init__(self) - if os.path.isfile(config): - with open(config, 'r') as rb: - _config = yaml.load(rb.read(), Loader=yaml.FullLoader) - else: - raise ValueError("reader config only support yaml") - def init(self): self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] self.cont_max_ = [ diff --git a/models/rank/dnn/model.py b/models/rank/dnn/model.py index f4425e3d9853b7f7decbc45ff607c4173901d0cf..709a4f1f0c45033827495a733b68c823f33b8c18 100755 --- a/models/rank/dnn/model.py +++ b/models/rank/dnn/model.py @@ -32,7 +32,7 @@ class Model(ModelBase): self.sparse_feature_dim = envs.get_global_env( "hyper_parameters.sparse_feature_dim") self.learning_rate = envs.get_global_env( - "hyper_parameters.learning_rate") + "hyper_parameters.optimizer.learning_rate") def net(self, input, is_infer=False): self.sparse_inputs = self._sparse_data_var[1:] diff --git a/models/rank/wide_deep/data/get_slot_data.py b/models/rank/wide_deep/data/get_slot_data.py index ec37f9af9ebc1120294f965cc5845ce7bb3feaf0..11f1e386523f2fce9ea703a2217f6fa441572418 100755 --- a/models/rank/wide_deep/data/get_slot_data.py +++ b/models/rank/wide_deep/data/get_slot_data.py @@ -11,10 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import yaml, os - -from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs +import os try: import cPickle as pickle except ImportError: @@ -26,12 +23,6 @@ class TrainReader(dg.MultiSlotDataGenerator): def __init__(self, config): dg.MultiSlotDataGenerator.__init__(self) - if os.path.isfile(config): - with open(config, 'r') as rb: - _config = yaml.load(rb.read(), Loader=yaml.FullLoader) - else: - raise ValueError("reader config only support yaml") - def init(self): pass diff --git a/models/rank/xdeepfm/data/get_slot_data.py b/models/rank/xdeepfm/data/get_slot_data.py index 0ace10b4fada14d1c6b91dfbc4c80bf91737c28d..804f05ae1ac6388bad815539825828ddac9c4ed2 100755 --- a/models/rank/xdeepfm/data/get_slot_data.py +++ b/models/rank/xdeepfm/data/get_slot_data.py @@ -12,9 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import yaml, os -from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs +import os try: import cPickle as pickle except ImportError: @@ -25,11 +23,6 @@ import paddle.fluid.incubate.data_generator as dg class TrainReader(dg.MultiSlotDataGenerator): def __init__(self, config): dg.MultiSlotDataGenerator.__init__(self) - if os.path.isfile(config): - with open(config, 'r') as rb: - _config = yaml.load(rb.read(), Loader=yaml.FullLoader) - else: - raise ValueError("reader config only support yaml") def init(self): pass diff --git a/run.py b/run.py index 594801fcdd5edb1821799ef53994674aec6a934d..9ded865dbb859450efe65aae703943cd156776da 100755 --- a/run.py +++ b/run.py @@ -14,7 +14,7 @@ import os import subprocess - +import sys import argparse import tempfile import yaml @@ -22,6 +22,7 @@ import copy from paddlerec.core.factory import TrainerFactory from paddlerec.core.utils import envs from paddlerec.core.utils import util +from paddlerec.core.utils import validation engines = {} device = ["CPU", "GPU"] @@ -48,9 +49,7 @@ def engine_registry(): def get_inters_from_yaml(file, filters): - with open(file, 'r') as rb: - _envs = yaml.load(rb.read(), Loader=yaml.FullLoader) - + _envs = envs.load_yaml(file) flattens = envs.flatten_environs(_envs) inters = {} for k, v in flattens.items(): @@ -197,9 +196,7 @@ def cluster_engine(args): def master(): role = "MASTER" from paddlerec.core.engine.cluster.cluster import ClusterEngine - with open(args.backend, 'r') as rb: - _envs = yaml.load(rb.read(), Loader=yaml.FullLoader) - + _envs = envs.load_yaml(args.backend) flattens = envs.flatten_environs(_envs, "_") flattens["engine_role"] = role flattens["engine_run_config"] = args.model @@ -322,8 +319,9 @@ if __name__ == "__main__": model_name = args.model.split('.')[-1] args.model = get_abs_model(args.model) + if not validation.yaml_validation(args.model): + sys.exit(-1) engine_registry() - which_engine = get_engine(args) engine = which_engine(args) engine.run() diff --git a/setup.py b/setup.py index 8ad1cc742434aa39513a1c618b56649c3530686a..87f9c00a8304e0219750a6b2b8fb20a81c1afc0a 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ from setuptools import setup, find_packages import shutil import tempfile -requires = ["paddlepaddle == 1.7.2", "pyyaml >= 5.1.1"] +requires = ["paddlepaddle == 1.7.2", "PyYAML >= 5.1.1"] about = {} about["__title__"] = "paddle-rec"