From 5e5139283b03adf0b9dc092eb7b16a3076089769 Mon Sep 17 00:00:00 2001 From: xjqbest <173596896@qq.com> Date: Thu, 4 Apr 2019 01:05:23 +0800 Subject: [PATCH] fix runtime error test=develop --- paddle/fluid/framework/fleet/fleet_wrapper.cc | 1 + paddle/fluid/framework/io/shell.cc | 2 +- python/paddle/fluid/executor.py | 2 +- .../fleet/parameter_server/__init__.py | 22 ++++++++++++++----- .../fluid/tests/unittests/test_dataset.py | 20 ++++++++++------- python/paddle/fluid/trainer_desc.py | 2 +- python/paddle/fluid/trainer_factory.py | 5 +++-- 7 files changed, 35 insertions(+), 19 deletions(-) diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc index 8147c7746..394ff24c4 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.cc +++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc @@ -237,6 +237,7 @@ void FleetWrapper::PushDenseParamSync( std::vector regions; for (auto& t : var_names) { Variable* var = scope.FindVar(t); + CHECK(var != nullptr) << "var[" << t << "] not found"; LoDTensor* tensor = var->GetMutable(); float* g = tensor->mutable_data(place); paddle::ps::Region reg(g, tensor->numel()); diff --git a/paddle/fluid/framework/io/shell.cc b/paddle/fluid/framework/io/shell.cc index bcfa4f44f..ab671cb56 100644 --- a/paddle/fluid/framework/io/shell.cc +++ b/paddle/fluid/framework/io/shell.cc @@ -126,7 +126,7 @@ static int shell_popen_fork_internal(const char* real_cmd, bool do_read, } close_open_fds_internal(); - if (execl("/bin/sh", "sh", "-c", real_cmd, NULL) < 0) { + if (execl("/bin/bash", "bash", "-c", real_cmd, NULL) < 0) { return -1; } exit(127); diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index e53633950..e15197037 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -712,7 +712,7 @@ class Executor(object): if dataset == None: raise RuntimeError("dataset is needed and should be initialized") - if self.place == paddle.fluid.CUDAPlace(): + if not isinstance(self.place, core.CPUPlace): raise RuntimeError("infer_from_dataset is verified on CPUPlace" "We will open CUDAPlace in the future") diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py index 044aa33c2..531a07e02 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py @@ -123,18 +123,23 @@ class Fleet(object): print("You should run DistributedOptimizer.minimize() first") sys.exit(-1) - def init_worker(self, programs): + def init_worker(self, programs, scopes=None): """ init_worker(): will be called by user. When a user knows current process is_server(), he/she should call init_worker() to initialize global information about worker and connect - worker with pserver. + worker with pserver. You should run startup program before init_worker. Args: programs(Program|list): a Program or a list of Programs - + scopes(Scope|list): a Scope or a list of Scopes, default None. """ if not isinstance(programs, list): programs = [programs] + if scopes is None: + scopes = [fluid.global_scope()] * len(programs) + if len(scopes) != len(programs): + print("You should make sure len(scopes) == len(programs) or set scopes None") + sys.exit(-1) if self._opt_info: if "fleet_desc" in self._opt_info: self._dist_desc_str = text_format.MessageToString( @@ -160,7 +165,7 @@ class Fleet(object): self.role_maker_._barrier_worker() if self.role_maker_._is_first_worker(): tables = self._dist_desc.trainer_param.dense_table - for prog in programs: + for prog, scope in zip(programs, scopes): prog_id = str(id(prog)) prog_conf = self._opt_info['program_configs'][prog_id] prog_tables = {} @@ -174,8 +179,13 @@ class Fleet(object): continue var_name_list = [] for i in range(0, len(table.dense_variable_name)): - var_name_list.append(table.dense_variable_name[i]) - self._fleet_ptr.init_model(prog.desc, + var_name = table.dense_variable_name[i] + if scope.find_var(var_name) is None: + print("var " + var_name + " not found in scope, " + "you should run startup program first") + sys.exit(-1) + var_name_list.append(var_name) + self._fleet_ptr.init_model(scope, int(table.table_id), var_name_list) # barrier for init model done diff --git a/python/paddle/fluid/tests/unittests/test_dataset.py b/python/paddle/fluid/tests/unittests/test_dataset.py index 39094323f..9c557097a 100644 --- a/python/paddle/fluid/tests/unittests/test_dataset.py +++ b/python/paddle/fluid/tests/unittests/test_dataset.py @@ -107,10 +107,12 @@ class TestDataset(unittest.TestCase): exe = fluid.Executor(fluid.CPUPlace()) exe.run(fluid.default_startup_program()) for i in range(2): - try: - exe.train_from_dataset(fluid.default_main_program(), dataset) - except: - self.assertTrue(False) + #try: + exe.train_from_dataset(fluid.default_main_program(), dataset) + #except ImportError as e: + # pass + #except Exception as e: + # self.assertTrue(False) os.remove("./test_in_memory_dataset_run_a.txt") os.remove("./test_in_memory_dataset_run_b.txt") @@ -149,10 +151,12 @@ class TestDataset(unittest.TestCase): exe = fluid.Executor(fluid.CPUPlace()) exe.run(fluid.default_startup_program()) for i in range(2): - try: - exe.train_from_dataset(fluid.default_main_program(), dataset) - except: - self.assertTrue(False) + #try: + exe.train_from_dataset(fluid.default_main_program(), dataset) + #except ImportError as e: + # pass + #except Exception as e: + # self.assertTrue(False) os.remove("./test_queue_dataset_run_a.txt") os.remove("./test_queue_dataset_run_b.txt") diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py index 380c404fb..b91f1d1f3 100644 --- a/python/paddle/fluid/trainer_desc.py +++ b/python/paddle/fluid/trainer_desc.py @@ -23,7 +23,7 @@ class TrainerDesc(object): with open(proto_file, 'r') as f: text_format.Parse(f.read(), self.proto_desc) ''' - from proto import trainer_desc_pb2 + from .proto import trainer_desc_pb2 self.proto_desc = trainer_desc_pb2.TrainerDesc() import multiprocessing as mp # set default thread num == cpu count diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py index 4e957880f..871b66366 100644 --- a/python/paddle/fluid/trainer_factory.py +++ b/python/paddle/fluid/trainer_factory.py @@ -12,6 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +from .trainer_desc import MultiTrainer, DistMultiTrainer +from .device_worker import Hogwild, DownpourSGD + __all__ = ["TrainerFactory"] @@ -20,8 +23,6 @@ class TrainerFactory(object): pass def _create_trainer(self, opt_info=None): - from .trainer_desc import MultiTrainer, DistMultiTrainer - from .device_worker import Hogwild, DownpourSGD trainer = None device_worker = None if opt_info == None: -- GitLab