diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc index 8147c7746192a91bb82c2aa754c5664def4c142f..394ff24c466622956b18b3012c146f6f9ddd838e 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.cc +++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc @@ -237,6 +237,7 @@ void FleetWrapper::PushDenseParamSync( std::vector regions; for (auto& t : var_names) { Variable* var = scope.FindVar(t); + CHECK(var != nullptr) << "var[" << t << "] not found"; LoDTensor* tensor = var->GetMutable(); float* g = tensor->mutable_data(place); paddle::ps::Region reg(g, tensor->numel()); diff --git a/paddle/fluid/framework/io/shell.cc b/paddle/fluid/framework/io/shell.cc index bcfa4f44ff1c6561cbbd60b76f75de1c8461a88a..ab671cb5690df51c1cff141906c40cc9e74584fa 100644 --- a/paddle/fluid/framework/io/shell.cc +++ b/paddle/fluid/framework/io/shell.cc @@ -126,7 +126,7 @@ static int shell_popen_fork_internal(const char* real_cmd, bool do_read, } close_open_fds_internal(); - if (execl("/bin/sh", "sh", "-c", real_cmd, NULL) < 0) { + if (execl("/bin/bash", "bash", "-c", real_cmd, NULL) < 0) { return -1; } exit(127); diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index e536339506a77b5d9d465c4f5c77727c6440c8d8..e15197037e1d901855883919b02a1574b7bc9a29 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -712,7 +712,7 @@ class Executor(object): if dataset == None: raise RuntimeError("dataset is needed and should be initialized") - if self.place == paddle.fluid.CUDAPlace(): + if not isinstance(self.place, core.CPUPlace): raise RuntimeError("infer_from_dataset is verified on CPUPlace" "We will open CUDAPlace in the future") diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py index 044aa33c2b5b572aa40169e8c57936b105ba0121..531a07e026508811020075171ee7aa86aba0c0d5 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py @@ -123,18 +123,23 @@ class Fleet(object): print("You should run DistributedOptimizer.minimize() first") sys.exit(-1) - def init_worker(self, programs): + def init_worker(self, programs, scopes=None): """ init_worker(): will be called by user. When a user knows current process is_server(), he/she should call init_worker() to initialize global information about worker and connect - worker with pserver. + worker with pserver. You should run startup program before init_worker. Args: programs(Program|list): a Program or a list of Programs - + scopes(Scope|list): a Scope or a list of Scopes, default None. """ if not isinstance(programs, list): programs = [programs] + if scopes is None: + scopes = [fluid.global_scope()] * len(programs) + if len(scopes) != len(programs): + print("You should make sure len(scopes) == len(programs) or set scopes None") + sys.exit(-1) if self._opt_info: if "fleet_desc" in self._opt_info: self._dist_desc_str = text_format.MessageToString( @@ -160,7 +165,7 @@ class Fleet(object): self.role_maker_._barrier_worker() if self.role_maker_._is_first_worker(): tables = self._dist_desc.trainer_param.dense_table - for prog in programs: + for prog, scope in zip(programs, scopes): prog_id = str(id(prog)) prog_conf = self._opt_info['program_configs'][prog_id] prog_tables = {} @@ -174,8 +179,13 @@ class Fleet(object): continue var_name_list = [] for i in range(0, len(table.dense_variable_name)): - var_name_list.append(table.dense_variable_name[i]) - self._fleet_ptr.init_model(prog.desc, + var_name = table.dense_variable_name[i] + if scope.find_var(var_name) is None: + print("var " + var_name + " not found in scope, " + "you should run startup program first") + sys.exit(-1) + var_name_list.append(var_name) + self._fleet_ptr.init_model(scope, int(table.table_id), var_name_list) # barrier for init model done diff --git a/python/paddle/fluid/tests/unittests/test_dataset.py b/python/paddle/fluid/tests/unittests/test_dataset.py index 39094323f3309ec7014e01e0bb2755cf84381e2f..9c557097a9f07a2e50a3e468584019da679c5a06 100644 --- a/python/paddle/fluid/tests/unittests/test_dataset.py +++ b/python/paddle/fluid/tests/unittests/test_dataset.py @@ -107,10 +107,12 @@ class TestDataset(unittest.TestCase): exe = fluid.Executor(fluid.CPUPlace()) exe.run(fluid.default_startup_program()) for i in range(2): - try: - exe.train_from_dataset(fluid.default_main_program(), dataset) - except: - self.assertTrue(False) + #try: + exe.train_from_dataset(fluid.default_main_program(), dataset) + #except ImportError as e: + # pass + #except Exception as e: + # self.assertTrue(False) os.remove("./test_in_memory_dataset_run_a.txt") os.remove("./test_in_memory_dataset_run_b.txt") @@ -149,10 +151,12 @@ class TestDataset(unittest.TestCase): exe = fluid.Executor(fluid.CPUPlace()) exe.run(fluid.default_startup_program()) for i in range(2): - try: - exe.train_from_dataset(fluid.default_main_program(), dataset) - except: - self.assertTrue(False) + #try: + exe.train_from_dataset(fluid.default_main_program(), dataset) + #except ImportError as e: + # pass + #except Exception as e: + # self.assertTrue(False) os.remove("./test_queue_dataset_run_a.txt") os.remove("./test_queue_dataset_run_b.txt") diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py index 380c404fb2d6a36bf3732ebbff4b6cef22f71362..b91f1d1f3cf46ed0543c05914ffff852df0d4877 100644 --- a/python/paddle/fluid/trainer_desc.py +++ b/python/paddle/fluid/trainer_desc.py @@ -23,7 +23,7 @@ class TrainerDesc(object): with open(proto_file, 'r') as f: text_format.Parse(f.read(), self.proto_desc) ''' - from proto import trainer_desc_pb2 + from .proto import trainer_desc_pb2 self.proto_desc = trainer_desc_pb2.TrainerDesc() import multiprocessing as mp # set default thread num == cpu count diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py index 4e957880f77a41d3dad9582bc7cc09af1d1a253b..871b663663e87a08ef3edaf58a4480b85caf4c4a 100644 --- a/python/paddle/fluid/trainer_factory.py +++ b/python/paddle/fluid/trainer_factory.py @@ -12,6 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +from .trainer_desc import MultiTrainer, DistMultiTrainer +from .device_worker import Hogwild, DownpourSGD + __all__ = ["TrainerFactory"] @@ -20,8 +23,6 @@ class TrainerFactory(object): pass def _create_trainer(self, opt_info=None): - from .trainer_desc import MultiTrainer, DistMultiTrainer - from .device_worker import Hogwild, DownpourSGD trainer = None device_worker = None if opt_info == None: