From 0d17c1b816ec5672d143aad18df3e2497bcb7d29 Mon Sep 17 00:00:00 2001 From: Thunderbrook <52529258+Thunderbrook@users.noreply.github.com> Date: Thu, 21 Nov 2019 20:03:31 +0800 Subject: [PATCH] solve pslib core in stop worker (#21263) * general table * add sparse table test=develop * no cvm test=develop * add no_cvm test=develop * add note test=develop * code style test=develop * code style test=develop * code style test=develop * code style test=develop * code style test=develop * add key of optimizer test=develop * solve pslib stop core test=develop * barrier test=develop * add notes test=develop --- paddle/fluid/framework/fleet/fleet_wrapper.cc | 7 +++++++ paddle/fluid/framework/fleet/fleet_wrapper.h | 2 ++ paddle/fluid/pybind/fleet_wrapper_py.cc | 1 + .../incubate/fleet/parameter_server/pslib/__init__.py | 4 ++++ 4 files changed, 14 insertions(+) diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc index d46a7ec1fcd..c4c64881a82 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.cc +++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc @@ -91,6 +91,13 @@ void FleetWrapper::StopServer() { #endif } +void FleetWrapper::FinalizeWorker() { +#ifdef PADDLE_WITH_PSLIB + VLOG(3) << "Going to finalize worker"; + pslib_ptr_->finalize_worker(); +#endif +} + uint64_t FleetWrapper::RunServer() { #ifdef PADDLE_WITH_PSLIB VLOG(3) << "Going to run server"; diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h index fc98cba853b..aa93e8d28bc 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.h +++ b/paddle/fluid/framework/fleet/fleet_wrapper.h @@ -147,6 +147,8 @@ class FleetWrapper { int index); // stop server void StopServer(); + // finalize worker to make worker can be stop + void FinalizeWorker(); // run server uint64_t RunServer(); // gather server ip diff --git a/paddle/fluid/pybind/fleet_wrapper_py.cc b/paddle/fluid/pybind/fleet_wrapper_py.cc index 31268f5e182..679c91e8d8d 100644 --- a/paddle/fluid/pybind/fleet_wrapper_py.cc +++ b/paddle/fluid/pybind/fleet_wrapper_py.cc @@ -55,6 +55,7 @@ void BindFleetWrapper(py::module* m) { .def("load_model", &framework::FleetWrapper::LoadModel) .def("clear_model", &framework::FleetWrapper::ClearModel) .def("stop_server", &framework::FleetWrapper::StopServer) + .def("finalize_worker", &framework::FleetWrapper::FinalizeWorker) .def("gather_servers", &framework::FleetWrapper::GatherServers) .def("gather_clients", &framework::FleetWrapper::GatherClients) .def("get_clients_info", &framework::FleetWrapper::GetClientsInfo) diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py index c6d62b1d027..acebbe82516 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py @@ -182,6 +182,10 @@ class PSLib(Fleet): destroyed when stop() is called. """ self._role_maker._barrier_worker() + # all worker should be finalize first + if self._role_maker.is_worker(): + self._fleet_ptr.finalize_worker() + self._role_maker._barrier_worker() if self._role_maker.is_first_worker(): self._fleet_ptr.stop_server() self._role_maker._barrier_worker() -- GitLab