diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 7be93fa6002ae93c3e1b75c8f7fe5ca5f40b271f..74945fb4f2f745b6ca9c48adb0c8b9e6ae1e94a4 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" -#include "paddle/fluid/platform/profiler.h" #include #include @@ -24,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/framework/details/multi_devices_graph_builder.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace framework { @@ -43,30 +43,40 @@ class ParallelExecutorPrivate { #endif }; +std::vector &ParallelExecutor::GetLocalScopes() { + return member_->local_scopes_; +} + ParallelExecutor::ParallelExecutor( size_t num_threads, bool use_event, const std::vector &places, const std::unordered_set ¶ms, - const ProgramDesc &startup_program, const ProgramDesc &main_program, - const std::string &loss_var_name, Scope *scope, bool allow_op_delay) + const std::unordered_set &bcast_vars, + const ProgramDesc &main_program, const std::string &loss_var_name, + Scope *scope, const std::vector &local_scopes, bool allow_op_delay) : member_(new ParallelExecutorPrivate(places)) { member_->global_scope_ = scope; - // Step 1. RunStartupProgram and Bcast the params to devs. - Executor exe(places[0]); - exe.Run(startup_program, scope, 0); + // Step 1. Bcast the params to devs. // Create local scopes - for (size_t i = 0; i < member_->places_.size(); ++i) { - member_->local_scopes_.push_back(&scope->NewScope()); + if (local_scopes.empty()) { + for (size_t i = 0; i < member_->places_.size(); ++i) { + member_->local_scopes_.push_back(&scope->NewScope()); + } + } else { + PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size()); + for (size_t i = 0; i < member_->places_.size(); ++i) { + member_->local_scopes_.push_back(local_scopes[i]); + } } // Bcast Parameters to all GPUs #ifdef PADDLE_WITH_CUDA member_->nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_)); #endif - if (platform::is_gpu_place(places[0]) && - member_->local_scopes_.size() != 1) { // Is CUDA - BCastParamsToGPUs(startup_program); + if (platform::is_gpu_place(places[0]) && member_->local_scopes_.size() != 1 && + local_scopes.empty()) { // Is CUDA + BCastParamsToGPUs(bcast_vars); } // Startup Program has been run. All local scopes has correct parameters. @@ -99,48 +109,47 @@ ParallelExecutor::ParallelExecutor( } void ParallelExecutor::BCastParamsToGPUs( - const ProgramDesc &startup_program) const { + const std::unordered_set &vars) const { #ifdef PADDLE_WITH_CUDA auto *main_scope = member_->local_scopes_[0]; - for (auto *var_desc : startup_program.Block(0).AllVars()) { - size_t idx = var_desc->Name().find("@GRAD"); - if (idx != std::string::npos) continue; - if (var_desc->GetType() == proto::VarType::LOD_TENSOR) { - auto &main_tensor = - main_scope->FindVar(var_desc->Name())->Get(); - - auto &dims = main_tensor.dims(); - - if (paddle::platform::is_gpu_place(main_tensor.place())) { - size_t numel = main_tensor.numel(); - ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); - platform::NCCLGroupGuard guard; - for (size_t i = 0; i < member_->places_.size(); ++i) { - auto place = member_->places_[i]; - void *buffer; - if (i == 0) { - buffer = const_cast(main_tensor.data()); - } else { - auto local_scope = member_->local_scopes_[i]; - auto *t = - local_scope->Var(var_desc->Name())->GetMutable(); - t->Resize(dims); - buffer = t->mutable_data(place, main_tensor.type()); - } - auto &nccl_ctx = member_->nccl_ctxs_->at(place); - platform::dynload::ncclBcast(buffer, numel, data_type, 0, - nccl_ctx.comm_, nccl_ctx.stream()); - } - } else { - platform::CPUPlace cpu; - for (size_t i = 1; i < member_->places_.size(); ++i) { + for (auto &var : vars) { + auto *main_var = main_scope->FindVar(var); + if (!main_var->IsType()) { + continue; + } + + auto &main_tensor = main_var->Get(); + + auto &dims = main_tensor.dims(); + + if (paddle::platform::is_gpu_place(main_tensor.place())) { + size_t numel = main_tensor.numel(); + ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); + platform::NCCLGroupGuard guard; + for (size_t i = 0; i < member_->places_.size(); ++i) { + auto place = member_->places_[i]; + void *buffer; + if (i == 0) { + buffer = const_cast(main_tensor.data()); + } else { auto local_scope = member_->local_scopes_[i]; - auto *t = local_scope->Var(var_desc->Name())->GetMutable(); + auto *t = local_scope->Var(var)->GetMutable(); t->Resize(dims); - t->mutable_data(cpu, main_tensor.type()); - paddle::framework::TensorCopy(main_tensor, cpu, t); + buffer = t->mutable_data(place, main_tensor.type()); } + auto &nccl_ctx = member_->nccl_ctxs_->at(place); + platform::dynload::ncclBcast(buffer, numel, data_type, 0, + nccl_ctx.comm_, nccl_ctx.stream()); + } + } else { + platform::CPUPlace cpu; + for (size_t i = 1; i < member_->places_.size(); ++i) { + auto local_scope = member_->local_scopes_[i]; + auto *t = local_scope->Var(var)->GetMutable(); + t->Resize(dims); + t->mutable_data(cpu, main_tensor.type()); + paddle::framework::TensorCopy(main_tensor, cpu, t); } } member_->nccl_ctxs_->WaitAll(); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index c7c58b2b808383621a6d492f9188b0d36bfa6858..c048c3865f14822be4a0015e385ea1b8e05d0ced 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -36,11 +36,14 @@ class ParallelExecutor { explicit ParallelExecutor(size_t num_threads, bool use_event, const std::vector& places, const std::unordered_set& params, - const ProgramDesc& startup_program, + const std::unordered_set& bcast_vars, const ProgramDesc& main_program, const std::string& loss_var_name, Scope* scope, + const std::vector& local_scopes, bool allow_op_delay); + std::vector& GetLocalScopes(); + void Run(const std::vector& fetch_tensors, const std::string& fetched_var_name, const std::unordered_map& feed_tensors); @@ -51,7 +54,7 @@ class ParallelExecutor { ParallelExecutorPrivate* member_; - void BCastParamsToGPUs(const ProgramDesc& startup_program) const; + void BCastParamsToGPUs(const std::unordered_set& vars) const; }; } // namespace framework diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 748ad75a99ea4955730327a10ae8468a107fed0a..bd8446df6650f5fb1c62e5370fd48216dbf31e17 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -544,13 +544,20 @@ All parameter, weight, gradient are variables in Paddle. [](ParallelExecutor &self, size_t num_threads, bool use_event, const std::vector &places, const std::unordered_set ¶ms, - const ProgramDesc &startup_program, + const std::unordered_set &bcast_vars, const ProgramDesc &main_program, const std::string &loss_var_name, - Scope *scope, bool allow_op_delay) { - new (&self) ParallelExecutor(num_threads, use_event, places, - params, startup_program, main_program, - loss_var_name, scope, allow_op_delay); + Scope *scope, std::vector &local_scopes, + bool allow_op_delay) { + new (&self) + ParallelExecutor(num_threads, use_event, places, params, + bcast_vars, main_program, loss_var_name, + scope, local_scopes, allow_op_delay); }) + .def("local_scopes", + [](ParallelExecutor &self) -> std::vector * { + return &self.GetLocalScopes(); + }, + py::return_value_policy::reference) .def("run", &ParallelExecutor::Run); BindRecordIOWriter(&m); diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 1b3ba414ecb50cc4d75dcaecd1f31265334c9aec..b93f2f974ca28cfd8d03c0dbbf1d401620a15e53 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -22,10 +22,49 @@ __all__ = ['ParallelExecutor'] class ParallelExecutor(object): def __init__(self, - loss_name, use_cuda, + loss_name=None, + main_program=None, num_threads=None, - allow_op_delay=False): + allow_op_delay=False, + share_vars_from=None): + """ + ParallelExecutor can run program in parallel. + + Args: + use_cuda(bool): Whether to use CUDA or not. + loss_name(str, default None): The loss name must set in training. + main_program(Program, default None): The program that need to run, + if not provided, then default_main_program will be used. + num_threads(int, default None): How many threads are used for + training. + allow_op_delay(bool, default False): Whether to delay and buffer + some operators together for scheduling or not, which may + improve performance in some cases, defalut False. + share_vars_from(ParallelExecutor, default None): If provied, + it will share variables from the specified ParallelExecutor. + + Returns: + A ParallelExecutor object. + + Raises: + TypeError: If share_vars_from is provided, but not ParallelExecutor + object. + + Examples: + .. code-block:: python + + train_exe = fluid.ParallelExecutor( + use_cuda=True, loss_name=loss.name) + test_exe = fluid.ParallelExecutor( + use_cuda=True, + main_program=test_program, + share_vars_from=train_exe) + + train_loss, = train_exe.run([loss.name], feed_dict=feed_dict) + test_loss, = test_exe.run([loss.name], feed_dict=feed_dict) + """ + self._places = [] self._act_places = [] if use_cuda: @@ -50,10 +89,21 @@ class ParallelExecutor(object): else: min(len(self._places) * 2, multiprocessing.cpu_count()) - startup = framework.default_startup_program() - main = framework.default_main_program() + main = main_program + main = main if main else framework.default_main_program() scope = executor.global_scope() + if share_vars_from and not isinstance(share_vars_from, + ParallelExecutor): + raise TypeError("share_vars_from must be ParallelExecutor.") + local_scopes = share_vars_from.executor.local_scopes( + ) if share_vars_from else [] + + persistable_vars = [ + v.name + for v in filter(lambda var: var.persistable, main.list_vars()) + ] + self.executor = core.ParallelExecutor( num_threads, True if use_cuda else False, # use_event @@ -62,10 +112,11 @@ class ParallelExecutor(object): p.name for p in main.global_block().iter_parameters() if not p.stop_gradient ]), - startup.desc, + set(persistable_vars), main.desc, - loss_name, + loss_name if loss_name else '', scope, + local_scopes, allow_op_delay) self.scope = scope diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index 0f90e0e4df5da93f427b892d1be69f14625d2e29..8401716db88ef3dda68644a052d78b4476c9fdc7 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -207,7 +207,11 @@ class TestParallelExecutorBase(unittest.TestCase): if memory_opt: fluid.memory_optimize(main) - exe = fluid.ParallelExecutor(loss_name=loss.name, use_cuda=True) + place = fluid.CUDAPlace(0) + startup_exe = fluid.Executor(place) + startup_exe.run(startup) + + exe = fluid.ParallelExecutor(True, loss_name=loss.name) if batch_size is not None: batch_size *= fluid.core.get_cuda_device_count() begin = time.time() @@ -453,3 +457,41 @@ class TestTransformer(TestParallelExecutorBase): @unittest.skip("transformer is buggy in multi gpu") def test_main(self): self.check_network_convergence(transformer) + + +class ParallelExecutorTestingDuringTraining(unittest.TestCase): + def test_parallel_testing(self): + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + loss = simple_fc_net(True) + test_program = main.clone(for_test=True) + + opt = fluid.optimizer.SGD(learning_rate=0.0001) + opt.minimize(loss) + + batch_size = 32 + image = numpy.random.normal(size=(batch_size, + 784)).astype('float32') + label = numpy.random.randint(0, 10, (batch_size, 1), dtype="int64") + + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(startup) + feed_dict = {'image': image, 'label': label} + + train_exe = fluid.ParallelExecutor( + use_cuda=True, loss_name=loss.name, main_program=main) + + test_exe = fluid.ParallelExecutor( + use_cuda=True, + main_program=test_program, + share_vars_from=train_exe) + + for i in xrange(5): + test_loss, = test_exe.run([loss.name], feed_dict=feed_dict) + test_loss = numpy.array(test_loss) + + train_loss, = train_exe.run([loss.name], feed_dict=feed_dict) + train_loss = numpy.array(train_loss) + self.assertTrue(numpy.allclose(train_loss, test_loss))