diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 002952436e58eecfcecf5c9fa40c01b795170681..10d39e779336e2001d66a55ac6d01ee768ddd4ff 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -34,7 +34,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( const std::vector &places, const std::string &loss_var_name, const std::unordered_set ¶ms, - const std::vector &local_scopes, + const std::vector &local_scopes, bool skip_scale_loss, platform::NCCLContextMap *nccl_ctxs) : loss_var_name_(loss_var_name), places_(places), @@ -45,7 +45,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( const std::vector &places, const std::string &loss_var_name, const std::unordered_set ¶ms, - const std::vector &local_scopes) + const std::vector &local_scopes, bool skip_scale_loss) : loss_var_name_(loss_var_name), places_(places), local_scopes_(local_scopes) { @@ -53,6 +53,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( for (auto &p : params) { grad_names_.insert(GradVarName(p)); } + skip_scale_loss_ = skip_scale_loss; } void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result, @@ -95,7 +96,9 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( // always use the first device CreateSendOp(&result, *op); } else if (IsScaleLossOp(*op)) { - CreateScaleLossGradOp(&result); + if (!skip_scale_loss_) { + CreateScaleLossGradOp(&result); + } is_forwarding = false; } else { CreateComputationalOps(&result, *op); diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h index b5ba2dbd3c00f23fabd993d7908664db38a31941..009c31b40c279ae3e924d2d7c67933e7444ed85c 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.h +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h @@ -34,12 +34,14 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { const std::string &loss_var_name, const std::unordered_set ¶ms, const std::vector &local_scopes, + bool skip_scale_loss, platform::NCCLContextMap *nccl_ctxs); #else MultiDevSSAGraphBuilder(const std::vector &places, const std::string &loss_var_name, const std::unordered_set ¶ms, - const std::vector &local_scopes); + const std::vector &local_scopes, + bool skip_scale_loss); #endif std::unique_ptr Build(const ProgramDesc &program) const override; @@ -57,6 +59,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { #ifdef PADDLE_WITH_CUDA platform::NCCLContextMap *nccl_ctxs_; #endif + bool skip_scale_loss_; bool IsScaleLossOp(const OpDesc &op) const; diff --git a/paddle/fluid/framework/details/send_op_handle.cc b/paddle/fluid/framework/details/send_op_handle.cc index 549b9d9abbe5bfd17df3509e0442bfa19b7ecd61..0763f92171e7813ec0ee8ca4f3aa42b76205130a 100644 --- a/paddle/fluid/framework/details/send_op_handle.cc +++ b/paddle/fluid/framework/details/send_op_handle.cc @@ -34,7 +34,10 @@ void SendOpHandle::RunImpl() { } in->generated_op_->Wait(dev_ctxes_[p]); } - this->RunAndRecordEvent([&] { op_->Run(*local_scope_, place_); }); + auto &tmp_scope = local_scope_->FindVar(kLocalExecScopeName)->Get(); + // FIXME(wuyi): can not use RunAndRecordEvent here, for it will cause dead + // lock. + op_->Run(*tmp_scope, place_); } std::string SendOpHandle::Name() const { return "send"; } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 67e02e2f119707bba376056510a8ca1034590b55..a673fa52880f3f14cdf11a39d2272880a97be19c 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -57,7 +57,8 @@ ParallelExecutor::ParallelExecutor( const std::unordered_set ¶ms, const std::unordered_set &bcast_vars, const ProgramDesc &main_program, const std::string &loss_var_name, - Scope *scope, const std::vector &local_scopes, bool allow_op_delay) + Scope *scope, const std::vector &local_scopes, bool allow_op_delay, + bool customize_scale_loss) : member_(new ParallelExecutorPrivate(places)) { member_->global_scope_ = scope; @@ -90,12 +91,13 @@ ParallelExecutor::ParallelExecutor( // Step 2. Convert main_program to SSA form and dependency graph. Also, insert // ncclOp #ifdef PADDLE_WITH_CUDA - details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name, - params, member_->local_scopes_, - member_->nccl_ctxs_.get()); + details::MultiDevSSAGraphBuilder builder( + member_->places_, loss_var_name, params, member_->local_scopes_, + customize_scale_loss, member_->nccl_ctxs_.get()); #else details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name, - params, member_->local_scopes_); + params, member_->local_scopes_, + customize_scale_loss); #endif auto graph = builder.Build(main_program); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index f4f283bb4b5eafc33619c98b5f30e1e8f453ece3..49da123d98181c3d3abcdd64d14c5583142eba58 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -40,7 +40,7 @@ class ParallelExecutor { const ProgramDesc& main_program, const std::string& loss_var_name, Scope* scope, const std::vector& local_scopes, - bool allow_op_delay); + bool allow_op_delay, bool customize_scale_loss); ~ParallelExecutor(); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 1f21e7abe76b2a32d6c18e5c26c4f25b65daef5b..b20b514fcdd0b41fefa0933bc2d22645e7d4b6d6 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -502,11 +502,11 @@ All parameter, weight, gradient are variables in Paddle. const std::unordered_set &bcast_vars, const ProgramDesc &main_program, const std::string &loss_var_name, Scope *scope, std::vector &local_scopes, - bool allow_op_delay) { - new (&self) - ParallelExecutor(num_threads, use_event, places, params, - bcast_vars, main_program, loss_var_name, - scope, local_scopes, allow_op_delay); + bool allow_op_delay, bool customize_loss_grad) { + new (&self) ParallelExecutor(num_threads, use_event, places, + params, bcast_vars, main_program, + loss_var_name, scope, local_scopes, + allow_op_delay, customize_loss_grad); }) .def("bcast_params", &ParallelExecutor::BCastParamsToGPUs) // NOTE: even we return a vec* to Python use reference policy. diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index fbdd6fd449625a21f91758dc12490b02070aea1a..4adbb2ea99b58c78c5c08c7ac8a556ca1de1615e 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -29,7 +29,8 @@ class ParallelExecutor(object): main_program=None, num_threads=None, allow_op_delay=False, - share_vars_from=None): + share_vars_from=None, + customize_loss_grad=False): """ ParallelExecutor can run program in parallel. @@ -78,7 +79,7 @@ class ParallelExecutor(object): else: for i in xrange(multiprocessing.cpu_count()): p = core.Place() - self._act_places.append(core.CPUPlace(i)) + self._act_places.append(core.CPUPlace()) p.set_place(self._act_places[-1]) self._places.append(p) assert self._places, "no place for execution" @@ -122,7 +123,8 @@ class ParallelExecutor(object): loss_name if loss_name else '', scope, local_scopes, - allow_op_delay) + allow_op_delay, + customize_loss_grad) self.scope = scope def run(self, fetch_list, feed=None, feed_dict=None):