diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index c2eb1c31b4f5625e662436e278a33c55b38bb004..daba9bf2dd861d6366723ed71eada66bc67c78f2 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -34,7 +34,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( const std::vector &places, const std::string &loss_var_name, const std::unordered_set ¶ms, - const std::vector &local_scopes, bool skip_scale_loss, + const std::vector &local_scopes, bool use_default_grad_scale, platform::NCCLContextMap *nccl_ctxs) : loss_var_name_(loss_var_name), places_(places), @@ -45,7 +45,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( const std::vector &places, const std::string &loss_var_name, const std::unordered_set ¶ms, - const std::vector &local_scopes, bool skip_scale_loss) + const std::vector &local_scopes, bool use_default_grad_scale) : loss_var_name_(loss_var_name), places_(places), local_scopes_(local_scopes) { @@ -53,7 +53,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( for (auto &p : params) { grad_names_.insert(GradVarName(p)); } - skip_scale_loss_ = skip_scale_loss; + use_default_grad_scale_ = use_default_grad_scale; } void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result, @@ -126,8 +126,8 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( } else if (IsDistTrainOp(*op, send_op)) { CreateComputationalOps(&result, *op, 1); } else if (IsScaleLossOp(*op)) { - // user can customize loss@grad if skip_scale_loss_ - if (!skip_scale_loss_) { + // user can customize loss@grad if not use_default_grad_scale_ + if (use_default_grad_scale_) { CreateScaleLossGradOp(&result); } is_forwarding = false; diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h index fa4d31bdc49da5d30340a710c950dcc8cd70180b..bad47458ef4cd1cd42e902341e8be66da5c210ed 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.h +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h @@ -41,7 +41,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { const std::string &loss_var_name, const std::unordered_set ¶ms, const std::vector &local_scopes, - bool skip_scale_loss); + bool use_default_grad_scale); #endif std::unique_ptr Build(const ProgramDesc &program) const override; @@ -59,7 +59,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { #ifdef PADDLE_WITH_CUDA platform::NCCLContextMap *nccl_ctxs_; #endif - bool skip_scale_loss_; + bool use_default_grad_scale_; bool IsScaleLossOp(const OpDesc &op) const; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index de644e851999920251c762a75c050e8182e950c6..4712efeff68cf98a50b610acf78bb52d7519c520 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -58,7 +58,7 @@ ParallelExecutor::ParallelExecutor( const std::unordered_set &bcast_vars, const ProgramDesc &main_program, const std::string &loss_var_name, Scope *scope, const std::vector &local_scopes, bool allow_op_delay, - bool customize_scale_loss) + bool use_default_grad_scale) : member_(new ParallelExecutorPrivate(places)) { member_->global_scope_ = scope; @@ -93,11 +93,11 @@ ParallelExecutor::ParallelExecutor( #ifdef PADDLE_WITH_CUDA details::MultiDevSSAGraphBuilder builder( member_->places_, loss_var_name, params, member_->local_scopes_, - customize_scale_loss, member_->nccl_ctxs_.get()); + use_default_grad_scale, member_->nccl_ctxs_.get()); #else details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name, params, member_->local_scopes_, - customize_scale_loss); + use_default_grad_scale); #endif auto graph = builder.Build(main_program); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 49da123d98181c3d3abcdd64d14c5583142eba58..ecd107d81f8f5bf5d8b899d0c07797114a7ab767 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -40,7 +40,7 @@ class ParallelExecutor { const ProgramDesc& main_program, const std::string& loss_var_name, Scope* scope, const std::vector& local_scopes, - bool allow_op_delay, bool customize_scale_loss); + bool allow_op_delay, bool use_default_grad_scale); ~ParallelExecutor(); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index b20b514fcdd0b41fefa0933bc2d22645e7d4b6d6..c925686f8382da1758fb7cdc048253290ef69513 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -502,11 +502,11 @@ All parameter, weight, gradient are variables in Paddle. const std::unordered_set &bcast_vars, const ProgramDesc &main_program, const std::string &loss_var_name, Scope *scope, std::vector &local_scopes, - bool allow_op_delay, bool customize_loss_grad) { - new (&self) ParallelExecutor(num_threads, use_event, places, - params, bcast_vars, main_program, - loss_var_name, scope, local_scopes, - allow_op_delay, customize_loss_grad); + bool allow_op_delay, bool use_default_grad_scale) { + new (&self) ParallelExecutor( + num_threads, use_event, places, params, bcast_vars, + main_program, loss_var_name, scope, local_scopes, + allow_op_delay, use_default_grad_scale); }) .def("bcast_params", &ParallelExecutor::BCastParamsToGPUs) // NOTE: even we return a vec* to Python use reference policy. diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 4adbb2ea99b58c78c5c08c7ac8a556ca1de1615e..f4128dcbe935ea5caf63b8c0b377f9bed36d6705 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -30,7 +30,7 @@ class ParallelExecutor(object): num_threads=None, allow_op_delay=False, share_vars_from=None, - customize_loss_grad=False): + use_default_grad_scale=True): """ ParallelExecutor can run program in parallel. @@ -46,6 +46,11 @@ class ParallelExecutor(object): improve performance in some cases, defalut False. share_vars_from(ParallelExecutor, default None): If provied, it will share variables from the specified ParallelExecutor. + use_default_grad_scale(bool, default True): If set True, a default + scale value equal to `1./device_count` would be multiplied to + gradients of each device and scaled gradients would be + aggregated. Otherwise, a customized scale value should be fed + to the network. Returns: A ParallelExecutor object. @@ -124,7 +129,7 @@ class ParallelExecutor(object): scope, local_scopes, allow_op_delay, - customize_loss_grad) + use_default_grad_scale) self.scope = scope def run(self, fetch_list, feed=None, feed_dict=None):