diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index ed75b48090b27a9c430afe067467a1a39d711938..61276efedeeca76a8818c15ddab73b3c53725c4b 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -53,6 +53,10 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p)); } } + // TODO(gongwb) :polish them! + if (is_encoded) { + VLOG(1) << "Use dgc allreduce mode"; + } } #else AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, @@ -86,7 +90,7 @@ void AllReduceOpHandle::RunImplEncoded() { paddle::framework::GradOriginalVarName(in_var_handles[i]->name()); auto encode_var_name = original_name + g_dgc_encoded; auto *in_var = local_scope->FindVar(encode_var_name); - PADDLE_ENFORCE_NOT_NULL(in_var); + PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name); auto &in = in_var->Get(); ins.emplace_back(&in); diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 645dd421d48b89c26c3489a7de02371736556b2d..afe5078bf80d00b595789a5f45d91a5e7a8dfce6 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -251,7 +251,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, CreatePassesFromStrategy(false); for (std::shared_ptr &pass : pass_builder_->AllPasses()) { - VLOG(3) << "apply " << pass->Type(); + VLOG(3) << "BuildStrategy::Apply pass:" << pass->Type(); if (IsMultiDevPass(pass->Type())) { pass->Erase(kPlaces); pass->SetNotOwned>(kPlaces, &places); diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 7e6e37116fe23f26eb14dd0573dbe031aec98dd8..94bc3d0854d5b17de4811aca07c35fbaa0e382ca 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -752,7 +752,7 @@ class DGCMomentumOptimizer(MomentumOptimizer): force_cpu=True) for param_var, grad_var in param_and_grads: - var_numel = reduce(lambda x, y: x * y, param_var.shape) + var_numel = abs(reduce(lambda x, y: x * y, param_var.shape)) if var_numel < 16384 or \ param_var.type == core.VarDesc.VarType.SELECTED_ROWS or \ grad_var.type == core.VarDesc.VarType.SELECTED_ROWS or \ diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 6b88e7a99fd78f6a7670ba55bc678e85d229ddf4..092cd5aea7d2f3ae7e5ba927261921fbe28f51bf 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -104,6 +104,7 @@ class ParallelExecutor(object): self._scope = scope if scope is not None else executor.global_scope() if main_program is not None and main_program._enable_dgc: + assert num_trainers > 1 assert build_strategy.reduce_strategy == BuildStrategy.ReduceStrategy.AllReduce assert num_trainers * len( self._places) > 1, "dgc is not useful for single card training" @@ -123,6 +124,11 @@ class ParallelExecutor(object): exec_strategy=exec_strategy, share_vars_from=share_vars_from._compiled_program if share_vars_from else None) + + # FIXME(gongwb): I will move dgc from dist mode to allreduce mode in next pr. + if main_program._enable_dgc: + self._compiled_program._build_strategy.is_distribution = True + self._place = core.CUDAPlace(0) if use_cuda else core.CPUPlace() self._exe = executor.Executor(self._place) self._compiled_program._compile(place=self._place, scope=self._scope)