diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index ed75b48090b27a9c430afe067467a1a39d711938..61276efedeeca76a8818c15ddab73b3c53725c4b 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -53,6 +53,10 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p)); } } + // TODO(gongwb) :polish them! + if (is_encoded) { + VLOG(1) << "Use dgc allreduce mode"; + } } #else AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, @@ -86,7 +90,7 @@ void AllReduceOpHandle::RunImplEncoded() { paddle::framework::GradOriginalVarName(in_var_handles[i]->name()); auto encode_var_name = original_name + g_dgc_encoded; auto *in_var = local_scope->FindVar(encode_var_name); - PADDLE_ENFORCE_NOT_NULL(in_var); + PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name); auto &in = in_var->Get(); ins.emplace_back(&in); diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 7e6e37116fe23f26eb14dd0573dbe031aec98dd8..94bc3d0854d5b17de4811aca07c35fbaa0e382ca 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -752,7 +752,7 @@ class DGCMomentumOptimizer(MomentumOptimizer): force_cpu=True) for param_var, grad_var in param_and_grads: - var_numel = reduce(lambda x, y: x * y, param_var.shape) + var_numel = abs(reduce(lambda x, y: x * y, param_var.shape)) if var_numel < 16384 or \ param_var.type == core.VarDesc.VarType.SELECTED_ROWS or \ grad_var.type == core.VarDesc.VarType.SELECTED_ROWS or \ diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 6b88e7a99fd78f6a7670ba55bc678e85d229ddf4..1a91dafb8a5048c293a3001d6043282775a4cddd 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -104,10 +104,11 @@ class ParallelExecutor(object): self._scope = scope if scope is not None else executor.global_scope() if main_program is not None and main_program._enable_dgc: + assert num_trainers > 1, "dgc is not useful for single trainer training." assert build_strategy.reduce_strategy == BuildStrategy.ReduceStrategy.AllReduce assert num_trainers * len( - self._places) > 1, "dgc is not useful for single card training" - assert use_cuda + self._places) > 1, "dgc is not useful for single card training." + assert use_cuda, "dgc only used when cuda is used." main_program = main_program if main_program is not None \ else framework.default_main_program() @@ -123,6 +124,11 @@ class ParallelExecutor(object): exec_strategy=exec_strategy, share_vars_from=share_vars_from._compiled_program if share_vars_from else None) + + # FIXME(gongwb): I will move dgc from dist mode to allreduce mode in next pr. + if main_program._enable_dgc: + self._compiled_program._build_strategy.is_distribution = True + self._place = core.CUDAPlace(0) if use_cuda else core.CPUPlace() self._exe = executor.Executor(self._place) self._compiled_program._compile(place=self._place, scope=self._scope)