From 8b793d0efd96399d10b3b525322f924325341b7d Mon Sep 17 00:00:00 2001 From: gongweibao Date: Mon, 8 Apr 2019 16:10:05 +0800 Subject: [PATCH] Fix DGC bug. (#16697) --- paddle/fluid/framework/details/all_reduce_op_handle.cc | 6 +++++- paddle/fluid/framework/details/build_strategy.cc | 2 +- python/paddle/fluid/optimizer.py | 2 +- python/paddle/fluid/parallel_executor.py | 6 ++++++ 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index ed75b4809..61276efed 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -53,6 +53,10 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p)); } } + // TODO(gongwb) :polish them! + if (is_encoded) { + VLOG(1) << "Use dgc allreduce mode"; + } } #else AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, @@ -86,7 +90,7 @@ void AllReduceOpHandle::RunImplEncoded() { paddle::framework::GradOriginalVarName(in_var_handles[i]->name()); auto encode_var_name = original_name + g_dgc_encoded; auto *in_var = local_scope->FindVar(encode_var_name); - PADDLE_ENFORCE_NOT_NULL(in_var); + PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name); auto &in = in_var->Get(); ins.emplace_back(&in); diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 645dd421d..afe5078bf 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -251,7 +251,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, CreatePassesFromStrategy(false); for (std::shared_ptr &pass : pass_builder_->AllPasses()) { - VLOG(3) << "apply " << pass->Type(); + VLOG(3) << "BuildStrategy::Apply pass:" << pass->Type(); if (IsMultiDevPass(pass->Type())) { pass->Erase(kPlaces); pass->SetNotOwned>(kPlaces, &places); diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 7e6e37116..94bc3d085 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -752,7 +752,7 @@ class DGCMomentumOptimizer(MomentumOptimizer): force_cpu=True) for param_var, grad_var in param_and_grads: - var_numel = reduce(lambda x, y: x * y, param_var.shape) + var_numel = abs(reduce(lambda x, y: x * y, param_var.shape)) if var_numel < 16384 or \ param_var.type == core.VarDesc.VarType.SELECTED_ROWS or \ grad_var.type == core.VarDesc.VarType.SELECTED_ROWS or \ diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 6b88e7a99..092cd5aea 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -104,6 +104,7 @@ class ParallelExecutor(object): self._scope = scope if scope is not None else executor.global_scope() if main_program is not None and main_program._enable_dgc: + assert num_trainers > 1 assert build_strategy.reduce_strategy == BuildStrategy.ReduceStrategy.AllReduce assert num_trainers * len( self._places) > 1, "dgc is not useful for single card training" @@ -123,6 +124,11 @@ class ParallelExecutor(object): exec_strategy=exec_strategy, share_vars_from=share_vars_from._compiled_program if share_vars_from else None) + + # FIXME(gongwb): I will move dgc from dist mode to allreduce mode in next pr. + if main_program._enable_dgc: + self._compiled_program._build_strategy.is_distribution = True + self._place = core.CUDAPlace(0) if use_cuda else core.CPUPlace() self._exe = executor.Executor(self._place) self._compiled_program._compile(place=self._place, scope=self._scope) -- GitLab