From cbdb8a17b1e7ab27fd7429c4a94db898d9a05b24 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Thu, 18 Apr 2019 13:09:58 +0800
Subject: [PATCH] Polish DGC code (#16818)

---
 CMakeLists.txt                                |   8 +-
 cmake/inference_lib.cmake                     |   9 -
 paddle/fluid/framework/details/CMakeLists.txt |  16 +-
 .../framework/details/all_reduce_op_handle.cc | 185 +----------------
 .../framework/details/all_reduce_op_handle.h  |  22 +-
 .../framework/details/dgc_const_values.h      |  32 +++
 .../details/multi_devices_graph_pass.cc       |  42 +++-
 .../details/multi_devices_graph_pass.h        |   4 +-
 .../details/sparse_all_reduce_op_handle.cc    | 188 ++++++++++++++++++
 .../details/sparse_all_reduce_op_handle.h     |  52 +++++
 paddle/fluid/inference/CMakeLists.txt         |   5 -
 paddle/fluid/operators/CMakeLists.txt         |   2 +-
 paddle/fluid/platform/CMakeLists.txt          |   9 +-
 paddle/fluid/platform/init.cc                 |   4 +-
 paddle/fluid/pybind/const_value.cc            |  16 ++
 python/paddle/fluid/optimizer.py              |  12 +-
 python/paddle/fluid/parallel_executor.py      |  12 +-
 .../fluid/tests/unittests/CMakeLists.txt      |   4 +-
 18 files changed, 369 insertions(+), 253 deletions(-)
 create mode 100644 paddle/fluid/framework/details/dgc_const_values.h
 create mode 100644 paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
 create mode 100644 paddle/fluid/framework/details/sparse_all_reduce_op_handle.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 02752de762..b9d53cb50f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -77,6 +77,7 @@ option(WITH_INFERENCE_API_TEST   "Test fluid inference C++ high-level api interf
 option(WITH_HIGH_LEVEL_API_TEST   "Test fluid python high-level api interface"  OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
 option(WITH_FAST_MATH   "Make use of fast math library, might affect the precision to some extent" ON)
+option(WITH_DGC   "Use DGC(Deep Gradient Compression) or not" ON)
 
 # PY_VERSION
 if(NOT PY_VERSION)
@@ -196,9 +197,14 @@ if(WITH_GPU)
     include(anakin_subgraph)
 endif()
 
-if(WITH_GPU AND NOT WIN32)
+if(WIN32 OR APPLE OR NOT WITH_GPU OR ON_INFER)
+    set(WITH_DGC OFF)
+endif()
+
+if(WITH_DGC)
     message(STATUS "add dgc lib.")
     include(external/dgc)
+    add_definitions(-DPADDLE_WITH_DGC)
 endif()
 
 if(WITH_MKL OR WITH_MKLML)
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index b7c32f80db..a7dce4dfdb 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -131,15 +131,6 @@ elseif (NOT CBLAS_FOUND OR WIN32)
             )
 endif ()
 
-if (WITH_GPU AND NOT WIN32)
-    set(dgc_dir "${FLUID_INSTALL_DIR}/third_party/install/dgc")
-    copy(dgc_lib
-            SRCS ${DGC_INSTALL_DIR}/lib ${DGC_INSTALL_DIR}/include
-            DSTS ${dgc_dir} ${dgc_dir}
-            DEPS dgc)
-endif()
-
-
 if (WITH_MKLDNN)
     set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn")
     copy(mkldnn_lib
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 2c1f3ae638..51231b981b 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -24,15 +24,19 @@ if(WITH_DISTRIBUTE)
     endif()
 endif()
 
+set(all_reduce_deps all_reduce_op_handle)
 if(WITH_GPU)
-    set(dgc_deps "")
-    if(NOT WIN32)
-        set(dgc_deps dgc)
-    endif()
     nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
-            dynload_cuda variable_visitor ${dgc_deps})
+            dynload_cuda variable_visitor)
     nv_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
             dynload_cuda variable_visitor)
+
+    if(WITH_DGC)
+        nv_library(sparse_all_reduce_op_handle SRCS sparse_all_reduce_op_handle.cc DEPS op_handle_base scope 
+            lod_tensor ddim memory dynload_cuda variable_visitor dgc all_reduce_op_handle)
+        set(all_reduce_deps sparse_all_reduce_op_handle)
+    endif()
+
     if(WITH_DISTRIBUTE)
         nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
             ddim dynload_cuda selected_rows_functor sendrecvop_rpc)
@@ -80,7 +84,7 @@ cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS grap
 cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
 
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
-        scale_loss_grad_op_handle rpc_op_handle fetch_barrier_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle fused_broadcast_op_handle)
+        scale_loss_grad_op_handle rpc_op_handle fetch_barrier_op_handle ${all_reduce_deps} reduce_op_handle broadcast_op_handle fused_broadcast_op_handle)
 
 cc_library(fuse_all_reduce_op_pass SRCS fuse_all_reduce_op_pass.cc DEPS graph graph_helper fused_all_reduce_op_handle)
 
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 61276efede..c9f06c64e4 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -17,11 +17,6 @@
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
 #include "paddle/fluid/framework/operator.h"
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "dgc/dgc.h"
-#endif
-
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -40,23 +35,16 @@ namespace details {
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                      const std::vector<Scope *> &local_scopes,
                                      const std::vector<platform::Place> &places,
-                                     const platform::NCCLContextMap *ctxs,
-                                     bool is_encoded, int nranks)
+                                     const platform::NCCLContextMap *ctxs)
     : OpHandleBase(node),
       local_scopes_(local_scopes),
       places_(places),
-      nccl_ctxs_(ctxs),
-      is_encoded_(is_encoded),
-      nranks_(nranks) {
+      nccl_ctxs_(ctxs) {
   if (nccl_ctxs_) {
     for (auto &p : places_) {
       this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
     }
   }
-  // TODO(gongwb) :polish them!
-  if (is_encoded) {
-    VLOG(1) << "Use dgc allreduce mode";
-  }
 }
 #else
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
@@ -66,92 +54,8 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 #endif
 
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-void AllReduceOpHandle::RunImplEncoded() {
-  platform::RecordEvent record_event(Name());
-
-  WaitInputVarGenerated();
-
-  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
-  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
-  PADDLE_ENFORCE_EQ(
-      in_var_handles.size(), places_.size(),
-      "The NoDummyInputSize should be equal to the number of places.");
-  PADDLE_ENFORCE_EQ(
-      in_var_handles.size(), out_var_handles.size(),
-      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
-
-  std::vector<const LoDTensor *> ins;
-  std::vector<LoDTensor *> outs;
-  int k = -1;
-  for (size_t i = 0; i < local_scopes_.size(); ++i) {
-    auto &local_scope =
-        local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
-    auto original_name =
-        paddle::framework::GradOriginalVarName(in_var_handles[i]->name());
-    auto encode_var_name = original_name + g_dgc_encoded;
-    auto *in_var = local_scope->FindVar(encode_var_name);
-    PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name);
-    auto &in = in_var->Get<LoDTensor>();
-    ins.emplace_back(&in);
-
-    auto *out = local_scope->FindVar(out_var_handles[i]->name())
-                    ->GetMutable<LoDTensor>();
-    outs.emplace_back(out);
-
-    if (k < 0) {
-      k = GetKValue(in_var_handles[i]->name());
-    }
-  }
-
-  PADDLE_ENFORCE(platform::is_gpu_place(ins[0]->place()));
-  PADDLE_ENFORCE(platform::is_gpu_place(outs[0]->place()));
-  PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
-
-  int dtype = -1;
-  size_t in_numel = 0;
-  size_t out_numel = 0;
-  PADDLE_ENFORCE(nranks_ > 1);
-  std::vector<std::function<void()>> all_reduce_calls;
-
-  for (size_t i = 0; i < local_scopes_.size(); ++i) {
-    auto &place = places_[i];
-    auto &in = *ins[i];
-    void *in_tensor_buf = const_cast<void *>(in.data<void>());
-
-    auto &out = *outs[i];
-    float *out_tensor_buf = out.data<float>();
-
-    dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype;
-    in_numel = (in_numel == 0) ? static_cast<size_t>(in.numel()) : in_numel;
-    PADDLE_ENFORCE(in_numel % 2 == 0);
-    PADDLE_ENFORCE(in_numel / 2 == static_cast<size_t>(k));
-    out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel;
-
-    int dev_id = boost::get<platform::CUDAPlace>(place).device;
-    auto &nccl_ctx = nccl_ctxs_->at(dev_id);
-    auto stream = nccl_ctx.stream();
-    auto comm = nccl_ctx.comm_;
-
-    auto &allocator =
-        platform::DeviceTemporaryAllocator::Instance().Get(place, stream);
-    int encode_size = 2 * k * sizeof(int);
-    // dgc use ncclAllGather to get all the encoded data
-    // so the buffer need nranks.
-    int buf_size = nranks_ * encode_size;
-    auto tmp_ious_data = allocator.Allocate(buf_size);
-    void *gather_buff = reinterpret_cast<void *>(tmp_ious_data->ptr());
-
-    VLOG(10) << "in_numel:" << in_numel << ", out_numel:" << out_numel
-             << ", nranks:" << nranks_ << ", gather_buf size:" << buf_size
-             << ", k:" << k << ", place:" << place << ", dtype:" << dtype;
-
-    all_reduce_calls.emplace_back([=] {
-      PADDLE_ENFORCE(paddle::communication::dgc::sparseAllGReduce(
-          in_tensor_buf, gather_buff, k, out_tensor_buf, out_numel, comm,
-          stream));
-    });
-  }
-
+void AllReduceOpHandle::RunAllReduceFuncs(
+    const std::vector<std::function<void()>> &all_reduce_calls) {
   this->RunAndRecordEvent([&] {
     if (all_reduce_calls.size() == 1UL) {
       // Do not use NCCLGroup when manage NCCL by per thread per device
@@ -182,68 +86,9 @@ void AllReduceOpHandle::RunImplEncoded() {
     }
   }
 }
-
-int AllReduceOpHandle::GetKValue(const std::string &grad_name) {
-  auto original_name = paddle::framework::GradOriginalVarName(grad_name);
-  auto var_name = original_name + g_dgc_k;
-  PADDLE_ENFORCE(local_scopes_.size() > 0);
-
-  auto *scope = local_scopes_[0];
-  auto &local_scope = scope->FindVar(kLocalExecScopeName)->Get<Scope *>();
-  auto var = local_scope->FindVar(var_name);
-  PADDLE_ENFORCE_NOT_NULL(var);
-  auto tensor = var->Get<LoDTensor>().data<float>();
-  return *tensor;
-}
-#endif
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-bool AllReduceOpHandle::IsEncoded() {
-  if (!is_encoded_) {
-    return false;
-  }
-  auto counter_name = g_dgc_counter_name;
-  auto step_name = g_dgc_rampup_begin_step;
-  PADDLE_ENFORCE(local_scopes_.size() > 0);
-
-  auto *scope = local_scopes_[0];
-  auto &local_scope = scope->FindVar(kLocalExecScopeName)->Get<Scope *>();
-  auto count_var = local_scope->FindVar(counter_name);
-  auto step_var = local_scope->FindVar(step_name);
-  if (count_var == nullptr || step_var == nullptr) {
-    PADDLE_THROW("not find count_var:%s or step_var:%s", counter_name,
-                 step_var);
-  }
-
-  float count = *count_var->Get<LoDTensor>().data<float>();
-  float step = *step_var->Get<LoDTensor>().data<float>();
-  if (static_cast<int>(count) < static_cast<int>(step)) {
-    VLOG(10) << "in all_reduce currentstep:" << count
-             << " < rampup_begin_step:" << step
-             << " so not use sparse all reduce";
-    return false;
-  }
-
-  return true;
-}
-#else
-bool AllReduceOpHandle::IsEncoded() { return false; }
 #endif
 
 void AllReduceOpHandle::RunImpl() {
-  if (!IsEncoded()) {
-    RunImplNormal();
-    return;
-  }
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  RunImplEncoded();
-#else
-  PADDLE_THROW("Not compiled with CUDA");
-#endif
-}
-
-void AllReduceOpHandle::RunImplNormal() {
   platform::RecordEvent record_event(Name());
 
   WaitInputVarGenerated();
@@ -304,27 +149,7 @@ void AllReduceOpHandle::RunImplNormal() {
             comm, stream));
       });
     }
-    this->RunAndRecordEvent([&] {
-      if (all_reduce_calls.size() == 1UL) {
-        // Do not use NCCLGroup when manage NCCL by per thread per device
-        all_reduce_calls[0]();
-      } else {
-        platform::NCCLGroupGuard guard;
-        for (auto &call : all_reduce_calls) {
-          call();
-        }
-      }
-    });
-
-    if (FLAGS_sync_nccl_allreduce) {
-      for (auto &p : places_) {
-        int dev_id = boost::get<platform::CUDAPlace>(p).device;
-        auto &nccl_ctx = nccl_ctxs_->at(dev_id);
-        auto stream = nccl_ctx.stream();
-        cudaStreamSynchronize(stream);
-      }
-    }
-
+    RunAllReduceFuncs(all_reduce_calls);
 #else
     PADDLE_THROW("Not compiled with CUDA");
 #endif
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h
index ca75186f6c..3effd0a851 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.h
@@ -28,19 +28,12 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-constexpr char g_dgc_counter_name[] = "__g_dgc_counter__";
-constexpr char g_dgc_rampup_begin_step[] = "__g_rampup_begin_step__";
-constexpr char g_dgc_encoded[] = "__dgc_encoded__";
-constexpr char g_dgc_k[] = "__dgc_k__";
-#endif
-
-struct AllReduceOpHandle : public OpHandleBase {
+class AllReduceOpHandle : public OpHandleBase {
+ public:
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places,
-                    const platform::NCCLContextMap *ctxs,
-                    bool is_encoded = false, int nranks = -1);
+                    const platform::NCCLContextMap *ctxs);
 #else
   AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places);
@@ -54,18 +47,13 @@ struct AllReduceOpHandle : public OpHandleBase {
  protected:
   void RunImpl() override;
 
- private:
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  void RunImplEncoded();
+  void RunAllReduceFuncs(
+      const std::vector<std::function<void()>> &all_reduce_calls);
   const platform::NCCLContextMap *nccl_ctxs_;
-  bool is_encoded_{false};
-  int nranks_{-1};
-  int GetKValue(const std::string &grad_name);
 #endif
-  void RunImplNormal();
-  bool IsEncoded();
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/dgc_const_values.h b/paddle/fluid/framework/details/dgc_const_values.h
new file mode 100644
index 0000000000..fbe50dc911
--- /dev/null
+++ b/paddle/fluid/framework/details/dgc_const_values.h
@@ -0,0 +1,32 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+constexpr char g_dgc_counter_name[] = "__g_dgc_counter__";
+constexpr char g_dgc_rampup_begin_step[] = "__g_rampup_begin_step__";
+constexpr char g_dgc_u[] = "__dgc_u__";
+constexpr char g_dgc_v[] = "__dgc_v__";
+constexpr char g_dgc_k[] = "__dgc_k__";
+constexpr char g_dgc_encoded[] = "__dgc_encoded__";
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index f213e07b55..e9aab179d2 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -34,6 +34,10 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
+#if defined(PADDLE_WITH_DGC)
+#include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h"
+#endif
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -438,12 +442,22 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
   auto append_allreduce_op = [&](
       const std::vector<Scope *> &scopes,
       const std::vector<platform::Place> &places) -> OpHandleBase * {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_DGC)
+    if (is_encoded) {
+      result->Get<GraphOps>(kGraphOps).emplace_back(new SparseAllReduceOpHandle(
+          result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
+          scopes, places, nccl_ctxs_, is_encoded,
+          static_cast<int>(strategy_.trainers_endpoints_.size()) *
+              places_.size()));
+    } else {
+      result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
+          result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
+          scopes, places, nccl_ctxs_));
+    }
+#elif defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
     result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
         result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
-        scopes, places, nccl_ctxs_, is_encoded,
-        static_cast<int>(strategy_.trainers_endpoints_.size()) *
-            places_.size()));
+        scopes, places, nccl_ctxs_));
 #else
     result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
         result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
@@ -561,7 +575,11 @@ void AllReduceSSAGraphBuilder::InsertCollectiveOp(
     CreateReduceOp(result, g_name, 0);
     CreateBroadcastOp(result, g_name, 0);
   } else {
+#if defined(PADDLE_WITH_DGC)
+    CreateAllReduceOp(result, g_name, IsEncoded(p_name));
+#else
     CreateAllReduceOp(result, g_name);
+#endif
   }
 }
 
@@ -965,8 +983,9 @@ int DistSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
   return op_dev_id;
 }
 
-bool DistSSAGraphBuilder::IsEncoded(const std::string &p_name) const {
-  auto u_name = p_name + "__dgc_u__";
+#if defined(PADDLE_WITH_DGC)
+bool AllReduceSSAGraphBuilder::IsEncoded(const std::string &p_name) const {
+  auto u_name = p_name + g_dgc_u;
   auto it = all_vars_.find(u_name);
   if (it == all_vars_.end()) {
     VLOG(10) << "can't find u_name, so it's not encoded:" << u_name;
@@ -975,6 +994,11 @@ bool DistSSAGraphBuilder::IsEncoded(const std::string &p_name) const {
 
   return true;
 }
+#else
+bool AllReduceSSAGraphBuilder::IsEncoded(const std::string &p_name) const {
+  return false;
+}
+#endif
 
 void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
                                              const std::string &p_name,
@@ -992,11 +1016,7 @@ void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
         CreateReduceOp(result, g_name, 0);
         CreateBroadcastOp(result, g_name, 0);
       } else {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-        CreateAllReduceOp(result, g_name, IsEncoded(p_name));
-#else
-        PADDLE_ENFORCE(false, "Compiled withoud cuda!");
-#endif
+        CreateAllReduceOp(result, g_name);
       }
       break;
     default:
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index 7cc68dd2d5..0c4b3b0b8c 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -113,6 +113,8 @@ class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
                                   const std::string &g_name) const;
 
   virtual void InsertPostprocessOps(ir::Graph *result) const {}
+
+  bool IsEncoded(const std::string &p_name) const;
 };
 
 class AsyncSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
@@ -203,8 +205,6 @@ class DistSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
 
   mutable std::vector<std::unordered_set<std::string>> bcast_var_name_set_;
   mutable bool need_broadcast_var_{false};
-
-  bool IsEncoded(const std::string &p_name) const;
 };
 
 std::unordered_set<std::string> &MultiDevSSAGraphBuilder();
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
new file mode 100644
index 0000000000..1bdd33fd53
--- /dev/null
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
@@ -0,0 +1,188 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h"
+#include <algorithm>
+#include "dgc/dgc.h"
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/reduce_and_gather.h"
+#include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DECLARE_bool(sync_nccl_allreduce);
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+SparseAllReduceOpHandle::SparseAllReduceOpHandle(
+    ir::Node *node, const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places,
+    const platform::NCCLContextMap *ctxs, bool is_encoded, int nranks)
+    : AllReduceOpHandle(node, local_scopes, places, ctxs),
+      is_encoded_(is_encoded),
+      nranks_(nranks) {
+  // TODO(gongwb) :polish them!
+  if (is_encoded) {
+    VLOG(1) << "Use dgc allreduce mode";
+  }
+}
+
+void SparseAllReduceOpHandle::RunImplEncoded() {
+  platform::RecordEvent record_event(Name());
+
+  WaitInputVarGenerated();
+
+  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
+  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), places_.size(),
+      "The NoDummyInputSize should be equal to the number of places.");
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), out_var_handles.size(),
+      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
+
+  std::vector<const LoDTensor *> ins;
+  std::vector<LoDTensor *> outs;
+  int k = -1;
+  for (size_t i = 0; i < local_scopes_.size(); ++i) {
+    auto &local_scope =
+        local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
+    auto original_name =
+        paddle::framework::GradOriginalVarName(in_var_handles[i]->name());
+    auto encode_var_name = original_name + g_dgc_encoded;
+    auto *in_var = local_scope->FindVar(encode_var_name);
+    PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name);
+    auto &in = in_var->Get<LoDTensor>();
+    ins.emplace_back(&in);
+
+    auto *out = local_scope->FindVar(out_var_handles[i]->name())
+                    ->GetMutable<LoDTensor>();
+    outs.emplace_back(out);
+
+    if (k < 0) {
+      k = GetKValue(in_var_handles[i]->name());
+    }
+  }
+
+  PADDLE_ENFORCE(platform::is_gpu_place(ins[0]->place()));
+  PADDLE_ENFORCE(platform::is_gpu_place(outs[0]->place()));
+  PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
+
+  int dtype = -1;
+  size_t in_numel = 0;
+  size_t out_numel = 0;
+  PADDLE_ENFORCE(nranks_ > 1);
+  std::vector<std::function<void()>> all_reduce_calls;
+
+  for (size_t i = 0; i < local_scopes_.size(); ++i) {
+    auto &place = places_[i];
+    auto &in = *ins[i];
+    void *in_tensor_buf = const_cast<void *>(in.data<void>());
+
+    auto &out = *outs[i];
+    float *out_tensor_buf = out.data<float>();
+
+    dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype;
+    in_numel = (in_numel == 0) ? static_cast<size_t>(in.numel()) : in_numel;
+    PADDLE_ENFORCE(in_numel % 2 == 0);
+    PADDLE_ENFORCE(in_numel / 2 == static_cast<size_t>(k));
+    out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel;
+
+    int dev_id = boost::get<platform::CUDAPlace>(place).device;
+    auto &nccl_ctx = nccl_ctxs_->at(dev_id);
+    auto stream = nccl_ctx.stream();
+    auto comm = nccl_ctx.comm_;
+
+    auto &allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(place, stream);
+    int encode_size = 2 * k * sizeof(int);
+    // dgc use ncclAllGather to get all the encoded data
+    // so the buffer need nranks.
+    int buf_size = nranks_ * encode_size;
+    auto tmp_ious_data = allocator.Allocate(buf_size);
+    void *gather_buff = reinterpret_cast<void *>(tmp_ious_data->ptr());
+
+    VLOG(10) << "in_numel:" << in_numel << ", out_numel:" << out_numel
+             << ", nranks:" << nranks_ << ", gather_buf size:" << buf_size
+             << ", k:" << k << ", place:" << place << ", dtype:" << dtype;
+
+    all_reduce_calls.emplace_back([=] {
+      PADDLE_ENFORCE(paddle::communication::dgc::sparseAllGReduce(
+          in_tensor_buf, gather_buff, k, out_tensor_buf, out_numel, comm,
+          stream));
+    });
+  }
+
+  RunAllReduceFuncs(all_reduce_calls);
+}
+
+int SparseAllReduceOpHandle::GetKValue(const std::string &grad_name) {
+  auto original_name = paddle::framework::GradOriginalVarName(grad_name);
+  auto var_name = original_name + g_dgc_k;
+  PADDLE_ENFORCE(local_scopes_.size() > 0);
+
+  auto *scope = local_scopes_[0];
+  auto &local_scope = scope->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  auto var = local_scope->FindVar(var_name);
+  PADDLE_ENFORCE_NOT_NULL(var);
+  auto tensor = var->Get<LoDTensor>().data<float>();
+  return *tensor;
+}
+
+bool SparseAllReduceOpHandle::IsEncoded() {
+  if (!is_encoded_) {
+    return false;
+  }
+  auto counter_name = g_dgc_counter_name;
+  auto step_name = g_dgc_rampup_begin_step;
+  PADDLE_ENFORCE(local_scopes_.size() > 0);
+
+  auto *scope = local_scopes_[0];
+  auto &local_scope = scope->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  auto count_var = local_scope->FindVar(counter_name);
+  auto step_var = local_scope->FindVar(step_name);
+  if (count_var == nullptr || step_var == nullptr) {
+    PADDLE_THROW("not find count_var:%s or step_var:%s", counter_name,
+                 step_var);
+  }
+
+  float count = *count_var->Get<LoDTensor>().data<float>();
+  float step = *step_var->Get<LoDTensor>().data<float>();
+  if (static_cast<int>(count) < static_cast<int>(step)) {
+    VLOG(10) << "in all_reduce currentstep:" << count
+             << " < rampup_begin_step:" << step
+             << " so not use sparse all reduce";
+    return false;
+  }
+
+  return true;
+}
+
+void SparseAllReduceOpHandle::RunImpl() {
+  if (!IsEncoded()) {
+    AllReduceOpHandle::RunImpl();
+    return;
+  }
+
+  RunImplEncoded();
+}
+
+std::string SparseAllReduceOpHandle::Name() const {
+  return "sparse_all_reduce";
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
new file mode 100644
index 0000000000..ed6be65a2c
--- /dev/null
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
@@ -0,0 +1,52 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
+#include "paddle/fluid/framework/details/dgc_const_values.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class SparseAllReduceOpHandle : public AllReduceOpHandle {
+ public:
+  SparseAllReduceOpHandle(ir::Node *node,
+                          const std::vector<Scope *> &local_scopes,
+                          const std::vector<platform::Place> &places,
+                          const platform::NCCLContextMap *ctxs,
+                          bool is_encoded = false, int nranks = -1);
+  std::string Name() const override;
+
+ protected:
+  void RunImpl() override;
+  int GetKValue(const std::string &grad_name);
+  bool IsEncoded();
+  void RunImplEncoded();
+
+ private:
+  bool is_encoded_{false};
+  int nranks_{-1};
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index fb433ff2a2..5e0be5d445 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -49,11 +49,6 @@ set(SHARED_INFERENCE_SRCS
     ${mkldnn_quantizer_src}
     ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc)
 
-# FIXME(gongwb): hidden libdgc.a
-if(WITH_GPU AND NOT WIN32)
-    set(fluid_modules ${fluid_modules} dgc)
-endif()
-
 if(WIN32)
   sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
               analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder)
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index e52e83673f..6e8d6f459c 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -72,7 +72,7 @@ endif()
 
 set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 
-if (WITH_GPU AND NOT WIN32)
+if (WITH_DGC)
     op_library(dgc_op DEPS dgc)
     file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(dgc);\n")
     set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dgc)
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index a2669ee211..5de00db55a 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -45,13 +45,12 @@ cc_library(cpu_helper SRCS cpu_helper.cc DEPS cblas enforce)
 cc_test(cpu_helper_test SRCS cpu_helper_test.cc DEPS cpu_helper)
 
 set(dgc_deps "")
+IF(WITH_DGC)
+    set(dgc_deps dgc)
+ENDIF()
+
 IF(WITH_GPU)
     set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
-    if(NOT WIN32)
-        set(dgc_deps dgc)
-    endif()
-ELSE()
-    set(dgc_deps)
 ENDIF()
 
 IF(WITH_MKLDNN)
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 407d1b1299..bb22628cdf 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -31,7 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/piece.h"
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_DGC)
 #include "dgc/dgc.h"
 #endif
 
@@ -211,7 +211,7 @@ void InitGLOG(const std::string &prog_name) {
 #endif
 }
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_DGC)
 void InitDGC() {
   std::call_once(dgc_init_flag, []() {
     PADDLE_ENFORCE(paddle::communication::dgc::dynloadNcclLib());
diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc
index f8ded9f94e..71eeaf3b53 100644
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -17,6 +17,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
 
+#if defined(PADDLE_WITH_DGC)
+#include "paddle/fluid/framework/details/dgc_const_values.h"
+#include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h"
+#endif
+
 namespace paddle {
 namespace pybind {
 
@@ -52,6 +57,17 @@ void BindConstValue(pybind11::module* m) {
   op_proto_and_checker_maker.def(
       "kOpCreationCallstackAttrName",
       framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName);
+#if defined(PADDLE_WITH_DGC)
+  auto dgc = m->def_submodule("dgc");
+  dgc.def("kDGCUName", [] { return framework::details::g_dgc_u; });
+  dgc.def("kDGCVName", [] { return framework::details::g_dgc_v; });
+  dgc.def("kDGCKName", [] { return framework::details::g_dgc_k; });
+  dgc.def("kDGCEncodedName", [] { return framework::details::g_dgc_encoded; });
+  dgc.def("kDGCCounterName",
+          [] { return framework::details::g_dgc_counter_name; });
+  dgc.def("kDGCRampUpBeginStepName",
+          [] { return framework::details::g_dgc_rampup_begin_step; });
+#endif
 }
 
 }  // namespace pybind
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index c3b7aee2b4..28126b72a4 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -751,14 +751,14 @@ class DGCMomentumOptimizer(MomentumOptimizer):
 
         # step counter
         self._global_step_var = self._add_auto_increment_var(
-            counter_name='__g_dgc_counter__', begin=0)
+            counter_name=core.dgc.kDGCCounterName(), begin=0)
 
         # rampup begin step var for all_reduce_op_handle
         self._rampup_begin_step_var = tensor.create_global_var(
             shape=[1],
             dtype=core.VarDesc.VarType.FP32,
             persistable=True,
-            name='__g_rampup_begin_step__',
+            name=core.dgc.kDGCRampUpBeginStepName(),
             value=self._rampup_begin_step * 1.0,
             force_cpu=True)
 
@@ -774,20 +774,20 @@ class DGCMomentumOptimizer(MomentumOptimizer):
                 shape=param_var.shape,
                 dtype=param_var.dtype,
                 persistable=True,
-                name=param_var.name + "__dgc_u__",
+                name=param_var.name + core.dgc.kDGCUName(),
                 value=0.0)
             v_var = tensor.create_global_var(
                 shape=param_var.shape,
                 dtype=param_var.dtype,
                 persistable=True,
-                name=param_var.name + "__dgc_v__",
+                name=param_var.name + core.dgc.kDGCVName(),
                 value=0.0)
 
             k_var = tensor.create_global_var(
                 shape=[1],
                 dtype=param_var.dtype,
                 persistable=True,
-                name=param_var.name + "__dgc_k__",
+                name=param_var.name + core.dgc.kDGCKName(),
                 value=0.0,
                 force_cpu=True)
 
@@ -795,7 +795,7 @@ class DGCMomentumOptimizer(MomentumOptimizer):
                 shape=[1],
                 dtype=param_var.dtype,
                 persistable=True,
-                name=param_var.name + "__dgc_encoded__",
+                name=param_var.name + core.dgc.kDGCEncodedName(),
                 value=0.0,
                 force_cpu=False)
 
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 092cd5aea7..6bf91887dc 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -104,11 +104,13 @@ class ParallelExecutor(object):
         self._scope = scope if scope is not None else executor.global_scope()
 
         if main_program is not None and main_program._enable_dgc:
-            assert num_trainers > 1
-            assert build_strategy.reduce_strategy == BuildStrategy.ReduceStrategy.AllReduce
+            assert num_trainers > 1, "dgc is not useful when num_trainers <= 1"
+            assert build_strategy.reduce_strategy == BuildStrategy.ReduceStrategy.AllReduce, "dgc \
+                only used for allreduce"
+
             assert num_trainers * len(
                 self._places) > 1, "dgc is not useful for single card training"
-            assert use_cuda
+            assert use_cuda, "dgc only used under cuda"
 
         main_program = main_program if main_program is not None \
             else framework.default_main_program()
@@ -125,10 +127,6 @@ class ParallelExecutor(object):
             share_vars_from=share_vars_from._compiled_program
             if share_vars_from else None)
 
-        # FIXME(gongwb): I will move dgc from dist mode to allreduce mode in next pr.
-        if main_program._enable_dgc:
-            self._compiled_program._build_strategy.is_distribution = True
-
         self._place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
         self._exe = executor.Executor(self._place)
         self._compiled_program._compile(place=self._place, scope=self._scope)
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index f95474f561..1d2db81872 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -97,11 +97,13 @@ py_test_modules(test_imperative_se_resnext MODULES test_imperative_se_resnext EN
 if(WITH_DISTRIBUTE)
     py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
     set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
+    if(WITH_DGC)
+        py_test_modules(test_dgc_op MODULES test_dgc_op)
+    endif()
     if(NOT APPLE)
         set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200)
         set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
         py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
-        py_test_modules(test_dgc_op MODULES test_dgc_op)
         set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
         py_test_modules(test_dist_se_resnext_nccl MODULES test_dist_se_resnext_nccl SERIAL)
         set_tests_properties(test_dist_se_resnext_nccl PROPERTIES TIMEOUT 1000)
-- 
GitLab