From 7bc4b095008058fdde05e3e9337e31845f1ce9b5 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Wed, 5 Feb 2020 14:01:10 +0800
Subject: [PATCH] add WITH_NCCL option for cmake. (#22384)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

cmake选项中添加了WITH_NCCL，显示指定是否编译NCCL的部分代码，WITH_NCCL默认打开，但如果WITH_GPU为OFF，则关闭WITH_NCCL

添加了PADDLE_WITH_NCCL定义

单机单卡能够关闭NCCL编译，多卡的话需要默认打开NCCL，如果关闭NCCL，则只能使用单卡

Co-authored-by: 石晓伟 <39303645+Shixiaowei02@users.noreply.github.com>
---
 CMakeLists.txt                                | 22 +++++++++++++++++++
 .../framework/details/all_reduce_op_handle.cc |  6 ++---
 .../framework/details/all_reduce_op_handle.h  |  8 +++----
 .../framework/details/broadcast_op_handle.cc  |  2 +-
 .../framework/details/broadcast_op_handle.h   |  6 ++---
 .../details/broadcast_op_handle_test.h        | 12 +++++-----
 .../details/fused_all_reduce_op_handle.cc     |  2 +-
 .../details/fused_all_reduce_op_handle.h      |  4 ++--
 .../details/fused_broadcast_op_handle.h       |  4 ++--
 .../details/fused_broadcast_op_handle_test.cc |  4 ++--
 .../framework/details/reduce_op_handle.cc     |  2 +-
 .../framework/details/reduce_op_handle.h      |  4 ++--
 .../fluid/framework/device_worker_factory.cc  |  2 +-
 paddle/fluid/framework/fleet/CMakeLists.txt   |  4 +++-
 .../all_reduce_deps_pass.cc                   |  2 +-
 .../fuse_all_reduce_op_pass.cc                | 14 ++++++------
 .../multi_devices_graph_pass.cc               | 16 +++++++-------
 .../multi_devices_graph_pass.h                |  2 +-
 paddle/fluid/framework/parallel_executor.cc   |  6 ++---
 paddle/fluid/framework/parallel_executor.h    |  2 +-
 paddle/fluid/framework/pipeline_trainer.cc    |  2 +-
 paddle/fluid/framework/section_worker.cc      |  2 +-
 paddle/fluid/framework/trainer_factory.cc     |  2 +-
 paddle/fluid/imperative/CMakeLists.txt        |  4 +++-
 paddle/fluid/imperative/nccl_context.cc       |  2 +-
 paddle/fluid/imperative/nccl_context.h        |  4 ++--
 paddle/fluid/imperative/tests/CMakeLists.txt  |  4 +++-
 paddle/fluid/operators/CMakeLists.txt         |  6 +++--
 .../fluid/operators/collective/CMakeLists.txt |  2 +-
 paddle/fluid/operators/data_norm_op.cu        |  4 ++--
 .../operators/distributed_ops/CMakeLists.txt  |  2 +-
 paddle/fluid/operators/nccl/CMakeLists.txt    |  4 ++++
 paddle/fluid/platform/device_context.cc       |  2 +-
 paddle/fluid/platform/device_context.h        |  6 ++---
 paddle/fluid/platform/dynload/CMakeLists.txt  |  5 ++++-
 paddle/fluid/platform/enforce_test.cc         |  2 +-
 paddle/fluid/pybind/CMakeLists.txt            | 20 ++++++++++++-----
 paddle/fluid/pybind/imperative.cc             |  2 +-
 paddle/fluid/pybind/pybind.cc                 |  2 +-
 .../fluid/tests/unittests/CMakeLists.txt      |  3 +++
 40 files changed, 127 insertions(+), 77 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c7a613c8e7..87fc787b74 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,6 +89,7 @@ option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VER
 option(WITH_DGC   "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE})
 option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF)
 option(WITH_LITE   "Compile Paddle Fluid with Lite Engine" OFF)
+option(WITH_NCCL   "Compile PaddlePaddle with NCCL support"             ON)
 
 # PY_VERSION
 if(NOT PY_VERSION)
@@ -121,6 +122,27 @@ if(WIN32)
         set(WITH_DISTRIBUTE OFF CACHE STRING
             "Disable DISTRIBUTE when compiling for Windows" FORCE)
     endif()
+    if(WITH_NCCL)
+        MESSAGE(WARNING
+            "Disable NCCL when compiling for Windows. Force WITH_NCCL=OFF.")
+        set(WITH_NCCL OFF CACHE STRING
+            "Disable NCCL when compiling for Windows" FORCE)
+    endif()
+endif()
+
+if (NOT WITH_GPU AND WITH_NCCL)
+    MESSAGE(WARNING
+        "Disable NCCL when compiling without GPU. Force WITH_NCCL=OFF.")
+    set(WITH_NCCL OFF CACHE STRING
+        "Disable NCCL when compiling without GPU" FORCE)
+endif()
+
+if(WITH_NCCL)
+     add_definitions("-DPADDLE_WITH_NCCL")
+else()
+     if(WITH_GPU)
+         MESSAGE(WARNING "If the environment is multi-card, the WITH_NCCL option needs to be turned on, otherwise only a single card can be used.")
+     endif()
 endif()
 
 if(WITH_BRPC_RDMA)
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 6c3b0923ed..3110e579d3 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -28,7 +28,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                      const std::vector<Scope *> &local_scopes,
                                      const std::vector<platform::Place> &places,
@@ -121,7 +121,7 @@ void AllReduceOpHandle::AllReduceFunc(
     const std::vector<platform::Place> &places,
     const std::vector<std::string> &out_var_names) {
   if (is_gpu_place(places[0])) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
     PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
     ncclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype);
     std::vector<std::function<void()>> all_reduce_calls;
@@ -161,7 +161,7 @@ void AllReduceOpHandle::AllReduceFunc(
   VLOG(10) << Name() << " size:" << numel * SizeOfType(dtype);
 }
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 void AllReduceOpHandle::NCCLAllReduceFunc(
     const std::vector<std::function<void()>> &all_reduce_calls) {
   this->RunAndRecordEvent([&] {
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h
index c8ff151a88..36f5d3adfa 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.h
@@ -20,7 +20,7 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
@@ -29,7 +29,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 class AllReduceOpHandle : public NCCLOpHandleBase {
  public:
   AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
@@ -54,13 +54,13 @@ class AllReduceOpHandle : public OpHandleBase {
 
   std::vector<Scope *> local_scopes_;
 
-#if !(defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
+#ifndef PADDLE_WITH_NCCL
   // NCCLOpHandleBase already have these attributes.
   // Will polish it by class inheritance framework.
   std::vector<platform::Place> places_;
 #endif
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
   void NCCLAllReduceFunc(
       const std::vector<std::function<void()>> &all_reduce_calls);
 
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 3637625f7e..b3c79b63ce 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -73,7 +73,7 @@ void BroadcastOpHandle::BroadcastOneVar(
       });
     }
   } else {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
     VarHandle *out_handle = nullptr;
     int root_id = boost::get<platform::CUDAPlace>(in_tensor.place()).device;
     std::vector<std::function<void()>> broadcast_calls;
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
index 45ccbb41e0..588248d645 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -24,7 +24,7 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
@@ -34,7 +34,7 @@ namespace details {
 
 struct BroadcastOpHandle : public OpHandleBase {
  public:
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
   BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places,
                     const platform::NCCLContextMap *nccl_ctxs)
@@ -70,7 +70,7 @@ struct BroadcastOpHandle : public OpHandleBase {
 
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
   const platform::NCCLContextMap *nccl_ctxs_;
 #endif
 
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h
index abc3f39e68..6d14c7e4e7 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h
@@ -44,7 +44,7 @@ struct TestBroadcastOpHandle {
   std::vector<std::unique_ptr<ir::Node>> nodes_;
   std::vector<p::Place> place_list_;
   bool use_gpu_;
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
   std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif
 
@@ -52,7 +52,7 @@ struct TestBroadcastOpHandle {
     for (size_t j = 0; j < ctxs_.size(); ++j) {
       ctxs_[j]->Wait();
     }
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
     if (nccl_ctxs_) {
       nccl_ctxs_->WaitAll();
     }
@@ -62,7 +62,7 @@ struct TestBroadcastOpHandle {
   void InitCtxOnGpu(bool use_gpu) {
     use_gpu_ = use_gpu;
     if (use_gpu_) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
       int count = p::GetCUDADeviceCount();
       if (count <= 1) {
         LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
@@ -86,7 +86,7 @@ struct TestBroadcastOpHandle {
         place_list_.push_back(p);
         ctxs_.emplace_back(new p::CPUDeviceContext(p));
       }
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
       nccl_ctxs_.reset(nullptr);
 #endif
     }
@@ -107,14 +107,14 @@ struct TestBroadcastOpHandle {
     nodes_.emplace_back(
         ir::CreateNodeForTest("node0", ir::Node::Type::kOperation));
     if (use_gpu_) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
       op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
                                          place_list_, nccl_ctxs_.get());
 #else
       PADDLE_THROW("CUDA is not support.");
 #endif
     } else {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
       op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
                                          place_list_, nccl_ctxs_.get());
 #else
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
index ddd6d10e5e..c67e21d5c4 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -28,7 +28,7 @@ namespace details {
 typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>>
     GradientAndLoDTensor;
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 FusedAllReduceOpHandle::FusedAllReduceOpHandle(
     ir::Node *node, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places, const size_t num_of_all_reduce,
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
index f6a11c4e50..16c13ac1c0 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
@@ -21,7 +21,7 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
@@ -30,7 +30,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 struct FusedAllReduceOpHandle : public AllReduceOpHandle {
   FusedAllReduceOpHandle(ir::Node *node,
                          const std::vector<Scope *> &local_scopes,
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.h b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
index e43d545c9c..8a59d2bfa9 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
@@ -25,7 +25,7 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
@@ -35,7 +35,7 @@ namespace details {
 
 struct FusedBroadcastOpHandle : public BroadcastOpHandle {
  public:
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
   FusedBroadcastOpHandle(ir::Node *node,
                          const std::vector<Scope *> local_scopes,
                          const std::vector<platform::Place> &places,
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
index 49404509a6..cbded074f2 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
@@ -45,14 +45,14 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
     nodes_.emplace_back(
         ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation));
     if (use_gpu_) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
       op_handle_ = new FusedBroadcastOpHandle(
           nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
 #else
       PADDLE_THROW("CUDA is not supported.");
 #endif
     } else {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
       op_handle_ = new FusedBroadcastOpHandle(
           nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
 #else
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index f524571367..80c0cda505 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -264,7 +264,7 @@ void ReduceOpHandle::RunImpl() {
         }
       });
     } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
       auto pre_in = pre_in_var->Get<framework::LoDTensor>();
       VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var);
       VariableVisitor::GetMutableTensor(out_var).mutable_data(
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
index 15064a108e..8b92bdef47 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -24,7 +24,7 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
@@ -62,7 +62,7 @@ struct ReduceOpHandle : public OpHandleBase {
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
   const platform::NCCLContextMap *nccl_ctxs_;
   ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                  const std::vector<platform::Place> &places,
diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc
index dc85941f57..e163b601d9 100644
--- a/paddle/fluid/framework/device_worker_factory.cc
+++ b/paddle/fluid/framework/device_worker_factory.cc
@@ -61,7 +61,7 @@ std::shared_ptr<DeviceWorker> DeviceWorkerFactory::CreateDeviceWorker(
 
 REGISTER_DEVICE_WORKER_CLASS(HogwildWorker);
 REGISTER_DEVICE_WORKER_CLASS(DownpourWorker);
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 REGISTER_DEVICE_WORKER_CLASS(SectionWorker);
 #endif
 }  // namespace framework
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
index 1b031ab315..78e9cb10d5 100644
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -4,7 +4,9 @@ else()
     cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope)
 endif(WITH_PSLIB)
 
-cc_library(nccl_wrapper SRCS nccl_wrapper.cc DEPS framework_proto variable_helper scope)
+if(WITH_NCCL)
+    cc_library(nccl_wrapper SRCS nccl_wrapper.cc DEPS framework_proto variable_helper scope)
+endif()
 if(WITH_BOX_PS)
     cc_library(box_wrapper SRCS box_wrapper.cc DEPS framework_proto lod_tensor box_ps)
 else()
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
index 4c00599680..8923dfc323 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
@@ -39,7 +39,7 @@ class AllReduceDepsPass : public ir::Pass {
     std::vector<details::OpHandleBase*> all_reduce_op_handles =
         GetSortedAllReduceOps(*graph);
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
     auto use_hierarchical_allreduce =
         Get<bool>(details::kUseHierarchicalAllReduce);
     for (size_t i = 0; i < all_reduce_op_handles.size(); ++i) {
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
index 9b7bd8cc82..86fbbaf772 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
@@ -38,7 +38,7 @@ class FuseAllReduceOpPass : public ir::Pass {
     auto &places = Get<const std::vector<platform::Place>>(details::kPlaces);
     auto &local_scopes = Get<const std::vector<Scope *>>(details::kLocalScopes);
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
     auto *multi_nccl_ctxs =
         &Get<platform::NCCLCommunicator>(details::kNCCLCtxs);
 #endif
@@ -85,7 +85,7 @@ class FuseAllReduceOpPass : public ir::Pass {
       for (auto &p_g : group_p_g) {
         group_all_reduce_ops.emplace_back(all_reduce_ops.at(p_g.second));
       }
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
       InsertFusedAllReduce(places, local_scopes, group_size,
                            group_all_reduce_ops, multi_nccl_ctxs, &result);
 #else
@@ -134,7 +134,7 @@ class FuseAllReduceOpPass : public ir::Pass {
                             const std::vector<Scope *> &local_scopes,
                             const size_t num_of_all_reduce,
                             const std::vector<ir::Node *> &all_reduce_ops,
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
                             const platform::NCCLCommunicator *multi_nccl_ctxs,
 #endif
                             ir::Graph *result) const {
@@ -161,7 +161,7 @@ class FuseAllReduceOpPass : public ir::Pass {
       result->RemoveNode(op_handle.Node());
     }
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
     CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places,
                            local_scopes, multi_nccl_ctxs, result);
 #else
@@ -177,11 +177,11 @@ class FuseAllReduceOpPass : public ir::Pass {
       const size_t num_of_all_reduce,
       const std::vector<platform::Place> &places,
       const std::vector<Scope *> &local_scopes,
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
       const platform::NCCLCommunicator *multi_nccl_ctxs,
 #endif
       ir::Graph *result) const {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
     auto *op_handle = new details::FusedAllReduceOpHandle(
         result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation),
         local_scopes, places, num_of_all_reduce, multi_nccl_ctxs);
@@ -199,7 +199,7 @@ class FuseAllReduceOpPass : public ir::Pass {
       op_handle->AddOutput(out);
     }
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
     if (!multi_nccl_ctxs) {
       SetCommunicationContext(places, op_handle);
     }
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
index 7a04b44910..79b5099355 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
@@ -156,7 +156,7 @@ void MultiDevSSAGraphBuilderBase::Init() const {
   places_ = Get<const std::vector<platform::Place>>(details::kPlaces);
   local_scopes_ = Get<const std::vector<Scope *>>(details::kLocalScopes);
   strategy_ = Get<const details::BuildStrategy>(kStrategy);
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
   multi_nccl_ctxs_ = &Get<platform::NCCLCommunicator>(details::kNCCLCtxs);
   nccl_ctxs_ = nullptr;
   if (multi_nccl_ctxs_) {
@@ -298,7 +298,7 @@ std::vector<ir::Node *> MultiDevSSAGraphBuilderBase::SortOperations(
 
 bool MultiDevSSAGraphBuilderBase::UseGPU() const {
   bool use_gpu = false;
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
   use_gpu = nccl_ctxs_ != nullptr;
 #endif
   return use_gpu;
@@ -348,7 +348,7 @@ void MultiDevSSAGraphBuilderBase::CreateOpHandleIOs(ir::Graph *result,
 
 void MultiDevSSAGraphBuilderBase::SetCommunicationContext(
     details::OpHandleBase *op_handle, const platform::Place &p) const {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
   if (nccl_ctxs_ == nullptr) {
     op_handle->SetDeviceContext(p,
                                 platform::DeviceContextPool::Instance().Get(p));
@@ -362,7 +362,7 @@ void MultiDevSSAGraphBuilderBase::SetCommunicationContext(
 void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result,
                                                     const std::string &p_name,
                                                     size_t src_dev_id) const {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
   auto *op_handle = new details::BroadcastOpHandle(
       result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation),
       local_scopes_, places_, nccl_ctxs_);
@@ -395,7 +395,7 @@ void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result,
 void MultiDevSSAGraphBuilderBase::CreateFusedBroadcastOp(
     ir::Graph *result,
     const std::vector<std::unordered_set<std::string>> &bcast_varnames) const {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
   auto *op_handle = new details::FusedBroadcastOpHandle(
       result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation),
       local_scopes_, places_, nccl_ctxs_);
@@ -451,7 +451,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
   auto append_allreduce_op = [&](
       const std::vector<Scope *> &scopes,
       const std::vector<platform::Place> &places) -> details::OpHandleBase * {
-#if defined(PADDLE_WITH_DGC)
+#if defined(PADDLE_WITH_DGC) && defined(PADDLE_WITH_NCCL)
     if (is_encoded) {
       result->Get<GraphOps>(kGraphOps).emplace_back(
           new details::SparseAllReduceOpHandle(
@@ -464,7 +464,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
               result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
               scopes, places, multi_nccl_ctxs_));
     }
-#elif defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#elif defined(PADDLE_WITH_NCCL)
     result->Get<GraphOps>(kGraphOps).emplace_back(
         new details::AllReduceOpHandle(
             result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
@@ -539,7 +539,7 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOps(
 
 details::VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(
     ir::Graph *result, const std::string &og, size_t dst_dev_id) const {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
   result->Get<GraphOps>(kGraphOps).emplace_back(new details::ReduceOpHandle(
       result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
       local_scopes_, places_, nccl_ctxs_));
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
index ea0455b6a8..719df016d9 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
@@ -94,7 +94,7 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass {
   void CreateOpHandleIOs(ir::Graph *result, ir::Node *node,
                          size_t device_id) const;
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
   mutable platform::NCCLContextMap *nccl_ctxs_{nullptr};
   mutable platform::NCCLCommunicator *multi_nccl_ctxs_{nullptr};
 #endif
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 7dc13bae4b..7958d3f99c 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -109,7 +109,7 @@ class ParallelExecutorPrivate {
     }
   }
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
   void InitNCCLCtxs(framework::Scope *scope, const BuildStrategy &bst) {
     VLOG(1) << "nccl comm num:" << bst.nccl_comm_num_ << ", nranks:" << nranks_
             << ", num_trainers:" << bst.num_trainers_
@@ -473,7 +473,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
   }
 
   if (member_->use_cuda_ && member_->nranks_ > 1) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
     member_->InitOrGetNCCLCommunicator(scope, &member_->build_strategy_);
 
     // Initialize device context's nccl comm, will be used by normal
@@ -652,7 +652,7 @@ void ParallelExecutor::BCastParamsToDevices(
     }
     auto &dims = main_tensor.dims();
     if (paddle::platform::is_gpu_place(main_tensor.place())) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
       std::vector<void *> buffers;
       buffers.reserve(member_->places_.size());
       size_t numel = main_tensor.numel();
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 612a58de07..56aeb21531 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -32,7 +32,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc
index 3a851f0920..88c1d83ff8 100644
--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ b/paddle/fluid/framework/pipeline_trainer.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/trainer.h"
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index c1a404c1cb..cd5204b490 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include "google/protobuf/io/zero_copy_stream_impl.h"
 #include "google/protobuf/message.h"
 #include "google/protobuf/text_format.h"
diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc
index ce0eb5ec30..23cfa11d4c 100644
--- a/paddle/fluid/framework/trainer_factory.cc
+++ b/paddle/fluid/framework/trainer_factory.cc
@@ -63,7 +63,7 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer(
 
 REGISTER_TRAINER_CLASS(MultiTrainer);
 REGISTER_TRAINER_CLASS(DistMultiTrainer);
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 REGISTER_TRAINER_CLASS(PipelineTrainer);
 #endif
 }  // namespace framework
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index 4250ba20ea..3b09c8402b 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -9,7 +9,9 @@ cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer)
 cc_library(engine SRCS engine.cc DEPS layer gradient_accumulator)
 cc_library(imperative_profiler SRCS profiler.cc)
 if(NOT WIN32)
-    cc_library(nccl_context SRCS nccl_context.cc DEPS device_context)
+    if(WITH_NCCL)
+        cc_library(nccl_context SRCS nccl_context.cc DEPS device_context)
+    endif()
     cc_library(data_loader SRCS data_loader.cc DEPS enforce)
 endif(NOT WIN32)
 
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index ab612b2f15..3c3634b0bd 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -16,7 +16,7 @@
 
 namespace paddle {
 namespace imperative {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 void NCCLParallelContext::RecvNCCLID(const std::string &ep,
                                      ncclUniqueId *nccl_id) {
   auto addr = paddle::string::Split(ep, ':');
diff --git a/paddle/fluid/imperative/nccl_context.h b/paddle/fluid/imperative/nccl_context.h
index b4f44e5640..ac36ed77b4 100644
--- a/paddle/fluid/imperative/nccl_context.h
+++ b/paddle/fluid/imperative/nccl_context.h
@@ -26,7 +26,7 @@
 
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/device_context.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif
 #include "paddle/fluid/platform/place.h"
@@ -57,7 +57,7 @@ class ParallelContext {
   platform::Place place_;
 };
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 class NCCLParallelContext : ParallelContext {
  public:
   explicit NCCLParallelContext(const ParallelStrategy& strategy,
diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt
index a128a774d0..e3c82474e0 100644
--- a/paddle/fluid/imperative/tests/CMakeLists.txt
+++ b/paddle/fluid/imperative/tests/CMakeLists.txt
@@ -1,7 +1,9 @@
 if(WIN32)
     cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS device_context)
 else()
-    cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context)
+    if (WITH_NCCL)
+        cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context)
+    endif()
 endif(WIN32)
 
 
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 675466bfb8..9ad9227610 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -69,8 +69,10 @@ if (WITH_GPU)
         op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
     endif()
     if (NOT WIN32)
-        op_library(sync_batch_norm_op)
-        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n")
+        if (WITH_NCCL)
+            op_library(sync_batch_norm_op)
+            file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n")
+        endif()
     endif()
 else()
     op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt
index c3b3f60eaf..3f9423ae5c 100644
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -30,7 +30,7 @@ endforeach()
 
 register_operators(EXCLUDES c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
 
-if(WITH_GPU AND NOT WIN32)
+if(WITH_NCCL)
     set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper)
     op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS} nccl_common)
 endif()
diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu
index 643e7bced5..9afb5b9908 100644
--- a/paddle/fluid/operators/data_norm_op.cu
+++ b/paddle/fluid/operators/data_norm_op.cu
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/data_norm_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
@@ -178,7 +178,7 @@ class DataNormGradKernel<platform::CUDADeviceContext, T>
         d_batch_sum, d_batch_square_sum);
 
     if (need_sync_stats) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
       auto comm = platform::NCCLCommContext::Instance().Get(0, ctx.GetPlace());
       PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
           reinterpret_cast<const void *>(d_batch_size),
diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
index d1a9c65c9c..79f14d75d2 100644
--- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
@@ -30,7 +30,7 @@ endforeach()
 
 register_operators(EXCLUDES gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS})
 
-if(WITH_GPU AND NOT WIN32)
+if(WITH_NCCL)
     set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} nccl_common)
     op_library(gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS} nccl_common)
 endif()
diff --git a/paddle/fluid/operators/nccl/CMakeLists.txt b/paddle/fluid/operators/nccl/CMakeLists.txt
index 9b26e19cc7..4f1fe372f5 100644
--- a/paddle/fluid/operators/nccl/CMakeLists.txt
+++ b/paddle/fluid/operators/nccl/CMakeLists.txt
@@ -1,3 +1,7 @@
+if (NOT WITH_NCCL)
+  return()
+endif()
+
 if(WITH_GPU AND NOT WIN32)
   nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator )
 endif()
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 62c83e0c4e..322a327967 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -305,7 +305,7 @@ CUDADeviceContext::~CUDADeviceContext() {
     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_),
                                 "Failed to destory Cudnn handle");
   }
-#if !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
   if (nccl_comm_) {
     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
   }
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 50e4538987..d6b8cda94f 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/cuda_helper.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
-#if !defined(__APPLE__) && !defined(_WIN32)
+#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif
 #include "paddle/fluid/platform/gpu_info.h"
@@ -144,7 +144,7 @@ class CUDADeviceContext : public DeviceContext {
   /*! \brief  Return cuda stream in the device context. */
   cudaStream_t stream() const;
 
-#if !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
   /*! \brief  Return nccl communicators. */
   ncclComm_t nccl_comm() const { return nccl_comm_; }
 
@@ -180,7 +180,7 @@ class CUDADeviceContext : public DeviceContext {
   std::unique_ptr<CublasHandleHolder> cublas_handle_;
   std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;
 
-#if !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
   // NCCL communicator (single process version) for NCCL collective operations.
   // NCCL collective operations provides fast collectives over multiple GPUs
   // both within and across nodes.
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 81312111ae..04597830a8 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -5,7 +5,10 @@ list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc)
 # There is no macOS version of NCCL.
 # Disable nvrtc and cuda_driver api on MacOS and Windows, and only do a early test on Linux.
 if (NOT APPLE AND NOT WIN32)
-  list(APPEND CUDA_SRCS nccl.cc nvrtc.cc cuda_driver.cc)
+  list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc)
+  if (WITH_NCCL)
+    list(APPEND CUDA_SRCS nccl.cc)
+  endif()
 endif()
 
 if (TENSORRT_FOUND)
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index 3e0f046cef..f1f24bb920 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -292,7 +292,7 @@ TEST(enforce, cuda_success) {
   EXPECT_TRUE(CheckCudaStatusSuccess(CUBLAS_STATUS_SUCCESS));
   EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED));
   EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE));
-#if !defined(__APPLE__) && !defined(_WIN32)
+#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
   EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
   EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError));
   EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError));
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 171d7752a7..320951debb 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,10 +1,17 @@
-set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper nccl_wrapper prune
+set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper prune
   feed_fetch_method pass_builder parallel_executor profiler layer tracer engine scope_pool
   analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
   gloo_wrapper)
 
+if (WITH_NCCL)
+  set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper)
+endif()
+
 if(NOT WIN32)
-  set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context data_loader)
+  set(PYBIND_DEPS ${PYBIND_DEPS} data_loader)
+  if (WITH_NCCL)
+    set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context)
+  endif()
 endif(NOT WIN32)
 
 if(WITH_PYTHON)
@@ -25,7 +32,6 @@ set(PYBIND_SRCS
   fleet_wrapper_py.cc
   gloo_wrapper_py.cc
   box_helper_py.cc
-  nccl_wrapper_py.cc
   data_set_py.cc
   imperative.cc
   ir.cc
@@ -35,15 +41,19 @@ if (WITH_DISTRIBUTE)
   list(APPEND PYBIND_SRCS communicator_py.cc)
 endif()
 
+if (WITH_NCCL)
+  list(APPEND PYBIND_SRCS nccl_wrapper_py.cc)
+endif()
+
 if(WITH_PYTHON)
   # generate op pybind functions automatically for dygraph.
   set(OP_FUNCTION_GENERETOR_DEPS pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag)
   list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OP_LIB})
   list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OPERATOR_DEPS})
 
-  if(NOT WIN32)
+  if(WITH_NCCL)
     list(APPEND OP_FUNCTION_GENERETOR_DEPS nccl_context)
-  endif(NOT WIN32)
+  endif(WITH_NCCL)
 
   add_executable(op_function_generator op_function_generator.cc)
   target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS})
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 84b6ff267e..dc1a104183 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -611,7 +611,7 @@ void BindImperative(py::module *m_ptr) {
                     },
                     [](imperative::ParallelStrategy &self,
                        const std::string &ep) { self.current_endpoint_ = ep; });
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
   py::class_<imperative::NCCLParallelContext> nccl_ctx(m,
                                                        "NCCLParallelContext");
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 90482d36cf..837088b4f8 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -2209,7 +2209,7 @@ All parameter, weight, gradient are variables in Paddle.
   BindFleetWrapper(&m);
   BindGlooWrapper(&m);
   BindBoxHelper(&m);
-#ifndef _WIN32
+#ifdef PADDLE_WITH_NCCL
   BindNCCLWrapper(&m);
 #endif
   BindGraph(&m);
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index e7ddc7a7b8..ed3b2469b4 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -4,6 +4,9 @@ set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0 FLAGS_fast_eager_deletion_mode=1 FL
 set(dist_ENVS http_proxy="" https_proxy="")
 
 file(GLOB DIST_TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_dist_*.py")
+if(NOT WITH_NCCL)
+    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_dgc_nccl")
+endif()
 string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}")
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)
-- 
GitLab