From 041bc72c55e3f0e9a60a1aee2cfc84ae20f8a342 Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Wed, 19 Jun 2019 09:50:21 +0800
Subject: [PATCH] [Cherry Pick] Not init nccl when rank is 1 (#18170)

* remove nccl dep when the number of GPU is 1
test=develop

* use multi card run syncBN
test=release/1.5
---
 paddle/fluid/framework/parallel_executor.cc                 | 6 +++---
 paddle/fluid/platform/device_context.cc                     | 4 +++-
 python/paddle/fluid/tests/unittests/CMakeLists.txt          | 2 +-
 .../paddle/fluid/tests/unittests/test_sync_batch_norm_op.py | 1 +
 4 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 15f83aa1fe9..6e2168a017a 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -369,8 +369,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
                "Execution which can get better performance,"
             << "you can force it off by env FLAGS_enable_parallel_graph=0";
 
-  if (member_->use_cuda_) {
-// Bcast Parameters to all GPUs
+  if (member_->use_cuda_ && member_->nranks_ > 1) {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
     member_->InitOrGetNCCLCommunicator(scope, build_strategy);
 
@@ -405,10 +404,11 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
     }
     return false;
   };
-
+  // Bcast Parameters to all GPUs
   if (need_broadcast()) {
     BCastParamsToDevices(bcast_vars, build_strategy.trainer_id_);
   }
+
   // Startup Program has been run. All local scopes has correct parameters.
 
   // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index caaf0e2c50c..4f048d44685 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -316,7 +316,9 @@ CUDADeviceContext::~CUDADeviceContext() {
   eigen_device_.reset();
   PADDLE_ENFORCE(cudaStreamDestroy(stream_));
 #if !defined(_WIN32)
-  PADDLE_ENFORCE(dynload::ncclCommDestroy(nccl_comm_));
+  if (nccl_comm_) {
+    PADDLE_ENFORCE(dynload::ncclCommDestroy(nccl_comm_));
+  }
 #endif
 }
 
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 2051207d326..cdb8a95b9de 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -222,5 +222,5 @@ if(WITH_DISTRIBUTE)
 endif()
 
 set_tests_properties(test_recordio_reader test_parallel_executor_test_while_train test_parallel_executor_mnist
-        test_parallel_executor_seresnext test_parallel_executor_crf
+        test_parallel_executor_seresnext test_parallel_executor_crf test_sync_batch_norm_op
         PROPERTIES LABELS "RUN_TYPE=DIST")
diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
index f6a658cb1b7..b8a2515e716 100644
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -98,6 +98,7 @@ class TestSyncBatchNormOpTraining(unittest.TestCase):
 
         #####################################################################
         # Multi-GPUs, self.N / core.get_cuda_device_count() per GPU
+        assert core.get_cuda_device_count() > 1
         main, startup, outs = self.build_program(place, layout, seed, True,
                                                  only_forward)
         exe = fluid.Executor(place)
-- 
GitLab