[Cherry Pick] Not init nccl when rank is 1 (#18170)

* remove nccl dep when the number of GPU is 1 test=develop * use multi card run syncBN test=release/1.5

[Cherry Pick] Not init nccl when rank is 1 (#18170)
* remove nccl dep when the number of GPU is 1 test=develop * use multi card run syncBN test=release/1.5
041bc72c · chengduo · GitHub · 39002b08 · 041bc72c · 041bc72c
4 changed file
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -369,8 +369,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
               "Execution which can get better performance,"
            << "you can force it off by env FLAGS_enable_parallel_graph=0";
-  if (member_->use_cuda_) {
+  if (member_->use_cuda_ && member_->nranks_ > 1) {
-// Bcast Parameters to all GPUs
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
    member_->InitOrGetNCCLCommunicator(scope, build_strategy);
@@ -405,10 +404,11 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
    }
    return false;
  };
+  // Bcast Parameters to all GPUs
  if (need_broadcast()) {
    BCastParamsToDevices(bcast_vars, build_strategy.trainer_id_);
  }
  // Startup Program has been run. All local scopes has correct parameters.
  // Step 2. Convert main_program to SSA form and dependency graph. Also, insert

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -316,7 +316,9 @@ CUDADeviceContext::~CUDADeviceContext() {
  eigen_device_.reset();
  PADDLE_ENFORCE(cudaStreamDestroy(stream_));
 #if !defined(_WIN32)
-  PADDLE_ENFORCE(dynload::ncclCommDestroy(nccl_comm_));
+  if (nccl_comm_) {
+    PADDLE_ENFORCE(dynload::ncclCommDestroy(nccl_comm_));
+  }
 #endif
 }

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -222,5 +222,5 @@ if(WITH_DISTRIBUTE)
 endif()
 set_tests_properties(test_recordio_reader test_parallel_executor_test_while_train test_parallel_executor_mnist
-        test_parallel_executor_seresnext test_parallel_executor_crf
+        test_parallel_executor_seresnext test_parallel_executor_crf test_sync_batch_norm_op
        PROPERTIES LABELS "RUN_TYPE=DIST")
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -98,6 +98,7 @@ class TestSyncBatchNormOpTraining(unittest.TestCase):
        #####################################################################
        # Multi-GPUs, self.N / core.get_cuda_device_count() per GPU
+        assert core.get_cuda_device_count() > 1
        main, startup, outs = self.build_program(place, layout, seed, True,
                                                 only_forward)
        exe = fluid.Executor(place)