Remove nccl dep when the number of GPU is 1 (#18158)

* remove nccl dep when the number of GPU is 1 test=develop

Remove nccl dep when the number of GPU is 1 (#18158)
* remove nccl dep when the number of GPU is 1 test=develop
4978db2c · chengduo · GitHub · 25ab23be · 4978db2c · 4978db2c
4 changed file
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -369,8 +369,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
               "Execution which can get better performance,"
            << "you can force it off by env FLAGS_enable_parallel_graph=0";

-  if (member_->use_cuda_) {
-// Bcast Parameters to all GPUs
+  if (member_->use_cuda_ && member_->nranks_ > 1) {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
    member_->InitOrGetNCCLCommunicator(scope, build_strategy);

@@ -405,10 +404,11 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
    }
    return false;
  };
-
+  // Bcast Parameters to all GPUs
  if (need_broadcast()) {
    BCastParamsToDevices(bcast_vars, build_strategy.trainer_id_);
  }
+
  // Startup Program has been run. All local scopes has correct parameters.

  // Step 2. Convert main_program to SSA form and dependency graph. Also, insert

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -316,7 +316,9 @@ CUDADeviceContext::~CUDADeviceContext() {
  eigen_device_.reset();
  PADDLE_ENFORCE(cudaStreamDestroy(stream_));
 #if !defined(_WIN32)
-  PADDLE_ENFORCE(dynload::ncclCommDestroy(nccl_comm_));
+  if (nccl_comm_) {
+    PADDLE_ENFORCE(dynload::ncclCommDestroy(nccl_comm_));
+  }
 #endif
 }


--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -223,5 +223,5 @@ if(WITH_DISTRIBUTE)
 endif()

 set_tests_properties(test_recordio_reader test_parallel_executor_test_while_train test_parallel_executor_mnist
-        test_parallel_executor_seresnext test_parallel_executor_crf
+        test_parallel_executor_seresnext test_parallel_executor_crf test_sync_batch_norm_op
        PROPERTIES LABELS "RUN_TYPE=DIST")
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -98,6 +98,7 @@ class TestSyncBatchNormOpTraining(unittest.TestCase):

        #####################################################################
        # Multi-GPUs, self.N / core.get_cuda_device_count() per GPU
+        assert core.get_cuda_device_count() > 1
        main, startup, outs = self.build_program(place, layout, seed, True,
                                                 only_forward)
        exe = fluid.Executor(place)