From 041bc72c55e3f0e9a60a1aee2cfc84ae20f8a342 Mon Sep 17 00:00:00 2001 From: chengduo Date: Wed, 19 Jun 2019 09:50:21 +0800 Subject: [PATCH] [Cherry Pick] Not init nccl when rank is 1 (#18170) * remove nccl dep when the number of GPU is 1 test=develop * use multi card run syncBN test=release/1.5 --- paddle/fluid/framework/parallel_executor.cc | 6 +++--- paddle/fluid/platform/device_context.cc | 4 +++- python/paddle/fluid/tests/unittests/CMakeLists.txt | 2 +- .../paddle/fluid/tests/unittests/test_sync_batch_norm_op.py | 1 + 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 15f83aa1fe9..6e2168a017a 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -369,8 +369,7 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, "Execution which can get better performance," << "you can force it off by env FLAGS_enable_parallel_graph=0"; - if (member_->use_cuda_) { -// Bcast Parameters to all GPUs + if (member_->use_cuda_ && member_->nranks_ > 1) { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) member_->InitOrGetNCCLCommunicator(scope, build_strategy); @@ -405,10 +404,11 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, } return false; }; - + // Bcast Parameters to all GPUs if (need_broadcast()) { BCastParamsToDevices(bcast_vars, build_strategy.trainer_id_); } + // Startup Program has been run. All local scopes has correct parameters. // Step 2. Convert main_program to SSA form and dependency graph. Also, insert diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index caaf0e2c50c..4f048d44685 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -316,7 +316,9 @@ CUDADeviceContext::~CUDADeviceContext() { eigen_device_.reset(); PADDLE_ENFORCE(cudaStreamDestroy(stream_)); #if !defined(_WIN32) - PADDLE_ENFORCE(dynload::ncclCommDestroy(nccl_comm_)); + if (nccl_comm_) { + PADDLE_ENFORCE(dynload::ncclCommDestroy(nccl_comm_)); + } #endif } diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 2051207d326..cdb8a95b9de 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -222,5 +222,5 @@ if(WITH_DISTRIBUTE) endif() set_tests_properties(test_recordio_reader test_parallel_executor_test_while_train test_parallel_executor_mnist - test_parallel_executor_seresnext test_parallel_executor_crf + test_parallel_executor_seresnext test_parallel_executor_crf test_sync_batch_norm_op PROPERTIES LABELS "RUN_TYPE=DIST") diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py index f6a658cb1b7..b8a2515e716 100644 --- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py @@ -98,6 +98,7 @@ class TestSyncBatchNormOpTraining(unittest.TestCase): ##################################################################### # Multi-GPUs, self.N / core.get_cuda_device_count() per GPU + assert core.get_cuda_device_count() > 1 main, startup, outs = self.build_program(place, layout, seed, True, only_forward) exe = fluid.Executor(place) -- GitLab