diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 15f83aa1fe92767f8eb96a7d3df808f3879c1856..6e2168a017a56c8541a99c116b251cc34092d48d 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -369,8 +369,7 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, "Execution which can get better performance," << "you can force it off by env FLAGS_enable_parallel_graph=0"; - if (member_->use_cuda_) { -// Bcast Parameters to all GPUs + if (member_->use_cuda_ && member_->nranks_ > 1) { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) member_->InitOrGetNCCLCommunicator(scope, build_strategy); @@ -405,10 +404,11 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, } return false; }; - + // Bcast Parameters to all GPUs if (need_broadcast()) { BCastParamsToDevices(bcast_vars, build_strategy.trainer_id_); } + // Startup Program has been run. All local scopes has correct parameters. // Step 2. Convert main_program to SSA form and dependency graph. Also, insert diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index caaf0e2c50c3ed3410ea8d886f1eaeea06f19446..4f048d44685a88c3342de48dc6f364c950605be9 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -316,7 +316,9 @@ CUDADeviceContext::~CUDADeviceContext() { eigen_device_.reset(); PADDLE_ENFORCE(cudaStreamDestroy(stream_)); #if !defined(_WIN32) - PADDLE_ENFORCE(dynload::ncclCommDestroy(nccl_comm_)); + if (nccl_comm_) { + PADDLE_ENFORCE(dynload::ncclCommDestroy(nccl_comm_)); + } #endif } diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 2051207d326a595ea0503510276601c6862fdc07..cdb8a95b9de89c2d5e3d0d3069742df61ffd60fd 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -222,5 +222,5 @@ if(WITH_DISTRIBUTE) endif() set_tests_properties(test_recordio_reader test_parallel_executor_test_while_train test_parallel_executor_mnist - test_parallel_executor_seresnext test_parallel_executor_crf + test_parallel_executor_seresnext test_parallel_executor_crf test_sync_batch_norm_op PROPERTIES LABELS "RUN_TYPE=DIST") diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py index f6a658cb1b753de93f11f45d0477f450ef0bdfaf..b8a2515e716bb2732eb61732480152ee1ce8e4b9 100644 --- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py @@ -98,6 +98,7 @@ class TestSyncBatchNormOpTraining(unittest.TestCase): ##################################################################### # Multi-GPUs, self.N / core.get_cuda_device_count() per GPU + assert core.get_cuda_device_count() > 1 main, startup, outs = self.build_program(place, layout, seed, True, only_forward) exe = fluid.Executor(place)