From cee7a3dbe8638ab1dc619bacd2ef679de45d3470 Mon Sep 17 00:00:00 2001 From: ShenLiang <1422485404@qq.com> Date: Mon, 5 Dec 2022 13:17:24 +0800 Subject: [PATCH] fix bug of reducer in best_fit (#48668) --- .../fluid/distributed/collective/reducer.cc | 33 ++++++++++++------- paddle/fluid/distributed/collective/reducer.h | 2 +- .../test_parallel_dygraph_dataparallel.py | 17 +++++++++- 3 files changed, 39 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc index cd8c8ed2e0..379bc57d55 100644 --- a/paddle/fluid/distributed/collective/reducer.cc +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -17,10 +17,16 @@ #include "paddle/phi/backends/device_manager.h" DECLARE_bool(use_stream_safe_cuda_allocator); +DECLARE_string(allocator_strategy); namespace paddle { namespace distributed { +static bool IsStreamSafeAllocator() { + return FLAGS_allocator_strategy == "auto_growth" && + FLAGS_use_stream_safe_cuda_allocator; +} + static Backend TransToBackend(platform::Place place) { static const std::map type_backend = { {phi::AllocationType::GPU, Backend::GPU}, @@ -399,14 +405,14 @@ void EagerGroup::ConcatTensors(const platform::Place &place) { } } -void EagerGroup::SplitTensorsDev(const platform::DeviceContext &context) { +void EagerGroup::SplitTensors(const platform::DeviceContext &context) { auto place = context.GetPlace(); if (platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto &gpu_context = static_cast(context); SplitTensorsWithType( gpu_context, &dense_contents_, &dense_tensors_, dtype_); - if (FLAGS_use_stream_safe_cuda_allocator) { + if (IsStreamSafeAllocator()) { auto dense_tensor = std::dynamic_pointer_cast(dense_contents_.impl()); VLOG(3) << "Free dense_contents_ " << dense_contents_.numel(); @@ -1011,12 +1017,11 @@ void EagerReducer::FinalizeBackward() { for (auto &group : groups_) { if (!group.is_sparse_) { group.task->Synchronize(); - } - } - - for (auto &group : groups_) { - if (!group.is_sparse_) { - group.dense_contents_.reset(); + if (!IsStreamSafeAllocator()) { + auto *default_ctx = + platform::DeviceContextPool::Instance().Get(inner_place_); + group.SplitTensors(*default_ctx); + } } } @@ -1054,9 +1059,15 @@ void EagerReducer::FusedAllReduceSchedule(EagerGroup *group, group->task = process_group_->AllReduce(in_out, in_out, opts); auto *context = process_group_->GetDeviceContext(inner_place_); - group->SplitTensorsDev(*context); - group->task->UpdateWaitChain(*context); - // split in FinalizeBackward() + + if (IsStreamSafeAllocator()) { + // NOTE(shenliang03): The best_fit allocator strategy is multi-stream + // insecure. In the Split operator, additional memory will be applied for + // calculation, and if it is asynchronous, an illegal memory access may be + // encountered. + group->SplitTensors(*context); + group->task->UpdateWaitChain(*context); + } } void EagerReducer::AllReduceSparse(EagerGroup *group, diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h index 5d27086fdb..5be2d60a6a 100644 --- a/paddle/fluid/distributed/collective/reducer.h +++ b/paddle/fluid/distributed/collective/reducer.h @@ -75,7 +75,7 @@ class EagerGroup { // context is used to select the stream for split - void SplitTensorsDev(const platform::DeviceContext &); + void SplitTensors(const platform::DeviceContext &); friend std::ostream &operator<<(std::ostream &, const EagerGroup &); }; diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py index 9e4be19dac..5fd7f3beb1 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py @@ -103,6 +103,7 @@ def start_local_trainers( training_script, training_script_args, eager_mode=True, + allocator_strategy="auto_growth", log_dir=None, ): current_env = copy.copy(os.environ.copy()) @@ -126,6 +127,10 @@ def start_local_trainers( if not eager_mode: proc_env["FLAGS_enable_eager_mode"] = "%d" % 0 + proc_env["FLAGS_allocator_strategy"] = allocator_strategy + if allocator_strategy == "auto_growth": + proc_env["FLAGS_fraction_of_gpu_memory_to_use"] = "0.1" + current_env.update(proc_env) print("trainer proc env:{}".format(current_env)) @@ -153,7 +158,12 @@ def start_local_trainers( class TestMultipleGpus(unittest.TestCase): - def run_mnist_2gpu(self, target_file_name, eager_mode=True): + def run_mnist_2gpu( + self, + target_file_name, + eager_mode=True, + allocator_strategy="auto_growth", + ): if ( not fluid.core.is_compiled_with_cuda() or fluid.core.get_cuda_device_count() == 0 @@ -170,6 +180,7 @@ class TestMultipleGpus(unittest.TestCase): cluster, pod, eager_mode=eager_mode, + allocator_strategy=allocator_strategy, training_script=target_file_name, training_script_args=[], ) @@ -218,6 +229,10 @@ class TestDataParallelWithPyLayer(TestMultipleGpus): self.run_mnist_2gpu( 'parallel_dygraph_dataparallel_with_pylayer.py', eager_mode=False ) + self.run_mnist_2gpu( + 'parallel_dygraph_dataparallel_with_pylayer.py', + allocator_strategy="naive_best_fit", + ) class TestGradientCheckInEagerMode(TestMultipleGpus): -- GitLab