fix bug of reducer in best_fit (#48668)

cee7a3db · ShenLiang · GitHub · 89f024e3 · cee7a3db · cee7a3db
3 changed file
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -17,10 +17,16 @@
 #include "paddle/phi/backends/device_manager.h"

 DECLARE_bool(use_stream_safe_cuda_allocator);
+DECLARE_string(allocator_strategy);

 namespace paddle {
 namespace distributed {

+static bool IsStreamSafeAllocator() {
+  return FLAGS_allocator_strategy == "auto_growth" &&
+         FLAGS_use_stream_safe_cuda_allocator;
+}
+
 static Backend TransToBackend(platform::Place place) {
  static const std::map<phi::AllocationType, Backend> type_backend = {
      {phi::AllocationType::GPU, Backend::GPU},
@@ -399,14 +405,14 @@ void EagerGroup::ConcatTensors(const platform::Place &place) {
  }
 }

-void EagerGroup::SplitTensorsDev(const platform::DeviceContext &context) {
+void EagerGroup::SplitTensors(const platform::DeviceContext &context) {
  auto place = context.GetPlace();
  if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
    auto &gpu_context = static_cast<const phi::GPUContext &>(context);
    SplitTensorsWithType(
        gpu_context, &dense_contents_, &dense_tensors_, dtype_);
-    if (FLAGS_use_stream_safe_cuda_allocator) {
+    if (IsStreamSafeAllocator()) {
      auto dense_tensor =
          std::dynamic_pointer_cast<phi::DenseTensor>(dense_contents_.impl());
      VLOG(3) << "Free dense_contents_ " << dense_contents_.numel();
@@ -1011,12 +1017,11 @@ void EagerReducer::FinalizeBackward() {
  for (auto &group : groups_) {
    if (!group.is_sparse_) {
      group.task->Synchronize();
-    }
-  }
-
-  for (auto &group : groups_) {
-    if (!group.is_sparse_) {
-      group.dense_contents_.reset();
+      if (!IsStreamSafeAllocator()) {
+        auto *default_ctx =
+            platform::DeviceContextPool::Instance().Get(inner_place_);
+        group.SplitTensors(*default_ctx);
+      }
    }
  }

@@ -1054,9 +1059,15 @@ void EagerReducer::FusedAllReduceSchedule(EagerGroup *group,
  group->task = process_group_->AllReduce(in_out, in_out, opts);

  auto *context = process_group_->GetDeviceContext(inner_place_);
-  group->SplitTensorsDev(*context);
-  group->task->UpdateWaitChain(*context);
-  // split in FinalizeBackward()
+
+  if (IsStreamSafeAllocator()) {
+    // NOTE(shenliang03): The best_fit allocator strategy is multi-stream
+    // insecure. In the Split operator, additional memory will be applied for
+    // calculation, and if it is asynchronous, an illegal memory access may be
+    // encountered.
+    group->SplitTensors(*context);
+    group->task->UpdateWaitChain(*context);
+  }
 }

 void EagerReducer::AllReduceSparse(EagerGroup *group,

--- a/paddle/fluid/distributed/collective/reducer.h
+++ b/paddle/fluid/distributed/collective/reducer.h
@@ -75,7 +75,7 @@ class EagerGroup {

  // context is used to select the stream for split

-  void SplitTensorsDev(const platform::DeviceContext &);
+  void SplitTensors(const platform::DeviceContext &);

  friend std::ostream &operator<<(std::ostream &, const EagerGroup &);
 };

--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
@@ -103,6 +103,7 @@ def start_local_trainers(
    training_script,
    training_script_args,
    eager_mode=True,
+    allocator_strategy="auto_growth",
    log_dir=None,
 ):
    current_env = copy.copy(os.environ.copy())
@@ -126,6 +127,10 @@ def start_local_trainers(
        if not eager_mode:
            proc_env["FLAGS_enable_eager_mode"] = "%d" % 0

+        proc_env["FLAGS_allocator_strategy"] = allocator_strategy
+        if allocator_strategy == "auto_growth":
+            proc_env["FLAGS_fraction_of_gpu_memory_to_use"] = "0.1"
+
        current_env.update(proc_env)

        print("trainer proc env:{}".format(current_env))
@@ -153,7 +158,12 @@ def start_local_trainers(


 class TestMultipleGpus(unittest.TestCase):
-    def run_mnist_2gpu(self, target_file_name, eager_mode=True):
+    def run_mnist_2gpu(
+        self,
+        target_file_name,
+        eager_mode=True,
+        allocator_strategy="auto_growth",
+    ):
        if (
            not fluid.core.is_compiled_with_cuda()
            or fluid.core.get_cuda_device_count() == 0
@@ -170,6 +180,7 @@ class TestMultipleGpus(unittest.TestCase):
            cluster,
            pod,
            eager_mode=eager_mode,
+            allocator_strategy=allocator_strategy,
            training_script=target_file_name,
            training_script_args=[],
        )
@@ -218,6 +229,10 @@ class TestDataParallelWithPyLayer(TestMultipleGpus):
        self.run_mnist_2gpu(
            'parallel_dygraph_dataparallel_with_pylayer.py', eager_mode=False
        )
+        self.run_mnist_2gpu(
+            'parallel_dygraph_dataparallel_with_pylayer.py',
+            allocator_strategy="naive_best_fit",
+        )


 class TestGradientCheckInEagerMode(TestMultipleGpus):