From 196b01876a3f1e8b4c5a4599421d708fc5d34fcf Mon Sep 17 00:00:00 2001
From: sneaxiy <32832641+sneaxiy@users.noreply.github.com>
Date: Thu, 1 Sep 2022 17:29:21 +0800
Subject: [PATCH] Lazy initialize dense_contents_ in reducer (#45631)

* make dense_contents_ lazy init

* update legacy dygraph

* fix legacy dygraph bug
---
 paddle/fluid/distributed/collective/reducer.cc |  6 ++++--
 paddle/fluid/imperative/reducer.cc             | 13 ++++++++++---
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
index a6cd305040..75a16bac37 100644
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -290,6 +290,9 @@ static void SplitTensorsWithType(const DeviceContext &context,
 }
 
 void EagerGroup::ConcatTensors(const platform::Place &place) {
+  dense_contents_ =
+      paddle::experimental::empty(IntArray({all_length_}), dtype_, place);
+
   if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     auto *default_ctx = static_cast<phi::GPUContext *>(
@@ -452,8 +455,6 @@ void EagerReducer::InitializeGroups(
     } else {
       // process the dense gradient.
       InitializeDenseGroups(tensor_indices_, &group);
-      group.dense_contents_ = paddle::experimental::empty(
-          IntArray({group.all_length_}), group.dtype_, inner_place_);
     }
 
     // map tensors to this group by VariableLocator
@@ -908,6 +909,7 @@ void EagerReducer::FinalizeBackward() {
   for (auto &group : groups_) {
     if (!group.is_sparse_) {
       group.SplitTensors(inner_place_);
+      group.dense_contents_.reset();
     }
   }
 
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 623a44ed75..24181eec59 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -543,9 +543,6 @@ void Reducer::InitializeGroups(
     } else {
       // process the dense gradient.
       InitializeDenseGroups(variable_indices_, &group);
-      auto tensor = group.dense_contents_.GetMutable<framework::LoDTensor>();
-      tensor->Resize(phi::make_ddim({group.all_length_}))
-          .mutable_data(place_, framework::TransToPhiDataType(group.dtype_));
     }
 
     // map variables to this group by VariableLocator
@@ -954,6 +951,10 @@ void Reducer::MarkGroupReady(size_t group_index) {
     UNUSED auto &group = groups_[next_group_];
     UNUSED const int run_order = next_group_ % nrings_;
 
+    auto *tensor = group.dense_contents_.GetMutable<framework::LoDTensor>();
+    tensor->Resize(phi::make_ddim({group.all_length_}))
+        .mutable_data(place_, framework::TransToPhiDataType(group.dtype_));
+
     // For CUDA or XPU, compute_stream --> comm_stream.
     // For CPU, do nothing.
     // NOTE. Because concat uses the comm_stream,
@@ -1116,6 +1117,12 @@ void Reducer::FinalizeBackward() {
     parallel_ctx_->WaitComm(i);
   }
 
+  for (auto &group : groups_) {
+    if (!group.is_sparse_) {
+      group.dense_contents_.Clear();
+    }
+  }
+
   if (NeedRebuildGroup()) {
     VLOG(3) << "Start rebuilding the groups";
     auto rebuild_group_indices = RebuildGruops();
-- 
GitLab