add details. test=develop

6d6ddcfe · dzhwinter · 11afbe0f · 6d6ddcfe · 6d6ddcfe · 6d6ddcfe
4 changed file
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -50,7 +50,12 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
-cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper)
+if(WITH_GPU)
+cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper cpu_info)
+else()
+nv_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info)
+endif()
 cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass)
 cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info)
 cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)

--- a/paddle/fluid/framework/details/memory_optimize_helper.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper.cc
@@ -13,13 +13,19 @@
 // limitations under the License.
 #include "paddle/fluid/framework/details/memory_optimize_helper.h"
+#include <algorithm>
 #include <deque>
 #include <functional>
-#include <iostream>
+#include <iterator>
 #include <numeric>
 #include <sstream>
 #include <string>
 #include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/platform/cpu_info.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/gpu_info.h"
+#endif  // PADDLE_WITH_CUDA
 namespace paddle {
 namespace framework {
@@ -230,6 +236,27 @@ ir::Node* OrderedSet::FindBestFitNode(ir::Node* var) const {
  return found_node;
 }
+ir::Node* OrderedSet::FindNextBestFitNode(ir::Node* var, ir::Node* prev) const {
+  ir::Node* found_node = nullptr;
+  NodeComparator functor;
+  auto it =
+      std::find_if(nodes_.begin(), nodes_.end(), [&](const NodeVector& v) {
+        if (v.front() == prev)
+          return true;
+        else
+          return false;
+      });
+  PADDLE_ENFORCE(it != nodes_.end(), "Not found previous in node list!");
+  for (it = std::next(it); it != nodes_.end(); ++it) {
+    auto& candidate = it->front();
+    if (functor(var, candidate)) {
+      found_node = candidate;
+      break;
+    }
+  }
+  return found_node;
+}
 bool OrderedSet::Has(ir::Node* var) const {
  if (mark_table_.count(var->Name())) {
    auto& node_in_samename = mark_table_.at(var->Name());
@@ -274,14 +301,35 @@ bool NodeCanReused(ir::Node* node) {
  return flag;
 }
+int MinChunkSize() {
+  int size{0};
+#ifdef PADDLE_WITH_CUDA
+  size = platform::GpuMinChunkSize();
+#else
+  size = platform::CpuMinChunkSize();
+#endif  // PADDLE_WITH_CUDA
+  return size;
+}
 bool NodeCanReused(const VarDesc& node) {
  auto type = node.GetType();
+  // only these types holds bulk of gpu memory
  if (!(type == proto::VarType::LOD_TENSOR ||
        type == proto::VarType::SELECTED_ROWS ||
        type == proto::VarType::LOD_TENSOR_ARRAY)) {
    return false;
  }
-  if (node.Persistable() || node.GetShape().empty()) {
+  // persistable variable is parameter
+  if (node.Persistable()) {
+    return false;
+  }
+  // shape < min_chunk_size is meaningless.
+  // further more, fetched loss always has size = 1
+  // which should not be reused.
+  auto shape = node.GetShape();
+  int size = std::abs(
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()));
+  if (shape.empty() || size < MinChunkSize()) {
    return false;
  }
  // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad

--- a/paddle/fluid/framework/details/memory_optimize_helper.h
+++ b/paddle/fluid/framework/details/memory_optimize_helper.h
@@ -62,6 +62,7 @@ class OrderedSet {
  }
  // find the bestfit shape node block with var.
  ir::Node* FindBestFitNode(ir::Node* var) const;
+  ir::Node* FindNextBestFitNode(ir::Node* var, ir::Node* prev) const;
  // map store non-const iterator, can not promise const
  int GetNodeIndexInPool(ir::Node* var);
  // pool all node to string

--- a/paddle/fluid/framework/details/memory_optimize_pass.cc
+++ b/paddle/fluid/framework/details/memory_optimize_pass.cc
@@ -76,6 +76,13 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
      }
      if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) {
        ir::Node* cache = pool_.FindBestFitNode(var);
+        while (cache != nullptr && var->Name() == cache->Name()) {
+          VLOG(3) << "The same cache variable is cascade reused." << var->Name()
+                  << " is re-filled to the pool after"
+                  << "the reused op is finished. Current op can not "
+                  << "replace it again. Skip this candidate.";
+          cache = pool_.FindNextBestFitNode(var, cache);
+        }
        if (var->Name() == FLAGS_memory_optimize_debug) {
          VLOG(3) << "start match var " << DebugString(var) << " of op "
                  << op->Name();
@@ -85,14 +92,6 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
        }
        if (cache != nullptr) {
-          if (var->Name() == cache->Name()) {
-            VLOG(3) << "The same cache variable is cascade reused."
-                    << var->Name() << " is re-filled to the pool after"
-                    << "the reused op is finished. Current op can not "
-                    << "replace it again. Skip this candidate.";
-            continue;
-          }
          int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
          VLOG(3) << string::Sprintf(
              "!!! %s,  %s => %s, cache idx %d, pool size %d",