Enhance/memory optimize (#15634)

* add skip send.recv test=develop * enhanced print message. test=develop * rerun ci. test=develop

Enhance/memory optimize (#15634)
* add skip send.recv test=develop * enhanced print message. test=develop * rerun ci. test=develop
488719ba · dzhwinter · GitHub · 2bf63f4c · 5d30b55d · 488719ba
7 changed file
--- a/paddle/fluid/framework/details/inplace_op_pass.cc
+++ b/paddle/fluid/framework/details/inplace_op_pass.cc
@@ -403,18 +403,20 @@ void GraphView::Build(ir::Graph* g) {
  // 2. track the nodes which used by parameter server.
  // these node can not be inplaced, otherwise trainer
  // pserver can not find each other name.
-  for (auto& node : g->Nodes()) {
+  auto update_skip_set = [&](ir::Node* node) {
-    if (!node->IsOp()) continue;
+    for (auto& in : node->inputs) {
-    if (node->Name() == "send") {
+      if (in->IsVar() && in->Var() != nullptr) dup_nodes_.emplace(in->Name());
-      for (auto& in : node->inputs) {
-        dup_nodes_.emplace(in->Name());
-      }
    }
-    if (node->Name() == "recv") {
+    for (auto& out : node->outputs) {
-      for (auto& out : node->outputs) {
+      if (out->IsVar() && out->Var() != nullptr)
        dup_nodes_.emplace(out->Name());
-      }
    }
+  };
+  for (auto& node : g->Nodes()) {
+    if (!node->IsOp()) continue;
+    if (node->Name() == "send") update_skip_set(node);
+    if (node->Name() == "recv") update_skip_set(node);
+    if (node->Name() == "prefetch") update_skip_set(node);
  }
 }

--- a/paddle/fluid/framework/details/memory_optimize_pass.cc
+++ b/paddle/fluid/framework/details/memory_optimize_pass.cc
@@ -51,8 +51,7 @@ static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
 std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  auto nodes = graph->Nodes();
-  auto subblock_vars = GetSubBlockVars(nodes);
+  CollectSkipVarsSet(nodes);
-  skip_set_.insert(subblock_vars.begin(), subblock_vars.end());
  cfg_.reset(new details::ControlFlowGraph(*graph));
  cfg_->LiveVariableAnalysis();
@@ -224,20 +223,27 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
  }
 }
-std::unordered_set<std::string> MemoryOptimizePass::GetSubBlockVars(
+void MemoryOptimizePass::CollectSkipVarsSet(
    const std::unordered_set<ir::Node*>& nodes) const {
-  std::unordered_set<std::string> vars;
+  auto update_skip_set = [&](OpDesc* op_desc) {
+    auto inputs = op_desc->InputArgumentNames();
+    auto outputs = op_desc->OutputArgumentNames();
+    skip_set_.insert(inputs.begin(), inputs.end());
+    skip_set_.insert(outputs.begin(), outputs.end());
+  };
  for (auto& op : nodes) {
    if (!op->IsOp() || op->Op() == nullptr) continue;
    auto* op_desc = op->Op();
-    if (OpHasSubBlock(op_desc)) {
+    // NOTE(dzhwinter):
-      auto inputs = op_desc->InputArgumentNames();
+    // current block can not reuse next level block vars.
-      auto outputs = op_desc->OutputArgumentNames();
+    if (OpHasSubBlock(op_desc)) update_skip_set(op_desc);
-      vars.insert(inputs.begin(), inputs.end());
+    // NOTE(dzhwinter):
-      vars.insert(outputs.begin(), outputs.end());
+    // distributed ops input/output name need to
-    }
+    // keep same bettwen trainer/pserver
+    if (op_desc->Type() == "send") update_skip_set(op_desc);
+    if (op_desc->Type() == "recv") update_skip_set(op_desc);
+    if (op_desc->Type() == "prefetch") update_skip_set(op_desc);
  }
-  return vars;
 }
 void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var,

--- a/paddle/fluid/framework/details/memory_optimize_pass.h
+++ b/paddle/fluid/framework/details/memory_optimize_pass.h
@@ -55,9 +55,10 @@ class MemoryOptimizePass : public ir::Pass {
                            ir::Graph* graph) const;
  void SubGraphOptimize(OpDesc* op_desc) const;
-  // scan subblock and collect the output/input variables.
+  // 1. scan op with subblock and collect the output/input vars.
-  std::unordered_set<std::string> GetSubBlockVars(
+  // while, while_grad, conditional_block
-      const std::unordered_set<ir::Node*>&) const;
+  // 2. scan distributed ops and collect the output/input vars
+  void CollectSkipVarsSet(const std::unordered_set<ir::Node*>&) const;
 private:
  // Reuse Node Pool, Owned.

--- a/paddle/fluid/framework/inplace_op_inference_test.cc
+++ b/paddle/fluid/framework/inplace_op_inference_test.cc
@@ -276,6 +276,7 @@ TEST(InferInplace, MultiGradInplaceInToOut) {
  auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
  auto in_to_outs = infer_inplace(*op, op->Block());
  EXPECT_EQ(in_to_outs.size(), 3ul);
  std::unordered_map<std::string, std::string> expects = {
      {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"},

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -22,7 +22,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/string/printf.h"
-DECLARE_bool(benchmark);
+DEFINE_bool(benchmark, false,
+            "Doing memory benchmark. It will make deleting scope synchronized, "
+            "and add some memory usage logs."
+            "Default cuda is asynchronous device, set to True will"
+            "force op run in synchronous mode.");
 DEFINE_bool(
    eager_delete_scope, true,

--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -35,7 +35,6 @@ DEFINE_bool(init_allocated_mem, false,
            "To find this error in time, we use init_allocated_mem to indicate "
            "that initializing the allocated memory with a small value "
            "during unit testing.");
-DECLARE_bool(benchmark);
 DECLARE_double(fraction_of_gpu_memory_to_use);
 namespace paddle {
@@ -188,21 +187,20 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
    platform::SetDeviceId(place.device);
    size_t avail, total;
    platform::GpuMemoryUsage(&avail, &total);
-    LOG(WARNING) << "Cannot allocate " << string::HumanReadableSize(size)
+    LOG(FATAL) << "Cannot allocate " << string::HumanReadableSize(size)
-                 << " in GPU " << place.device << ", available "
+               << " in GPU " << place.device << ", available "
-                 << string::HumanReadableSize(avail);
+               << string::HumanReadableSize(avail) << "total " << total
-    LOG(WARNING) << "total " << total;
+               << "GpuMinChunkSize "
-    LOG(WARNING) << "GpuMinChunkSize "
+               << string::HumanReadableSize(buddy_allocator->GetMinChunkSize())
-                 << string::HumanReadableSize(
+               << "GpuMaxChunkSize "
-                        buddy_allocator->GetMinChunkSize());
+               << string::HumanReadableSize(buddy_allocator->GetMaxChunkSize())
-    LOG(WARNING) << "GpuMaxChunkSize "
+               << "GPU memory used: "
-                 << string::HumanReadableSize(
+               << string::HumanReadableSize(Used<platform::CUDAPlace>(place));
-                        buddy_allocator->GetMaxChunkSize());
-    LOG(WARNING) << "GPU memory used: "
-                 << string::HumanReadableSize(Used<platform::CUDAPlace>(place));
    platform::SetDeviceId(cur_dev);
  } else {
-    if (FLAGS_benchmark) allocation::GPUMemMonitor.Add(place.device, size);
+    if (VLOG_IS_ON(3)) {
+      allocation::GPUMemMonitor.Add(place.device, size);
+    }
    if (FLAGS_init_allocated_mem) {
      cudaMemset(ptr, 0xEF, size);
    }
@@ -218,7 +216,9 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
                               size_t size) {
 #ifdef PADDLE_WITH_CUDA
  GetGPUBuddyAllocator(place.device)->Free(p);
-  if (FLAGS_benchmark) allocation::GPUMemMonitor.Minus(place.device, size);
+  if (VLOG_IS_ON(3)) {
+    allocation::GPUMemMonitor.Minus(place.device, size);
+  }
 #else
  PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
 #endif

--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -14,12 +14,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
-DEFINE_bool(benchmark, false,
-            "Doing memory benchmark. It will make deleting scope synchronized, "
-            "and add some memory usage logs."
-            "Default cuda is asynchronous device, set to True will"
-            "force op run in synchronous mode.");
 namespace paddle {
 namespace platform {