diff --git a/paddle/fluid/framework/details/inplace_op_pass.cc b/paddle/fluid/framework/details/inplace_op_pass.cc
index 64368a5e8737b2484bda9b7dd52451b4d4f760ff..78c5d5b50e606daa963e728355dc1bce83cd5484 100644
--- a/paddle/fluid/framework/details/inplace_op_pass.cc
+++ b/paddle/fluid/framework/details/inplace_op_pass.cc
@@ -403,18 +403,20 @@ void GraphView::Build(ir::Graph* g) {
   // 2. track the nodes which used by parameter server.
   // these node can not be inplaced, otherwise trainer
   // pserver can not find each other name.
-  for (auto& node : g->Nodes()) {
-    if (!node->IsOp()) continue;
-    if (node->Name() == "send") {
-      for (auto& in : node->inputs) {
-        dup_nodes_.emplace(in->Name());
-      }
+  auto update_skip_set = [&](ir::Node* node) {
+    for (auto& in : node->inputs) {
+      if (in->IsVar() && in->Var() != nullptr) dup_nodes_.emplace(in->Name());
     }
-    if (node->Name() == "recv") {
-      for (auto& out : node->outputs) {
+    for (auto& out : node->outputs) {
+      if (out->IsVar() && out->Var() != nullptr)
         dup_nodes_.emplace(out->Name());
-      }
     }
+  };
+  for (auto& node : g->Nodes()) {
+    if (!node->IsOp()) continue;
+    if (node->Name() == "send") update_skip_set(node);
+    if (node->Name() == "recv") update_skip_set(node);
+    if (node->Name() == "prefetch") update_skip_set(node);
   }
 }
 
diff --git a/paddle/fluid/framework/details/memory_optimize_pass.cc b/paddle/fluid/framework/details/memory_optimize_pass.cc
index 33ca45668e86bdbe615b91366db7e286258dd7d6..85de14a60a8fe6958794f0ac25768b9da1943f9d 100644
--- a/paddle/fluid/framework/details/memory_optimize_pass.cc
+++ b/paddle/fluid/framework/details/memory_optimize_pass.cc
@@ -51,8 +51,7 @@ static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
 std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
   auto nodes = graph->Nodes();
-  auto subblock_vars = GetSubBlockVars(nodes);
-  skip_set_.insert(subblock_vars.begin(), subblock_vars.end());
+  CollectSkipVarsSet(nodes);
 
   cfg_.reset(new details::ControlFlowGraph(*graph));
   cfg_->LiveVariableAnalysis();
@@ -224,20 +223,27 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
   }
 }
 
-std::unordered_set<std::string> MemoryOptimizePass::GetSubBlockVars(
+void MemoryOptimizePass::CollectSkipVarsSet(
     const std::unordered_set<ir::Node*>& nodes) const {
-  std::unordered_set<std::string> vars;
+  auto update_skip_set = [&](OpDesc* op_desc) {
+    auto inputs = op_desc->InputArgumentNames();
+    auto outputs = op_desc->OutputArgumentNames();
+    skip_set_.insert(inputs.begin(), inputs.end());
+    skip_set_.insert(outputs.begin(), outputs.end());
+  };
   for (auto& op : nodes) {
     if (!op->IsOp() || op->Op() == nullptr) continue;
     auto* op_desc = op->Op();
-    if (OpHasSubBlock(op_desc)) {
-      auto inputs = op_desc->InputArgumentNames();
-      auto outputs = op_desc->OutputArgumentNames();
-      vars.insert(inputs.begin(), inputs.end());
-      vars.insert(outputs.begin(), outputs.end());
-    }
+    // NOTE(dzhwinter):
+    // current block can not reuse next level block vars.
+    if (OpHasSubBlock(op_desc)) update_skip_set(op_desc);
+    // NOTE(dzhwinter):
+    // distributed ops input/output name need to
+    // keep same bettwen trainer/pserver
+    if (op_desc->Type() == "send") update_skip_set(op_desc);
+    if (op_desc->Type() == "recv") update_skip_set(op_desc);
+    if (op_desc->Type() == "prefetch") update_skip_set(op_desc);
   }
-  return vars;
 }
 
 void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var,
diff --git a/paddle/fluid/framework/details/memory_optimize_pass.h b/paddle/fluid/framework/details/memory_optimize_pass.h
index b3e026e0bc1e222e82a22b343c86ddc87a967e8f..3d6b1897f3b5106054b8f647f9cf613ebd1d65ff 100644
--- a/paddle/fluid/framework/details/memory_optimize_pass.h
+++ b/paddle/fluid/framework/details/memory_optimize_pass.h
@@ -55,9 +55,10 @@ class MemoryOptimizePass : public ir::Pass {
                             ir::Graph* graph) const;
 
   void SubGraphOptimize(OpDesc* op_desc) const;
-  // scan subblock and collect the output/input variables.
-  std::unordered_set<std::string> GetSubBlockVars(
-      const std::unordered_set<ir::Node*>&) const;
+  // 1. scan op with subblock and collect the output/input vars.
+  // while, while_grad, conditional_block
+  // 2. scan distributed ops and collect the output/input vars
+  void CollectSkipVarsSet(const std::unordered_set<ir::Node*>&) const;
 
  private:
   // Reuse Node Pool, Owned.
diff --git a/paddle/fluid/framework/inplace_op_inference_test.cc b/paddle/fluid/framework/inplace_op_inference_test.cc
index 121f648a5f04ae65560ae8d04042e40df61aad50..3e4d715c6f089496d1b1f7906e3f10147a073622 100644
--- a/paddle/fluid/framework/inplace_op_inference_test.cc
+++ b/paddle/fluid/framework/inplace_op_inference_test.cc
@@ -276,6 +276,7 @@ TEST(InferInplace, MultiGradInplaceInToOut) {
 
   auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
   auto in_to_outs = infer_inplace(*op, op->Block());
+
   EXPECT_EQ(in_to_outs.size(), 3ul);
   std::unordered_map<std::string, std::string> expects = {
       {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"},
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 87f0f307d30bc90a43a698c3766b16c975f0635e..953618560913229cd1e47659ad61e621efc10ed1 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -22,7 +22,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/string/printf.h"
 
-DECLARE_bool(benchmark);
+DEFINE_bool(benchmark, false,
+            "Doing memory benchmark. It will make deleting scope synchronized, "
+            "and add some memory usage logs."
+            "Default cuda is asynchronous device, set to True will"
+            "force op run in synchronous mode.");
 
 DEFINE_bool(
     eager_delete_scope, true,
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc
index ef62f758e37f28ab826faac84fd1276b14de7980..327adcc4aac1c50b51942c557d66dae6770e24f2 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -35,7 +35,6 @@ DEFINE_bool(init_allocated_mem, false,
             "To find this error in time, we use init_allocated_mem to indicate "
             "that initializing the allocated memory with a small value "
             "during unit testing.");
-DECLARE_bool(benchmark);
 DECLARE_double(fraction_of_gpu_memory_to_use);
 
 namespace paddle {
@@ -188,21 +187,20 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
     platform::SetDeviceId(place.device);
     size_t avail, total;
     platform::GpuMemoryUsage(&avail, &total);
-    LOG(WARNING) << "Cannot allocate " << string::HumanReadableSize(size)
-                 << " in GPU " << place.device << ", available "
-                 << string::HumanReadableSize(avail);
-    LOG(WARNING) << "total " << total;
-    LOG(WARNING) << "GpuMinChunkSize "
-                 << string::HumanReadableSize(
-                        buddy_allocator->GetMinChunkSize());
-    LOG(WARNING) << "GpuMaxChunkSize "
-                 << string::HumanReadableSize(
-                        buddy_allocator->GetMaxChunkSize());
-    LOG(WARNING) << "GPU memory used: "
-                 << string::HumanReadableSize(Used<platform::CUDAPlace>(place));
+    LOG(FATAL) << "Cannot allocate " << string::HumanReadableSize(size)
+               << " in GPU " << place.device << ", available "
+               << string::HumanReadableSize(avail) << "total " << total
+               << "GpuMinChunkSize "
+               << string::HumanReadableSize(buddy_allocator->GetMinChunkSize())
+               << "GpuMaxChunkSize "
+               << string::HumanReadableSize(buddy_allocator->GetMaxChunkSize())
+               << "GPU memory used: "
+               << string::HumanReadableSize(Used<platform::CUDAPlace>(place));
     platform::SetDeviceId(cur_dev);
   } else {
-    if (FLAGS_benchmark) allocation::GPUMemMonitor.Add(place.device, size);
+    if (VLOG_IS_ON(3)) {
+      allocation::GPUMemMonitor.Add(place.device, size);
+    }
     if (FLAGS_init_allocated_mem) {
       cudaMemset(ptr, 0xEF, size);
     }
@@ -218,7 +216,9 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
                                size_t size) {
 #ifdef PADDLE_WITH_CUDA
   GetGPUBuddyAllocator(place.device)->Free(p);
-  if (FLAGS_benchmark) allocation::GPUMemMonitor.Minus(place.device, size);
+  if (VLOG_IS_ON(3)) {
+    allocation::GPUMemMonitor.Minus(place.device, size);
+  }
 #else
   PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
 #endif
diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc
index 60b2d83f15746eab0a4d29c7965c064690b6d46d..655ce8485d4584aa0955315b045da6bf541f7fe2 100644
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -14,12 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/place.h"
 
-DEFINE_bool(benchmark, false,
-            "Doing memory benchmark. It will make deleting scope synchronized, "
-            "and add some memory usage logs."
-            "Default cuda is asynchronous device, set to True will"
-            "force op run in synchronous mode.");
-
 namespace paddle {
 namespace platform {