Add ExecutionConfig and fix last-live-op bug for standalone executor (#46405)

* Add ExecutionConfig and fix last-live-op bug for standalone executor * Improve code design

Add ExecutionConfig and fix last-live-op bug for standalone executor (#46405)
* Add ExecutionConfig and fix last-live-op bug for standalone executor * Improve code design
c4de1277 · Ruibiao Chen · GitHub · 1d04e021 · c4de1277 · c4de1277
10 changed file
--- a/paddle/fluid/framework/new_executor/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/CMakeLists.txt
@@ -14,6 +14,7 @@ set(STANDALONE_EXECUTOR_SRCS
 set(STANDALONE_EXECUTOR_DEPS
    dependency_builder
    device_context
+    execution_config
    op_registry
    scope
    framework_proto

--- a/paddle/fluid/framework/new_executor/interpreter/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/interpreter/CMakeLists.txt
@@ -2,3 +2,8 @@ cc_library(
  dependency_builder
  SRCS dependency_builder.cc
  DEPS operator)
+
+cc_library(
+  execution_config
+  SRCS execution_config.cc
+  DEPS phi_backends)
--- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc
@@ -16,16 +16,6 @@

 #include <queue>

-// The difference between "sequential_run" and "serial_run":
-// "sequential_run" dispatches OPs one by one according to the sequence in the
-// Program, while "serial_run" ensures that all Ops are scheduled in a singal
-// thread. In standalone executor, "sequential_run" is also "serial_run", while
-// "serial_run" is not necessarily "sequential_run".
-PADDLE_DEFINE_EXPORTED_bool(new_executor_sequential_run,
-                            false,
-                            "Enable sequential execution for standalone "
-                            "executor, only applied to GPU OPs.");
-
 namespace paddle {
 namespace framework {
 namespace interpreter {
@@ -67,7 +57,7 @@ const std::string StringizeDownstreamMap(
 }

 const std::map<int, std::set<int>>& DependencyBuilder::Build(
-    const std::vector<Instruction>& instructions) {
+    const std::vector<Instruction>& instructions, bool is_sequential_run) {
  PADDLE_ENFORCE_EQ(
      is_build_,
      false,
@@ -85,7 +75,7 @@ const std::map<int, std::set<int>>& DependencyBuilder::Build(
  AddDependencyForRandomOp();
  AddDependencyForReadOp();

-  if (FLAGS_new_executor_sequential_run) {
+  if (is_sequential_run) {
    AddDependencyForSequentialRun();
  }

@@ -505,7 +495,7 @@ void DependencyBuilder::BuildOpHappensBefore() {
                                next,
                                op_idx));
          op_happens_before_[op_idx][next] = true;
-          VLOG(8) << "happens before: " << op_idx << " " << next;
+          VLOG(10) << "happens before: " << op_idx << " " << next;
          q.push(next);
        }
      }

--- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h
+++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h
@@ -36,7 +36,7 @@ class DependencyBuilder {
  // build op dependencies and return the mapping from op to its downstream-op
  // set
  const std::map<int, std::set<int>>& Build(
-      const std::vector<Instruction>& instructions);
+      const std::vector<Instruction>& instructions, bool is_sequential_run);

  bool OpHappensBefore(int prior_op_idx, int posterior_op_idx);


--- a/paddle/fluid/framework/new_executor/threadpool_config.h
+++ b/paddle/fluid/framework/new_executor/threadpool_config.h
@@ -12,12 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#pragma once
+#include "paddle/fluid/framework/new_executor/interpreter/execution_config.h"

+#include <set>
 #include <thread>
+
 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
 #include "paddle/fluid/platform/device/npu/npu_info.h"
-#include "paddle/fluid/platform/place.h"
 #include "paddle/phi/backends/device_manager.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/xpu/xpu_info.h"
@@ -48,7 +49,7 @@ static constexpr size_t kMinOpNumForAsyncPrepare = 1000;
 // Note that the purpose of the config is to limit the total 'possible'
 // threads introduced by interpretercore to avoid hurting performance.

-inline std::tuple<int, int, int> GetThreadPoolConfig(const phi::Place place,
+inline std::tuple<int, int, int> GetThreadPoolConfig(const phi::Place& place,
                                                     size_t op_num) {
  int num_device_threads = kDeviceNumThreads,
      num_host_threads = kHostNumThreads,
@@ -131,6 +132,24 @@ inline std::tuple<int, int, int> GetThreadPoolConfig(const phi::Place place,
      num_host_threads, num_device_threads, num_prepare_threads);
 }

+ExecutionConfig::ExecutionConfig(const phi::Place& place, size_t op_num) {
+  std::tie(host_num_threads, deivce_num_threads, prepare_num_threads) =
+      GetThreadPoolConfig(place, op_num);
+}
+
+void ExecutionConfig::Log(int log_level) {
+  VLOG(log_level) << "ExecutionConfig:";
+  VLOG(log_level) << "used_for_jit = " << used_for_jit;
+  VLOG(log_level) << "create_local_scope = " << create_local_scope;
+  VLOG(log_level) << "host_num_threads = " << host_num_threads;
+  VLOG(log_level) << "deivce_num_threads = " << deivce_num_threads;
+  VLOG(log_level) << "prepare_num_threads = " << prepare_num_threads;
+  VLOG(log_level) << "skip_gc_vars = ";
+  for (const std::string& var : skip_gc_vars) {
+    VLOG(log_level) << var;
+  }
+}
+
 }  // namespace interpreter
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/new_executor/interpreter/execution_config.h
+++ b/paddle/fluid/framework/new_executor/interpreter/execution_config.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <set>
+#include <string>
+
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+namespace interpreter {
+
+struct ExecutionConfig {
+  bool used_for_jit{false};
+  bool create_local_scope{true};
+
+  size_t host_num_threads;
+  size_t deivce_num_threads;
+  size_t prepare_num_threads;
+
+  std::set<std::string> skip_gc_vars;
+
+  ExecutionConfig(const phi::Place& place, size_t op_num);
+  void Log(int log_level);
+};
+
+}  // namespace interpreter
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -19,7 +19,6 @@
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
 #include "paddle/fluid/framework/new_executor/interpretercore_util.h"
-#include "paddle/fluid/framework/new_executor/threadpool_config.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/os_info.h"
@@ -32,6 +31,15 @@
 #endif
 #include "paddle/phi/backends/device_manager.h"

+// The difference between "sequential_run" and "serial_run":
+// "sequential_run" dispatches OPs one by one according to the sequence in the
+// Program, while "serial_run" ensures that all Ops are scheduled in a singal
+// thread. In standalone executor, "sequential_run" is also "serial_run", while
+// "serial_run" is not necessarily "sequential_run".
+PADDLE_DEFINE_EXPORTED_bool(new_executor_sequential_run,
+                            false,
+                            "Enable sequential execution for standalone "
+                            "executor, only applied to GPU OPs.");
 PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace,
                            false,
                            "Use inplace in new executor");
@@ -49,6 +57,8 @@ constexpr const char* kTaskCompletion = "TaskCompletion";
 namespace paddle {
 namespace framework {

+// TODO(Ruibia): Pass skip_gc_vars, used_for_jit, and other config messages by
+// constructing an interpreter::ExecutionConfig
 InterpreterCore::InterpreterCore(const platform::Place& place,
                                 const BlockDesc& block,
                                 const std::set<std::string>& skip_gc_vars,
@@ -56,10 +66,9 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
                                 bool used_for_jit)
    : place_(place),
      block_(block),
-      skip_gc_vars_(skip_gc_vars),
+      execution_config_(place, block.OpSize()),
      var_scope_(scope),
-      stream_analyzer_(place),
-      used_for_jit_(used_for_jit) {
+      stream_analyzer_(place) {
  VLOG(4) << "InterpreterCore(): " << this << " on " << place_;

  is_build_ = false;
@@ -67,14 +76,13 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
  exception_notifier_ = main_thread_blocker_.RegisterEvent(kExceptionCaught);
  completion_notifier_ = main_thread_blocker_.RegisterEvent(kTaskCompletion);

-  create_local_scope_ = FLAGS_new_executor_use_local_scope;
-
-  if (used_for_jit_) {
-    create_local_scope_ = false;
-  }
-  VLOG(4) << "create_local_scope_ is " << create_local_scope_;
+  execution_config_.used_for_jit = used_for_jit;
+  execution_config_.create_local_scope =
+      !used_for_jit && FLAGS_new_executor_use_local_scope;
+  execution_config_.skip_gc_vars = skip_gc_vars;
+  execution_config_.Log(/*log_level=*/8);

-  if (create_local_scope_) {
+  if (execution_config_.create_local_scope) {
    auto local_scope = &var_scope_.GetMutableScope()->NewScope();
    local_scope_ = local_scope;
  }
@@ -127,7 +135,7 @@ interpreter::CostInfo InterpreterCore::DryRun(
    platform::DeviceContextPool::Instance().Get(place_)->Wait();
  }

-  if (create_local_scope_) {
+  if (execution_config_.create_local_scope) {
    ClearLoDTensorArrayInLocalScope();
  }

@@ -173,7 +181,7 @@ paddle::framework::FetchList InterpreterCore::Run(
    }
 #endif
  }
-  if (create_local_scope_) {
+  if (execution_config_.create_local_scope) {
    ClearLoDTensorArrayInLocalScope();
  }

@@ -201,16 +209,17 @@ paddle::framework::FetchList InterpreterCore::Run(
  if (!is_build_) {
    LOG_FIRST_N(INFO, 1) << "New Executor is Running.";
    paddle::framework::interpreter::build_variable_scope(
-        block_, &var_scope_, create_local_scope_);
+        block_, &var_scope_, execution_config_.create_local_scope);

    std::vector<paddle::framework::OpFuncNode> op_func_nodes;
-    paddle::framework::interpreter::build_op_func_list(place_,
-                                                       block_,
-                                                       skip_gc_vars_,
-                                                       &op_func_nodes,
-                                                       &var_scope_,
-                                                       create_local_scope_,
-                                                       used_for_jit_);
+    paddle::framework::interpreter::build_op_func_list(
+        place_,
+        block_,
+        execution_config_.skip_gc_vars,
+        &op_func_nodes,
+        &var_scope_,
+        execution_config_.create_local_scope,
+        execution_config_.used_for_jit);
    is_build_ = true;
    SetFeedVarsInplaceSkip(feed_names);
    // convert vec func_list to graph
@@ -239,12 +248,13 @@ paddle::framework::FetchList InterpreterCore::Run(
 #endif
  }

-  if (create_local_scope_) {
+  if (execution_config_.create_local_scope) {
    ClearLoDTensorArrayInLocalScope();
  }
  // return Fetch Tensors
-  Scope* inner_scope =
-      create_local_scope_ ? local_scope_ : var_scope_.GetMutableScope();
+  Scope* inner_scope = execution_config_.create_local_scope
+                           ? local_scope_
+                           : var_scope_.GetMutableScope();
  auto* fetch_var = inner_scope->FindVar(interpreter::kFetchVarName);
  if (fetch_var) {
    return std::move(*fetch_var->GetMutable<framework::FetchList>());
@@ -259,12 +269,13 @@ void InterpreterCore::SetCopyProgram(std::shared_ptr<ProgramDesc> prog) {

 void InterpreterCore::SetSkipGcVars(const std::set<std::string>& skip_gc_vars) {
  PADDLE_ENFORCE_EQ(
-      skip_gc_vars_.empty(),
+      execution_config_.skip_gc_vars.empty(),
      true,
      platform::errors::PreconditionNotMet(
-          "Skip_gc_vars_ can only be initialized once, now skip_gc_vars_ is "
+          "execution_config_.skip_gc_vars can only be initialized once, now "
+          "execution_config_.skip_gc_vars is "
          "not empty, do not call SetSkipGcVars method repeatedly."));
-  skip_gc_vars_ = skip_gc_vars;
+  execution_config_.skip_gc_vars = skip_gc_vars;
 }

 const VariableScope* InterpreterCore::GetVariableScope() const {
@@ -288,12 +299,13 @@ void InterpreterCore::ShareWorkQueueFrom(std::shared_ptr<InterpreterCore> src) {
          << ") to InterpreterCore(" << this << ")";
 }

-bool InterpreterCore::BuildInplaceCheckVarIsOnlyInput(size_t var_index) {
+bool InterpreterCore::BuildInplaceCheckVarIsOnlyInput(
+    const std::vector<std::vector<size_t>>& input_var2op, size_t var_index) {
  if (!var_scope_.VarDesc(var_index)) {
-    return input_var2op_info_.at(var_index).size() == 1;
+    return input_var2op.at(var_index).size() == 1;
  } else {
    int is_input_cnt = 0;
-    for (auto inst_id : input_var2op_info_.at(var_index)) {
+    for (auto inst_id : input_var2op.at(var_index)) {
      OpInOutInfo info;
      info.Build(vec_instruction_.at(inst_id).OpBase());
      if (info.IsInArgBufferNeeded(var_scope_.VarDesc(var_index)->Name())) {
@@ -306,21 +318,19 @@ bool InterpreterCore::BuildInplaceCheckVarIsOnlyInput(size_t var_index) {

 std::shared_ptr<interpreter::AsyncWorkQueue> InterpreterCore::GetWorkQueue() {
  if (async_work_queue_ == nullptr) {
-    int host_num_threads = 1, deivce_num_threads = 1, prepare_num_threads = 1;
-    std::tie(host_num_threads, deivce_num_threads, prepare_num_threads) =
-        interpreter::GetThreadPoolConfig(place_, vec_instruction_.size());
-    async_work_queue_ =
-        std::make_shared<interpreter::AsyncWorkQueue>(host_num_threads,
-                                                      deivce_num_threads,
-                                                      prepare_num_threads,
-                                                      &main_thread_blocker_);
+    async_work_queue_ = std::make_shared<interpreter::AsyncWorkQueue>(
+        execution_config_.host_num_threads,
+        execution_config_.deivce_num_threads,
+        execution_config_.prepare_num_threads,
+        &main_thread_blocker_);
  }
  return async_work_queue_;
 }

 void InterpreterCore::BuildAndCacheInstructionCtx(Instruction* instr_node) {
-  Scope* inner_scope =
-      create_local_scope_ ? local_scope_ : var_scope_.GetMutableScope();
+  Scope* inner_scope = execution_config_.create_local_scope
+                           ? local_scope_
+                           : var_scope_.GetMutableScope();
  VariableValueMap ins_map;
  for (auto& var_name_item : instr_node->Inputs()) {
    std::vector<Variable*> input_vars;
@@ -346,8 +356,9 @@ void InterpreterCore::BuildAndCacheInstructionCtx(Instruction* instr_node) {
  // set runtime_ctx and infershape_ctx_
  if (instr_node->OpBase()->Type() == "cinn_launch") {  // OP use scope in
                                                        // kernel
-    Scope* local_scope = create_local_scope_ ? var_scope_.GetMutableLocalScope()
-                                             : var_scope_.GetMutableScope();
+    Scope* local_scope = execution_config_.create_local_scope
+                             ? var_scope_.GetMutableLocalScope()
+                             : var_scope_.GetMutableScope();
    instr_node->ResetContextWithScope(ins_map, outs_map, *local_scope);
  } else {
    instr_node->ResetContext(ins_map, outs_map);
@@ -377,8 +388,19 @@ void InterpreterCore::BuildInplace() {
    }
  }

-  Scope* local_scope = create_local_scope_ ? var_scope_.GetMutableLocalScope()
-                                           : var_scope_.GetMutableScope();
+  Scope* local_scope = execution_config_.create_local_scope
+                           ? var_scope_.GetMutableLocalScope()
+                           : var_scope_.GetMutableScope();
+  std::vector<std::vector<size_t>> input_var2op(var_scope_.VarSize());
+  for (Instruction& instr : vec_instruction_) {
+    for (auto& item : instr.Inputs()) {
+      for (int var_id : item.second) {
+        if (var_id != kEmptyVarIndex) {
+          input_var2op.at(var_id).push_back(instr.Id());
+        }
+      }
+    }
+  }

  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
    auto& instr = vec_instruction_[i];
@@ -402,7 +424,7 @@ void InterpreterCore::BuildInplace() {
        if (var_scope_.GetVarSikpInplace(iter->second[0])) {
          continue;
        }
-        if (BuildInplaceCheckVarIsOnlyInput(iter->second[0])) {
+        if (BuildInplaceCheckVarIsOnlyInput(input_var2op, iter->second[0])) {
          auto iterout = outputs.find(pair.second);
          if (iterout != outputs.end() && !iterout->second.empty()) {
            const std::string& invar_name =
@@ -432,7 +454,9 @@ void InterpreterCore::BuildOperatorDependences() {
  // Schedule
  auto op_nums = vec_instruction_.size();
  dependecy_count_.resize(op_nums);
-  auto op2downstream = dependency_builder_.Build(vec_instruction_);
+  auto op2downstream = dependency_builder_.Build(
+      vec_instruction_,
+      /*is_sequential_run=*/FLAGS_new_executor_sequential_run);
  for (size_t op = 0; op < vec_instruction_.size(); ++op) {
    auto op_list = op2downstream[op];
    std::vector<size_t> downsteam_vector(op_list.begin(), op_list.end());
@@ -460,10 +484,7 @@ void InterpreterCore::ClearLoDTensorArrayInLocalScope() {
 void InterpreterCore::Convert(
    std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
  auto& vec_meta_info = var_scope_.MutableVecMetaInfo();
-  auto var_nums = var_scope_.VarSize();
-  input_var2op_info_.resize(var_nums);
  auto nodes = *op_func_nodes;
-
  auto op_nums = nodes.size();
  vec_instruction_.reserve(op_nums);
  for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
@@ -471,35 +492,42 @@ void InterpreterCore::Convert(
    auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node);
    vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_);
  }
+
  BuildOperatorDependences();
+
  // calculate last_live_ops_
  for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
-    auto& instr = vec_instruction_[op_idx];
+    Instruction& instr = vec_instruction_[op_idx];
    OpInOutInfo info;
-    std::set<size_t> gc_check_inputs;
-    for (auto& item : instr.Inputs()) {
+    info.Build(instr.OpBase());
+
+    std::set<size_t> gc_check_vars;
+
+    const std::map<std::string, std::vector<int>>& ins = instr.Inputs();
+    const std::map<std::string, std::vector<int>>& outs = instr.Outputs();
+    std::multimap<std::string, std::vector<int>> ins_and_outs{ins.begin(),
+                                                              ins.end()};
+    ins_and_outs.insert(outs.begin(), outs.end());
+
+    for (auto& item : ins_and_outs) {
      for (auto id : item.second) {
        if (id == kEmptyVarIndex) {
          continue;
        }
-        input_var2op_info_.at(id).push_back(op_idx);
-        // var can be gc-ed
-        if (!info.IsBuilt()) {
-          info.Build(instr.OpBase());
-        }
        auto* var_desc = var_scope_.VarDesc(id);
-        if (var_desc) {
-          if (info.IsInArgBufferNeeded(var_desc->Name())) {
-            gc_check_inputs.insert(id);
-          }
-        } else {
-          gc_check_inputs.insert(id);
+        // skip no_need_buffer input vars
+        if (var_desc && ins.count(item.first) &&
+            !info.IsInArgBufferNeeded(var_desc->Name())) {
+          continue;
        }
+        gc_check_vars.insert(id);
      }
    }
-    for (auto var_id : gc_check_inputs) {
-      Scope* inner_scope =
-          create_local_scope_ ? local_scope_ : var_scope_.GetMutableScope();
+
+    for (auto var_id : gc_check_vars) {
+      Scope* inner_scope = execution_config_.create_local_scope
+                               ? local_scope_
+                               : var_scope_.GetMutableScope();
      paddle::framework::Variable* var =
          inner_scope->FindVar(var_scope_.GetNameById(var_id));
      if (var->IsType<LoDTensor>() || var->IsType<phi::SelectedRows>() ||
@@ -512,18 +540,9 @@ void InterpreterCore::Convert(
      }
    }
  }
-  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
-    // checkout output
-    for (auto& item : vec_instruction_[i].Outputs()) {
-      for (auto var_id : item.second) {
-        if (input_var2op_info_.at(var_id).size() == 0) {
-          last_live_ops_[var_id].insert(i);
-        }
-      }
-    }
-  }
+
  // clear the last_live_ops list for all vars in skip_gc_vars
-  for (const std::string& skip_gc_var : skip_gc_vars_) {
+  for (const std::string& skip_gc_var : execution_config_.skip_gc_vars) {
    int var_id = var_scope_.GetIdByName(skip_gc_var);
    if (var_id != -1) {
      last_live_ops_[var_id].clear();
@@ -662,8 +681,9 @@ inline void SetDeviceId(const platform::Place& place) {
 void InterpreterCore::RunInstruction(const Instruction& instr_node) {
  auto* op = instr_node.OpBase();
  auto place = instr_node.DeviceContext().GetPlace();
-  Scope* local_scope = create_local_scope_ ? var_scope_.GetMutableLocalScope()
-                                           : var_scope_.GetMutableScope();
+  Scope* local_scope = execution_config_.create_local_scope
+                           ? var_scope_.GetMutableLocalScope()
+                           : var_scope_.GetMutableScope();
  VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope_);

  SetDeviceId(place);
@@ -947,6 +967,8 @@ void InterpreterCore::RunInstructionAsync(

      CheckGC(instr_node, atomic_var_ref);

+      interpreter::LogDeviceMemoryStats(place_);
+
      interpreter::RecordEvent(instr_node, place_);
    } catch (platform::EnforceNotMet& ex) {
      framework::InsertCallStackInfo(op->Type(), op->Attrs(), &ex);
@@ -1135,16 +1157,17 @@ void InterpreterCore::Prepare(

  if (!is_build_) {
    paddle::framework::interpreter::build_variable_scope(
-        block_, &var_scope_, create_local_scope_);
+        block_, &var_scope_, execution_config_.create_local_scope);
    FeedInput();
    std::vector<paddle::framework::OpFuncNode> op_func_nodes;
-    paddle::framework::interpreter::build_op_func_list(place_,
-                                                       block_,
-                                                       skip_gc_vars_,
-                                                       &op_func_nodes,
-                                                       &var_scope_,
-                                                       create_local_scope_,
-                                                       used_for_jit_);
+    paddle::framework::interpreter::build_op_func_list(
+        place_,
+        block_,
+        execution_config_.skip_gc_vars,
+        &op_func_nodes,
+        &var_scope_,
+        execution_config_.create_local_scope,
+        execution_config_.used_for_jit);
    is_build_ = true;
    SetFeedVarsInplaceSkip(feed_names);
    // convert vec func_list to graph

--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -23,6 +23,7 @@
 #include "paddle/fluid/framework/new_executor/event_manager.h"
 #include "paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h"
 #include "paddle/fluid/framework/new_executor/interpreter/dependency_builder.h"
+#include "paddle/fluid/framework/new_executor/interpreter/execution_config.h"
 #include "paddle/fluid/framework/new_executor/interpretercore_util.h"
 #include "paddle/fluid/framework/new_executor/new_executor_defs.h"
 #include "paddle/fluid/framework/new_executor/profiler.h"
@@ -67,7 +68,8 @@ class InterpreterCore {
  void reset_scope(Scope* new_scope);

 private:
-  bool BuildInplaceCheckVarIsOnlyInput(size_t var_index);
+  bool BuildInplaceCheckVarIsOnlyInput(
+      const std::vector<std::vector<size_t>>& input_var2op, size_t var_index);

  std::shared_ptr<interpreter::AsyncWorkQueue> GetWorkQueue();

@@ -112,9 +114,9 @@ class InterpreterCore {

  platform::Place place_;
  const BlockDesc& block_;  // not owned
-  std::set<std::string> skip_gc_vars_;

  interpreter::DependencyBuilder dependency_builder_;
+  interpreter::ExecutionConfig execution_config_;

  // NOTE(zhiqiu): when add fetch ops in GetInterpreterCore, we will
  // copy a new program and block, the copy_program_ here is used to
@@ -134,10 +136,7 @@ class InterpreterCore {

  std::vector<size_t> dependecy_count_;
  std::atomic<size_t> unfinished_op_numer_{0};
-  std::vector<std::vector<size_t>> input_var2op_info_;
-
  VariableScope var_scope_;
-  bool create_local_scope_{true};
  Scope* local_scope_{nullptr};  // not owned

  StreamAnalyzer stream_analyzer_;
@@ -151,8 +150,6 @@ class InterpreterCore {

  std::future<std::unique_ptr<AtomicVectorSizeT>> atomic_deps_;
  std::future<std::unique_ptr<AtomicVectorSizeT>> atomic_var_ref_;
-
-  bool used_for_jit_{false};
 };

 std::shared_ptr<InterpreterCore> CreateInterpreterCore(

--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include "paddle/fluid/framework/new_executor/interpretercore_util.h"

 #include <algorithm>
@@ -18,6 +19,7 @@
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/new_executor/data_transfer.h"
+#include "paddle/fluid/memory/stats.h"
 #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
 #include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
@@ -34,6 +36,11 @@ PADDLE_DEFINE_EXPORTED_bool(
    false,
    "Enable serial execution for standalone executor, used for debug.");

+PADDLE_DEFINE_EXPORTED_bool(
+    new_executor_log_memory_stats,
+    false,
+    "Log memory stats after each op runs, just used for debug.");
+
 DECLARE_bool(use_mkldnn);
 DECLARE_bool(check_nan_inf);

@@ -135,6 +142,21 @@ std::unique_ptr<AtomicVectorSizeT> PrepareAtomicVarRef(
  return var_ref;
 }

+void LogDeviceMemoryStats(const platform::Place& place) {
+  if (FLAGS_new_executor_log_memory_stats && platform::is_gpu_place(place)) {
+    VLOG(0) << "memory_allocated: "
+            << static_cast<double>(memory::DeviceMemoryStatCurrentValue(
+                   "Allocated", place.device)) /
+                   1024 / 1024
+            << " MB";
+    VLOG(0) << "max_memory_allocated: "
+            << static_cast<double>(memory::DeviceMemoryStatPeakValue(
+                   "Allocated", place.device)) /
+                   1024 / 1024
+            << " MB";
+  }
+}
+
 bool var_can_be_deleted(const std::string& name, const BlockDesc& block) {
  auto* var_desc = block.FindVar(name);
  if (var_desc == nullptr || var_desc->Persistable()) {
@@ -629,18 +651,14 @@ void build_op_func_list(const platform::Place& place,

        // step 5. run kernel
        if (run_phi_kernel) {
-          VLOG(1) << "start run phi kernel. ";
          phi::KernelContext phi_kernel_context;
          op_with_kernel->BuildPhiKernelContext(
              runtime_context, dev_ctx, &phi_kernel_context);
          (*op_func_node.phi_kernel_)(&phi_kernel_context);
-          VLOG(1) << "end run phi kernel. ";
        } else {
-          VLOG(4) << "start run kernel. ";
          // the place of exec_ctx maybe has changed.
          op_func_node.kernel_func_(ExecutionContext(
              *op_with_kernel, *runtime_scope, *dev_ctx, runtime_context));
-          VLOG(4) << "end run kernel. ";
        }

        // post-process grad_op.outputs if need cast complex grad into real
@@ -702,6 +720,7 @@ void build_op_func_list(const platform::Place& place,
    // gc---------------------------------------------------------------------------
    auto iter = unused_var_map.find(op);
    if (iter == unused_var_map.end()) {
+      interpreter::LogDeviceMemoryStats(place);
      continue;
    }

@@ -722,6 +741,8 @@ void build_op_func_list(const platform::Place& place,
      }
    }
    delete garbages;  // free mem
+
+    interpreter::LogDeviceMemoryStats(place);
  }
 }


--- a/paddle/fluid/framework/new_executor/interpretercore_util.h
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.h
@@ -44,7 +44,6 @@ constexpr size_t kPrepareWorkQueueIdx = 2;
 namespace paddle {
 namespace framework {
 namespace interpreter {
-
 class AsyncWorkQueue {
 public:
  AsyncWorkQueue(size_t host_num_threads,
@@ -77,6 +76,8 @@ std::unique_ptr<AtomicVectorSizeT> PrepareAtomicDeps(
 std::unique_ptr<AtomicVectorSizeT> PrepareAtomicVarRef(
    const std::vector<VariableMetaInfo>& vec_meta_info);

+void LogDeviceMemoryStats(const platform::Place& place);
+
 void build_variable_scope(const framework::BlockDesc& block,
                          VariableScope* var_scope,
                          bool use_local_scope = true);