From f38e126e56d8e35f09ad32043c18383a76b806e2 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Mon, 19 Jun 2023 17:02:03 +0800
Subject: [PATCH] [NewExe]Polish InterpreterCore with PImpl and Derived
 ProgramInterpreter and NewIRInterpreter (#54651)

* [NewExe]Polish InterpreterCore with PImpl

fix code style

add std::move

* fix conflict

* fix typo

* fix typo
---
 .../framework/new_executor/CMakeLists.txt     |    5 +-
 .../new_executor/interpreter_base_impl.h      |  129 ++
 .../framework/new_executor/interpretercore.cc | 1561 +----------------
 .../framework/new_executor/interpretercore.h  |  166 +-
 .../new_executor/new_ir_interpreter.cc        | 1480 ++++++++++++++++
 .../new_executor/new_ir_interpreter.h         |  182 ++
 .../new_executor/program_interpreter.cc       | 1461 +++++++++++++++
 .../new_executor/program_interpreter.h        |  181 ++
 .../new_executor/standalone_executor.cc       |    8 +-
 .../standalone_executor_new_ir_test.cc        |    6 +-
 10 files changed, 3493 insertions(+), 1686 deletions(-)
 create mode 100644 paddle/fluid/framework/new_executor/interpreter_base_impl.h
 create mode 100644 paddle/fluid/framework/new_executor/new_ir_interpreter.cc
 create mode 100644 paddle/fluid/framework/new_executor/new_ir_interpreter.h
 create mode 100644 paddle/fluid/framework/new_executor/program_interpreter.cc
 create mode 100644 paddle/fluid/framework/new_executor/program_interpreter.h
diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt
index eedafa96d23..54902b86c3c 100644
--- a/paddle/fluid/framework/new_executor/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/CMakeLists.txt
@@ -2,8 +2,9 @@ add_subdirectory(garbage_collector)
 add_subdirectory(interpreter)
 add_subdirectory(workqueue)
 
-set(STANDALONE_EXECUTOR_SRCS feed_fetch_utils.cc interpretercore.cc
-                             new_executor_defs.cc standalone_executor.cc)
+set(STANDALONE_EXECUTOR_SRCS
+    feed_fetch_utils.cc interpretercore.cc new_executor_defs.cc
+    standalone_executor.cc program_interpreter.cc new_ir_interpreter.cc)
 
 set(STANDALONE_EXECUTOR_DEPS
     interpreter
diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
new file mode 100644
index 00000000000..8fc0a137d03
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
@@ -0,0 +1,129 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <queue>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "gflags/gflags.h"
+#include "paddle/fluid/platform/flags.h"
+
+#include "paddle/fluid/framework/details/exception_holder.h"
+#include "paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h"
+#include "paddle/fluid/framework/new_executor/interpreter/dependency_builder.h"
+#include "paddle/fluid/framework/new_executor/interpreter/execution_config.h"
+#include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
+#include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
+#include "paddle/fluid/framework/new_executor/new_executor_defs.h"
+#include "paddle/fluid/framework/new_executor/profiler.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/memory/allocation/spin_lock.h"
+#include "paddle/fluid/platform/device_event.h"
+#include "paddle/phi/backends/device_manager.h"
+
+DECLARE_bool(new_executor_serial_run);
+DECLARE_bool(new_executor_static_build);
+DECLARE_bool(new_executor_use_inplace);
+DECLARE_bool(new_executor_use_local_scope);
+
+PHI_DECLARE_bool(check_nan_inf);
+DECLARE_bool(benchmark);
+DECLARE_uint64(executor_log_deps_every_microseconds);
+PHI_DECLARE_bool(new_executor_use_cuda_graph);
+PHI_DECLARE_bool(enable_new_ir_in_executor);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PHI_DECLARE_bool(sync_nccl_allreduce);
+#endif
+
+constexpr const char* kExceptionCaught = "ExceptionCaught";
+constexpr const char* kTaskCompletion = "TaskCompletion";
+
+namespace paddle {
+namespace framework {
+using HookFunc = std::function<void(OperatorBase*, Scope*)>;
+
+/// @brief InterpreterBaseImpl is a abstract Base Class and define necessary
+/// interface with virtual keywords for Derived class.
+/// TODO(Aurelius84): Clean unnecessary interface to keep cohesion.
+class InterpreterBaseImpl {
+ public:
+  virtual ~InterpreterBaseImpl() = default;
+  virtual paddle::framework::FetchList Run(
+      const std::vector<std::string>& feed_names,
+      const std::vector<phi::DenseTensor>& feed_tensors) = 0;
+
+  virtual paddle::framework::FetchList Run(
+      const std::vector<std::string>& feed_names, bool need_fetch = true) = 0;
+
+  virtual void ShareWorkQueueFrom(InterpreterBaseImpl* src) = 0;
+
+  virtual void SetCopyProgram(std::shared_ptr<ProgramDesc> prog) = 0;
+
+  virtual void SetSkipGcVars(const std::set<std::string>& skip_gc_vars) = 0;
+
+  virtual const std::set<std::string>& JitInputVars() const = 0;
+
+  virtual void SetJitInputVars(const std::set<std::string>& jit_input_vars) = 0;
+
+  virtual const VariableScope* GetVariableScope() const = 0;
+
+  virtual void reset_scope(Scope* new_scope) = 0;
+
+  virtual const platform::Place& GetPlace() const = 0;
+
+  virtual void SetOutputHooks(const std::vector<HookFunc>& hookfuncs) = 0;
+};
+
+inline void SetDeviceId(const platform::Place& place) {
+  // TODO(zhiqiu): reduce the cost
+  if (platform::is_gpu_place(place)) {
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Cannot run operator on place %s, please recompile paddle or "
+        "reinstall Paddle with CUDA support.",
+        place));
+#else
+    auto dev_id = place.device;
+    platform::SetDeviceId(dev_id);
+#endif
+  } else if (platform::is_xpu_place(place)) {
+#ifndef PADDLE_WITH_XPU
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Cannot run operator on place %s, please recompile paddle or "
+        "reinstall Paddle with XPU support.",
+        place));
+#else
+    auto dev_id = place.device;
+    platform::SetXPUDeviceId(dev_id);
+#endif
+  } else if (platform::is_custom_place(place)) {
+#ifndef PADDLE_WITH_CUSTOM_DEVICE
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Cannot run operator on place %s, please recompile paddle or "
+        "reinstall Paddle with CustomDevice support.",
+        place));
+#else
+    phi::DeviceManager::SetDevice(place);
+#endif
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index f864d090d74..6a5e92b1108 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -14,28 +14,8 @@
 
 #include "paddle/fluid/framework/new_executor/interpretercore.h"
 
-#include <unordered_set>
-
-#include "gflags/gflags.h"
-
-#include "paddle/fluid/framework/details/nan_inf_utils.h"
-#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
-#include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
-#include "paddle/fluid/framework/new_executor/interpreter/static_build.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/os_info.h"
-#include "paddle/fluid/platform/profiler/event_tracing.h"
-#include "paddle/fluid/platform/profiler/supplement_tracing.h"
-#include "paddle/phi/common/place.h"
-#include "paddle/phi/core/kernel_context.h"
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-#include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h"
-#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
-#include "paddle/fluid/platform/flags.h"
-#include "paddle/phi/backends/device_manager.h"
+#include "paddle/fluid/framework/new_executor/new_ir_interpreter.h"
+#include "paddle/fluid/framework/new_executor/program_interpreter.h"
 
 PADDLE_DEFINE_EXPORTED_bool(
     new_executor_serial_run,
@@ -53,1551 +33,78 @@ PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope,
                             "Use local_scope in new executor(especially used "
                             "in UT), can turn off for better performance");
 
-PHI_DECLARE_bool(check_nan_inf);
-DECLARE_bool(benchmark);
-DECLARE_uint64(executor_log_deps_every_microseconds);
-PHI_DECLARE_bool(new_executor_use_cuda_graph);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PHI_DECLARE_bool(sync_nccl_allreduce);
-#endif
-
-PHI_DECLARE_bool(enable_new_ir_in_executor);
-
-constexpr const char* kExceptionCaught = "ExceptionCaught";
-constexpr const char* kTaskCompletion = "TaskCompletion";
-
 namespace paddle {
 namespace framework {
 
-inline void SetDeviceId(const platform::Place& place) {
-  // TODO(zhiqiu): reduce the cost
-  if (platform::is_gpu_place(place)) {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Cannot run operator on place %s, please recompile paddle or "
-        "reinstall Paddle with CUDA support.",
-        place));
-#else
-    auto dev_id = place.device;
-    platform::SetDeviceId(dev_id);
-#endif
-  } else if (platform::is_xpu_place(place)) {
-#ifndef PADDLE_WITH_XPU
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Cannot run operator on place %s, please recompile paddle or "
-        "reinstall Paddle with XPU support.",
-        place));
-#else
-    auto dev_id = place.device;
-    platform::SetXPUDeviceId(dev_id);
-#endif
-  } else if (platform::is_custom_place(place)) {
-#ifndef PADDLE_WITH_CUSTOM_DEVICE
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Cannot run operator on place %s, please recompile paddle or "
-        "reinstall Paddle with CustomDevice support.",
-        place));
-#else
-    phi::DeviceManager::SetDevice(place);
-#endif
-  }
-}
-
 InterpreterCore::InterpreterCore(const platform::Place& place,
                                  const BlockDesc& block,
                                  framework::Scope* scope,
-                                 const ExecutionConfig& execution_config)
-    : place_(place),
-      block_(block),
-      stream_analyzer_(place),
-      execution_config_(execution_config),
-      var_scope_(scope) {
-  VLOG(4) << "InterpreterCore(): " << this << " on " << place_;
-
-  static_build_ = FLAGS_new_executor_static_build &&
-                  !FLAGS_new_executor_use_cuda_graph &&
-                  !execution_config.used_for_control_flow_op &&
-                  interpreter::BlockCanBeStaticBuilt(block);
-
-  exception_notifier_ = main_thread_blocker_.RegisterEvent(kExceptionCaught);
-  completion_notifier_ = main_thread_blocker_.RegisterEvent(kTaskCompletion);
-
-  if (!FLAGS_new_executor_use_local_scope) {
-    execution_config_.create_local_scope = false;
-  }
-  execution_config_.AnalyzeThreadPoolConfig(place, block.OpSize());
-  execution_config_.Log(/*log_level=*/8);
-
-  if (execution_config_.create_local_scope) {
-    auto local_scope = &var_scope_.GetMutableScope()->NewScope();
-    local_scope_ = local_scope;
-  }
-  var_scope_.SetLocalScope(local_scope_);
-
-  instruction_scheduling_priority_less = [this](size_t lhs, size_t rhs) {
-    SchedulingPriority lhs_scheduling_priority =
-        vec_instruction_[lhs].GetSchedulingPriority();
-    SchedulingPriority rhs_scheduling_priority =
-        vec_instruction_[rhs].GetSchedulingPriority();
-    if (lhs_scheduling_priority == rhs_scheduling_priority) {
-      return lhs < rhs;
-    }
-    return lhs_scheduling_priority > rhs_scheduling_priority;
-  };
-
-  PrepareForCUDAGraphCapture();
+                                 const ExecutionConfig& execution_config) {
+  VLOG(4) << "InterpreterCore(): " << this << " on " << place;
+  impl_ = std::make_unique<ProgramInterpreter>(
+      place, block, scope, execution_config);
 }
 
 InterpreterCore::InterpreterCore(const platform::Place& place,
-                                 const BlockDesc& block,
-                                 framework::Scope* scope,
                                  std::unique_ptr<::ir::Program> ir_prog,
-                                 const ExecutionConfig& execution_config)
-    : place_(place),
-      block_(block),
-      stream_analyzer_(place),
-      execution_config_(execution_config),
-      var_scope_(scope),
-      ir_program_(std::move(ir_prog)) {
-  VLOG(4) << "InterpreterCore(): " << this << " on " << place_;
-
-  static_build_ = FLAGS_new_executor_static_build &&
-                  !FLAGS_new_executor_use_cuda_graph &&
-                  !execution_config.used_for_control_flow_op &&
-                  interpreter::BlockCanBeStaticBuilt(block);
-
-  exception_notifier_ = main_thread_blocker_.RegisterEvent(kExceptionCaught);
-  completion_notifier_ = main_thread_blocker_.RegisterEvent(kTaskCompletion);
-
-  if (!FLAGS_new_executor_use_local_scope) {
-    execution_config_.create_local_scope = false;
-  }
-  execution_config_.AnalyzeThreadPoolConfig(place, block.OpSize());
-  execution_config_.Log(/*log_level=*/8);
-
-  if (execution_config_.create_local_scope) {
-    auto local_scope = &var_scope_.GetMutableScope()->NewScope();
-    local_scope_ = local_scope;
-  }
-
-  // force use outer scope for now
-  local_scope_ = scope;
-  static_build_ = true;
-
-  var_scope_.SetLocalScope(local_scope_);
-
-  instruction_scheduling_priority_less = [this](size_t lhs, size_t rhs) {
-    SchedulingPriority lhs_scheduling_priority =
-        vec_instruction_[lhs].GetSchedulingPriority();
-    SchedulingPriority rhs_scheduling_priority =
-        vec_instruction_[rhs].GetSchedulingPriority();
-    if (lhs_scheduling_priority == rhs_scheduling_priority) {
-      return lhs < rhs;
-    }
-    return lhs_scheduling_priority > rhs_scheduling_priority;
-  };
-
-  PrepareForCUDAGraphCapture();
+                                 framework::Scope* scope,
+                                 const ExecutionConfig& execution_config) {
+  VLOG(4) << "InterpreterCore(): " << this << " on " << place;
+  impl_ = std::make_unique<NewIRInterpreter>(
+      place, std::move(ir_prog), scope, execution_config);
 }
 
 InterpreterCore::~InterpreterCore() {
-  // cancle gc's thread
-  gc_.reset(nullptr);
-  async_work_queue_.reset();
-  VLOG(4) << "~InterpreterCore(): " << this << " on " << place_;
-
-#ifdef PADDLE_WITH_MKLDNN
-  // Clear mkl-dnn cache,
-  // this is needed to have mkl-dnn unit tests working
-  platform::ClearMKLDNNCache(place_, this);
-#endif
+  VLOG(4) << "~InterpreterCore(): " << this;
+  impl_.reset(nullptr);
 }
 
-void InterpreterCore::RunImpl() {
-  // lazy initialization of gc, do not create gc is the program only run once
-  if (!gc_) {
-    gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_);
-  }
-
-  interpreter::ResetAtomicGuard guard(&deps_, &refs_);
-
-  if (FLAGS_enable_new_ir_in_executor ||
-      ((execution_config_.used_for_jit || execution_config_.used_for_cinn) &&
-       (sync_op_num_ == 0))) {
-    VLOG(4) << "Tracing Instruction List";
-    TraceInstructionList(vec_instruction_);
-  } else {
-    VLOG(4) << "Non-tracing";
-    // For the program that only run once, it is no need to
-    // create work_queue, so the async_work_queue_ is created
-    // until the second step run.
-    async_work_queue_ = GetWorkQueue();
-    ExecuteInstructionList(vec_instruction_);
-  }
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
-  if (platform::is_custom_place(place_)) {
-    platform::DeviceContextPool::Instance().Get(place_)->Wait();
-  }
-#endif
-}
-
-paddle::framework::FetchList InterpreterCore::Run(
+FetchList InterpreterCore::Run(
     const std::vector<std::string>& feed_names,
     const std::vector<phi::DenseTensor>& feed_tensors) {
-  SetDeviceId(place_);
-  CheckCUDAGraphBeforeRun(feed_names);
-
-#ifdef PADDLE_WITH_MKLDNN
-  platform::AttachPointerHashToMKLDNNKey(this, place_);
-#endif
-
-  bool is_build = is_build_;
-
-  Prepare(feed_names, feed_tensors, is_build);
-
-  if (is_build) {
-    RunImpl();
-  }
-
-  if (HasLocalScope()) {
-    ClearLoDTensorArrayInLocalScope();
-  }
-
-  // return Fetch Tensors
-  auto* fetch_var = local_scope_->FindVar(interpreter::kFetchVarName);
-  if (fetch_var) {
-    auto fetch_list = std::move(*fetch_var->GetMutable<framework::FetchList>());
-#ifdef PADDLE_WITH_CUDA
-    if (platform::IsCUDAGraphCapturing()) {
-      PADDLE_ENFORCE_EQ(fetch_list.empty(),
-                        true,
-                        platform::errors::InvalidArgument(
-                            "Cannot fetch data when using CUDA Graph."));
-    }
-#endif
-    return fetch_list;
-  } else {
-    return {};
-  }
+  return impl_->Run(feed_names, feed_tensors);
 }
 
-paddle::framework::FetchList InterpreterCore::Run(
-    const std::vector<std::string>& feed_names, bool need_fetch) {
-  SetDeviceId(place_);
-  CheckCUDAGraphBeforeRun(feed_names);
-
-#ifdef PADDLE_WITH_MKLDNN
-  platform::AttachPointerHashToMKLDNNKey(this, place_);
-#endif
-
-  if (!is_build_) {
-    LOG_FIRST_N(INFO, 1) << "New Executor is Running.";
-    if (FLAGS_enable_new_ir_in_executor) {
-      VLOG(6) << "begin to build scope";
-      ::ir::BuildScope(
-          ir_program_->block(), local_scope_, &value_2_var_name_map_);
-      VLOG(6) << "build ccope finshed";
-    } else {
-      interpreter::BuildVariableScope(block_, execution_config_, &var_scope_);
-    }
-
-    std::vector<paddle::framework::OpFuncNode> op_func_nodes;
-    if (FLAGS_enable_new_ir_in_executor) {
-      interpreter::BuildOpFuncList(place_,
-                                   ir_program_->block(),
-                                   &op_func_nodes,
-                                   local_scope_,
-                                   value_2_var_name_map_,
-                                   execution_config_);
-    } else {
-      interpreter::BuildOpFuncList(place_,
-                                   block_,
-                                   execution_config_.skip_gc_vars,
-                                   &op_func_nodes,
-                                   &var_scope_,
-                                   execution_config_,
-                                   HasLocalScope(),
-                                   static_build_);
-    }
-    SetFeedVarsInplaceSkip(feed_names);
-    // convert vec func_list to graph
-    Convert(&op_func_nodes);
-    UpdateSyncOpNum();
-    if (static_build_) {
-      VLOG(4) << "RUN impl";
-      RunImpl();
-    }
-    is_build_ = true;
-  } else {
-    RunImpl();
-  }
-
-  if (HasLocalScope()) {
-    ClearLoDTensorArrayInLocalScope();
-  }
+FetchList InterpreterCore::Run(const std::vector<std::string>& feed_names,
+                               bool need_fetch) {
+  return impl_->Run(feed_names, need_fetch);
+}
 
-  // return Fetch Tensors
-  Scope* inner_scope =
-      HasLocalScope() ? local_scope_ : var_scope_.GetMutableScope();
-  auto* fetch_var = inner_scope->FindVar(interpreter::kFetchVarName);
-  if (fetch_var && need_fetch) {
-    auto fetch_list = std::move(*fetch_var->GetMutable<framework::FetchList>());
-#ifdef PADDLE_WITH_CUDA
-    if (platform::IsCUDAGraphCapturing()) {
-      PADDLE_ENFORCE_EQ(fetch_list.empty(),
-                        true,
-                        platform::errors::InvalidArgument(
-                            "Cannot fetch data when using CUDA Graph."));
-    }
-#endif
-    return fetch_list;
-  } else {
-    return {};
-  }
+void InterpreterCore::ShareWorkQueueFrom(std::shared_ptr<InterpreterCore> src) {
+  impl_->ShareWorkQueueFrom(const_cast<InterpreterBaseImpl*>(src->Impl()));
 }
 
 void InterpreterCore::SetCopyProgram(std::shared_ptr<ProgramDesc> prog) {
-  copy_program_ = prog;
+  impl_->SetCopyProgram(prog);
 }
 
 void InterpreterCore::SetSkipGcVars(const std::set<std::string>& skip_gc_vars) {
-  PADDLE_ENFORCE_EQ(
-      execution_config_.skip_gc_vars.empty(),
-      true,
-      platform::errors::PreconditionNotMet(
-          "execution_config_.skip_gc_vars can only be initialized once, now "
-          "execution_config_.skip_gc_vars is "
-          "not empty, do not call SetSkipGcVars method repeatedly."));
-  execution_config_.skip_gc_vars = skip_gc_vars;
+  impl_->SetSkipGcVars(skip_gc_vars);
 }
 
-void InterpreterCore::SetJitInputVars(
-    const std::set<std::string>& jit_input_vars) {
-  PADDLE_ENFORCE_EQ(
-      execution_config_.jit_input_vars.empty(),
-      true,
-      platform::errors::PreconditionNotMet(
-          "execution_config_.jit_input_vars can only be initialized once, now "
-          "execution_config_.jit_input_vars is "
-          "not empty, do not call SetJitInputVars method repeatedly."));
-  execution_config_.jit_input_vars = jit_input_vars;
+const std::set<std::string>& InterpreterCore::JitInputVars() const {
+  return impl_->JitInputVars();
 }
 
-const std::set<std::string>& InterpreterCore::JitInputVars() const {
-  return execution_config_.jit_input_vars;
+void InterpreterCore::SetJitInputVars(
+    const std::set<std::string>& jit_input_vars) {
+  impl_->SetJitInputVars(jit_input_vars);
 }
 
 const VariableScope* InterpreterCore::GetVariableScope() const {
-  return &var_scope_;
+  return impl_->GetVariableScope();
 }
 
 void InterpreterCore::reset_scope(Scope* new_scope) {
-  var_scope_.SetScope(new_scope);
-  auto& var_list = var_scope_.MutableVarList();
-  for (size_t i = 0; i < var_list.size(); i++) {
-    const auto& var_name = var_scope_.GetNameById(i);
-    var_list[i] = new_scope->FindVar(var_name);
-  }
-  // The index should be assured valid, cause the InterpreterCore may not be
-  // fully built, but was still cached and used. For example, see unit test
-  // `test_assert.py`, it may exit before `InterpreterCore::Convert`, but still
-  // was cached and used by later tests.
-  for (size_t i = 0; i < std::min(refs_.size(), var_list.size()); i++) {
-    refs_[i]->ResetVariable(var_list[i]);
-  }
-
-  for (size_t i = 0; i < vec_instruction_.size(); i++) {
-    BuildAndCacheInstructionCtx(&vec_instruction_[i]);
-  }
-}
-
-void InterpreterCore::ShareWorkQueueFrom(std::shared_ptr<InterpreterCore> src) {
-  async_work_queue_ = src->GetWorkQueue();
-  VLOG(8) << "Share AsyncWorkQueue from InterpreterCore(" << src.get()
-          << ") to InterpreterCore(" << this << ")";
-}
-
-bool InterpreterCore::BuildInplaceCheckVarIsOnlyInput(
-    const std::vector<std::vector<size_t>>& input_var2op, size_t var_index) {
-  if (!var_scope_.VarDesc(var_index)) {
-    return input_var2op.at(var_index).size() == 1;
-  } else {
-    int is_input_cnt = 0;
-    for (auto inst_id : input_var2op.at(var_index)) {
-      OpInOutInfo info;
-      info.Build(vec_instruction_.at(inst_id).OpBase());
-      if (info.IsInArgBufferNeeded(var_scope_.VarDesc(var_index)->Name())) {
-        is_input_cnt++;
-      }
-    }
-    return is_input_cnt == 1;
-  }
-}
-
-std::shared_ptr<interpreter::AsyncWorkQueue> InterpreterCore::GetWorkQueue() {
-  if (async_work_queue_ == nullptr) {
-    async_work_queue_ = std::make_shared<interpreter::AsyncWorkQueue>(
-        execution_config_.host_num_threads,
-        execution_config_.device_num_threads,
-        nullptr);
-  }
-  return async_work_queue_;
-}
-
-void InterpreterCore::BuildAndCacheInstructionCtx(Instruction* instr_node) {
-  Scope* inner_scope =
-      HasLocalScope() ? local_scope_ : var_scope_.GetMutableScope();
-  VariableValueMap ins_map;
-  for (auto& var_name_item : instr_node->Inputs()) {
-    std::vector<Variable*> input_vars;
-
-    input_vars.reserve(var_name_item.second.size());
-    for (auto& id : var_name_item.second) {
-      input_vars.emplace_back(inner_scope->FindVar(var_scope_.GetNameById(id)));
-    }
-    ins_map.emplace(var_name_item.first, std::move(input_vars));
-  }
-
-  VariableValueMap outs_map;
-  for (auto& var_name_item : instr_node->Outputs()) {
-    std::vector<Variable*> out_vars;
-
-    out_vars.reserve(var_name_item.second.size());
-    for (auto& id : var_name_item.second) {
-      out_vars.emplace_back(inner_scope->FindVar(var_scope_.GetNameById(id)));
-    }
-    outs_map.emplace(var_name_item.first, std::move(out_vars));
-  }
-
-  // set runtime_ctx and infershape_ctx_
-  if (instr_node->OpBase()->Type() == "cinn_launch" ||
-      instr_node->OpBase()->Type() == "cinn_instruction_run") {  // OP use scope
-                                                                 // in kernel
-    Scope* local_scope = HasLocalScope() ? var_scope_.GetMutableLocalScope()
-                                         : var_scope_.GetMutableScope();
-    instr_node->ResetContextWithScope(ins_map, outs_map, *local_scope);
-  } else {
-    instr_node->ResetContext(ins_map, outs_map);
-  }
-}
-
-void InterpreterCore::BuildInplace() {
-  // NOTE(Ruibiao): coalesce_tensor_op outputs a FusedOutput phi::DenseTensor
-  // and a list of Output Tensors which are sliced from the FusedOutput. These
-  // outputs sholud not be the outvar of the in-place var-pair since memory
-  // reuse between FusedOutput and Output Tensors is assumed. For the following
-  // example:
-  // fused_var, var1, var2, var3 = coalesce_tensor(var1, var2, var3)
-  // var1 = sum(var4, var5)
-  // ...
-  //
-  // After running coalesce_tensor_op, var1 is assumed to share the buffer
-  // slices from fused_var. However, if sum_op is in-place, then var1 would
-  // re-share the buffer with var4 instead of fused_var.
-  std::set<std::string> skip_inplace_outvars;
-  for (Instruction& instr : vec_instruction_) {
-    OperatorBase* op = instr.OpBase();
-    if (op->Type() == kCoalesceTensor) {
-      const std::vector<std::string>& outputs =
-          op->OutputVars(/*has_intermediate=*/false);
-      skip_inplace_outvars.insert(outputs.begin(), outputs.end());
-    }
-  }
-
-  Scope* local_scope = HasLocalScope() ? var_scope_.GetMutableLocalScope()
-                                       : var_scope_.GetMutableScope();
-  std::vector<std::vector<size_t>> input_var2op(var_scope_.VarSize());
-  for (Instruction& instr : vec_instruction_) {
-    for (auto& item : instr.Inputs()) {
-      for (int var_id : item.second) {
-        if (var_id != kEmptyVarIndex) {
-          input_var2op.at(var_id).push_back(instr.Id());
-        }
-      }
-    }
-  }
-
-  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
-    auto& instr = vec_instruction_[i];
-    auto* op_base = instr.OpBase();
-    if (!op_base->Info().infer_inplace_) {
-      continue;
-    }
-
-    auto in_to_outs = op_base->Info().infer_inplace_(
-        platform::is_gpu_place(instr.DeviceContext().GetPlace()));
-
-    auto& inputs = instr.Inputs();
-    auto& outputs = instr.Outputs();
-    for (auto& pair : in_to_outs) {
-      auto iter = inputs.find(pair.first);
-      if (iter != inputs.end() && !iter->second.empty()) {
-        auto in_var_desc = var_scope_.VarDesc(iter->second[0]);
-        if (in_var_desc && in_var_desc->Persistable()) {
-          continue;
-        }
-        if (var_scope_.GetVarSikpInplace(iter->second[0])) {
-          continue;
-        }
-        if (BuildInplaceCheckVarIsOnlyInput(input_var2op, iter->second[0])) {
-          auto iterout = outputs.find(pair.second);
-          if (iterout != outputs.end() && !iterout->second.empty()) {
-            const std::string& invar_name =
-                var_scope_.GetNameById(iter->second[0]);
-            const std::string& outvar_name =
-                var_scope_.GetNameById(iterout->second[0]);
-            auto invar = local_scope->FindVar(invar_name);
-            auto outvar = local_scope->FindVar(outvar_name);
-
-            if (invar && outvar && invar->IsType<phi::DenseTensor>() &&
-                outvar->IsType<phi::DenseTensor>() &&
-                skip_inplace_outvars.find(outvar_name) ==
-                    skip_inplace_outvars.end()) {
-              instr.AddInplace(invar, outvar);
-              VLOG(3) << "inplace " << op_base->Type() << " " << invar_name
-                      << " -> " << outvar_name;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-void InterpreterCore::PrepareForCUDAGraphCapture() {
-  if (!FLAGS_new_executor_use_cuda_graph) return;
-#ifdef PADDLE_WITH_CUDA
-  PADDLE_ENFORCE_EQ(
-      platform::IsCUDAGraphCapturing(),
-      false,
-      platform::errors::PermissionDenied("CUDA Graph is not allowed to capture "
-                                         "before prepare."));
-  PADDLE_ENFORCE_EQ(platform::is_gpu_place(place_),
-                    true,
-                    platform::errors::InvalidArgument(
-                        "CUDA Graph is only supported on NVIDIA GPU device."));
-  // If set true, will call `cudaStreamSynchronize(nccl_stream)`after allreduce.
-  // which may cause error in cuda graph. This behavior is consistent with PE.
-  PADDLE_ENFORCE_EQ(FLAGS_sync_nccl_allreduce,
-                    false,
-                    platform::errors::InvalidArgument(
-                        "FLAGS_sync_nccl_allreduce must be False to support "
-                        "CUDA Graph capturing."));
-
-  // All output vars of coalesce_tensor op should be persistable.
-  // If fused output var of coalesce_tensor is gc, it will cause accuracy
-  // problem. The specific reasons need to be analyzed.
-  for (auto& op_desc : block_.AllOps()) {
-    if (op_desc->Type() == kCoalesceTensor) {
-      for (auto& out_var_name : op_desc->OutputArgumentNames()) {
-        // The fused var needs to be set to persistable, not just added to
-        // skip_gc_vars.
-        // In the case where the feed fetch var is changed, StandaloneExecutor
-        // will be newly constructed. If the fused var is not persistable,
-        // these vars will be recreated and initialized, resulting in
-        // precision problems.
-        auto* out_var = op_desc->Block()->FindVarRecursive(out_var_name);
-        if (out_var) {
-          out_var->SetPersistable(true);
-          VLOG(4) << "Mark Var(" << out_var_name << ") as Persistable.";
-        }
-      }
-    }
-  }
-#else
-  PADDLE_THROW(platform::errors::Unimplemented(
-      "CUDA Graph is only supported on NVIDIA GPU device."));
-#endif
-}
-
-void InterpreterCore::CheckCUDAGraphBeforeRun(
-    const std::vector<std::string>& feed_names) {
-#ifdef PADDLE_WITH_CUDA
-  if (platform::IsCUDAGraphCapturing()) {
-    PADDLE_ENFORCE_EQ(
-        feed_names.empty(),
-        true,
-        platform::errors::InvalidArgument(
-            "Feeding data is not permitted when capturing CUDA Graph."));
-    PADDLE_ENFORCE_EQ(
-        FLAGS_new_executor_use_cuda_graph,
-        true,
-        platform::errors::InvalidArgument(
-            "You must turn on FLAGS_new_executor_use_cuda_graph to True "
-            "to enable CUDA Graph capturing."));
-    PADDLE_ENFORCE_EQ(
-        place_,
-        platform::CUDAGraphCapturingPlace(),
-        platform::errors::InvalidArgument("The place to capture CUDAGraph is "
-                                          "not the same as the place to run."));
-  }
-#endif
-}
-
-void InterpreterCore::BuildOperatorDependences() {
-  // analysis the dependences between ops, add next_instr_list to each instr,
-  // and set the dependecy_count_
-  size_t instr_num = vec_instruction_.size();
-  dependecy_count_ = std::vector<size_t>(instr_num, 0);
-  auto downstream_map = dependency_builder_.Build(vec_instruction_);
-
-  for (size_t instr_id = 0; instr_id < instr_num; ++instr_id) {
-    Instruction& cur_instr = vec_instruction_[instr_id];
-    const std::set<size_t>& next_instr_ids = downstream_map[instr_id];
-
-    if (FLAGS_new_executor_serial_run) {
-      for (size_t next_instr_id : next_instr_ids) {
-        cur_instr.AddNextInstrInSameThread(next_instr_id);
-      }
-    } else {
-      if (cur_instr.KernelType() == OpFuncType::kGpuAsync) {
-        for (size_t next_instr_id : next_instr_ids) {
-          if (vec_instruction_[next_instr_id].KernelType() ==
-              OpFuncType::kGpuAsync) {
-            cur_instr.AddNextInstrInSameThread(next_instr_id);
-          } else {
-            cur_instr.AddNextInstrInDifferentThread(next_instr_id);
-          }
-        }
-      } else {
-        bool has_instr_in_same_thread = false;
-        for (size_t next_instr_id : next_instr_ids) {
-          if (!has_instr_in_same_thread &&
-              vec_instruction_[next_instr_id].KernelType() !=
-                  OpFuncType::kGpuAsync) {
-            cur_instr.AddNextInstrInSameThread(next_instr_id);
-            has_instr_in_same_thread = true;
-          } else {
-            cur_instr.AddNextInstrInDifferentThread(next_instr_id);
-          }
-        }
-      }
-    }
-
-    for (size_t next_instr_id : next_instr_ids) {
-      ++dependecy_count_[next_instr_id];
-    }
-  }
-}
-
-// At the end of each step, the holder of phi::DenseTensor in LoDTensorArray is
-// null. Clear these Tensors and leave LoDTensorArray empty, otherwise an
-// exception will occur in the next step
-void InterpreterCore::ClearLoDTensorArrayInLocalScope() {
-  auto vars = local_scope_->LocalVars();
-  for (auto var : vars) {
-    if (var->IsType<LoDTensorArray>()) {
-      auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
-      lod_tensor_arr->clear();
-    }
-  }
-}
-
-void InterpreterCore::Convert(
-    std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
-  auto& vec_meta_info = var_scope_.MutableVecMetaInfo();
-  auto nodes = *op_func_nodes;
-  auto op_nums = nodes.size();
-  vec_instruction_.clear();
-  vec_instruction_.reserve(op_nums);
-  for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
-    auto& op_func_node = nodes[op_idx];
-    auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node);
-    vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_);
-#ifdef PADDLE_WITH_CUDA
-    if (FLAGS_new_executor_use_cuda_graph) {
-      auto& op = op_func_node.operator_base_;
-      auto& op_type = op->Type();
-      if (op_type == interpreter::kMemcpyD2H ||
-          op_type == interpreter::kMemcpyH2D) {
-        PADDLE_THROW(paddle::platform::errors::Fatal(
-            "Cuda memory copy d2h/h2d is not allowed while using cuda graph."));
-      }
-      PADDLE_ENFORCE_EQ(typeid(*dev_ctx_) == typeid(phi::GPUContext),
-                        true,
-                        platform::errors::InvalidArgument(
-                            "Device context of op %s must be [%s] while using "
-                            "cuda graph, but got [%s].",
-                            op_type,
-                            typeid(phi::GPUContext).name(),
-                            typeid(*dev_ctx_).name()));
-      // cuda graph needs to record all stream
-      phi::backends::gpu::CUDAGraphContextManager::Instance()
-          .RecordCapturingDeviceContext(dev_ctx_);
-    }
-#endif
-  }
-  BuildOperatorDependences();
-
-  // NOTE(Ruibiao): For cross-step stream synchronization, an event may be
-  // recorded in the first step and waited in the second step. So, in the first
-  // step, the WaitEvent may be called without RecordEvent. Considering that
-  // before the first call to RecordEvent, an Event represents an empty set of
-  // work and WaitEvent always return succeed immediately, we omit the
-  // prelude-record for the first step here.
-  stream_analyzer_.ConstructEvents(&vec_instruction_);
-  // add event for the input var of jit program, since there are async copied
-  // from gpu_pinned place to gpu place on compute stream.
-  for (size_t i = 0; i < dependecy_count_.size(); ++i) {
-    if (dependecy_count_[i] == 0) {
-      auto& inst = vec_instruction_[i];
-      if (inst.OpBaseValid() &&
-          inst.OpBase()->Type() == interpreter::kMemcpyD2H &&
-          platform::is_gpu_place(place_)) {
-        for (auto& item : inst.Inputs()) {
-          for (auto var_id : item.second) {
-            auto name = var_scope_.GetNameById(var_id);
-            if (JitInputVars().count(name)) {
-              auto device_event = std::make_shared<platform::DeviceEvent>(
-                  place_, platform::GenerateDeviceEventFlag());
-              VLOG(4) << "Add input event for input: " << name << " of "
-                      << inst.OpBase()->Type();
-              inst.AddEventToWait(
-                  i, device_event, stream_analyzer_.GetWaiterType(inst));
-            }
-          }
-        }
-      }
-    }
-  }
-
-  // calculate last_live_ops_
-  if (!FLAGS_enable_new_ir_in_executor) {
-    for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
-      Instruction& instr = vec_instruction_[op_idx];
-      OpInOutInfo info;
-      if (instr.OpBaseValid()) {
-        info.Build(instr.OpBase());
-      }
-
-      std::set<size_t> gc_check_vars;
-
-      const std::map<std::string, std::vector<int>>& ins = instr.Inputs();
-      const std::map<std::string, std::vector<int>>& outs = instr.Outputs();
-      std::multimap<std::string, std::vector<int>> ins_and_outs{ins.begin(),
-                                                                ins.end()};
-      ins_and_outs.insert(outs.begin(), outs.end());
-
-      for (auto& item : ins_and_outs) {
-        for (auto id : item.second) {
-          if (id == kEmptyVarIndex) {
-            continue;
-          }
-          auto* var_desc = var_scope_.VarDesc(id);
-          // skip no_need_buffer input vars
-          if (var_desc && ins.count(item.first) &&
-              !info.IsInArgBufferNeeded(var_desc->Name())) {
-            continue;
-          }
-          // skip when this var is not in block and not a data_transferred var,
-          // which means this var is managed by other block
-          const auto& var_name = var_scope_.GetNameById(id);
-          bool not_owned = !block_.HasVar(var_name);
-          const auto& transferred_vars = var_scope_.DataTransferAddedVars();
-          bool not_transferred =
-              std::all_of(transferred_vars.begin(),
-                          transferred_vars.end(),
-                          [&](const std::pair<std::string, int>& elem) {
-                            return elem.first != var_name;
-                          });
-          if (not_owned && not_transferred) {
-            VLOG(10) << "[gc_check_inputs] skip gc: " << var_name;
-            continue;
-          }
-          gc_check_vars.insert(id);
-        }
-      }
-
-      for (auto var_id : gc_check_vars) {
-        Scope* inner_scope =
-            HasLocalScope() ? local_scope_ : var_scope_.GetMutableScope();
-        paddle::framework::Variable* var =
-            inner_scope->FindVar(var_scope_.GetNameById(var_id));
-        if (var->IsType<phi::DenseTensor>() ||
-            var->IsType<phi::SelectedRows>() || var->IsType<LoDTensorArray>()) {
-          last_live_ops_[var_id].insert(op_idx);
-        } else {
-          VLOG(4) << "not clear " << var_scope_.GetNameById(var_id) << " after "
-                  << instr.OpBase()->Type() << " because its type is "
-                  << framework::ToTypeName(var->Type());
-        }
-      }
-    }
-  }
-
-  // clear the last_live_ops list for all vars in skip_gc_vars
-  for (const std::string& skip_gc_var : execution_config_.skip_gc_vars) {
-    int var_id = var_scope_.GetIdByName(skip_gc_var);
-    if (var_id != -1) {
-      last_live_ops_[var_id].clear();
-      VLOG(8) << "Skip gc for var: " << skip_gc_var;
-    }
-  }
-
-  // shrink, find the downstream op that has no other op in the
-  // downstream list happens before it
-  // For example,
-  // b = op1(a)
-  // c = op2(a, b)
-  // in this case, a is the input of op1 and op2, we only need to check
-  // a after op2, because op2 always uses a after op1.
-  for (size_t i = 0; i < last_live_ops_.size(); ++i) {
-    std::set<size_t> minumum_last_live_ops;
-    for (size_t item : last_live_ops_[i]) {
-      bool not_before_any = true;
-      // find the op that is not executed before any
-      for (size_t other_item : last_live_ops_[i]) {
-        if (dependency_builder_.OpHappensBefore(item, other_item)) {
-          VLOG(8) << "happens_before: " << item << "->" << other_item
-                  << ", so skip " << item;
-          not_before_any = false;
-          break;
-        }
-      }
-      if (not_before_any) {
-        VLOG(8) << "last live op of var " << i << " "
-                << var_scope_.GetNameById(i) << " : " << item << " "
-                << vec_instruction_[item].OpBase()->Type();
-        minumum_last_live_ops.insert(item);
-        vec_instruction_[item].AddGCCheckVar(i);
-      }
-    }
-    last_live_ops_[i] = minumum_last_live_ops;
-    vec_meta_info[i].var_ref_count_ = last_live_ops_[i].size();
-  }
-  if (!FLAGS_enable_new_ir_in_executor) {
-    for (size_t i = 0; i < vec_instruction_.size(); ++i) {
-      BuildAndCacheInstructionCtx(&vec_instruction_[i]);
-    }
-  }
-  bool inplaced = false;
-  for (const Instruction& inst : vec_instruction_) {
-    if (inst.OpBaseValid() && (inst.OpBase()->Type() == "share_buffer" ||
-                               inst.OpBase()->Type() == "share_data")) {
-      VLOG(4) << "Already inplaced, skip inplace now.";
-      inplaced = true;
-    }
-  }
-
-  if (!FLAGS_enable_new_ir_in_executor && FLAGS_new_executor_use_inplace &&
-      !inplaced) {
-    BuildInplace();
-  }
-
-  for (auto& dep : dependecy_count_) {
-    deps_.emplace_back(std::make_shared<interpreter::OpDepInfo>(dep));
-  }
-  for (size_t i = 0; i < vec_meta_info.size(); ++i) {
-    refs_.emplace_back(std::make_shared<interpreter::VarRefInfo>(
-        vec_meta_info[i].var_ref_count_, var_scope_.VarRef(i)));
-  }
-
-  AnalyseExecuteOrderForTrace();
-}
-
-void InterpreterCore::BuildSkipShareLoDInfo() {
-  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
-    bool can_skip_lod = true;
-    for (auto& input : vec_instruction_[i].InnerRuntimeContext()->inputs) {
-      for (auto& var : input.second) {
-        if (var->IsType<phi::DenseTensor>()) {
-          if (var->Get<phi::DenseTensor>().lod().size() != 0) {
-            can_skip_lod = false;
-            break;
-          }
-        } else {
-          can_skip_lod = false;
-          break;
-        }
-      }
-    }
-    if (can_skip_lod) {
-      VLOG(8) << "skip share lod for: " << vec_instruction_[i].OpBase()->Type()
-              << " (" << i << ")";
-    }
-    vec_instruction_[i].InnerInferShapeContext()->SetSkipLoD(can_skip_lod);
-  }
-}
-
-void InterpreterCore::RunOperator(const Instruction& instr_node) {
-  auto* op = instr_node.OpBase();
-  auto place = instr_node.DeviceContext().GetPlace();
-  Scope* local_scope = HasLocalScope() ? var_scope_.GetMutableLocalScope()
-                                       : var_scope_.GetMutableScope();
-  VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope);
-
-  auto op_with_kernel = dynamic_cast<const framework::OperatorWithKernel*>(op);
-  {
-    // If it is OperatorBase, InferShape do nothing.
-    if (op_with_kernel != nullptr) {
-      platform::RecordEvent infershape_event(
-          "infer_shape",
-          platform::TracerEventType::OperatorInner,
-          1,
-          platform::EventRole::kInnerOp);
-
-      // see OperatorWithKernel::RunImpl in operator.cc for why
-      if (!(op_with_kernel->HasAttr(kAllKernelsMustComputeRuntimeShape) &&
-            op_with_kernel->Attr<bool>(kAllKernelsMustComputeRuntimeShape))) {
-        op_with_kernel->Info().infer_shape_(
-            instr_node.InnerInferShapeContext().get());
-      }
-      infershape_event.End();
-      platform::RecordOpInfoSupplement(op->Type(),
-                                       op->Attrs(),
-                                       *(instr_node.InnerInferShapeContext()),
-                                       *(instr_node.InnerRuntimeContext()),
-                                       op->Id());
-    }
-  }
-  if (op_with_kernel != nullptr && FLAGS_new_executor_use_inplace) {
-    // TODO(xiongkun03) Does operator base support inplace ?
-    for (auto& pair : instr_node.InplaceInfo()) {
-      const auto& in = paddle::framework::details::GetTensorFromVar(pair.first);
-      auto* out =
-          paddle::framework::details::GetMutableTensorFromVar(pair.second);
-      if (in.dims() == out->dims()) {
-        out->ShareBufferWith(in);
-      }
-    }
-  }
-
-  {
-    platform::RecordEvent compute_event(
-        "compute",
-        platform::TracerEventType::OperatorInner,
-        1,
-        platform::EventRole::kInnerOp);
-    if (op_with_kernel == nullptr) {  // operator base
-      instr_node.OpBase()->Run(*local_scope, place_);
-    } else {
-      phi::Kernel* kernel = instr_node.PhiKernel();
-      if (kernel && kernel->IsValid()) {  // phi kernel
-        if (kernel->GetKernelRegisteredType() ==
-            phi::KernelRegisteredType::FUNCTION) {
-          VLOG(4) << "Run function kernel: " << op->Type();
-          VLOG(4) << instr_node.InnerRuntimeContext().get() << " "
-                  << &instr_node.DeviceContext();
-          phi::KernelContext phi_kernel_context;
-          op_with_kernel->BuildPhiKernelContext(
-              *instr_node.InnerRuntimeContext().get(),
-              const_cast<platform::DeviceContext*>(&instr_node.DeviceContext()),
-              &phi_kernel_context);
-
-          (*kernel)(&phi_kernel_context);
-        } else {
-          VLOG(4) << "Run structure kernel: " << op->Type();
-          (*kernel)(instr_node.InnerExecutionContext().get());
-        }
-      } else {  // fluid kernel
-        instr_node.KernelFunc()(*instr_node.InnerExecutionContext().get());
-      }
-    }
-  }
-
-  VLOG(4) << "End run " << place << " " << op->DebugStringEx(local_scope);
-
-  if (!instr_node.InplaceBackMap().empty()) {
-    platform::RecordEvent inplaceback_event(
-        "InplaceVarsBack", platform::TracerEventType::UserDefined, 10);
-    auto& m = instr_node.InplaceBackMap();
-    // NOTE(zhiqiu): same logic as TransferInplaceVarsBack() in operator.cc
-    for (auto& p : m) {
-      auto* transformed_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(
-          var_scope_.VarRef(p.first));
-      auto* original_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(
-          var_scope_.VarRef(p.second));
-      original_tensor->ShareDataWith(*transformed_tensor);
-      VLOG(4) << "Transfer inplace variable back form "
-              << var_scope_.GetNameById(p.first) << " to "
-              << var_scope_.GetNameById(p.second);
-    }
-  }
-
-  /*For profiling/benchmark only*/
-  if (FLAGS_benchmark) {
-    instr_node.DeviceContext().Wait();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
-    VLOG(4) << "Operator(" << op->Type()
-            << "): context wait and get last error";
-#endif
-  }
-
-  for (auto& hook : hookfuncs_) {
-    hook(op, local_scope);
-  }
-
-  // for debug nan/inf
-  if (op_with_kernel != nullptr && FLAGS_check_nan_inf) {
-    VLOG(4) << "Check nan/inf";
-    try {
-      framework::details::CheckOpHasNanOrInf(
-          *op,
-          *local_scope,
-          place);  // TODO(xiongkun03) change it to inner scope.
-    } catch (...) {
-      const std::vector<std::string>* callstack = nullptr;
-      auto attrs = op->Attrs();
-      auto iter =
-          attrs.find(OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
-      if (iter != attrs.end()) {
-        callstack = &PADDLE_GET_CONST(std::vector<std::string>, iter->second);
-        if (callstack->empty()) callstack = nullptr;
-      }
-      std::ostringstream sout;
-      if (callstack) {
-        if (FLAGS_call_stack_level > 1) {
-          sout << "\n\n  Compile Traceback (most recent call last):";
-        } else {
-          sout << "In user code:\n";
-        }
-        for (auto& line : *callstack) {
-          sout << "\n  " << line;
-        }
-      }
-      std::cout << sout.str() << std::endl;
-      std::rethrow_exception(std::current_exception());
-    }
-  }
-}
-
-void InterpreterCore::RunInstruction(const Instruction& instr_node) {
-  VLOG(5) << __func__ << " OP id:" << instr_node.Id()
-          << " name:" << instr_node.OpBase()->Type() << " type:"
-          << (instr_node.KernelType() == OpFuncType::kCpuSync
-                  ? "kCpuSync"
-                  : (instr_node.KernelType() == OpFuncType::kGpuSync
-                         ? "kGpuSync"
-                         : "kGpuAsync"))
-          << " runs on " << platform::GetCurrentThreadName();
-  OperatorBase* op = nullptr;
-  if (instr_node.OpBaseValid()) {
-    op = instr_node.OpBase();
-    platform::RecordEvent instruction_event(
-        op->Type(), platform::TracerEventType::Operator, 1);
-  }
-
-  SetDeviceId(instr_node.DeviceContext().GetPlace());
-
-  try {
-    instr_node.WaitEvent(place_);
-
-    if (instr_node.PreDefineContext()) {
-      VLOG(5) << "run new ir selected kernel";
-      auto op_func_node = const_cast<OpFuncNode*>((instr_node.OpFunc()));
-      op_func_node->infer_shape_interface_->infer_shape_(
-          &(op_func_node->infer_meta_context_));
-      (*(op_func_node->phi_kernel_))(&(op_func_node->kernel_context_));
-
-    } else if (!instr_node.IsArtificial()) {
-      RunOperator(instr_node);
-      CheckGC(instr_node);
-      interpreter::LogDeviceMemoryStats(place_);
-    }
-    instr_node.RecordEvent(place_);
-  } catch (platform::EnforceNotMet& ex) {
-    framework::InsertCallStackInfo(op->Type(), op->Attrs(), &ex);
-    exception_holder_.Catch(std::make_exception_ptr(std::move(ex)));
-  } catch (platform::EOFException&) {
-    exception_holder_.Catch(std::current_exception());
-  } catch (std::exception& ex) {
-    LOG(WARNING) << op->Type() << " raises an exception "
-                 << platform::demangle(typeid(ex).name()) << ", " << ex.what();
-    exception_holder_.Catch(std::current_exception());
-  } catch (...) {
-    LOG(WARNING) << op->Type() << " raises an unknown exception";
-    exception_holder_.Catch(std::current_exception());
-  }
-}
-
-std::string InterpreterCore::GetDepsString() const {
-  std::stringstream ss;
-  auto downstream_map = dependency_builder_.OpDownstreamMap();
-  ss << "Note: when static_dep is 1, it is ok that the dynamic_dep will not "
-        "be decreased to 0."
-     << std::endl;
-  ss << "unfinished_op_number_:" << unfinished_op_number_ << std::endl;
-  for (size_t i = 0; i < deps_.size(); ++i) {
-    ss << "op:" << i << ", type: " << vec_instruction_[i].OpBase()->Type()
-       << ", static_dep:" << deps_[i]->StaticDep()
-       << ", dynamic_dep:" << deps_[i]->DynamicDep() << ", downstream op: ";
-    for (auto id : downstream_map[i]) {
-      ss << id << ", ";
-    }
-    ss << std::endl;
-  }
-  return ss.str();
-}
-
-void InterpreterCore::ExecuteInstructionList(
-    const std::vector<Instruction>& vec_instr) {
-  unfinished_op_number_ = vec_instr.size();
-  if (unfinished_op_number_ == 0) {
-    VLOG(4) << "No op to run, return";
-    return;
-  }
-
-  exception_holder_.Clear();
-
-  for (size_t i = 0; i < dependecy_count_.size(); ++i) {
-    if (dependecy_count_[i] == 0) {
-      // NOTE(zhiqiu): hot fix for jit input var
-      RecordMemcpyD2H(vec_instr.at(i));
-      if (FLAGS_new_executor_serial_run) {
-        RunInstructionAsync(i);
-      } else {
-        async_work_queue_->AddTask(vec_instr.at(i).KernelType(),
-                                   [this, i] { RunInstructionAsync(i); });
-      }
-    }
-  }
-
-  // For debug hang in main_thread_blocker_.WaitEvent(),
-  // launch async task to log deps every
-  // FLAGS_executor_log_deps_every_microseconds, then cancel the std::async when
-  // main_thread_blocker_.WaitEvent() executed. Why not use std::async instead
-  // of workqueue? To make sure that the logging thread itself will not affect
-  // the workqueue
-  //  used in interpretercore.
-
-  std::future<int> logged_times;
-  std::atomic_bool cancel_log = ATOMIC_VAR_INIT(false);
-  if (FLAGS_executor_log_deps_every_microseconds) {
-    logged_times = std::async(
-        std::launch::async,
-        [this](const std::atomic_bool& cancel) {
-          int times = 0;
-          while (!cancel) {
-            std::this_thread::sleep_for(std::chrono::microseconds(
-                FLAGS_executor_log_deps_every_microseconds));
-            // check again, since cancel may be changed during sleep
-            if (cancel) {
-              break;
-            }
-            VLOG(0) << "deps:\n" << GetDepsString();
-            times++;
-          }
-          return times;
-        },
-        std::ref(cancel_log));
-  }
-
-  auto event_name = main_thread_blocker_.WaitEvent();
-  VLOG(1) << "main_thread_blocker_(" << &main_thread_blocker_
-          << ") got event_name: " << event_name;
-
-  cancel_log = true;
-  if (logged_times.valid()) {
-    VLOG(1) << "Logged deps for " << logged_times.get() << " times";
-  }
-
-  if (UNLIKELY(exception_holder_.IsCaught())) {
-    VLOG(1) << "Exception caught " << exception_holder_.Type();
-    // Graceful exit when the executor encountered a fatal error.
-    // EOF is not a fatal error.
-    if (exception_holder_.Type() != "EOF") {
-      async_work_queue_->Cancel();
-      async_work_queue_.reset();
-    }
-    VLOG(4) << "Cancel ok";
-    PADDLE_ENFORCE_EQ(
-        main_thread_blocker_.Clear(),
-        0,
-        platform::errors::PreconditionNotMet(
-            "main_thread_blocker_.Clear() return -1, clear failed"));
-    VLOG(4) << "clear ok";
-    exception_holder_.ReThrow();
-  }
-}
-
-void InterpreterCore::RunNextInstructions(const Instruction& instr,
-                                          SchedulingQueue* reserved_next_ops) {
-  platform::RecordEvent record(
-      "RunNextInstructions", platform::TracerEventType::UserDefined, 10);
-
-  auto IsReady = [this](size_t next_id) {
-    VLOG(4) << "op_id: " << next_id
-            << ", remain deps: " << deps_[next_id]->DynamicDep();
-    return deps_[next_id]->CheckAndDecrease();
-  };
-
-  for (size_t next_instr_id : instr.NextInstrsInDifferenceThread()) {
-    if (IsReady(next_instr_id)) {
-      async_work_queue_->AddTask(
-          vec_instruction_[next_instr_id].KernelType(),
-          [this, next_instr_id]() { RunInstructionAsync(next_instr_id); });
-    }
-  }
-
-  for (size_t next_instr_id : instr.NextInstrsInSameThread()) {
-    if (IsReady(next_instr_id)) {
-      reserved_next_ops->push(next_instr_id);
-    }
-  }
-}
-
-void InterpreterCore::RunInstructionAsync(size_t instr_id) {
-  // NOTE(Ruibiao): Due to the uncertain order in multi-threading asynchronous
-  // scheduling, the priority order involved cross-thread scheduling is not
-  // guaranteed. Only Ops scheduled by the same AddTask call have the guarantee
-  // of priority order.
-  SchedulingQueue ready_ops(instruction_scheduling_priority_less);
-  ready_ops.push(instr_id);
-  while (!ready_ops.empty()) {
-    instr_id = ready_ops.top();
-    ready_ops.pop();
-    auto& instr_node = vec_instruction_.at(instr_id);
-
-    RunInstruction(instr_node);
-
-    if (UNLIKELY(exception_holder_.IsCaught())) {
-      VLOG(4) << "Exception caught";
-      if (exception_notifier_ != nullptr) {
-        exception_notifier_->NotifyEvent();
-      }
-      return;
-    }
-
-    VLOG(4) << "unfinished_op_number_: " << unfinished_op_number_;
-    if (UNLIKELY(unfinished_op_number_.fetch_sub(
-                     1, std::memory_order_relaxed) == 1)) {
-      if (completion_notifier_ != nullptr) {
-        completion_notifier_->NotifyEvent();
-      }
-    }
-
-    RunNextInstructions(instr_node, &ready_ops);
-  }
-}
-
-void InterpreterCore::RecordStreamForGC(const Instruction& instr) {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
-  PADDLE_THROW(platform::errors::Unimplemented(
-      "RecordStreamForGC is only implemented when compiled with GPU."));
-#else
-  if (!IsInterpretercoreFastGCEnabled() ||
-      instr.KernelType() != OpFuncType::kGpuAsync) {
-    return;
-  }
-  if (instr.DeviceContext().GetPlace() == phi::CustomPlace()) {
-    return;
-  }
-  platform::RecordEvent record(
-      "RecordStreamForGC", platform::TracerEventType::UserDefined, 10);
-
-  gpuStream_t stream =
-      reinterpret_cast<const phi::GPUContext&>(instr.DeviceContext()).stream();
-  auto TensorRecordStream = [&stream](phi::DenseTensor& tensor) {
-    auto allocation = tensor.Holder();
-    if (allocation == nullptr) {
-      return;
-    }
-
-    const platform::Place& place = allocation->place();
-    if (platform::is_gpu_place(place)) {
-      memory::RecordStream(allocation, stream);
-    } else if (platform::is_cuda_pinned_place(place)) {
-      // TODO(Ruibiao): Here should do something to make sure that the tensor
-      // is not freed until the H2D copies done. However, simplely launch a
-      // CUDA runtime callback to the H2D stream may lead a high performance
-      // overhead. As all the cases we meet in H2D are copies from CPUPlace at
-      // present, we just log a WARNING here. A better design is required.
-      LOG(WARNING) << "Copy data from a CUDAPinned tensor in an asynchronous "
-                      "manner may lead a data inconsistent";
-    } else {
-      // memory copies involve CPUPlace are always synchronous, so just do
-      // nothing here
-    }
-  };
-
-  /* NOTE(Ruibiao)：Cross-stream tensor synchronization is required only when
-   * all the following conditions are satisfied:
-   * 1. The tensor will be GC after running the instruction, i.e., in
-   * instr.GCCheckVars.
-   * 2. The stream which initializes this tensor is different from the stream
-   * which the instruction run in.
-   * 3. The tensor is the instruction's input, cause we assume that
-   * instruction will initialize all output tensors with its running stream.
-   * 4. In the OP function of this instruction, the tensor is an input of a
-   * async CUDA kernel.
-   *
-   * Here we only process the first condition, because:
-   * 1. Since the RecordStream function will directly return when the recored
-   * stream is equal to the owning stream, recording a stream same as which
-   * initialized this tensor has less time overhead. Conversely, it may take
-   * more time if we try to extract those cross-stream input vars from
-   * instr.GCCheckVars.
-   * 2. Now the instruction has no idea of which vars involving async running
-   * in OP function, and thus we can not recognize condition 4. It should be
-   * supported later.
-   */
-  for (int var_id : instr.GCCheckVars()) {
-    VLOG(4) << "GC sync " << var_scope_.GetNameById(var_id) << " "
-            << var_scope_.VarDesc(var_id);
-
-    // persistable var will be ignore while GC
-    if (var_scope_.VarDesc(var_id) &&
-        var_scope_.VarDesc(var_id)->Persistable()) {
-      continue;
-    }
-
-    paddle::framework::Variable* var = var_scope_.VarRef(var_id);
-    if (var == nullptr) {
-      continue;
-    }
-
-    if (var->IsType<phi::DenseTensor>()) {
-      TensorRecordStream(*(var->GetMutable<phi::DenseTensor>()));
-    } else if (var->IsType<
-                   operators::reader::
-                       OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {
-      // do nothing
-    } else if (var->IsType<phi::SelectedRows>()) {
-      TensorRecordStream(
-          *(var->GetMutable<phi::SelectedRows>()->mutable_value()));
-    } else if (var->IsType<LoDTensorArray>()) {
-      auto* tensor_arr = var->GetMutable<LoDTensorArray>();
-      for (auto& tensor : *tensor_arr) {
-        TensorRecordStream(tensor);
-      }
-    } else if (var->IsType<std::vector<Scope*>>()) {
-      // do nothing
-    } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "The variable(%s) is not supported in eager deletion.",
-          framework::ToTypeName(var->Type())));
-    }
-  }
-#endif
+  impl_->reset_scope(new_scope);
 }
 
-void InterpreterCore::CheckGC(const Instruction& instr) {
-  platform::RecordEvent record(
-      "CheckGC", platform::TracerEventType::UserDefined, 10);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  RecordStreamForGC(instr);
-#endif
-  auto& var_scope = var_scope_;
-
-  for (auto var_id : instr.GCCheckVars()) {
-    VLOG(4) << "GC:" << var_scope_.GetNameById(var_id) << ", id:" << var_id
-            << ", ref:" << refs_[var_id]->DynamicRef();
-    bool is_ready = refs_[var_id]->CheckAndDecrease();
-    // ignore all persistable var while GC
-    if (var_scope.VarDesc(var_id) && var_scope.VarDesc(var_id)->Persistable()) {
-      continue;
-    }
-    if (is_ready) {
-      VLOG(6) << "Async delete variable with name : "
-              << var_scope.GetNameById(var_id);
-      gc_->Add(refs_[var_id]->Var(), instr);
-    }
-  }
+const platform::Place& InterpreterCore::GetPlace() const {
+  return impl_->GetPlace();
 }
 
-void InterpreterCore::Prepare(const std::vector<std::string>& feed_names,
-                              const std::vector<phi::DenseTensor>& feed_tensors,
-                              bool prepare_feed) {
-  PADDLE_ENFORCE_EQ(feed_names.size(),
-                    feed_tensors.size(),
-                    platform::errors::PreconditionNotMet(
-                        "Required feed_names.size() == feed_tensors.size(), "
-                        "but received %d != %d",
-                        feed_names.size(),
-                        feed_tensors.size()));
-  auto FeedInput = [&] {
-    VLOG(4) << "Feed inputs";
-    for (size_t i = 0; i < feed_names.size(); ++i) {
-      auto* feed_var = local_scope_->FindVar(feed_names[i]);
-      PADDLE_ENFORCE_NOT_NULL(
-          feed_var,
-          platform::errors::NotFound("Variable %s should not be nullptr.",
-                                     feed_names[i]));
-
-      auto feed_tensor = feed_var->GetMutable<phi::DenseTensor>();
-      feed_tensor->ShareDataWith(feed_tensors[i]);
-      feed_tensor->set_lod(feed_tensors[i].lod());
-    }
-  };
-
-  if (!is_build_) {
-    paddle::framework::interpreter::BuildVariableScope(
-        block_, execution_config_, &var_scope_);
-    FeedInput();
-    std::vector<paddle::framework::OpFuncNode> op_func_nodes;
-    paddle::framework::interpreter::BuildOpFuncList(
-        place_,
-        block_,
-        execution_config_.skip_gc_vars,
-        &op_func_nodes,
-        &var_scope_,
-        execution_config_,
-        HasLocalScope(),
-        static_build_);
-    SetFeedVarsInplaceSkip(feed_names);
-    // convert vec func_list to graph
-    Convert(&op_func_nodes);
-    UpdateSyncOpNum();
-    if (static_build_) {
-      VLOG(4) << "RUN impl";
-      RunImpl();
-    }
-    BuildSkipShareLoDInfo();
-    is_build_ = true;
-  }
-  // NOTE: Because feed_tensor will be GC after
-  // paddle::framework::BuildOpFuncList, so we should
-  // call FeedInput again.
-  if (prepare_feed) {
-    FeedInput();
-  }
+void InterpreterCore::SetOutputHooks(const std::vector<HookFunc>& hookfuncs) {
+  impl_->SetOutputHooks(hookfuncs);
 }
-
-void InterpreterCore::SetFeedVarsInplaceSkip(
-    const std::vector<std::string>& feed_names) {
-  for (auto& feed_name : feed_names) {
-    var_scope_.SetVarSikpInplace(feed_name, true);
-  }
-}
-
-bool InterpreterCore::HasLocalScope() const { return local_scope_ != nullptr; }
-
-// Note(zhangbo):
-// (1) What is "Trace"?
-// The OP execute scheduling rule adopted by Interpretercore by default is a
-// multi-threaded scheduling mode(see ExecuteInstructionList). By maintaining a
-// high-performance thread pool, the OP's execute scheduling is distributed to
-// the sub threads maintained by the thread pool, but the main thread does not
-// have any tasks. In Trace mode, the executor will execute directly in the main
-// thread according to the pre provided OP sequence(trace_execute_order_),
-// instead of being distributed to the thread pool.
-// (2) When we use "Trace"?
-// In dygraph to static, This scheduling causes that the execution of the
-// forward and backward OPs and the execution of the dygraph optimizer cannot be
-// executed in the same thread. Executing thread switch may cause cpu cache
-// miss. When a model is all KQueueAsync type OPs, all OPs will be distributed
-// to the DeviceThread for execution, and the multithreading scheduling will not
-// have any benefits. Therefore, in the dynamic to static, when the number of
-// KQueueAsync Ops is 0, we choose Trace mode.
-void InterpreterCore::TraceInstructionList(
-    const std::vector<Instruction>& vec_instr) {
-  unfinished_op_number_ = vec_instr.size();
-  if (unfinished_op_number_ == 0) {
-    VLOG(4) << "No op to run, return";
-    return;
-  }
-
-  exception_holder_.Clear();
-
-  for (size_t i = 0; i < dependecy_count_.size(); ++i) {
-    if (dependecy_count_[i] == 0) {
-      // NOTE(zhiqiu): hot fix for jit input var
-      RecordMemcpyD2H(vec_instr.at(i));
-    }
-  }
-
-  for (size_t idx = 0; idx < trace_execute_order_.size(); idx++) {
-    auto instr_id = trace_execute_order_[idx];
-    auto& instr_node = vec_instruction_.at(instr_id);
-
-    RunInstruction(instr_node);
-
-    if (UNLIKELY(exception_holder_.IsCaught())) {
-      VLOG(4) << "Exception caught";
-      break;
-    }
-  }
-
-  if (UNLIKELY(exception_holder_.IsCaught())) {
-    VLOG(1) << "Exception caught " << exception_holder_.Type();
-    PADDLE_ENFORCE_EQ(
-        main_thread_blocker_.Clear(),
-        0,
-        platform::errors::PreconditionNotMet(
-            "main_thread_blocker_.Clear() return -1, clear failed"));
-    VLOG(4) << "clear ok";
-    exception_holder_.ReThrow();
-  }
-}
-
-void InterpreterCore::RecordMemcpyD2H(const Instruction& instr_node) {
-  // NOTE(zhiqiu): hot fix for jit input var
-  if (instr_node.OpBaseValid() &&
-      instr_node.OpBase()->Type() == interpreter::kMemcpyD2H) {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto* default_dev_ctx = pool.Get(place_);
-    for (auto& event : instr_node.EventsToWait()) {
-      platform::RecordEvent record(
-          "RecordStreamEvent", platform::TracerEventType::UserDefined, 10);
-      VLOG(3) << "Record event on default stream in jit_input_var at op: "
-              << instr_node.OpBase()->Type();
-      event.event_->Record(default_dev_ctx);
-    }
-  }
-}
-
-void InterpreterCore::UpdateSyncOpNum() {
-  int64_t sync_op_num = 0;
-  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
-    if (vec_instruction_[i].KernelType() == OpFuncType::kCpuSync ||
-        vec_instruction_[i].KernelType() == OpFuncType::kGpuSync) {
-      sync_op_num = sync_op_num + 1;
-    }
-  }
-  sync_op_num_ = sync_op_num;
-  VLOG(4) << "Update sync op num, sync op num is: " << sync_op_num_;
-}
-
-// Note(zhangbo):
-// When there is a KQueueSync type OP in the model, breadth traversal is better
-// than depth traversal. For example: OP(O) ->(direct_run)-> OP(A)
-// ->(sync_run)-> OP(B) OP(O) ->(direct_run)-> OP(C) ->(direct_run)-> OP(D) If B
-// is run before C, B may always block to wait for A to finish executing, but in
-// fact, C can be executed first during this time.
-void InterpreterCore::AnalyseExecuteOrderForTrace() {
-  VLOG(4) << "Analyze the execution order of Trace scheduling mode.";
-  interpreter::ResetAtomicGuard guard(&deps_, &refs_);
-
-  auto op_downstream_map = dependency_builder_.OpDownstreamMap();
-
-  auto IsReady = [this](size_t next_id) {
-    VLOG(4) << "op_id: " << next_id
-            << ", remain deps: " << deps_[next_id]->DynamicDep();
-    return deps_[next_id]->CheckAndDecrease();
-  };
-
-  std::vector<size_t> trace_order;
-  SchedulingQueue ready_ops(instruction_scheduling_priority_less);
-
-  for (size_t instr_id = 0; instr_id < dependecy_count_.size(); ++instr_id) {
-    if (dependecy_count_[instr_id] == 0) {
-      ready_ops.push(instr_id);
-    }
-  }
-
-  while (!ready_ops.empty()) {
-    size_t now_id = ready_ops.top();
-    ready_ops.pop();
-    trace_order.push_back(now_id);
-
-    auto next_op_set = op_downstream_map[now_id];
-
-    for (size_t next_op_id : next_op_set) {
-      if (IsReady(next_op_id)) {
-        ready_ops.push(next_op_id);
-      }
-    }
-  }
-
-  PADDLE_ENFORCE_EQ(
-      trace_order.size(),
-      dependecy_count_.size(),
-      platform::errors::PreconditionNotMet(
-          "trace_order size should be equal to dependecy_count_."));
-
-  trace_execute_order_ = trace_order;
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
index e676c0ea709..af12b0904f3 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -12,56 +12,41 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
-
-#include <map>
-#include <queue>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/details/exception_holder.h"
-#include "paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h"
-#include "paddle/fluid/framework/new_executor/interpreter/dependency_builder.h"
-#include "paddle/fluid/framework/new_executor/interpreter/execution_config.h"
-#include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
-#include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
-#include "paddle/fluid/framework/new_executor/new_executor_defs.h"
-#include "paddle/fluid/framework/new_executor/profiler.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/memory/allocation/spin_lock.h"
-#include "paddle/fluid/platform/device_event.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/value.h"
+#include "paddle/fluid/framework/new_executor/interpreter_base_impl.h"
 
 DECLARE_bool(new_executor_use_local_scope);
 
+namespace ir {
+class Program;
+}  // namespace ir
+
+namespace ir {
+class Program;
+}  // namespace ir
+
 namespace paddle {
 namespace framework {
 
+class InterpreterBaseImpl;
+
 class InterpreterCore {
   using ExecutionConfig = interpreter::ExecutionConfig;
-  using InstructionSchedulingPriorityLess = std::function<bool(size_t, size_t)>;
-  using SchedulingQueue =
-      std::priority_queue<size_t,
-                          std::vector<size_t>,
-                          InstructionSchedulingPriorityLess>;
+  using HookFunc = std::function<void(OperatorBase*, Scope*)>;
 
  public:
   InterpreterCore(const platform::Place& place,
                   const BlockDesc& block,
                   Scope* scope,
                   const ExecutionConfig& execution_config = ExecutionConfig());
-
+  // This constructor is for New IR.
   InterpreterCore(const platform::Place& place,
-                  const BlockDesc& block,
-                  Scope* scope,
                   std::unique_ptr<::ir::Program> ir_prog,
+                  Scope* scope,
                   const ExecutionConfig& execution_config = ExecutionConfig());
-
   ~InterpreterCore();
 
+  const InterpreterBaseImpl* Impl() const { return impl_.get(); }
+
   paddle::framework::FetchList Run(
       const std::vector<std::string>& feed_names,
       const std::vector<phi::DenseTensor>& feed_tensors);
@@ -83,127 +68,14 @@ class InterpreterCore {
 
   void reset_scope(Scope* new_scope);
 
-  const platform::Place& GetPlace() const { return place_; }
+  const platform::Place& GetPlace() const;
 
-  using HookFunc = std::function<void(OperatorBase*, Scope*)>;
-  void SetOutputHooks(const std::vector<HookFunc>& hookfuncs) {
-    hookfuncs_ = hookfuncs;
-  }
+  void SetOutputHooks(const std::vector<HookFunc>& hookfuncs);
 
  private:
   DISABLE_COPY_AND_ASSIGN(InterpreterCore);
-  // build graph
-  void Convert(std::vector<paddle::framework::OpFuncNode>* op_func_nodes);
-  void BuildOperatorDependences();
-  void BuildAndCacheInstructionCtx(Instruction* instr_node);
-  void BuildSkipShareLoDInfo();
-  void UpdateSyncOpNum();
-  void AnalyseExecuteOrderForTrace();
-
-  // inplace
-  void BuildInplace();
-  bool BuildInplaceCheckVarIsOnlyInput(
-      const std::vector<std::vector<size_t>>& input_var2op, size_t var_index);
-  void SetFeedVarsInplaceSkip(const std::vector<std::string>& feed_names);
-
-  // cuda graph
-  void CheckCUDAGraphBeforeRun(const std::vector<std::string>& feed_names);
-  void PrepareForCUDAGraphCapture();
-
-  // execution
-  void RunImpl();
-  void ExecuteInstructionList(const std::vector<Instruction>& vec_instr);
-  void RunInstructionAsync(size_t instr_id);
-  void RunInstruction(const Instruction& instr_node);
-  void RunNextInstructions(const Instruction& instr_id,
-                           SchedulingQueue* reserved_next_ops);
-  void RunOperator(const Instruction& instr_node);
-  // Trace
-  void TraceInstructionList(const std::vector<Instruction>& vec_instr);
-
-  // only used when program contains no feed op
-  void Prepare(const std::vector<std::string>& feed_names,
-               const std::vector<phi::DenseTensor>& feed_tensors,
-               bool prepare_feed);
-
-  void RecordMemcpyD2H(const Instruction& instr_node);
-
-  // gc
-  void RecordStreamForGC(const Instruction& instr);
-  void CheckGC(const Instruction& instr);
-  void ClearLoDTensorArrayInLocalScope();
-
-  // workqueue
-  std::shared_ptr<interpreter::AsyncWorkQueue> GetWorkQueue();
-
-  // scope
-  bool HasLocalScope() const;
-
-  // For log and debug
-  std::string GetDepsString() const;
-
- private:
-  bool is_build_{false};
-  bool static_build_{false};
-
-  const platform::Place place_;
-  const BlockDesc& block_;  // not owned
-
-  interpreter::DependencyBuilder dependency_builder_;
-  interpreter::StreamAnalyzer stream_analyzer_;
-
-  // NOTE(zhiqiu): when add fetch ops in GetInterpreterCore, we will
-  // copy a new program and block, the copy_program_ here is used to
-  // hold the program, otherwise block_ maybe not valid after the
-  // new program is deleted.
-  std::shared_ptr<ProgramDesc> copy_program_{nullptr};
-
-  // from variable scope
-  std::vector<Variable*> var_list_;
-  std::map<std::string, int> name2id_;
-  std::vector<VariableMetaInfo> vec_meta_info_;
-
-  std::vector<Instruction> vec_instruction_;  // deconstruct before OpFuncNode
-
-  std::atomic<size_t> unfinished_op_number_{0};
-
-  ExecutionConfig execution_config_;
-
-  VariableScope var_scope_;
-  Scope* local_scope_{nullptr};  // not owned
-
-  EventsWaiter main_thread_blocker_;
-  std::shared_ptr<interpreter::AsyncWorkQueue> async_work_queue_;
-
-  details::ExceptionHolder exception_holder_;
-  std::shared_ptr<EventsWaiter::EventNotifier> exception_notifier_{nullptr};
-  std::shared_ptr<EventsWaiter::EventNotifier> completion_notifier_{nullptr};
-
-  std::unique_ptr<InterpreterCoreGarbageCollector> gc_;
-
-  // last_live_ops_[i] contains the id of operators that last access the i-th
-  // var
-  std::map<size_t, std::set<size_t>> last_live_ops_;
-
-  // dependecy_count_[i] contains the number of dependencies that the i-th op
-  // need to wait
-  std::vector<size_t> dependecy_count_;
-
-  std::vector<std::shared_ptr<interpreter::OpDepInfo>> deps_;
-  std::vector<std::shared_ptr<interpreter::VarRefInfo>> refs_;
-
-  // used for Trace
-  int64_t sync_op_num_{-1};
-  std::vector<size_t> trace_execute_order_;
-
-  InstructionSchedulingPriorityLess instruction_scheduling_priority_less;
-
-  std::vector<HookFunc> hookfuncs_;
-
-  // The next only for new IR
-  std::unique_ptr<::ir::Program> ir_program_{nullptr};
 
-  std::unordered_map<::ir::Value, std::string> value_2_var_name_map_;
+  std::unique_ptr<InterpreterBaseImpl> impl_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
new file mode 100644
index 00000000000..d7b2c9f0b2c
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
@@ -0,0 +1,1480 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/new_executor/new_ir_interpreter.h"
+
+#include <unordered_set>
+
+#include "gflags/gflags.h"
+
+#include "paddle/fluid/framework/details/nan_inf_utils.h"
+#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
+#include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
+#include "paddle/fluid/framework/new_executor/interpreter/static_build.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/os_info.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/fluid/platform/profiler/supplement_tracing.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_context.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#include "paddle/fluid/platform/flags.h"
+#include "paddle/phi/backends/device_manager.h"
+
+#include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h"
+
+namespace paddle {
+namespace framework {
+
+NewIRInterpreter::NewIRInterpreter(const platform::Place& place,
+                                   std::unique_ptr<::ir::Program> ir_prog,
+                                   framework::Scope* scope,
+                                   const ExecutionConfig& execution_config)
+    : place_(place),
+      stream_analyzer_(place),
+      execution_config_(execution_config),
+      var_scope_(scope),
+      ir_program_(std::move(ir_prog)) {
+  VLOG(4) << "NewIRInterpreter(): " << this << " on " << place_;
+  static_build_ = FLAGS_new_executor_static_build &&
+                  !FLAGS_new_executor_use_cuda_graph &&
+                  !execution_config.used_for_control_flow_op;
+  //    &&interpreter::BlockCanBeStaticBuilt(block);
+
+  exception_notifier_ = main_thread_blocker_.RegisterEvent(kExceptionCaught);
+  completion_notifier_ = main_thread_blocker_.RegisterEvent(kTaskCompletion);
+
+  if (!FLAGS_new_executor_use_local_scope) {
+    execution_config_.create_local_scope = false;
+  }
+  execution_config_.AnalyzeThreadPoolConfig(place,
+                                            ir_program_->block()->size());
+  execution_config_.Log(/*log_level=*/8);
+
+  if (execution_config_.create_local_scope) {
+    auto local_scope = &var_scope_.GetMutableScope()->NewScope();
+    local_scope_ = local_scope;
+  }
+
+  // force use outer scope for now
+  local_scope_ = scope;
+  static_build_ = true;
+
+  var_scope_.SetLocalScope(local_scope_);
+
+  instruction_scheduling_priority_less = [this](size_t lhs, size_t rhs) {
+    SchedulingPriority lhs_scheduling_priority =
+        vec_instruction_[lhs].GetSchedulingPriority();
+    SchedulingPriority rhs_scheduling_priority =
+        vec_instruction_[rhs].GetSchedulingPriority();
+    if (lhs_scheduling_priority == rhs_scheduling_priority) {
+      return lhs < rhs;
+    }
+    return lhs_scheduling_priority > rhs_scheduling_priority;
+  };
+
+  PrepareForCUDAGraphCapture();
+}
+
+NewIRInterpreter::~NewIRInterpreter() {
+  // cancle gc's thread
+  gc_.reset(nullptr);
+  async_work_queue_.reset();
+  VLOG(4) << "~NewIRInterpreter(): " << this << " on " << place_;
+
+#ifdef PADDLE_WITH_MKLDNN
+  // Clear mkl-dnn cache,
+  // this is needed to have mkl-dnn unit tests working
+  platform::ClearMKLDNNCache(place_, this);
+#endif
+}
+
+void NewIRInterpreter::RunImpl() {
+  // lazy initialization of gc, do not create gc is the program only run once
+  if (!gc_) {
+    gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_);
+  }
+
+  interpreter::ResetAtomicGuard guard(&deps_, &refs_);
+
+  //   if ((execution_config_.used_for_jit || execution_config_.used_for_cinn)
+  //   &&
+  //       (sync_op_num_ == 0)) {
+  VLOG(4) << "Tracing Instruction List";
+  TraceInstructionList(vec_instruction_);
+  //   } else {
+  //     VLOG(4) << "Non-tracing";
+  //     // For the program that only run once, it is no need to
+  //     // create work_queue, so the async_work_queue_ is created
+  //     // until the second step run.
+  //     async_work_queue_ = GetWorkQueue();
+  //     ExecuteInstructionList(vec_instruction_);
+  //   }
+  // #ifdef PADDLE_WITH_CUSTOM_DEVICE
+  //   if (platform::is_custom_place(place_)) {
+  //     platform::DeviceContextPool::Instance().Get(place_)->Wait();
+  //   }
+  // #endif
+}
+
+FetchList NewIRInterpreter::Run(
+    const std::vector<std::string>& feed_names,
+    const std::vector<phi::DenseTensor>& feed_tensors) {
+  SetDeviceId(place_);
+  CheckCUDAGraphBeforeRun(feed_names);
+
+#ifdef PADDLE_WITH_MKLDNN
+  platform::AttachPointerHashToMKLDNNKey(this, place_);
+#endif
+
+  bool is_build = is_build_;
+  Prepare(feed_names, feed_tensors, is_build);
+
+  if (is_build) {
+    RunImpl();
+  }
+
+  if (HasLocalScope()) {
+    ClearLoDTensorArrayInLocalScope();
+  }
+
+  // return Fetch Tensors
+  auto* fetch_var = local_scope_->FindVar(interpreter::kFetchVarName);
+  if (fetch_var) {
+    auto fetch_list = std::move(*fetch_var->GetMutable<framework::FetchList>());
+#ifdef PADDLE_WITH_CUDA
+    if (platform::IsCUDAGraphCapturing()) {
+      PADDLE_ENFORCE_EQ(fetch_list.empty(),
+                        true,
+                        platform::errors::InvalidArgument(
+                            "Cannot fetch data when using CUDA Graph."));
+    }
+#endif
+    return fetch_list;
+  } else {
+    return {};
+  }
+}
+
+FetchList NewIRInterpreter::Run(const std::vector<std::string>& feed_names,
+                                bool need_fetch) {
+  SetDeviceId(place_);
+  CheckCUDAGraphBeforeRun(feed_names);
+
+#ifdef PADDLE_WITH_MKLDNN
+  platform::AttachPointerHashToMKLDNNKey(this, place_);
+#endif
+
+  if (!is_build_) {
+    LOG_FIRST_N(INFO, 1) << "New Executor is Running.";
+    ::ir::BuildScope(
+        ir_program_->block(), local_scope_, &value_2_var_name_map_);
+
+    std::vector<paddle::framework::OpFuncNode> op_func_nodes;
+    interpreter::BuildOpFuncList(place_,
+                                 ir_program_->block(),
+                                 &op_func_nodes,
+                                 local_scope_,
+                                 value_2_var_name_map_,
+                                 execution_config_);
+    SetFeedVarsInplaceSkip(feed_names);
+    // convert vec func_list to graph
+    Convert(&op_func_nodes);
+    UpdateSyncOpNum();
+    if (static_build_) {
+      VLOG(4) << "RUN impl";
+      RunImpl();
+    }
+    is_build_ = true;
+  } else {
+    RunImpl();
+  }
+
+  if (HasLocalScope()) {
+    ClearLoDTensorArrayInLocalScope();
+  }
+
+  // return Fetch Tensors
+  Scope* inner_scope =
+      HasLocalScope() ? local_scope_ : var_scope_.GetMutableScope();
+  auto* fetch_var = inner_scope->FindVar(interpreter::kFetchVarName);
+  if (fetch_var && need_fetch) {
+    auto fetch_list = std::move(*fetch_var->GetMutable<framework::FetchList>());
+#ifdef PADDLE_WITH_CUDA
+    if (platform::IsCUDAGraphCapturing()) {
+      PADDLE_ENFORCE_EQ(fetch_list.empty(),
+                        true,
+                        platform::errors::InvalidArgument(
+                            "Cannot fetch data when using CUDA Graph."));
+    }
+#endif
+    return fetch_list;
+  } else {
+    return {};
+  }
+}
+
+void NewIRInterpreter::SetCopyProgram(std::shared_ptr<ProgramDesc> prog) {
+  copy_program_ = prog;
+}
+
+void NewIRInterpreter::SetSkipGcVars(
+    const std::set<std::string>& skip_gc_vars) {
+  PADDLE_ENFORCE_EQ(
+      execution_config_.skip_gc_vars.empty(),
+      true,
+      platform::errors::PreconditionNotMet(
+          "execution_config_.skip_gc_vars can only be initialized once, now "
+          "execution_config_.skip_gc_vars is "
+          "not empty, do not call SetSkipGcVars method repeatedly."));
+  execution_config_.skip_gc_vars = skip_gc_vars;
+}
+
+void NewIRInterpreter::SetJitInputVars(
+    const std::set<std::string>& jit_input_vars) {
+  PADDLE_ENFORCE_EQ(
+      execution_config_.jit_input_vars.empty(),
+      true,
+      platform::errors::PreconditionNotMet(
+          "execution_config_.jit_input_vars can only be initialized once, now "
+          "execution_config_.jit_input_vars is "
+          "not empty, do not call SetJitInputVars method repeatedly."));
+  execution_config_.jit_input_vars = jit_input_vars;
+}
+
+const std::set<std::string>& NewIRInterpreter::JitInputVars() const {
+  return execution_config_.jit_input_vars;
+}
+
+const VariableScope* NewIRInterpreter::GetVariableScope() const {
+  return &var_scope_;
+}
+
+void NewIRInterpreter::reset_scope(Scope* new_scope) {
+  var_scope_.SetScope(new_scope);
+  auto& var_list = var_scope_.MutableVarList();
+  for (size_t i = 0; i < var_list.size(); i++) {
+    const auto& var_name = var_scope_.GetNameById(i);
+    var_list[i] = new_scope->FindVar(var_name);
+  }
+  // The index should be assured valid, cause the InterpreterCore may not be
+  // fully built, but was still cached and used. For example, see unit test
+  // `test_assert.py`, it may exit before `NewIRInterpreter::Convert`,
+  // but still was cached and used by later tests.
+  for (size_t i = 0; i < std::min(refs_.size(), var_list.size()); i++) {
+    refs_[i]->ResetVariable(var_list[i]);
+  }
+
+  for (size_t i = 0; i < vec_instruction_.size(); i++) {
+    BuildAndCacheInstructionCtx(&vec_instruction_[i]);
+  }
+}
+
+void NewIRInterpreter::ShareWorkQueueFrom(InterpreterBaseImpl* src) {
+  async_work_queue_ = reinterpret_cast<NewIRInterpreter*>(src)->GetWorkQueue();
+  VLOG(8) << "Share AsyncWorkQueue from InterpreterCore(" << src
+          << ") to InterpreterCore(" << this << ")";
+}
+
+bool NewIRInterpreter::BuildInplaceCheckVarIsOnlyInput(
+    const std::vector<std::vector<size_t>>& input_var2op, size_t var_index) {
+  if (!var_scope_.VarDesc(var_index)) {
+    return input_var2op.at(var_index).size() == 1;
+  } else {
+    int is_input_cnt = 0;
+    for (auto inst_id : input_var2op.at(var_index)) {
+      OpInOutInfo info;
+      info.Build(vec_instruction_.at(inst_id).OpBase());
+      if (info.IsInArgBufferNeeded(var_scope_.VarDesc(var_index)->Name())) {
+        is_input_cnt++;
+      }
+    }
+    return is_input_cnt == 1;
+  }
+}
+
+std::shared_ptr<interpreter::AsyncWorkQueue> NewIRInterpreter::GetWorkQueue() {
+  if (async_work_queue_ == nullptr) {
+    async_work_queue_ = std::make_shared<interpreter::AsyncWorkQueue>(
+        execution_config_.host_num_threads,
+        execution_config_.device_num_threads,
+        nullptr);
+  }
+  return async_work_queue_;
+}
+
+void NewIRInterpreter::BuildAndCacheInstructionCtx(Instruction* instr_node) {
+  Scope* inner_scope =
+      HasLocalScope() ? local_scope_ : var_scope_.GetMutableScope();
+  VariableValueMap ins_map;
+  for (auto& var_name_item : instr_node->Inputs()) {
+    std::vector<Variable*> input_vars;
+
+    input_vars.reserve(var_name_item.second.size());
+    for (auto& id : var_name_item.second) {
+      input_vars.emplace_back(inner_scope->FindVar(var_scope_.GetNameById(id)));
+    }
+    ins_map.emplace(var_name_item.first, std::move(input_vars));
+  }
+
+  VariableValueMap outs_map;
+  for (auto& var_name_item : instr_node->Outputs()) {
+    std::vector<Variable*> out_vars;
+
+    out_vars.reserve(var_name_item.second.size());
+    for (auto& id : var_name_item.second) {
+      out_vars.emplace_back(inner_scope->FindVar(var_scope_.GetNameById(id)));
+    }
+    outs_map.emplace(var_name_item.first, std::move(out_vars));
+  }
+
+  // set runtime_ctx and infershape_ctx_
+  if (instr_node->OpBase()->Type() == "cinn_launch" ||
+      instr_node->OpBase()->Type() == "cinn_instruction_run") {  // OP use scope
+                                                                 // in kernel
+    Scope* local_scope = HasLocalScope() ? var_scope_.GetMutableLocalScope()
+                                         : var_scope_.GetMutableScope();
+    instr_node->ResetContextWithScope(ins_map, outs_map, *local_scope);
+  } else {
+    instr_node->ResetContext(ins_map, outs_map);
+  }
+}
+
+void NewIRInterpreter::BuildInplace() {
+  // NOTE(Ruibiao): coalesce_tensor_op outputs a FusedOutput phi::DenseTensor
+  // and a list of Output Tensors which are sliced from the FusedOutput. These
+  // outputs sholud not be the outvar of the in-place var-pair since memory
+  // reuse between FusedOutput and Output Tensors is assumed. For the following
+  // example:
+  // fused_var, var1, var2, var3 = coalesce_tensor(var1, var2, var3)
+  // var1 = sum(var4, var5)
+  // ...
+  //
+  // After running coalesce_tensor_op, var1 is assumed to share the buffer
+  // slices from fused_var. However, if sum_op is in-place, then var1 would
+  // re-share the buffer with var4 instead of fused_var.
+  std::set<std::string> skip_inplace_outvars;
+  for (Instruction& instr : vec_instruction_) {
+    OperatorBase* op = instr.OpBase();
+    if (op->Type() == kCoalesceTensor) {
+      const std::vector<std::string>& outputs =
+          op->OutputVars(/*has_intermediate=*/false);
+      skip_inplace_outvars.insert(outputs.begin(), outputs.end());
+    }
+  }
+
+  Scope* local_scope = HasLocalScope() ? var_scope_.GetMutableLocalScope()
+                                       : var_scope_.GetMutableScope();
+  std::vector<std::vector<size_t>> input_var2op(var_scope_.VarSize());
+  for (Instruction& instr : vec_instruction_) {
+    for (auto& item : instr.Inputs()) {
+      for (int var_id : item.second) {
+        if (var_id != kEmptyVarIndex) {
+          input_var2op.at(var_id).push_back(instr.Id());
+        }
+      }
+    }
+  }
+
+  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
+    auto& instr = vec_instruction_[i];
+    auto* op_base = instr.OpBase();
+    if (!op_base->Info().infer_inplace_) {
+      continue;
+    }
+
+    auto in_to_outs = op_base->Info().infer_inplace_(
+        platform::is_gpu_place(instr.DeviceContext().GetPlace()));
+
+    auto& inputs = instr.Inputs();
+    auto& outputs = instr.Outputs();
+    for (auto& pair : in_to_outs) {
+      auto iter = inputs.find(pair.first);
+      if (iter != inputs.end() && !iter->second.empty()) {
+        auto in_var_desc = var_scope_.VarDesc(iter->second[0]);
+        if (in_var_desc && in_var_desc->Persistable()) {
+          continue;
+        }
+        if (var_scope_.GetVarSikpInplace(iter->second[0])) {
+          continue;
+        }
+        if (BuildInplaceCheckVarIsOnlyInput(input_var2op, iter->second[0])) {
+          auto iterout = outputs.find(pair.second);
+          if (iterout != outputs.end() && !iterout->second.empty()) {
+            const std::string& invar_name =
+                var_scope_.GetNameById(iter->second[0]);
+            const std::string& outvar_name =
+                var_scope_.GetNameById(iterout->second[0]);
+            auto invar = local_scope->FindVar(invar_name);
+            auto outvar = local_scope->FindVar(outvar_name);
+
+            if (invar && outvar && invar->IsType<phi::DenseTensor>() &&
+                outvar->IsType<phi::DenseTensor>() &&
+                skip_inplace_outvars.find(outvar_name) ==
+                    skip_inplace_outvars.end()) {
+              instr.AddInplace(invar, outvar);
+              VLOG(3) << "inplace " << op_base->Type() << " " << invar_name
+                      << " -> " << outvar_name;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void NewIRInterpreter::PrepareForCUDAGraphCapture() {
+  if (!FLAGS_new_executor_use_cuda_graph) return;
+#ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE_EQ(
+      platform::IsCUDAGraphCapturing(),
+      false,
+      platform::errors::PermissionDenied("CUDA Graph is not allowed to capture "
+                                         "before prepare."));
+  PADDLE_ENFORCE_EQ(platform::is_gpu_place(place_),
+                    true,
+                    platform::errors::InvalidArgument(
+                        "CUDA Graph is only supported on NVIDIA GPU device."));
+  // If set true, will call `cudaStreamSynchronize(nccl_stream)`after allreduce.
+  // which may cause error in cuda graph. This behavior is consistent with PE.
+  PADDLE_ENFORCE_EQ(FLAGS_sync_nccl_allreduce,
+                    false,
+                    platform::errors::InvalidArgument(
+                        "FLAGS_sync_nccl_allreduce must be False to support "
+                        "CUDA Graph capturing."));
+
+  // All output vars of coalesce_tensor op should be persistable.
+  // If fused output var of coalesce_tensor is gc, it will cause accuracy
+  // problem. The specific reasons need to be analyzed.
+//   for (auto& op_desc : block_.AllOps()) {
+//     if (op_desc->Type() == kCoalesceTensor) {
+//       for (auto& out_var_name : op_desc->OutputArgumentNames()) {
+//         // The fused var needs to be set to persistable, not just added to
+//         // skip_gc_vars.
+//         // In the case where the feed fetch var is changed,
+//         StandaloneExecutor
+//         // will be newly constructed. If the fused var is not persistable,
+//         // these vars will be recreated and initialized, resulting in
+//         // precision problems.
+//         auto* out_var = op_desc->Block()->FindVarRecursive(out_var_name);
+//         if (out_var) {
+//           out_var->SetPersistable(true);
+//           VLOG(4) << "Mark Var(" << out_var_name << ") as Persistable.";
+//         }
+//       }
+//     }
+//   }
+#else
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "CUDA Graph is only supported on NVIDIA GPU device."));
+#endif
+}
+
+void NewIRInterpreter::CheckCUDAGraphBeforeRun(
+    const std::vector<std::string>& feed_names) {
+#ifdef PADDLE_WITH_CUDA
+  if (platform::IsCUDAGraphCapturing()) {
+    PADDLE_ENFORCE_EQ(
+        feed_names.empty(),
+        true,
+        platform::errors::InvalidArgument(
+            "Feeding data is not permitted when capturing CUDA Graph."));
+    PADDLE_ENFORCE_EQ(
+        FLAGS_new_executor_use_cuda_graph,
+        true,
+        platform::errors::InvalidArgument(
+            "You must turn on FLAGS_new_executor_use_cuda_graph to True "
+            "to enable CUDA Graph capturing."));
+    PADDLE_ENFORCE_EQ(
+        place_,
+        platform::CUDAGraphCapturingPlace(),
+        platform::errors::InvalidArgument("The place to capture CUDAGraph is "
+                                          "not the same as the place to run."));
+  }
+#endif
+}
+
+void NewIRInterpreter::BuildOperatorDependences() {
+  // analysis the dependences between ops, add next_instr_list to each instr,
+  // and set the dependecy_count_
+  size_t instr_num = vec_instruction_.size();
+  dependecy_count_ = std::vector<size_t>(instr_num, 0);
+  auto downstream_map = dependency_builder_.Build(vec_instruction_);
+
+  for (size_t instr_id = 0; instr_id < instr_num; ++instr_id) {
+    Instruction& cur_instr = vec_instruction_[instr_id];
+    const std::set<size_t>& next_instr_ids = downstream_map[instr_id];
+
+    if (FLAGS_new_executor_serial_run) {
+      for (size_t next_instr_id : next_instr_ids) {
+        cur_instr.AddNextInstrInSameThread(next_instr_id);
+      }
+    } else {
+      if (cur_instr.KernelType() == OpFuncType::kGpuAsync) {
+        for (size_t next_instr_id : next_instr_ids) {
+          if (vec_instruction_[next_instr_id].KernelType() ==
+              OpFuncType::kGpuAsync) {
+            cur_instr.AddNextInstrInSameThread(next_instr_id);
+          } else {
+            cur_instr.AddNextInstrInDifferentThread(next_instr_id);
+          }
+        }
+      } else {
+        bool has_instr_in_same_thread = false;
+        for (size_t next_instr_id : next_instr_ids) {
+          if (!has_instr_in_same_thread &&
+              vec_instruction_[next_instr_id].KernelType() !=
+                  OpFuncType::kGpuAsync) {
+            cur_instr.AddNextInstrInSameThread(next_instr_id);
+            has_instr_in_same_thread = true;
+          } else {
+            cur_instr.AddNextInstrInDifferentThread(next_instr_id);
+          }
+        }
+      }
+    }
+
+    for (size_t next_instr_id : next_instr_ids) {
+      ++dependecy_count_[next_instr_id];
+    }
+  }
+}
+
+// At the end of each step, the holder of phi::DenseTensor in LoDTensorArray is
+// null. Clear these Tensors and leave LoDTensorArray empty, otherwise an
+// exception will occur in the next step
+void NewIRInterpreter::ClearLoDTensorArrayInLocalScope() {
+  auto vars = local_scope_->LocalVars();
+  for (auto var : vars) {
+    if (var->IsType<LoDTensorArray>()) {
+      auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
+      lod_tensor_arr->clear();
+    }
+  }
+}
+
+void NewIRInterpreter::Convert(
+    std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
+  auto& vec_meta_info = var_scope_.MutableVecMetaInfo();
+  auto nodes = *op_func_nodes;
+  auto op_nums = nodes.size();
+  vec_instruction_.clear();
+  vec_instruction_.reserve(op_nums);
+  for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
+    auto& op_func_node = nodes[op_idx];
+    auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node);
+    vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_);
+#ifdef PADDLE_WITH_CUDA
+    if (FLAGS_new_executor_use_cuda_graph) {
+      auto& op = op_func_node.operator_base_;
+      auto& op_type = op->Type();
+      if (op_type == interpreter::kMemcpyD2H ||
+          op_type == interpreter::kMemcpyH2D) {
+        PADDLE_THROW(paddle::platform::errors::Fatal(
+            "Cuda memory copy d2h/h2d is not allowed while using cuda graph."));
+      }
+      PADDLE_ENFORCE_EQ(typeid(*dev_ctx_) == typeid(phi::GPUContext),
+                        true,
+                        platform::errors::InvalidArgument(
+                            "Device context of op %s must be [%s] while using "
+                            "cuda graph, but got [%s].",
+                            op_type,
+                            typeid(phi::GPUContext).name(),
+                            typeid(*dev_ctx_).name()));
+      // cuda graph needs to record all stream
+      phi::backends::gpu::CUDAGraphContextManager::Instance()
+          .RecordCapturingDeviceContext(dev_ctx_);
+    }
+#endif
+  }
+
+  BuildOperatorDependences();
+
+  // NOTE(Ruibiao): For cross-step stream synchronization, an event may be
+  // recorded in the first step and waited in the second step. So, in the first
+  // step, the WaitEvent may be called without RecordEvent. Considering that
+  // before the first call to RecordEvent, an Event represents an empty set of
+  // work and WaitEvent always return succeed immediately, we omit the
+  // prelude-record for the first step here.
+  stream_analyzer_.ConstructEvents(&vec_instruction_);
+
+  // add event for the input var of jit program, since there are async copied
+  // from gpu_pinned place to gpu place on compute stream.
+  for (size_t i = 0; i < dependecy_count_.size(); ++i) {
+    if (dependecy_count_[i] == 0) {
+      auto& inst = vec_instruction_[i];
+      if (inst.OpBaseValid() &&
+          inst.OpBase()->Type() == interpreter::kMemcpyD2H &&
+          platform::is_gpu_place(place_)) {
+        for (auto& item : inst.Inputs()) {
+          for (auto var_id : item.second) {
+            auto name = var_scope_.GetNameById(var_id);
+            if (JitInputVars().count(name)) {
+              auto device_event = std::make_shared<platform::DeviceEvent>(
+                  place_, platform::GenerateDeviceEventFlag());
+              VLOG(4) << "Add input event for input: " << name << " of "
+                      << inst.OpBase()->Type();
+              inst.AddEventToWait(
+                  i, device_event, stream_analyzer_.GetWaiterType(inst));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // calculate last_live_ops_
+  //   for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
+  //     Instruction& instr = vec_instruction_[op_idx];
+  //     OpInOutInfo info;
+  //     info.Build(instr.OpBase());
+
+  //     std::set<size_t> gc_check_vars;
+
+  //     const std::map<std::string, std::vector<int>>& ins = instr.Inputs();
+  //     const std::map<std::string, std::vector<int>>& outs = instr.Outputs();
+  //     std::multimap<std::string, std::vector<int>> ins_and_outs{ins.begin(),
+  //                                                               ins.end()};
+  //     ins_and_outs.insert(outs.begin(), outs.end());
+
+  //     for (auto& item : ins_and_outs) {
+  //       for (auto id : item.second) {
+  //         if (id == kEmptyVarIndex) {
+  //           continue;
+  //         }
+  //         auto* var_desc = var_scope_.VarDesc(id);
+  //         // skip no_need_buffer input vars
+  //         if (var_desc && ins.count(item.first) &&
+  //             !info.IsInArgBufferNeeded(var_desc->Name())) {
+  //           continue;
+  //         }
+  //         // skip when this var is not in block and not a data_transferred
+  //         var,
+  //         // which means this var is managed by other block
+  //         const auto& var_name = var_scope_.GetNameById(id);
+  //         bool not_owned = !block_.HasVar(var_name);
+  //         const auto& transferred_vars = var_scope_.DataTransferAddedVars();
+  //         bool not_transferred =
+  //             std::all_of(transferred_vars.begin(),
+  //                         transferred_vars.end(),
+  //                         [&](const std::pair<std::string, int>& elem) {
+  //                           return elem.first != var_name;
+  //                         });
+  //         if (not_owned && not_transferred) {
+  //           VLOG(10) << "[gc_check_inputs] skip gc: " << var_name;
+  //           continue;
+  //         }
+  //         gc_check_vars.insert(id);
+  //       }
+  //     }
+
+  //     for (auto var_id : gc_check_vars) {
+  //       Scope* inner_scope =
+  //           HasLocalScope() ? local_scope_ : var_scope_.GetMutableScope();
+  //       paddle::framework::Variable* var =
+  //           inner_scope->FindVar(var_scope_.GetNameById(var_id));
+  //       if (var->IsType<phi::DenseTensor>() ||
+  //       var->IsType<phi::SelectedRows>() ||
+  //           var->IsType<LoDTensorArray>()) {
+  //         last_live_ops_[var_id].insert(op_idx);
+  //       } else {
+  //         VLOG(4) << "not clear " << var_scope_.GetNameById(var_id) << "
+  //         after "
+  //                 << instr.OpBase()->Type() << " because its type is "
+  //                 << framework::ToTypeName(var->Type());
+  //       }
+  //     }
+  //   }
+
+  // clear the last_live_ops list for all vars in skip_gc_vars
+  for (const std::string& skip_gc_var : execution_config_.skip_gc_vars) {
+    int var_id = var_scope_.GetIdByName(skip_gc_var);
+    if (var_id != -1) {
+      last_live_ops_[var_id].clear();
+      VLOG(8) << "Skip gc for var: " << skip_gc_var;
+    }
+  }
+
+  // shrink, find the downstream op that has no other op in the
+  // downstream list happens before it
+  // For example,
+  // b = op1(a)
+  // c = op2(a, b)
+  // in this case, a is the input of op1 and op2, we only need to check
+  // a after op2, because op2 always uses a after op1.
+  for (size_t i = 0; i < last_live_ops_.size(); ++i) {
+    std::set<size_t> minumum_last_live_ops;
+    for (size_t item : last_live_ops_[i]) {
+      bool not_before_any = true;
+      // find the op that is not executed before any
+      for (size_t other_item : last_live_ops_[i]) {
+        if (dependency_builder_.OpHappensBefore(item, other_item)) {
+          VLOG(8) << "happens_before: " << item << "->" << other_item
+                  << ", so skip " << item;
+          not_before_any = false;
+          break;
+        }
+      }
+      if (not_before_any) {
+        VLOG(8) << "last live op of var " << i << " "
+                << var_scope_.GetNameById(i) << " : " << item << " "
+                << vec_instruction_[item].OpBase()->Type();
+        minumum_last_live_ops.insert(item);
+        vec_instruction_[item].AddGCCheckVar(i);
+      }
+    }
+    last_live_ops_[i] = minumum_last_live_ops;
+    vec_meta_info[i].var_ref_count_ = last_live_ops_[i].size();
+  }
+
+  //   for (size_t i = 0; i < vec_instruction_.size(); ++i) {
+  //     BuildAndCacheInstructionCtx(&vec_instruction_[i]);
+  //   }
+
+  // bool inplaced = false;
+  // for (const Instruction& inst : vec_instruction_) {
+  //   if (inst.OpBase()->Type() == "share_buffer" ||
+  //       inst.OpBase()->Type() == "share_data") {
+  //     VLOG(4) << "Already inplaced, skip inplace now.";
+  //     inplaced = true;
+  //   }
+  // }
+
+  //   if (FLAGS_new_executor_use_inplace && !inplaced) {
+  //     BuildInplace();
+  //   }
+
+  for (auto& dep : dependecy_count_) {
+    deps_.emplace_back(std::make_shared<interpreter::OpDepInfo>(dep));
+  }
+  for (size_t i = 0; i < vec_meta_info.size(); ++i) {
+    refs_.emplace_back(std::make_shared<interpreter::VarRefInfo>(
+        vec_meta_info[i].var_ref_count_, var_scope_.VarRef(i)));
+  }
+
+  AnalyseExecuteOrderForTrace();
+}
+
+void NewIRInterpreter::BuildSkipShareLoDInfo() {
+  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
+    bool can_skip_lod = true;
+    for (auto& input : vec_instruction_[i].InnerRuntimeContext()->inputs) {
+      for (auto& var : input.second) {
+        if (var->IsType<phi::DenseTensor>()) {
+          if (var->Get<phi::DenseTensor>().lod().size() != 0) {
+            can_skip_lod = false;
+            break;
+          }
+        } else {
+          can_skip_lod = false;
+          break;
+        }
+      }
+    }
+    if (can_skip_lod) {
+      VLOG(8) << "skip share lod for: " << vec_instruction_[i].OpBase()->Type()
+              << " (" << i << ")";
+    }
+    vec_instruction_[i].InnerInferShapeContext()->SetSkipLoD(can_skip_lod);
+  }
+}
+
+void NewIRInterpreter::RunOperator(const Instruction& instr_node) {
+  auto* op = instr_node.OpBase();
+  auto place = instr_node.DeviceContext().GetPlace();
+  Scope* local_scope = HasLocalScope() ? var_scope_.GetMutableLocalScope()
+                                       : var_scope_.GetMutableScope();
+  VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope);
+
+  auto op_with_kernel = dynamic_cast<const framework::OperatorWithKernel*>(op);
+  {
+    // If it is OperatorBase, InferShape do nothing.
+    if (op_with_kernel != nullptr) {
+      platform::RecordEvent infershape_event(
+          "infer_shape",
+          platform::TracerEventType::OperatorInner,
+          1,
+          platform::EventRole::kInnerOp);
+
+      // see OperatorWithKernel::RunImpl in operator.cc for why
+      if (!(op_with_kernel->HasAttr(kAllKernelsMustComputeRuntimeShape) &&
+            op_with_kernel->Attr<bool>(kAllKernelsMustComputeRuntimeShape))) {
+        op_with_kernel->Info().infer_shape_(
+            instr_node.InnerInferShapeContext().get());
+      }
+      infershape_event.End();
+      platform::RecordOpInfoSupplement(op->Type(),
+                                       op->Attrs(),
+                                       *(instr_node.InnerInferShapeContext()),
+                                       *(instr_node.InnerRuntimeContext()),
+                                       op->Id());
+    }
+  }
+  if (op_with_kernel != nullptr && FLAGS_new_executor_use_inplace) {
+    // TODO(xiongkun03) Does operator base support inplace ?
+    for (auto& pair : instr_node.InplaceInfo()) {
+      const auto& in = paddle::framework::details::GetTensorFromVar(pair.first);
+      auto* out =
+          paddle::framework::details::GetMutableTensorFromVar(pair.second);
+      if (in.dims() == out->dims()) {
+        out->ShareBufferWith(in);
+      }
+    }
+  }
+
+  {
+    platform::RecordEvent compute_event(
+        "compute",
+        platform::TracerEventType::OperatorInner,
+        1,
+        platform::EventRole::kInnerOp);
+    if (op_with_kernel == nullptr) {  // operator base
+      instr_node.OpBase()->Run(*local_scope, place_);
+    } else {
+      phi::Kernel* kernel = instr_node.PhiKernel();
+      if (kernel && kernel->IsValid()) {  // phi kernel
+        if (kernel->GetKernelRegisteredType() ==
+            phi::KernelRegisteredType::FUNCTION) {
+          VLOG(4) << "Run function kernel: " << op->Type();
+          VLOG(4) << instr_node.InnerRuntimeContext().get() << " "
+                  << &instr_node.DeviceContext();
+          phi::KernelContext phi_kernel_context;
+          op_with_kernel->BuildPhiKernelContext(
+              *instr_node.InnerRuntimeContext().get(),
+              const_cast<platform::DeviceContext*>(&instr_node.DeviceContext()),
+              &phi_kernel_context);
+
+          (*kernel)(&phi_kernel_context);
+        } else {
+          VLOG(4) << "Run structure kernel: " << op->Type();
+          (*kernel)(instr_node.InnerExecutionContext().get());
+        }
+      } else {  // fluid kernel
+        instr_node.KernelFunc()(*instr_node.InnerExecutionContext().get());
+      }
+    }
+  }
+
+  VLOG(4) << "End run " << place << " " << op->DebugStringEx(local_scope);
+
+  if (!instr_node.InplaceBackMap().empty()) {
+    platform::RecordEvent inplaceback_event(
+        "InplaceVarsBack", platform::TracerEventType::UserDefined, 10);
+    auto& m = instr_node.InplaceBackMap();
+    // NOTE(zhiqiu): same logic as TransferInplaceVarsBack() in operator.cc
+    for (auto& p : m) {
+      auto* transformed_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(
+          var_scope_.VarRef(p.first));
+      auto* original_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(
+          var_scope_.VarRef(p.second));
+      original_tensor->ShareDataWith(*transformed_tensor);
+      VLOG(4) << "Transfer inplace variable back form "
+              << var_scope_.GetNameById(p.first) << " to "
+              << var_scope_.GetNameById(p.second);
+    }
+  }
+
+  /*For profiling/benchmark only*/
+  if (FLAGS_benchmark) {
+    instr_node.DeviceContext().Wait();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
+    VLOG(4) << "Operator(" << op->Type()
+            << "): context wait and get last error";
+#endif
+  }
+
+  for (auto& hook : hookfuncs_) {
+    hook(op, local_scope);
+  }
+
+  // for debug nan/inf
+  if (op_with_kernel != nullptr && FLAGS_check_nan_inf) {
+    VLOG(4) << "Check nan/inf";
+    try {
+      framework::details::CheckOpHasNanOrInf(
+          *op,
+          *local_scope,
+          place);  // TODO(xiongkun03) change it to inner scope.
+    } catch (...) {
+      const std::vector<std::string>* callstack = nullptr;
+      auto attrs = op->Attrs();
+      auto iter =
+          attrs.find(OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
+      if (iter != attrs.end()) {
+        callstack = &PADDLE_GET_CONST(std::vector<std::string>, iter->second);
+        if (callstack->empty()) callstack = nullptr;
+      }
+      std::ostringstream sout;
+      if (callstack) {
+        if (FLAGS_call_stack_level > 1) {
+          sout << "\n\n  Compile Traceback (most recent call last):";
+        } else {
+          sout << "In user code:\n";
+        }
+        for (auto& line : *callstack) {
+          sout << "\n  " << line;
+        }
+      }
+      std::cout << sout.str() << std::endl;
+      std::rethrow_exception(std::current_exception());
+    }
+  }
+}
+
+void NewIRInterpreter::RunInstruction(const Instruction& instr_node) {
+  VLOG(5) << __func__ << " OP id:" << instr_node.Id()
+          << " name:" << instr_node.OpBase()->Type() << " type:"
+          << (instr_node.KernelType() == OpFuncType::kCpuSync
+                  ? "kCpuSync"
+                  : (instr_node.KernelType() == OpFuncType::kGpuSync
+                         ? "kGpuSync"
+                         : "kGpuAsync"))
+          << " runs on " << platform::GetCurrentThreadName();
+
+  OperatorBase* op = nullptr;
+  if (instr_node.OpBaseValid()) {
+    op = instr_node.OpBase();
+    platform::RecordEvent instruction_event(
+        op->Type(), platform::TracerEventType::Operator, 1);
+  }
+
+  SetDeviceId(instr_node.DeviceContext().GetPlace());
+
+  try {
+    instr_node.WaitEvent(place_);
+
+    if (instr_node.PreDefineContext()) {
+      VLOG(5) << "run new ir selected kernel";
+      auto op_func_node = const_cast<OpFuncNode*>((instr_node.OpFunc()));
+      op_func_node->infer_shape_interface_->infer_shape_(
+          &(op_func_node->infer_meta_context_));
+      (*(op_func_node->phi_kernel_))(&(op_func_node->kernel_context_));
+    } else if (!instr_node.IsArtificial()) {
+      RunOperator(instr_node);
+      CheckGC(instr_node);
+      interpreter::LogDeviceMemoryStats(place_);
+    }
+
+    instr_node.RecordEvent(place_);
+  } catch (platform::EnforceNotMet& ex) {
+    framework::InsertCallStackInfo(op->Type(), op->Attrs(), &ex);
+    exception_holder_.Catch(std::make_exception_ptr(std::move(ex)));
+  } catch (platform::EOFException&) {
+    exception_holder_.Catch(std::current_exception());
+  } catch (std::exception& ex) {
+    LOG(WARNING) << op->Type() << " raises an exception "
+                 << platform::demangle(typeid(ex).name()) << ", " << ex.what();
+    exception_holder_.Catch(std::current_exception());
+  } catch (...) {
+    LOG(WARNING) << op->Type() << " raises an unknown exception";
+    exception_holder_.Catch(std::current_exception());
+  }
+}
+
+std::string NewIRInterpreter::GetDepsString() const {
+  std::stringstream ss;
+  auto downstream_map = dependency_builder_.OpDownstreamMap();
+  ss << "Note: when static_dep is 1, it is ok that the dynamic_dep will not "
+        "be decreased to 0."
+     << std::endl;
+  ss << "unfinished_op_number_:" << unfinished_op_number_ << std::endl;
+  for (size_t i = 0; i < deps_.size(); ++i) {
+    ss << "op:" << i << ", type: " << vec_instruction_[i].OpBase()->Type()
+       << ", static_dep:" << deps_[i]->StaticDep()
+       << ", dynamic_dep:" << deps_[i]->DynamicDep() << ", downstream op: ";
+    for (auto id : downstream_map[i]) {
+      ss << id << ", ";
+    }
+    ss << std::endl;
+  }
+  return ss.str();
+}
+
+void NewIRInterpreter::ExecuteInstructionList(
+    const std::vector<Instruction>& vec_instr) {
+  unfinished_op_number_ = vec_instr.size();
+  if (unfinished_op_number_ == 0) {
+    VLOG(4) << "No op to run, return";
+    return;
+  }
+
+  exception_holder_.Clear();
+
+  for (size_t i = 0; i < dependecy_count_.size(); ++i) {
+    if (dependecy_count_[i] == 0) {
+      // NOTE(zhiqiu): hot fix for jit input var
+      RecordMemcpyD2H(vec_instr.at(i));
+      if (FLAGS_new_executor_serial_run) {
+        RunInstructionAsync(i);
+      } else {
+        async_work_queue_->AddTask(vec_instr.at(i).KernelType(),
+                                   [this, i] { RunInstructionAsync(i); });
+      }
+    }
+  }
+
+  // For debug hang in main_thread_blocker_.WaitEvent(),
+  // launch async task to log deps every
+  // FLAGS_executor_log_deps_every_microseconds, then cancel the std::async when
+  // main_thread_blocker_.WaitEvent() executed. Why not use std::async instead
+  // of workqueue? To make sure that the logging thread itself will not affect
+  // the workqueue
+  //  used in interpretercore.
+
+  std::future<int> logged_times;
+  std::atomic_bool cancel_log = ATOMIC_VAR_INIT(false);
+  if (FLAGS_executor_log_deps_every_microseconds) {
+    logged_times = std::async(
+        std::launch::async,
+        [this](const std::atomic_bool& cancel) {
+          int times = 0;
+          while (!cancel) {
+            std::this_thread::sleep_for(std::chrono::microseconds(
+                FLAGS_executor_log_deps_every_microseconds));
+            // check again, since cancel may be changed during sleep
+            if (cancel) {
+              break;
+            }
+            VLOG(0) << "deps:\n" << GetDepsString();
+            times++;
+          }
+          return times;
+        },
+        std::ref(cancel_log));
+  }
+
+  auto event_name = main_thread_blocker_.WaitEvent();
+  VLOG(1) << "main_thread_blocker_(" << &main_thread_blocker_
+          << ") got event_name: " << event_name;
+
+  cancel_log = true;
+  if (logged_times.valid()) {
+    VLOG(1) << "Logged deps for " << logged_times.get() << " times";
+  }
+
+  if (UNLIKELY(exception_holder_.IsCaught())) {
+    VLOG(1) << "Exception caught " << exception_holder_.Type();
+    // Graceful exit when the executor encountered a fatal error.
+    // EOF is not a fatal error.
+    if (exception_holder_.Type() != "EOF") {
+      async_work_queue_->Cancel();
+      async_work_queue_.reset();
+    }
+    VLOG(4) << "Cancel ok";
+    PADDLE_ENFORCE_EQ(
+        main_thread_blocker_.Clear(),
+        0,
+        platform::errors::PreconditionNotMet(
+            "main_thread_blocker_.Clear() return -1, clear failed"));
+    VLOG(4) << "clear ok";
+    exception_holder_.ReThrow();
+  }
+}
+
+void NewIRInterpreter::RunNextInstructions(const Instruction& instr,
+                                           SchedulingQueue* reserved_next_ops) {
+  platform::RecordEvent record(
+      "RunNextInstructions", platform::TracerEventType::UserDefined, 10);
+
+  auto IsReady = [this](size_t next_id) {
+    VLOG(4) << "op_id: " << next_id
+            << ", remain deps: " << deps_[next_id]->DynamicDep();
+    return deps_[next_id]->CheckAndDecrease();
+  };
+
+  for (size_t next_instr_id : instr.NextInstrsInDifferenceThread()) {
+    if (IsReady(next_instr_id)) {
+      async_work_queue_->AddTask(
+          vec_instruction_[next_instr_id].KernelType(),
+          [this, next_instr_id]() { RunInstructionAsync(next_instr_id); });
+    }
+  }
+
+  for (size_t next_instr_id : instr.NextInstrsInSameThread()) {
+    if (IsReady(next_instr_id)) {
+      reserved_next_ops->push(next_instr_id);
+    }
+  }
+}
+
+void NewIRInterpreter::RunInstructionAsync(size_t instr_id) {
+  // NOTE(Ruibiao): Due to the uncertain order in multi-threading asynchronous
+  // scheduling, the priority order involved cross-thread scheduling is not
+  // guaranteed. Only Ops scheduled by the same AddTask call have the guarantee
+  // of priority order.
+  SchedulingQueue ready_ops(instruction_scheduling_priority_less);
+  ready_ops.push(instr_id);
+  while (!ready_ops.empty()) {
+    instr_id = ready_ops.top();
+    ready_ops.pop();
+    auto& instr_node = vec_instruction_.at(instr_id);
+
+    RunInstruction(instr_node);
+
+    if (UNLIKELY(exception_holder_.IsCaught())) {
+      VLOG(4) << "Exception caught";
+      if (exception_notifier_ != nullptr) {
+        exception_notifier_->NotifyEvent();
+      }
+      return;
+    }
+
+    VLOG(4) << "unfinished_op_number_: " << unfinished_op_number_;
+    if (UNLIKELY(unfinished_op_number_.fetch_sub(
+                     1, std::memory_order_relaxed) == 1)) {
+      if (completion_notifier_ != nullptr) {
+        completion_notifier_->NotifyEvent();
+      }
+    }
+
+    RunNextInstructions(instr_node, &ready_ops);
+  }
+}
+
+void NewIRInterpreter::RecordStreamForGC(const Instruction& instr) {
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "RecordStreamForGC is only implemented when compiled with GPU."));
+#else
+  if (!IsInterpretercoreFastGCEnabled() ||
+      instr.KernelType() != OpFuncType::kGpuAsync) {
+    return;
+  }
+  if (instr.DeviceContext().GetPlace() == phi::CustomPlace()) {
+    return;
+  }
+  platform::RecordEvent record(
+      "RecordStreamForGC", platform::TracerEventType::UserDefined, 10);
+
+  gpuStream_t stream =
+      reinterpret_cast<const phi::GPUContext&>(instr.DeviceContext()).stream();
+  auto TensorRecordStream = [&stream](phi::DenseTensor& tensor) {
+    auto allocation = tensor.Holder();
+    if (allocation == nullptr) {
+      return;
+    }
+
+    const platform::Place& place = allocation->place();
+    if (platform::is_gpu_place(place)) {
+      memory::RecordStream(allocation, stream);
+    } else if (platform::is_cuda_pinned_place(place)) {
+      // TODO(Ruibiao): Here should do something to make sure that the tensor
+      // is not freed until the H2D copies done. However, simplely launch a
+      // CUDA runtime callback to the H2D stream may lead a high performance
+      // overhead. As all the cases we meet in H2D are copies from CPUPlace at
+      // present, we just log a WARNING here. A better design is required.
+      LOG(WARNING) << "Copy data from a CUDAPinned tensor in an asynchronous "
+                      "manner may lead a data inconsistent";
+    } else {
+      // memory copies involve CPUPlace are always synchronous, so just do
+      // nothing here
+    }
+  };
+
+  /* NOTE(Ruibiao)：Cross-stream tensor synchronization is required only when
+   * all the following conditions are satisfied:
+   * 1. The tensor will be GC after running the instruction, i.e., in
+   * instr.GCCheckVars.
+   * 2. The stream which initializes this tensor is different from the stream
+   * which the instruction run in.
+   * 3. The tensor is the instruction's input, cause we assume that
+   * instruction will initialize all output tensors with its running stream.
+   * 4. In the OP function of this instruction, the tensor is an input of a
+   * async CUDA kernel.
+   *
+   * Here we only process the first condition, because:
+   * 1. Since the RecordStream function will directly return when the recored
+   * stream is equal to the owning stream, recording a stream same as which
+   * initialized this tensor has less time overhead. Conversely, it may take
+   * more time if we try to extract those cross-stream input vars from
+   * instr.GCCheckVars.
+   * 2. Now the instruction has no idea of which vars involving async running
+   * in OP function, and thus we can not recognize condition 4. It should be
+   * supported later.
+   */
+  for (int var_id : instr.GCCheckVars()) {
+    VLOG(4) << "GC sync " << var_scope_.GetNameById(var_id) << " "
+            << var_scope_.VarDesc(var_id);
+
+    // persistable var will be ignore while GC
+    if (var_scope_.VarDesc(var_id) &&
+        var_scope_.VarDesc(var_id)->Persistable()) {
+      continue;
+    }
+
+    paddle::framework::Variable* var = var_scope_.VarRef(var_id);
+    if (var == nullptr) {
+      continue;
+    }
+
+    if (var->IsType<phi::DenseTensor>()) {
+      TensorRecordStream(*(var->GetMutable<phi::DenseTensor>()));
+    } else if (var->IsType<
+                   operators::reader::
+                       OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {
+      // do nothing
+    } else if (var->IsType<phi::SelectedRows>()) {
+      TensorRecordStream(
+          *(var->GetMutable<phi::SelectedRows>()->mutable_value()));
+    } else if (var->IsType<LoDTensorArray>()) {
+      auto* tensor_arr = var->GetMutable<LoDTensorArray>();
+      for (auto& tensor : *tensor_arr) {
+        TensorRecordStream(tensor);
+      }
+    } else if (var->IsType<std::vector<Scope*>>()) {
+      // do nothing
+    } else {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "The variable(%s) is not supported in eager deletion.",
+          framework::ToTypeName(var->Type())));
+    }
+  }
+#endif
+}
+
+void NewIRInterpreter::CheckGC(const Instruction& instr) {
+  platform::RecordEvent record(
+      "CheckGC", platform::TracerEventType::UserDefined, 10);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  RecordStreamForGC(instr);
+#endif
+  auto& var_scope = var_scope_;
+
+  for (auto var_id : instr.GCCheckVars()) {
+    VLOG(4) << "GC:" << var_scope_.GetNameById(var_id) << ", id:" << var_id
+            << ", ref:" << refs_[var_id]->DynamicRef();
+    bool is_ready = refs_[var_id]->CheckAndDecrease();
+    // ignore all persistable var while GC
+    if (var_scope.VarDesc(var_id) && var_scope.VarDesc(var_id)->Persistable()) {
+      continue;
+    }
+    if (is_ready) {
+      VLOG(6) << "Async delete variable with name : "
+              << var_scope.GetNameById(var_id);
+      gc_->Add(refs_[var_id]->Var(), instr);
+    }
+  }
+}
+
+void NewIRInterpreter::Prepare(
+    const std::vector<std::string>& feed_names,
+    const std::vector<phi::DenseTensor>& feed_tensors,
+    bool prepare_feed) {
+  PADDLE_ENFORCE_EQ(feed_names.size(),
+                    feed_tensors.size(),
+                    platform::errors::PreconditionNotMet(
+                        "Required feed_names.size() == feed_tensors.size(), "
+                        "but received %d != %d",
+                        feed_names.size(),
+                        feed_tensors.size()));
+  auto FeedInput = [&] {
+    VLOG(4) << "Feed inputs";
+    for (size_t i = 0; i < feed_names.size(); ++i) {
+      auto* feed_var = local_scope_->FindVar(feed_names[i]);
+      PADDLE_ENFORCE_NOT_NULL(
+          feed_var,
+          platform::errors::NotFound("Variable %s should not be nullptr.",
+                                     feed_names[i]));
+
+      auto feed_tensor = feed_var->GetMutable<phi::DenseTensor>();
+      feed_tensor->ShareDataWith(feed_tensors[i]);
+      feed_tensor->set_lod(feed_tensors[i].lod());
+    }
+  };
+  // TODO(dev): Support this
+  //   if (!is_build_) {
+  //     paddle::framework::interpreter::BuildVariableScope(
+  //         block_, execution_config_, &var_scope_);
+  //     FeedInput();
+  //     std::vector<paddle::framework::OpFuncNode> op_func_nodes;
+  //     paddle::framework::interpreter::BuildOpFuncList(
+  //         place_,
+  //         block_,
+  //         execution_config_.skip_gc_vars,
+  //         &op_func_nodes,
+  //         &var_scope_,
+  //         execution_config_,
+  //         HasLocalScope(),
+  //         static_build_);
+  //     SetFeedVarsInplaceSkip(feed_names);
+  //     // convert vec func_list to graph
+  //     Convert(&op_func_nodes);
+  //     UpdateSyncOpNum();
+  //     if (static_build_) {
+  //       VLOG(4) << "RUN impl";
+  //       RunImpl();
+  //     }
+  //     BuildSkipShareLoDInfo();
+  //     is_build_ = true;
+  //   }
+  // NOTE: Because feed_tensor will be GC after
+  // paddle::framework::BuildOpFuncList, so we should
+  // call FeedInput again.
+  if (prepare_feed) {
+    FeedInput();
+  }
+}
+
+void NewIRInterpreter::SetFeedVarsInplaceSkip(
+    const std::vector<std::string>& feed_names) {
+  for (auto& feed_name : feed_names) {
+    var_scope_.SetVarSikpInplace(feed_name, true);
+  }
+}
+
+bool NewIRInterpreter::HasLocalScope() const { return local_scope_ != nullptr; }
+
+// Note(zhangbo):
+// (1) What is "Trace"?
+// The OP execute scheduling rule adopted by Interpretercore by default is a
+// multi-threaded scheduling mode(see ExecuteInstructionList). By maintaining a
+// high-performance thread pool, the OP's execute scheduling is distributed to
+// the sub threads maintained by the thread pool, but the main thread does not
+// have any tasks. In Trace mode, the executor will execute directly in the main
+// thread according to the pre provided OP sequence(trace_execute_order_),
+// instead of being distributed to the thread pool.
+// (2) When we use "Trace"?
+// In dygraph to static, This scheduling causes that the execution of the
+// forward and backward OPs and the execution of the dygraph optimizer cannot be
+// executed in the same thread. Executing thread switch may cause cpu cache
+// miss. When a model is all KQueueAsync type OPs, all OPs will be distributed
+// to the DeviceThread for execution, and the multithreading scheduling will not
+// have any benefits. Therefore, in the dynamic to static, when the number of
+// KQueueAsync Ops is 0, we choose Trace mode.
+void NewIRInterpreter::TraceInstructionList(
+    const std::vector<Instruction>& vec_instr) {
+  unfinished_op_number_ = vec_instr.size();
+  if (unfinished_op_number_ == 0) {
+    VLOG(4) << "No op to run, return";
+    return;
+  }
+
+  exception_holder_.Clear();
+
+  for (size_t i = 0; i < dependecy_count_.size(); ++i) {
+    if (dependecy_count_[i] == 0) {
+      // NOTE(zhiqiu): hot fix for jit input var
+      RecordMemcpyD2H(vec_instr.at(i));
+    }
+  }
+
+  for (size_t idx = 0; idx < trace_execute_order_.size(); idx++) {
+    auto instr_id = trace_execute_order_[idx];
+    auto& instr_node = vec_instruction_.at(instr_id);
+
+    RunInstruction(instr_node);
+
+    if (UNLIKELY(exception_holder_.IsCaught())) {
+      VLOG(4) << "Exception caught";
+      break;
+    }
+  }
+
+  if (UNLIKELY(exception_holder_.IsCaught())) {
+    VLOG(1) << "Exception caught " << exception_holder_.Type();
+    PADDLE_ENFORCE_EQ(
+        main_thread_blocker_.Clear(),
+        0,
+        platform::errors::PreconditionNotMet(
+            "main_thread_blocker_.Clear() return -1, clear failed"));
+    VLOG(4) << "clear ok";
+    exception_holder_.ReThrow();
+  }
+}
+
+void NewIRInterpreter::RecordMemcpyD2H(const Instruction& instr_node) {
+  // NOTE(zhiqiu): hot fix for jit input var
+  if (instr_node.OpBaseValid() &&
+      instr_node.OpBase()->Type() == interpreter::kMemcpyD2H) {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto* default_dev_ctx = pool.Get(place_);
+    for (auto& event : instr_node.EventsToWait()) {
+      platform::RecordEvent record(
+          "RecordStreamEvent", platform::TracerEventType::UserDefined, 10);
+      VLOG(3) << "Record event on default stream in jit_input_var at op: "
+              << instr_node.OpBase()->Type();
+      event.event_->Record(default_dev_ctx);
+    }
+  }
+}
+
+void NewIRInterpreter::UpdateSyncOpNum() {
+  int64_t sync_op_num = 0;
+  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
+    if (vec_instruction_[i].KernelType() == OpFuncType::kCpuSync ||
+        vec_instruction_[i].KernelType() == OpFuncType::kGpuSync) {
+      sync_op_num = sync_op_num + 1;
+    }
+  }
+  sync_op_num_ = sync_op_num;
+  VLOG(4) << "Update sync op num, sync op num is: " << sync_op_num_;
+}
+
+// Note(zhangbo):
+// When there is a KQueueSync type OP in the model, breadth traversal is better
+// than depth traversal. For example: OP(O) ->(direct_run)-> OP(A)
+// ->(sync_run)-> OP(B) OP(O) ->(direct_run)-> OP(C) ->(direct_run)-> OP(D) If B
+// is run before C, B may always block to wait for A to finish executing, but in
+// fact, C can be executed first during this time.
+void NewIRInterpreter::AnalyseExecuteOrderForTrace() {
+  VLOG(4) << "Analyze the execution order of Trace scheduling mode.";
+  interpreter::ResetAtomicGuard guard(&deps_, &refs_);
+
+  auto op_downstream_map = dependency_builder_.OpDownstreamMap();
+
+  auto IsReady = [this](size_t next_id) {
+    VLOG(4) << "op_id: " << next_id
+            << ", remain deps: " << deps_[next_id]->DynamicDep();
+    return deps_[next_id]->CheckAndDecrease();
+  };
+
+  std::vector<size_t> trace_order;
+  SchedulingQueue ready_ops(instruction_scheduling_priority_less);
+
+  for (size_t instr_id = 0; instr_id < dependecy_count_.size(); ++instr_id) {
+    if (dependecy_count_[instr_id] == 0) {
+      ready_ops.push(instr_id);
+    }
+  }
+
+  while (!ready_ops.empty()) {
+    size_t now_id = ready_ops.top();
+    ready_ops.pop();
+    trace_order.push_back(now_id);
+
+    auto next_op_set = op_downstream_map[now_id];
+
+    for (size_t next_op_id : next_op_set) {
+      if (IsReady(next_op_id)) {
+        ready_ops.push(next_op_id);
+      }
+    }
+  }
+
+  PADDLE_ENFORCE_EQ(
+      trace_order.size(),
+      dependecy_count_.size(),
+      platform::errors::PreconditionNotMet(
+          "trace_order size should be equal to dependecy_count_."));
+
+  trace_execute_order_ = trace_order;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.h b/paddle/fluid/framework/new_executor/new_ir_interpreter.h
new file mode 100644
index 00000000000..59eb155c123
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.h
@@ -0,0 +1,182 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include "paddle/fluid/framework/new_executor/interpreter_base_impl.h"
+
+namespace ir {
+class Program;
+}  // namespace ir
+
+namespace paddle {
+namespace framework {
+
+class NewIRInterpreter : public InterpreterBaseImpl {
+  using ExecutionConfig = interpreter::ExecutionConfig;
+  using InstructionSchedulingPriorityLess = std::function<bool(size_t, size_t)>;
+  using SchedulingQueue =
+      std::priority_queue<size_t,
+                          std::vector<size_t>,
+                          InstructionSchedulingPriorityLess>;
+
+ public:
+  NewIRInterpreter(const platform::Place& place,
+                   std::unique_ptr<::ir::Program> ir_prog,
+                   Scope* scope,
+                   const ExecutionConfig& execution_config = ExecutionConfig());
+
+  ~NewIRInterpreter();
+
+  paddle::framework::FetchList Run(
+      const std::vector<std::string>& feed_names,
+      const std::vector<phi::DenseTensor>& feed_tensors) override;
+
+  paddle::framework::FetchList Run(const std::vector<std::string>& feed_names,
+                                   bool need_fetch = true) override;
+
+  void ShareWorkQueueFrom(InterpreterBaseImpl* src) override;
+
+  void SetCopyProgram(std::shared_ptr<ProgramDesc> prog) override;
+
+  void SetSkipGcVars(const std::set<std::string>& skip_gc_vars) override;
+
+  const std::set<std::string>& JitInputVars() const override;
+
+  void SetJitInputVars(const std::set<std::string>& jit_input_vars) override;
+
+  const VariableScope* GetVariableScope() const override;
+
+  void reset_scope(Scope* new_scope) override;
+
+  const platform::Place& GetPlace() const override { return place_; }
+
+  void SetOutputHooks(const std::vector<HookFunc>& hookfuncs) override {
+    hookfuncs_ = hookfuncs;
+  }
+
+ private:
+  // build graph
+  void Convert(std::vector<paddle::framework::OpFuncNode>* op_func_nodes);
+  void BuildOperatorDependences();
+  void BuildAndCacheInstructionCtx(Instruction* instr_node);
+  void BuildSkipShareLoDInfo();
+  void UpdateSyncOpNum();
+  void AnalyseExecuteOrderForTrace();
+
+  // inplace
+  void BuildInplace();
+  bool BuildInplaceCheckVarIsOnlyInput(
+      const std::vector<std::vector<size_t>>& input_var2op, size_t var_index);
+  void SetFeedVarsInplaceSkip(const std::vector<std::string>& feed_names);
+
+  // cuda graph
+  void CheckCUDAGraphBeforeRun(const std::vector<std::string>& feed_names);
+  void PrepareForCUDAGraphCapture();
+
+  // execution
+  void RunImpl();
+  void ExecuteInstructionList(const std::vector<Instruction>& vec_instr);
+  void RunInstructionAsync(size_t instr_id);
+  void RunInstruction(const Instruction& instr_node);
+  void RunNextInstructions(const Instruction& instr_id,
+                           SchedulingQueue* reserved_next_ops);
+  void RunOperator(const Instruction& instr_node);
+  // Trace
+  void TraceInstructionList(const std::vector<Instruction>& vec_instr);
+
+  // only used when program contains no feed op
+  void Prepare(const std::vector<std::string>& feed_names,
+               const std::vector<phi::DenseTensor>& feed_tensors,
+               bool prepare_feed);
+
+  void RecordMemcpyD2H(const Instruction& instr_node);
+
+  // gc
+  void RecordStreamForGC(const Instruction& instr);
+  void CheckGC(const Instruction& instr);
+  void ClearLoDTensorArrayInLocalScope();
+
+  // workqueue
+  std::shared_ptr<interpreter::AsyncWorkQueue> GetWorkQueue();
+
+  // scope
+  bool HasLocalScope() const;
+
+  // For log and debug
+  std::string GetDepsString() const;
+
+  bool is_build_{false};
+  bool static_build_{false};
+
+  const platform::Place place_;
+
+  interpreter::DependencyBuilder dependency_builder_;
+  interpreter::StreamAnalyzer stream_analyzer_;
+
+  // NOTE(zhiqiu): when add fetch ops in GetInterpreterCore, we will
+  // copy a new program and block, the copy_program_ here is used to
+  // hold the program, otherwise block_ maybe not valid after the
+  // new program is deleted.
+  std::shared_ptr<ProgramDesc> copy_program_{nullptr};
+
+  // from variable scope
+  std::vector<Variable*> var_list_;
+  std::map<std::string, int> name2id_;
+  std::vector<VariableMetaInfo> vec_meta_info_;
+
+  std::vector<Instruction> vec_instruction_;  // deconstruct before OpFuncNode
+
+  std::atomic<size_t> unfinished_op_number_{0};
+
+  ExecutionConfig execution_config_;
+
+  VariableScope var_scope_;
+  Scope* local_scope_{nullptr};  // not owned
+
+  EventsWaiter main_thread_blocker_;
+  std::shared_ptr<interpreter::AsyncWorkQueue> async_work_queue_;
+
+  details::ExceptionHolder exception_holder_;
+  std::shared_ptr<EventsWaiter::EventNotifier> exception_notifier_{nullptr};
+  std::shared_ptr<EventsWaiter::EventNotifier> completion_notifier_{nullptr};
+
+  std::unique_ptr<InterpreterCoreGarbageCollector> gc_;
+
+  // last_live_ops_[i] contains the id of operators that last access the i-th
+  // var
+  std::map<size_t, std::set<size_t>> last_live_ops_;
+
+  // dependecy_count_[i] contains the number of dependencies that the i-th op
+  // need to wait
+  std::vector<size_t> dependecy_count_;
+
+  std::vector<std::shared_ptr<interpreter::OpDepInfo>> deps_;
+  std::vector<std::shared_ptr<interpreter::VarRefInfo>> refs_;
+
+  // used for Trace
+  int64_t sync_op_num_{-1};
+  std::vector<size_t> trace_execute_order_;
+
+  InstructionSchedulingPriorityLess instruction_scheduling_priority_less;
+
+  std::vector<HookFunc> hookfuncs_;
+
+  std::unique_ptr<::ir::Program> ir_program_{nullptr};
+
+  std::unordered_map<::ir::Value, std::string> value_2_var_name_map_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
new file mode 100644
index 00000000000..3156575ebd3
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -0,0 +1,1461 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/new_executor/program_interpreter.h"
+
+#include "paddle/fluid/framework/details/nan_inf_utils.h"
+#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
+#include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
+#include "paddle/fluid/framework/new_executor/interpreter/static_build.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/os_info.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/fluid/platform/profiler/supplement_tracing.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_context.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#include "paddle/phi/backends/device_manager.h"
+
+namespace paddle {
+namespace framework {
+
+ProgramInterpreter::ProgramInterpreter(const platform::Place& place,
+                                       const BlockDesc& block,
+                                       framework::Scope* scope,
+                                       const ExecutionConfig& execution_config)
+    : place_(place),
+      block_(block),
+      stream_analyzer_(place),
+      execution_config_(execution_config),
+      var_scope_(scope) {
+  VLOG(4) << "ProgramInterpreter(): " << this << " on " << place_;
+
+  static_build_ = FLAGS_new_executor_static_build &&
+                  !FLAGS_new_executor_use_cuda_graph &&
+                  !execution_config.used_for_control_flow_op &&
+                  interpreter::BlockCanBeStaticBuilt(block);
+
+  exception_notifier_ = main_thread_blocker_.RegisterEvent(kExceptionCaught);
+  completion_notifier_ = main_thread_blocker_.RegisterEvent(kTaskCompletion);
+
+  if (!FLAGS_new_executor_use_local_scope) {
+    execution_config_.create_local_scope = false;
+  }
+  execution_config_.AnalyzeThreadPoolConfig(place, block.OpSize());
+  execution_config_.Log(/*log_level=*/8);
+
+  if (execution_config_.create_local_scope) {
+    auto local_scope = &var_scope_.GetMutableScope()->NewScope();
+    local_scope_ = local_scope;
+  }
+  var_scope_.SetLocalScope(local_scope_);
+
+  instruction_scheduling_priority_less = [this](size_t lhs, size_t rhs) {
+    SchedulingPriority lhs_scheduling_priority =
+        vec_instruction_[lhs].GetSchedulingPriority();
+    SchedulingPriority rhs_scheduling_priority =
+        vec_instruction_[rhs].GetSchedulingPriority();
+    if (lhs_scheduling_priority == rhs_scheduling_priority) {
+      return lhs < rhs;
+    }
+    return lhs_scheduling_priority > rhs_scheduling_priority;
+  };
+
+  PrepareForCUDAGraphCapture();
+}
+
+ProgramInterpreter::~ProgramInterpreter() {
+  // cancle gc's thread
+  gc_.reset(nullptr);
+  async_work_queue_.reset();
+  VLOG(4) << "~ProgramInterpreter(): " << this << " on " << place_;
+
+#ifdef PADDLE_WITH_MKLDNN
+  // Clear mkl-dnn cache,
+  // this is needed to have mkl-dnn unit tests working
+  platform::ClearMKLDNNCache(place_, this);
+#endif
+}
+
+void ProgramInterpreter::RunImpl() {
+  // lazy initialization of gc, do not create gc is the program only run once
+  if (!gc_) {
+    gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_);
+  }
+
+  interpreter::ResetAtomicGuard guard(&deps_, &refs_);
+
+  if ((execution_config_.used_for_jit || execution_config_.used_for_cinn) &&
+      (sync_op_num_ == 0)) {
+    VLOG(4) << "Tracing Instruction List";
+    TraceInstructionList(vec_instruction_);
+  } else {
+    VLOG(4) << "Non-tracing";
+    // For the program that only run once, it is no need to
+    // create work_queue, so the async_work_queue_ is created
+    // until the second step run.
+    async_work_queue_ = GetWorkQueue();
+    ExecuteInstructionList(vec_instruction_);
+  }
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  if (platform::is_custom_place(place_)) {
+    platform::DeviceContextPool::Instance().Get(place_)->Wait();
+  }
+#endif
+}
+
+FetchList ProgramInterpreter::Run(
+    const std::vector<std::string>& feed_names,
+    const std::vector<phi::DenseTensor>& feed_tensors) {
+  SetDeviceId(place_);
+  CheckCUDAGraphBeforeRun(feed_names);
+
+#ifdef PADDLE_WITH_MKLDNN
+  platform::AttachPointerHashToMKLDNNKey(this, place_);
+#endif
+
+  bool is_build = is_build_;
+  Prepare(feed_names, feed_tensors, is_build);
+
+  if (is_build) {
+    RunImpl();
+  }
+
+  if (HasLocalScope()) {
+    ClearLoDTensorArrayInLocalScope();
+  }
+
+  // return Fetch Tensors
+  auto* fetch_var = local_scope_->FindVar(interpreter::kFetchVarName);
+  if (fetch_var) {
+    auto fetch_list = std::move(*fetch_var->GetMutable<framework::FetchList>());
+#ifdef PADDLE_WITH_CUDA
+    if (platform::IsCUDAGraphCapturing()) {
+      PADDLE_ENFORCE_EQ(fetch_list.empty(),
+                        true,
+                        platform::errors::InvalidArgument(
+                            "Cannot fetch data when using CUDA Graph."));
+    }
+#endif
+    return fetch_list;
+  } else {
+    return {};
+  }
+}
+
+FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
+                                  bool need_fetch) {
+  SetDeviceId(place_);
+  CheckCUDAGraphBeforeRun(feed_names);
+
+#ifdef PADDLE_WITH_MKLDNN
+  platform::AttachPointerHashToMKLDNNKey(this, place_);
+#endif
+
+  if (!is_build_) {
+    LOG_FIRST_N(INFO, 1) << "New Executor is Running.";
+    paddle::framework::interpreter::BuildVariableScope(
+        block_, execution_config_, &var_scope_);
+
+    std::vector<paddle::framework::OpFuncNode> op_func_nodes;
+    paddle::framework::interpreter::BuildOpFuncList(
+        place_,
+        block_,
+        execution_config_.skip_gc_vars,
+        &op_func_nodes,
+        &var_scope_,
+        execution_config_,
+        HasLocalScope(),
+        static_build_);
+    SetFeedVarsInplaceSkip(feed_names);
+    // convert vec func_list to graph
+    Convert(&op_func_nodes);
+    UpdateSyncOpNum();
+    if (static_build_) {
+      VLOG(4) << "RUN impl";
+      RunImpl();
+    }
+    is_build_ = true;
+  } else {
+    RunImpl();
+  }
+
+  if (HasLocalScope()) {
+    ClearLoDTensorArrayInLocalScope();
+  }
+
+  // return Fetch Tensors
+  Scope* inner_scope =
+      HasLocalScope() ? local_scope_ : var_scope_.GetMutableScope();
+  auto* fetch_var = inner_scope->FindVar(interpreter::kFetchVarName);
+  if (fetch_var && need_fetch) {
+    auto fetch_list = std::move(*fetch_var->GetMutable<framework::FetchList>());
+#ifdef PADDLE_WITH_CUDA
+    if (platform::IsCUDAGraphCapturing()) {
+      PADDLE_ENFORCE_EQ(fetch_list.empty(),
+                        true,
+                        platform::errors::InvalidArgument(
+                            "Cannot fetch data when using CUDA Graph."));
+    }
+#endif
+    return fetch_list;
+  } else {
+    return {};
+  }
+}
+
+void ProgramInterpreter::SetCopyProgram(std::shared_ptr<ProgramDesc> prog) {
+  copy_program_ = prog;
+}
+
+void ProgramInterpreter::SetSkipGcVars(
+    const std::set<std::string>& skip_gc_vars) {
+  PADDLE_ENFORCE_EQ(
+      execution_config_.skip_gc_vars.empty(),
+      true,
+      platform::errors::PreconditionNotMet(
+          "execution_config_.skip_gc_vars can only be initialized once, now "
+          "execution_config_.skip_gc_vars is "
+          "not empty, do not call SetSkipGcVars method repeatedly."));
+  execution_config_.skip_gc_vars = skip_gc_vars;
+}
+
+void ProgramInterpreter::SetJitInputVars(
+    const std::set<std::string>& jit_input_vars) {
+  PADDLE_ENFORCE_EQ(
+      execution_config_.jit_input_vars.empty(),
+      true,
+      platform::errors::PreconditionNotMet(
+          "execution_config_.jit_input_vars can only be initialized once, now "
+          "execution_config_.jit_input_vars is "
+          "not empty, do not call SetJitInputVars method repeatedly."));
+  execution_config_.jit_input_vars = jit_input_vars;
+}
+
+const std::set<std::string>& ProgramInterpreter::JitInputVars() const {
+  return execution_config_.jit_input_vars;
+}
+
+const VariableScope* ProgramInterpreter::GetVariableScope() const {
+  return &var_scope_;
+}
+
+void ProgramInterpreter::reset_scope(Scope* new_scope) {
+  var_scope_.SetScope(new_scope);
+  auto& var_list = var_scope_.MutableVarList();
+  for (size_t i = 0; i < var_list.size(); i++) {
+    const auto& var_name = var_scope_.GetNameById(i);
+    var_list[i] = new_scope->FindVar(var_name);
+  }
+  // The index should be assured valid, cause the InterpreterCore may not be
+  // fully built, but was still cached and used. For example, see unit test
+  // `test_assert.py`, it may exit before `ProgramInterpreter::Convert`,
+  // but still was cached and used by later tests.
+  for (size_t i = 0; i < std::min(refs_.size(), var_list.size()); i++) {
+    refs_[i]->ResetVariable(var_list[i]);
+  }
+
+  for (size_t i = 0; i < vec_instruction_.size(); i++) {
+    BuildAndCacheInstructionCtx(&vec_instruction_[i]);
+  }
+}
+
+void ProgramInterpreter::ShareWorkQueueFrom(InterpreterBaseImpl* src) {
+  async_work_queue_ =
+      reinterpret_cast<ProgramInterpreter*>(src)->GetWorkQueue();
+  VLOG(8) << "Share AsyncWorkQueue from InterpreterCore(" << src
+          << ") to InterpreterCore(" << this << ")";
+}
+
+bool ProgramInterpreter::BuildInplaceCheckVarIsOnlyInput(
+    const std::vector<std::vector<size_t>>& input_var2op, size_t var_index) {
+  if (!var_scope_.VarDesc(var_index)) {
+    return input_var2op.at(var_index).size() == 1;
+  } else {
+    int is_input_cnt = 0;
+    for (auto inst_id : input_var2op.at(var_index)) {
+      OpInOutInfo info;
+      info.Build(vec_instruction_.at(inst_id).OpBase());
+      if (info.IsInArgBufferNeeded(var_scope_.VarDesc(var_index)->Name())) {
+        is_input_cnt++;
+      }
+    }
+    return is_input_cnt == 1;
+  }
+}
+
+std::shared_ptr<interpreter::AsyncWorkQueue>
+ProgramInterpreter::GetWorkQueue() {
+  if (async_work_queue_ == nullptr) {
+    async_work_queue_ = std::make_shared<interpreter::AsyncWorkQueue>(
+        execution_config_.host_num_threads,
+        execution_config_.device_num_threads,
+        nullptr);
+  }
+  return async_work_queue_;
+}
+
+void ProgramInterpreter::BuildAndCacheInstructionCtx(Instruction* instr_node) {
+  Scope* inner_scope =
+      HasLocalScope() ? local_scope_ : var_scope_.GetMutableScope();
+  VariableValueMap ins_map;
+  for (auto& var_name_item : instr_node->Inputs()) {
+    std::vector<Variable*> input_vars;
+
+    input_vars.reserve(var_name_item.second.size());
+    for (auto& id : var_name_item.second) {
+      input_vars.emplace_back(inner_scope->FindVar(var_scope_.GetNameById(id)));
+    }
+    ins_map.emplace(var_name_item.first, std::move(input_vars));
+  }
+
+  VariableValueMap outs_map;
+  for (auto& var_name_item : instr_node->Outputs()) {
+    std::vector<Variable*> out_vars;
+
+    out_vars.reserve(var_name_item.second.size());
+    for (auto& id : var_name_item.second) {
+      out_vars.emplace_back(inner_scope->FindVar(var_scope_.GetNameById(id)));
+    }
+    outs_map.emplace(var_name_item.first, std::move(out_vars));
+  }
+
+  // set runtime_ctx and infershape_ctx_
+  if (instr_node->OpBase()->Type() == "cinn_launch" ||
+      instr_node->OpBase()->Type() == "cinn_instruction_run") {  // OP use scope
+                                                                 // in kernel
+    Scope* local_scope = HasLocalScope() ? var_scope_.GetMutableLocalScope()
+                                         : var_scope_.GetMutableScope();
+    instr_node->ResetContextWithScope(ins_map, outs_map, *local_scope);
+  } else {
+    instr_node->ResetContext(ins_map, outs_map);
+  }
+}
+
+void ProgramInterpreter::BuildInplace() {
+  // NOTE(Ruibiao): coalesce_tensor_op outputs a FusedOutput phi::DenseTensor
+  // and a list of Output Tensors which are sliced from the FusedOutput. These
+  // outputs sholud not be the outvar of the in-place var-pair since memory
+  // reuse between FusedOutput and Output Tensors is assumed. For the following
+  // example:
+  // fused_var, var1, var2, var3 = coalesce_tensor(var1, var2, var3)
+  // var1 = sum(var4, var5)
+  // ...
+  //
+  // After running coalesce_tensor_op, var1 is assumed to share the buffer
+  // slices from fused_var. However, if sum_op is in-place, then var1 would
+  // re-share the buffer with var4 instead of fused_var.
+  std::set<std::string> skip_inplace_outvars;
+  for (Instruction& instr : vec_instruction_) {
+    OperatorBase* op = instr.OpBase();
+    if (op->Type() == kCoalesceTensor) {
+      const std::vector<std::string>& outputs =
+          op->OutputVars(/*has_intermediate=*/false);
+      skip_inplace_outvars.insert(outputs.begin(), outputs.end());
+    }
+  }
+
+  Scope* local_scope = HasLocalScope() ? var_scope_.GetMutableLocalScope()
+                                       : var_scope_.GetMutableScope();
+  std::vector<std::vector<size_t>> input_var2op(var_scope_.VarSize());
+  for (Instruction& instr : vec_instruction_) {
+    for (auto& item : instr.Inputs()) {
+      for (int var_id : item.second) {
+        if (var_id != kEmptyVarIndex) {
+          input_var2op.at(var_id).push_back(instr.Id());
+        }
+      }
+    }
+  }
+
+  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
+    auto& instr = vec_instruction_[i];
+    auto* op_base = instr.OpBase();
+    if (!op_base->Info().infer_inplace_) {
+      continue;
+    }
+
+    auto in_to_outs = op_base->Info().infer_inplace_(
+        platform::is_gpu_place(instr.DeviceContext().GetPlace()));
+
+    auto& inputs = instr.Inputs();
+    auto& outputs = instr.Outputs();
+    for (auto& pair : in_to_outs) {
+      auto iter = inputs.find(pair.first);
+      if (iter != inputs.end() && !iter->second.empty()) {
+        auto in_var_desc = var_scope_.VarDesc(iter->second[0]);
+        if (in_var_desc && in_var_desc->Persistable()) {
+          continue;
+        }
+        if (var_scope_.GetVarSikpInplace(iter->second[0])) {
+          continue;
+        }
+        if (BuildInplaceCheckVarIsOnlyInput(input_var2op, iter->second[0])) {
+          auto iterout = outputs.find(pair.second);
+          if (iterout != outputs.end() && !iterout->second.empty()) {
+            const std::string& invar_name =
+                var_scope_.GetNameById(iter->second[0]);
+            const std::string& outvar_name =
+                var_scope_.GetNameById(iterout->second[0]);
+            auto invar = local_scope->FindVar(invar_name);
+            auto outvar = local_scope->FindVar(outvar_name);
+
+            if (invar && outvar && invar->IsType<phi::DenseTensor>() &&
+                outvar->IsType<phi::DenseTensor>() &&
+                skip_inplace_outvars.find(outvar_name) ==
+                    skip_inplace_outvars.end()) {
+              instr.AddInplace(invar, outvar);
+              VLOG(3) << "inplace " << op_base->Type() << " " << invar_name
+                      << " -> " << outvar_name;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void ProgramInterpreter::PrepareForCUDAGraphCapture() {
+  if (!FLAGS_new_executor_use_cuda_graph) return;
+#ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE_EQ(
+      platform::IsCUDAGraphCapturing(),
+      false,
+      platform::errors::PermissionDenied("CUDA Graph is not allowed to capture "
+                                         "before prepare."));
+  PADDLE_ENFORCE_EQ(platform::is_gpu_place(place_),
+                    true,
+                    platform::errors::InvalidArgument(
+                        "CUDA Graph is only supported on NVIDIA GPU device."));
+  // If set true, will call `cudaStreamSynchronize(nccl_stream)`after allreduce.
+  // which may cause error in cuda graph. This behavior is consistent with PE.
+  PADDLE_ENFORCE_EQ(FLAGS_sync_nccl_allreduce,
+                    false,
+                    platform::errors::InvalidArgument(
+                        "FLAGS_sync_nccl_allreduce must be False to support "
+                        "CUDA Graph capturing."));
+
+  // All output vars of coalesce_tensor op should be persistable.
+  // If fused output var of coalesce_tensor is gc, it will cause accuracy
+  // problem. The specific reasons need to be analyzed.
+  for (auto& op_desc : block_.AllOps()) {
+    if (op_desc->Type() == kCoalesceTensor) {
+      for (auto& out_var_name : op_desc->OutputArgumentNames()) {
+        // The fused var needs to be set to persistable, not just added to
+        // skip_gc_vars.
+        // In the case where the feed fetch var is changed, StandaloneExecutor
+        // will be newly constructed. If the fused var is not persistable,
+        // these vars will be recreated and initialized, resulting in
+        // precision problems.
+        auto* out_var = op_desc->Block()->FindVarRecursive(out_var_name);
+        if (out_var) {
+          out_var->SetPersistable(true);
+          VLOG(4) << "Mark Var(" << out_var_name << ") as Persistable.";
+        }
+      }
+    }
+  }
+#else
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "CUDA Graph is only supported on NVIDIA GPU device."));
+#endif
+}
+
+void ProgramInterpreter::CheckCUDAGraphBeforeRun(
+    const std::vector<std::string>& feed_names) {
+#ifdef PADDLE_WITH_CUDA
+  if (platform::IsCUDAGraphCapturing()) {
+    PADDLE_ENFORCE_EQ(
+        feed_names.empty(),
+        true,
+        platform::errors::InvalidArgument(
+            "Feeding data is not permitted when capturing CUDA Graph."));
+    PADDLE_ENFORCE_EQ(
+        FLAGS_new_executor_use_cuda_graph,
+        true,
+        platform::errors::InvalidArgument(
+            "You must turn on FLAGS_new_executor_use_cuda_graph to True "
+            "to enable CUDA Graph capturing."));
+    PADDLE_ENFORCE_EQ(
+        place_,
+        platform::CUDAGraphCapturingPlace(),
+        platform::errors::InvalidArgument("The place to capture CUDAGraph is "
+                                          "not the same as the place to run."));
+  }
+#endif
+}
+
+void ProgramInterpreter::BuildOperatorDependences() {
+  // analysis the dependences between ops, add next_instr_list to each instr,
+  // and set the dependecy_count_
+  size_t instr_num = vec_instruction_.size();
+  dependecy_count_ = std::vector<size_t>(instr_num, 0);
+  auto downstream_map = dependency_builder_.Build(vec_instruction_);
+
+  for (size_t instr_id = 0; instr_id < instr_num; ++instr_id) {
+    Instruction& cur_instr = vec_instruction_[instr_id];
+    const std::set<size_t>& next_instr_ids = downstream_map[instr_id];
+
+    if (FLAGS_new_executor_serial_run) {
+      for (size_t next_instr_id : next_instr_ids) {
+        cur_instr.AddNextInstrInSameThread(next_instr_id);
+      }
+    } else {
+      if (cur_instr.KernelType() == OpFuncType::kGpuAsync) {
+        for (size_t next_instr_id : next_instr_ids) {
+          if (vec_instruction_[next_instr_id].KernelType() ==
+              OpFuncType::kGpuAsync) {
+            cur_instr.AddNextInstrInSameThread(next_instr_id);
+          } else {
+            cur_instr.AddNextInstrInDifferentThread(next_instr_id);
+          }
+        }
+      } else {
+        bool has_instr_in_same_thread = false;
+        for (size_t next_instr_id : next_instr_ids) {
+          if (!has_instr_in_same_thread &&
+              vec_instruction_[next_instr_id].KernelType() !=
+                  OpFuncType::kGpuAsync) {
+            cur_instr.AddNextInstrInSameThread(next_instr_id);
+            has_instr_in_same_thread = true;
+          } else {
+            cur_instr.AddNextInstrInDifferentThread(next_instr_id);
+          }
+        }
+      }
+    }
+
+    for (size_t next_instr_id : next_instr_ids) {
+      ++dependecy_count_[next_instr_id];
+    }
+  }
+}
+
+// At the end of each step, the holder of phi::DenseTensor in LoDTensorArray is
+// null. Clear these Tensors and leave LoDTensorArray empty, otherwise an
+// exception will occur in the next step
+void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() {
+  auto vars = local_scope_->LocalVars();
+  for (auto var : vars) {
+    if (var->IsType<LoDTensorArray>()) {
+      auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
+      lod_tensor_arr->clear();
+    }
+  }
+}
+
+void ProgramInterpreter::Convert(
+    std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
+  auto& vec_meta_info = var_scope_.MutableVecMetaInfo();
+  auto nodes = *op_func_nodes;
+  auto op_nums = nodes.size();
+  vec_instruction_.clear();
+  vec_instruction_.reserve(op_nums);
+  for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
+    auto& op_func_node = nodes[op_idx];
+    auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node);
+    vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_);
+#ifdef PADDLE_WITH_CUDA
+    if (FLAGS_new_executor_use_cuda_graph) {
+      auto& op = op_func_node.operator_base_;
+      auto& op_type = op->Type();
+      if (op_type == interpreter::kMemcpyD2H ||
+          op_type == interpreter::kMemcpyH2D) {
+        PADDLE_THROW(paddle::platform::errors::Fatal(
+            "Cuda memory copy d2h/h2d is not allowed while using cuda graph."));
+      }
+      PADDLE_ENFORCE_EQ(typeid(*dev_ctx_) == typeid(phi::GPUContext),
+                        true,
+                        platform::errors::InvalidArgument(
+                            "Device context of op %s must be [%s] while using "
+                            "cuda graph, but got [%s].",
+                            op_type,
+                            typeid(phi::GPUContext).name(),
+                            typeid(*dev_ctx_).name()));
+      // cuda graph needs to record all stream
+      phi::backends::gpu::CUDAGraphContextManager::Instance()
+          .RecordCapturingDeviceContext(dev_ctx_);
+    }
+#endif
+  }
+
+  BuildOperatorDependences();
+
+  // NOTE(Ruibiao): For cross-step stream synchronization, an event may be
+  // recorded in the first step and waited in the second step. So, in the first
+  // step, the WaitEvent may be called without RecordEvent. Considering that
+  // before the first call to RecordEvent, an Event represents an empty set of
+  // work and WaitEvent always return succeed immediately, we omit the
+  // prelude-record for the first step here.
+  stream_analyzer_.ConstructEvents(&vec_instruction_);
+
+  // add event for the input var of jit program, since there are async copied
+  // from gpu_pinned place to gpu place on compute stream.
+  for (size_t i = 0; i < dependecy_count_.size(); ++i) {
+    if (dependecy_count_[i] == 0) {
+      auto& inst = vec_instruction_[i];
+      if (inst.OpBase()->Type() == interpreter::kMemcpyD2H &&
+          platform::is_gpu_place(place_)) {
+        for (auto& item : inst.Inputs()) {
+          for (auto var_id : item.second) {
+            auto name = var_scope_.GetNameById(var_id);
+            if (JitInputVars().count(name)) {
+              auto device_event = std::make_shared<platform::DeviceEvent>(
+                  place_, platform::GenerateDeviceEventFlag());
+              VLOG(4) << "Add input event for input: " << name << " of "
+                      << inst.OpBase()->Type();
+              inst.AddEventToWait(
+                  i, device_event, stream_analyzer_.GetWaiterType(inst));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // calculate last_live_ops_
+  for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
+    Instruction& instr = vec_instruction_[op_idx];
+    OpInOutInfo info;
+    info.Build(instr.OpBase());
+
+    std::set<size_t> gc_check_vars;
+
+    const std::map<std::string, std::vector<int>>& ins = instr.Inputs();
+    const std::map<std::string, std::vector<int>>& outs = instr.Outputs();
+    std::multimap<std::string, std::vector<int>> ins_and_outs{ins.begin(),
+                                                              ins.end()};
+    ins_and_outs.insert(outs.begin(), outs.end());
+
+    for (auto& item : ins_and_outs) {
+      for (auto id : item.second) {
+        if (id == kEmptyVarIndex) {
+          continue;
+        }
+        auto* var_desc = var_scope_.VarDesc(id);
+        // skip no_need_buffer input vars
+        if (var_desc && ins.count(item.first) &&
+            !info.IsInArgBufferNeeded(var_desc->Name())) {
+          continue;
+        }
+        // skip when this var is not in block and not a data_transferred var,
+        // which means this var is managed by other block
+        const auto& var_name = var_scope_.GetNameById(id);
+        bool not_owned = !block_.HasVar(var_name);
+        const auto& transferred_vars = var_scope_.DataTransferAddedVars();
+        bool not_transferred =
+            std::all_of(transferred_vars.begin(),
+                        transferred_vars.end(),
+                        [&](const std::pair<std::string, int>& elem) {
+                          return elem.first != var_name;
+                        });
+        if (not_owned && not_transferred) {
+          VLOG(10) << "[gc_check_inputs] skip gc: " << var_name;
+          continue;
+        }
+        gc_check_vars.insert(id);
+      }
+    }
+
+    for (auto var_id : gc_check_vars) {
+      Scope* inner_scope =
+          HasLocalScope() ? local_scope_ : var_scope_.GetMutableScope();
+      paddle::framework::Variable* var =
+          inner_scope->FindVar(var_scope_.GetNameById(var_id));
+      if (var->IsType<phi::DenseTensor>() || var->IsType<phi::SelectedRows>() ||
+          var->IsType<LoDTensorArray>()) {
+        last_live_ops_[var_id].insert(op_idx);
+      } else {
+        VLOG(4) << "not clear " << var_scope_.GetNameById(var_id) << " after "
+                << instr.OpBase()->Type() << " because its type is "
+                << framework::ToTypeName(var->Type());
+      }
+    }
+  }
+
+  // clear the last_live_ops list for all vars in skip_gc_vars
+  for (const std::string& skip_gc_var : execution_config_.skip_gc_vars) {
+    int var_id = var_scope_.GetIdByName(skip_gc_var);
+    if (var_id != -1) {
+      last_live_ops_[var_id].clear();
+      VLOG(8) << "Skip gc for var: " << skip_gc_var;
+    }
+  }
+
+  // shrink, find the downstream op that has no other op in the
+  // downstream list happens before it
+  // For example,
+  // b = op1(a)
+  // c = op2(a, b)
+  // in this case, a is the input of op1 and op2, we only need to check
+  // a after op2, because op2 always uses a after op1.
+  for (size_t i = 0; i < last_live_ops_.size(); ++i) {
+    std::set<size_t> minumum_last_live_ops;
+    for (size_t item : last_live_ops_[i]) {
+      bool not_before_any = true;
+      // find the op that is not executed before any
+      for (size_t other_item : last_live_ops_[i]) {
+        if (dependency_builder_.OpHappensBefore(item, other_item)) {
+          VLOG(8) << "happens_before: " << item << "->" << other_item
+                  << ", so skip " << item;
+          not_before_any = false;
+          break;
+        }
+      }
+      if (not_before_any) {
+        VLOG(8) << "last live op of var " << i << " "
+                << var_scope_.GetNameById(i) << " : " << item << " "
+                << vec_instruction_[item].OpBase()->Type();
+        minumum_last_live_ops.insert(item);
+        vec_instruction_[item].AddGCCheckVar(i);
+      }
+    }
+    last_live_ops_[i] = minumum_last_live_ops;
+    vec_meta_info[i].var_ref_count_ = last_live_ops_[i].size();
+  }
+
+  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
+    BuildAndCacheInstructionCtx(&vec_instruction_[i]);
+  }
+
+  bool inplaced = false;
+  for (const Instruction& inst : vec_instruction_) {
+    if (inst.OpBase()->Type() == "share_buffer" ||
+        inst.OpBase()->Type() == "share_data") {
+      VLOG(4) << "Already inplaced, skip inplace now.";
+      inplaced = true;
+    }
+  }
+
+  if (FLAGS_new_executor_use_inplace && !inplaced) {
+    BuildInplace();
+  }
+
+  for (auto& dep : dependecy_count_) {
+    deps_.emplace_back(std::make_shared<interpreter::OpDepInfo>(dep));
+  }
+  for (size_t i = 0; i < vec_meta_info.size(); ++i) {
+    refs_.emplace_back(std::make_shared<interpreter::VarRefInfo>(
+        vec_meta_info[i].var_ref_count_, var_scope_.VarRef(i)));
+  }
+
+  AnalyseExecuteOrderForTrace();
+}
+
+void ProgramInterpreter::BuildSkipShareLoDInfo() {
+  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
+    bool can_skip_lod = true;
+    for (auto& input : vec_instruction_[i].InnerRuntimeContext()->inputs) {
+      for (auto& var : input.second) {
+        if (var->IsType<phi::DenseTensor>()) {
+          if (var->Get<phi::DenseTensor>().lod().size() != 0) {
+            can_skip_lod = false;
+            break;
+          }
+        } else {
+          can_skip_lod = false;
+          break;
+        }
+      }
+    }
+    if (can_skip_lod) {
+      VLOG(8) << "skip share lod for: " << vec_instruction_[i].OpBase()->Type()
+              << " (" << i << ")";
+    }
+    vec_instruction_[i].InnerInferShapeContext()->SetSkipLoD(can_skip_lod);
+  }
+}
+
+void ProgramInterpreter::RunOperator(const Instruction& instr_node) {
+  auto* op = instr_node.OpBase();
+  auto place = instr_node.DeviceContext().GetPlace();
+  Scope* local_scope = HasLocalScope() ? var_scope_.GetMutableLocalScope()
+                                       : var_scope_.GetMutableScope();
+  VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope);
+
+  auto op_with_kernel = dynamic_cast<const framework::OperatorWithKernel*>(op);
+  {
+    // If it is OperatorBase, InferShape do nothing.
+    if (op_with_kernel != nullptr) {
+      platform::RecordEvent infershape_event(
+          "infer_shape",
+          platform::TracerEventType::OperatorInner,
+          1,
+          platform::EventRole::kInnerOp);
+
+      // see OperatorWithKernel::RunImpl in operator.cc for why
+      if (!(op_with_kernel->HasAttr(kAllKernelsMustComputeRuntimeShape) &&
+            op_with_kernel->Attr<bool>(kAllKernelsMustComputeRuntimeShape))) {
+        op_with_kernel->Info().infer_shape_(
+            instr_node.InnerInferShapeContext().get());
+      }
+      infershape_event.End();
+      platform::RecordOpInfoSupplement(op->Type(),
+                                       op->Attrs(),
+                                       *(instr_node.InnerInferShapeContext()),
+                                       *(instr_node.InnerRuntimeContext()),
+                                       op->Id());
+    }
+  }
+  if (op_with_kernel != nullptr && FLAGS_new_executor_use_inplace) {
+    // TODO(xiongkun03) Does operator base support inplace ?
+    for (auto& pair : instr_node.InplaceInfo()) {
+      const auto& in = paddle::framework::details::GetTensorFromVar(pair.first);
+      auto* out =
+          paddle::framework::details::GetMutableTensorFromVar(pair.second);
+      if (in.dims() == out->dims()) {
+        out->ShareBufferWith(in);
+      }
+    }
+  }
+
+  {
+    platform::RecordEvent compute_event(
+        "compute",
+        platform::TracerEventType::OperatorInner,
+        1,
+        platform::EventRole::kInnerOp);
+    if (op_with_kernel == nullptr) {  // operator base
+      instr_node.OpBase()->Run(*local_scope, place_);
+    } else {
+      phi::Kernel* kernel = instr_node.PhiKernel();
+      if (kernel && kernel->IsValid()) {  // phi kernel
+        if (kernel->GetKernelRegisteredType() ==
+            phi::KernelRegisteredType::FUNCTION) {
+          VLOG(4) << "Run function kernel: " << op->Type();
+          VLOG(4) << instr_node.InnerRuntimeContext().get() << " "
+                  << &instr_node.DeviceContext();
+          phi::KernelContext phi_kernel_context;
+          op_with_kernel->BuildPhiKernelContext(
+              *instr_node.InnerRuntimeContext().get(),
+              const_cast<platform::DeviceContext*>(&instr_node.DeviceContext()),
+              &phi_kernel_context);
+
+          (*kernel)(&phi_kernel_context);
+        } else {
+          VLOG(4) << "Run structure kernel: " << op->Type();
+          (*kernel)(instr_node.InnerExecutionContext().get());
+        }
+      } else {  // fluid kernel
+        instr_node.KernelFunc()(*instr_node.InnerExecutionContext().get());
+      }
+    }
+  }
+
+  VLOG(4) << "End run " << place << " " << op->DebugStringEx(local_scope);
+
+  if (!instr_node.InplaceBackMap().empty()) {
+    platform::RecordEvent inplaceback_event(
+        "InplaceVarsBack", platform::TracerEventType::UserDefined, 10);
+    auto& m = instr_node.InplaceBackMap();
+    // NOTE(zhiqiu): same logic as TransferInplaceVarsBack() in operator.cc
+    for (auto& p : m) {
+      auto* transformed_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(
+          var_scope_.VarRef(p.first));
+      auto* original_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(
+          var_scope_.VarRef(p.second));
+      original_tensor->ShareDataWith(*transformed_tensor);
+      VLOG(4) << "Transfer inplace variable back form "
+              << var_scope_.GetNameById(p.first) << " to "
+              << var_scope_.GetNameById(p.second);
+    }
+  }
+
+  /*For profiling/benchmark only*/
+  if (FLAGS_benchmark) {
+    instr_node.DeviceContext().Wait();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
+    VLOG(4) << "Operator(" << op->Type()
+            << "): context wait and get last error";
+#endif
+  }
+
+  for (auto& hook : hookfuncs_) {
+    hook(op, local_scope);
+  }
+
+  // for debug nan/inf
+  if (op_with_kernel != nullptr && FLAGS_check_nan_inf) {
+    VLOG(4) << "Check nan/inf";
+    try {
+      framework::details::CheckOpHasNanOrInf(
+          *op,
+          *local_scope,
+          place);  // TODO(xiongkun03) change it to inner scope.
+    } catch (...) {
+      const std::vector<std::string>* callstack = nullptr;
+      auto attrs = op->Attrs();
+      auto iter =
+          attrs.find(OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
+      if (iter != attrs.end()) {
+        callstack = &PADDLE_GET_CONST(std::vector<std::string>, iter->second);
+        if (callstack->empty()) callstack = nullptr;
+      }
+      std::ostringstream sout;
+      if (callstack) {
+        if (FLAGS_call_stack_level > 1) {
+          sout << "\n\n  Compile Traceback (most recent call last):";
+        } else {
+          sout << "In user code:\n";
+        }
+        for (auto& line : *callstack) {
+          sout << "\n  " << line;
+        }
+      }
+      std::cout << sout.str() << std::endl;
+      std::rethrow_exception(std::current_exception());
+    }
+  }
+}
+
+void ProgramInterpreter::RunInstruction(const Instruction& instr_node) {
+  VLOG(5) << __func__ << " OP id:" << instr_node.Id()
+          << " name:" << instr_node.OpBase()->Type() << " type:"
+          << (instr_node.KernelType() == OpFuncType::kCpuSync
+                  ? "kCpuSync"
+                  : (instr_node.KernelType() == OpFuncType::kGpuSync
+                         ? "kGpuSync"
+                         : "kGpuAsync"))
+          << " runs on " << platform::GetCurrentThreadName();
+
+  auto* op = instr_node.OpBase();
+  platform::RecordEvent instruction_event(
+      op->Type(), platform::TracerEventType::Operator, 1);
+
+  SetDeviceId(instr_node.DeviceContext().GetPlace());
+
+  try {
+    instr_node.WaitEvent(place_);
+
+    if (!instr_node.IsArtificial()) {
+      RunOperator(instr_node);
+      CheckGC(instr_node);
+      interpreter::LogDeviceMemoryStats(place_);
+    }
+
+    instr_node.RecordEvent(place_);
+  } catch (platform::EnforceNotMet& ex) {
+    framework::InsertCallStackInfo(op->Type(), op->Attrs(), &ex);
+    exception_holder_.Catch(std::make_exception_ptr(std::move(ex)));
+  } catch (platform::EOFException&) {
+    exception_holder_.Catch(std::current_exception());
+  } catch (std::exception& ex) {
+    LOG(WARNING) << op->Type() << " raises an exception "
+                 << platform::demangle(typeid(ex).name()) << ", " << ex.what();
+    exception_holder_.Catch(std::current_exception());
+  } catch (...) {
+    LOG(WARNING) << op->Type() << " raises an unknown exception";
+    exception_holder_.Catch(std::current_exception());
+  }
+}
+
+std::string ProgramInterpreter::GetDepsString() const {
+  std::stringstream ss;
+  auto downstream_map = dependency_builder_.OpDownstreamMap();
+  ss << "Note: when static_dep is 1, it is ok that the dynamic_dep will not "
+        "be decreased to 0."
+     << std::endl;
+  ss << "unfinished_op_number_:" << unfinished_op_number_ << std::endl;
+  for (size_t i = 0; i < deps_.size(); ++i) {
+    ss << "op:" << i << ", type: " << vec_instruction_[i].OpBase()->Type()
+       << ", static_dep:" << deps_[i]->StaticDep()
+       << ", dynamic_dep:" << deps_[i]->DynamicDep() << ", downstream op: ";
+    for (auto id : downstream_map[i]) {
+      ss << id << ", ";
+    }
+    ss << std::endl;
+  }
+  return ss.str();
+}
+
+void ProgramInterpreter::ExecuteInstructionList(
+    const std::vector<Instruction>& vec_instr) {
+  unfinished_op_number_ = vec_instr.size();
+  if (unfinished_op_number_ == 0) {
+    VLOG(4) << "No op to run, return";
+    return;
+  }
+
+  exception_holder_.Clear();
+
+  for (size_t i = 0; i < dependecy_count_.size(); ++i) {
+    if (dependecy_count_[i] == 0) {
+      // NOTE(zhiqiu): hot fix for jit input var
+      RecordMemcpyD2H(vec_instr.at(i));
+      if (FLAGS_new_executor_serial_run) {
+        RunInstructionAsync(i);
+      } else {
+        async_work_queue_->AddTask(vec_instr.at(i).KernelType(),
+                                   [this, i] { RunInstructionAsync(i); });
+      }
+    }
+  }
+
+  // For debug hang in main_thread_blocker_.WaitEvent(),
+  // launch async task to log deps every
+  // FLAGS_executor_log_deps_every_microseconds, then cancel the std::async when
+  // main_thread_blocker_.WaitEvent() executed. Why not use std::async instead
+  // of workqueue? To make sure that the logging thread itself will not affect
+  // the workqueue
+  //  used in interpretercore.
+
+  std::future<int> logged_times;
+  std::atomic_bool cancel_log = ATOMIC_VAR_INIT(false);
+  if (FLAGS_executor_log_deps_every_microseconds) {
+    logged_times = std::async(
+        std::launch::async,
+        [this](const std::atomic_bool& cancel) {
+          int times = 0;
+          while (!cancel) {
+            std::this_thread::sleep_for(std::chrono::microseconds(
+                FLAGS_executor_log_deps_every_microseconds));
+            // check again, since cancel may be changed during sleep
+            if (cancel) {
+              break;
+            }
+            VLOG(0) << "deps:\n" << GetDepsString();
+            times++;
+          }
+          return times;
+        },
+        std::ref(cancel_log));
+  }
+
+  auto event_name = main_thread_blocker_.WaitEvent();
+  VLOG(1) << "main_thread_blocker_(" << &main_thread_blocker_
+          << ") got event_name: " << event_name;
+
+  cancel_log = true;
+  if (logged_times.valid()) {
+    VLOG(1) << "Logged deps for " << logged_times.get() << " times";
+  }
+
+  if (UNLIKELY(exception_holder_.IsCaught())) {
+    VLOG(1) << "Exception caught " << exception_holder_.Type();
+    // Graceful exit when the executor encountered a fatal error.
+    // EOF is not a fatal error.
+    if (exception_holder_.Type() != "EOF") {
+      async_work_queue_->Cancel();
+      async_work_queue_.reset();
+    }
+    VLOG(4) << "Cancel ok";
+    PADDLE_ENFORCE_EQ(
+        main_thread_blocker_.Clear(),
+        0,
+        platform::errors::PreconditionNotMet(
+            "main_thread_blocker_.Clear() return -1, clear failed"));
+    VLOG(4) << "clear ok";
+    exception_holder_.ReThrow();
+  }
+}
+
+void ProgramInterpreter::RunNextInstructions(
+    const Instruction& instr, SchedulingQueue* reserved_next_ops) {
+  platform::RecordEvent record(
+      "RunNextInstructions", platform::TracerEventType::UserDefined, 10);
+
+  auto IsReady = [this](size_t next_id) {
+    VLOG(4) << "op_id: " << next_id
+            << ", remain deps: " << deps_[next_id]->DynamicDep();
+    return deps_[next_id]->CheckAndDecrease();
+  };
+
+  for (size_t next_instr_id : instr.NextInstrsInDifferenceThread()) {
+    if (IsReady(next_instr_id)) {
+      async_work_queue_->AddTask(
+          vec_instruction_[next_instr_id].KernelType(),
+          [this, next_instr_id]() { RunInstructionAsync(next_instr_id); });
+    }
+  }
+
+  for (size_t next_instr_id : instr.NextInstrsInSameThread()) {
+    if (IsReady(next_instr_id)) {
+      reserved_next_ops->push(next_instr_id);
+    }
+  }
+}
+
+void ProgramInterpreter::RunInstructionAsync(size_t instr_id) {
+  // NOTE(Ruibiao): Due to the uncertain order in multi-threading asynchronous
+  // scheduling, the priority order involved cross-thread scheduling is not
+  // guaranteed. Only Ops scheduled by the same AddTask call have the guarantee
+  // of priority order.
+  SchedulingQueue ready_ops(instruction_scheduling_priority_less);
+  ready_ops.push(instr_id);
+  while (!ready_ops.empty()) {
+    instr_id = ready_ops.top();
+    ready_ops.pop();
+    auto& instr_node = vec_instruction_.at(instr_id);
+
+    RunInstruction(instr_node);
+
+    if (UNLIKELY(exception_holder_.IsCaught())) {
+      VLOG(4) << "Exception caught";
+      if (exception_notifier_ != nullptr) {
+        exception_notifier_->NotifyEvent();
+      }
+      return;
+    }
+
+    VLOG(4) << "unfinished_op_number_: " << unfinished_op_number_;
+    if (UNLIKELY(unfinished_op_number_.fetch_sub(
+                     1, std::memory_order_relaxed) == 1)) {
+      if (completion_notifier_ != nullptr) {
+        completion_notifier_->NotifyEvent();
+      }
+    }
+
+    RunNextInstructions(instr_node, &ready_ops);
+  }
+}
+
+void ProgramInterpreter::RecordStreamForGC(const Instruction& instr) {
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "RecordStreamForGC is only implemented when compiled with GPU."));
+#else
+  if (!IsInterpretercoreFastGCEnabled() ||
+      instr.KernelType() != OpFuncType::kGpuAsync) {
+    return;
+  }
+
+  if (instr.DeviceContext().GetPlace() == phi::CustomPlace()) {
+    return;
+  }
+
+  platform::RecordEvent record(
+      "RecordStreamForGC", platform::TracerEventType::UserDefined, 10);
+
+  gpuStream_t stream =
+      reinterpret_cast<const phi::GPUContext&>(instr.DeviceContext()).stream();
+  auto TensorRecordStream = [&stream](phi::DenseTensor& tensor) {
+    auto allocation = tensor.Holder();
+    if (allocation == nullptr) {
+      return;
+    }
+
+    const platform::Place& place = allocation->place();
+    if (platform::is_gpu_place(place)) {
+      memory::RecordStream(allocation, stream);
+    } else if (platform::is_cuda_pinned_place(place)) {
+      // TODO(Ruibiao): Here should do something to make sure that the tensor
+      // is not freed until the H2D copies done. However, simplely launch a
+      // CUDA runtime callback to the H2D stream may lead a high performance
+      // overhead. As all the cases we meet in H2D are copies from CPUPlace at
+      // present, we just log a WARNING here. A better design is required.
+      LOG(WARNING) << "Copy data from a CUDAPinned tensor in an asynchronous "
+                      "manner may lead a data inconsistent";
+    } else {
+      // memory copies involve CPUPlace are always synchronous, so just do
+      // nothing here
+    }
+  };
+
+  /* NOTE(Ruibiao)：Cross-stream tensor synchronization is required only when
+   * all the following conditions are satisfied:
+   * 1. The tensor will be GC after running the instruction, i.e., in
+   * instr.GCCheckVars.
+   * 2. The stream which initializes this tensor is different from the stream
+   * which the instruction run in.
+   * 3. The tensor is the instruction's input, cause we assume that
+   * instruction will initialize all output tensors with its running stream.
+   * 4. In the OP function of this instruction, the tensor is an input of a
+   * async CUDA kernel.
+   *
+   * Here we only process the first condition, because:
+   * 1. Since the RecordStream function will directly return when the recored
+   * stream is equal to the owning stream, recording a stream same as which
+   * initialized this tensor has less time overhead. Conversely, it may take
+   * more time if we try to extract those cross-stream input vars from
+   * instr.GCCheckVars.
+   * 2. Now the instruction has no idea of which vars involving async running
+   * in OP function, and thus we can not recognize condition 4. It should be
+   * supported later.
+   */
+  for (int var_id : instr.GCCheckVars()) {
+    VLOG(4) << "GC sync " << var_scope_.GetNameById(var_id) << " "
+            << var_scope_.VarDesc(var_id);
+
+    // persistable var will be ignore while GC
+    if (var_scope_.VarDesc(var_id) &&
+        var_scope_.VarDesc(var_id)->Persistable()) {
+      continue;
+    }
+
+    paddle::framework::Variable* var = var_scope_.VarRef(var_id);
+    if (var == nullptr) {
+      continue;
+    }
+
+    if (var->IsType<phi::DenseTensor>()) {
+      TensorRecordStream(*(var->GetMutable<phi::DenseTensor>()));
+    } else if (var->IsType<
+                   operators::reader::
+                       OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {
+      // do nothing
+    } else if (var->IsType<phi::SelectedRows>()) {
+      TensorRecordStream(
+          *(var->GetMutable<phi::SelectedRows>()->mutable_value()));
+    } else if (var->IsType<LoDTensorArray>()) {
+      auto* tensor_arr = var->GetMutable<LoDTensorArray>();
+      for (auto& tensor : *tensor_arr) {
+        TensorRecordStream(tensor);
+      }
+    } else if (var->IsType<std::vector<Scope*>>()) {
+      // do nothing
+    } else {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "The variable(%s) is not supported in eager deletion.",
+          framework::ToTypeName(var->Type())));
+    }
+  }
+#endif
+}
+
+void ProgramInterpreter::CheckGC(const Instruction& instr) {
+  platform::RecordEvent record(
+      "CheckGC", platform::TracerEventType::UserDefined, 10);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  RecordStreamForGC(instr);
+#endif
+  auto& var_scope = var_scope_;
+
+  for (auto var_id : instr.GCCheckVars()) {
+    VLOG(4) << "GC:" << var_scope_.GetNameById(var_id) << ", id:" << var_id
+            << ", ref:" << refs_[var_id]->DynamicRef();
+    bool is_ready = refs_[var_id]->CheckAndDecrease();
+    // ignore all persistable var while GC
+    if (var_scope.VarDesc(var_id) && var_scope.VarDesc(var_id)->Persistable()) {
+      continue;
+    }
+    if (is_ready) {
+      VLOG(6) << "Async delete variable with name : "
+              << var_scope.GetNameById(var_id);
+      gc_->Add(refs_[var_id]->Var(), instr);
+    }
+  }
+}
+
+void ProgramInterpreter::Prepare(
+    const std::vector<std::string>& feed_names,
+    const std::vector<phi::DenseTensor>& feed_tensors,
+    bool prepare_feed) {
+  PADDLE_ENFORCE_EQ(feed_names.size(),
+                    feed_tensors.size(),
+                    platform::errors::PreconditionNotMet(
+                        "Required feed_names.size() == feed_tensors.size(), "
+                        "but received %d != %d",
+                        feed_names.size(),
+                        feed_tensors.size()));
+  auto FeedInput = [&] {
+    VLOG(4) << "Feed inputs";
+    for (size_t i = 0; i < feed_names.size(); ++i) {
+      auto* feed_var = local_scope_->FindVar(feed_names[i]);
+      PADDLE_ENFORCE_NOT_NULL(
+          feed_var,
+          platform::errors::NotFound("Variable %s should not be nullptr.",
+                                     feed_names[i]));
+
+      auto feed_tensor = feed_var->GetMutable<phi::DenseTensor>();
+      feed_tensor->ShareDataWith(feed_tensors[i]);
+      feed_tensor->set_lod(feed_tensors[i].lod());
+    }
+  };
+
+  if (!is_build_) {
+    paddle::framework::interpreter::BuildVariableScope(
+        block_, execution_config_, &var_scope_);
+    FeedInput();
+    std::vector<paddle::framework::OpFuncNode> op_func_nodes;
+    paddle::framework::interpreter::BuildOpFuncList(
+        place_,
+        block_,
+        execution_config_.skip_gc_vars,
+        &op_func_nodes,
+        &var_scope_,
+        execution_config_,
+        HasLocalScope(),
+        static_build_);
+    SetFeedVarsInplaceSkip(feed_names);
+    // convert vec func_list to graph
+    Convert(&op_func_nodes);
+    UpdateSyncOpNum();
+    if (static_build_) {
+      VLOG(4) << "RUN impl";
+      RunImpl();
+    }
+    BuildSkipShareLoDInfo();
+    is_build_ = true;
+  }
+  // NOTE: Because feed_tensor will be GC after
+  // paddle::framework::BuildOpFuncList, so we should
+  // call FeedInput again.
+  if (prepare_feed) {
+    FeedInput();
+  }
+}
+
+void ProgramInterpreter::SetFeedVarsInplaceSkip(
+    const std::vector<std::string>& feed_names) {
+  for (auto& feed_name : feed_names) {
+    var_scope_.SetVarSikpInplace(feed_name, true);
+  }
+}
+
+bool ProgramInterpreter::HasLocalScope() const {
+  return local_scope_ != nullptr;
+}
+
+// Note(zhangbo):
+// (1) What is "Trace"?
+// The OP execute scheduling rule adopted by Interpretercore by default is a
+// multi-threaded scheduling mode(see ExecuteInstructionList). By maintaining a
+// high-performance thread pool, the OP's execute scheduling is distributed to
+// the sub threads maintained by the thread pool, but the main thread does not
+// have any tasks. In Trace mode, the executor will execute directly in the main
+// thread according to the pre provided OP sequence(trace_execute_order_),
+// instead of being distributed to the thread pool.
+// (2) When we use "Trace"?
+// In dygraph to static, This scheduling causes that the execution of the
+// forward and backward OPs and the execution of the dygraph optimizer cannot be
+// executed in the same thread. Executing thread switch may cause cpu cache
+// miss. When a model is all KQueueAsync type OPs, all OPs will be distributed
+// to the DeviceThread for execution, and the multithreading scheduling will not
+// have any benefits. Therefore, in the dynamic to static, when the number of
+// KQueueAsync Ops is 0, we choose Trace mode.
+void ProgramInterpreter::TraceInstructionList(
+    const std::vector<Instruction>& vec_instr) {
+  unfinished_op_number_ = vec_instr.size();
+  if (unfinished_op_number_ == 0) {
+    VLOG(4) << "No op to run, return";
+    return;
+  }
+
+  exception_holder_.Clear();
+
+  for (size_t i = 0; i < dependecy_count_.size(); ++i) {
+    if (dependecy_count_[i] == 0) {
+      // NOTE(zhiqiu): hot fix for jit input var
+      RecordMemcpyD2H(vec_instr.at(i));
+    }
+  }
+
+  for (size_t idx = 0; idx < trace_execute_order_.size(); idx++) {
+    auto instr_id = trace_execute_order_[idx];
+    auto& instr_node = vec_instruction_.at(instr_id);
+
+    RunInstruction(instr_node);
+
+    if (UNLIKELY(exception_holder_.IsCaught())) {
+      VLOG(4) << "Exception caught";
+      break;
+    }
+  }
+
+  if (UNLIKELY(exception_holder_.IsCaught())) {
+    VLOG(1) << "Exception caught " << exception_holder_.Type();
+    PADDLE_ENFORCE_EQ(
+        main_thread_blocker_.Clear(),
+        0,
+        platform::errors::PreconditionNotMet(
+            "main_thread_blocker_.Clear() return -1, clear failed"));
+    VLOG(4) << "clear ok";
+    exception_holder_.ReThrow();
+  }
+}
+
+void ProgramInterpreter::RecordMemcpyD2H(const Instruction& instr_node) {
+  // NOTE(zhiqiu): hot fix for jit input var
+  if (instr_node.OpBase()->Type() == interpreter::kMemcpyD2H) {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto* default_dev_ctx = pool.Get(place_);
+    for (auto& event : instr_node.EventsToWait()) {
+      platform::RecordEvent record(
+          "RecordStreamEvent", platform::TracerEventType::UserDefined, 10);
+      VLOG(3) << "Record event on default stream in jit_input_var at op: "
+              << instr_node.OpBase()->Type();
+      event.event_->Record(default_dev_ctx);
+    }
+  }
+}
+
+void ProgramInterpreter::UpdateSyncOpNum() {
+  int64_t sync_op_num = 0;
+  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
+    if (vec_instruction_[i].KernelType() == OpFuncType::kCpuSync ||
+        vec_instruction_[i].KernelType() == OpFuncType::kGpuSync) {
+      sync_op_num = sync_op_num + 1;
+    }
+  }
+  sync_op_num_ = sync_op_num;
+  VLOG(4) << "Update sync op num, sync op num is: " << sync_op_num_;
+}
+
+// Note(zhangbo):
+// When there is a KQueueSync type OP in the model, breadth traversal is better
+// than depth traversal. For example: OP(O) ->(direct_run)-> OP(A)
+// ->(sync_run)-> OP(B) OP(O) ->(direct_run)-> OP(C) ->(direct_run)-> OP(D) If B
+// is run before C, B may always block to wait for A to finish executing, but in
+// fact, C can be executed first during this time.
+void ProgramInterpreter::AnalyseExecuteOrderForTrace() {
+  VLOG(4) << "Analyze the execution order of Trace scheduling mode.";
+  interpreter::ResetAtomicGuard guard(&deps_, &refs_);
+
+  auto op_downstream_map = dependency_builder_.OpDownstreamMap();
+
+  auto IsReady = [this](size_t next_id) {
+    VLOG(4) << "op_id: " << next_id
+            << ", remain deps: " << deps_[next_id]->DynamicDep();
+    return deps_[next_id]->CheckAndDecrease();
+  };
+
+  std::vector<size_t> trace_order;
+  SchedulingQueue ready_ops(instruction_scheduling_priority_less);
+
+  for (size_t instr_id = 0; instr_id < dependecy_count_.size(); ++instr_id) {
+    if (dependecy_count_[instr_id] == 0) {
+      ready_ops.push(instr_id);
+    }
+  }
+
+  while (!ready_ops.empty()) {
+    size_t now_id = ready_ops.top();
+    ready_ops.pop();
+    trace_order.push_back(now_id);
+
+    auto next_op_set = op_downstream_map[now_id];
+
+    for (size_t next_op_id : next_op_set) {
+      if (IsReady(next_op_id)) {
+        ready_ops.push(next_op_id);
+      }
+    }
+  }
+
+  PADDLE_ENFORCE_EQ(
+      trace_order.size(),
+      dependecy_count_.size(),
+      platform::errors::PreconditionNotMet(
+          "trace_order size should be equal to dependecy_count_."));
+
+  trace_execute_order_ = trace_order;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h
new file mode 100644
index 00000000000..ca66691608d
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/program_interpreter.h
@@ -0,0 +1,181 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/new_executor/interpreter_base_impl.h"
+
+namespace paddle {
+namespace framework {
+
+///
+/// \brief Derived Class to interpret the instructions transformed
+/// from legacy ProgramDesc.
+///
+
+class ProgramInterpreter : public InterpreterBaseImpl {
+  using ExecutionConfig = interpreter::ExecutionConfig;
+  using InstructionSchedulingPriorityLess = std::function<bool(size_t, size_t)>;
+  using SchedulingQueue =
+      std::priority_queue<size_t,
+                          std::vector<size_t>,
+                          InstructionSchedulingPriorityLess>;
+
+ public:
+  ProgramInterpreter(
+      const platform::Place& place,
+      const BlockDesc& block,
+      Scope* scope,
+      const ExecutionConfig& execution_config = ExecutionConfig());
+
+  ~ProgramInterpreter();
+
+  paddle::framework::FetchList Run(
+      const std::vector<std::string>& feed_names,
+      const std::vector<phi::DenseTensor>& feed_tensors) override;
+
+  paddle::framework::FetchList Run(const std::vector<std::string>& feed_names,
+                                   bool need_fetch = true) override;
+
+  void ShareWorkQueueFrom(InterpreterBaseImpl* src) override;
+
+  void SetCopyProgram(std::shared_ptr<ProgramDesc> prog) override;
+
+  void SetSkipGcVars(const std::set<std::string>& skip_gc_vars) override;
+
+  const std::set<std::string>& JitInputVars() const override;
+
+  void SetJitInputVars(const std::set<std::string>& jit_input_vars) override;
+
+  const VariableScope* GetVariableScope() const override;
+
+  void reset_scope(Scope* new_scope) override;
+
+  const platform::Place& GetPlace() const override { return place_; }
+
+  void SetOutputHooks(const std::vector<HookFunc>& hookfuncs) override {
+    hookfuncs_ = hookfuncs;
+  }
+
+ private:
+  // build graph
+  void Convert(std::vector<paddle::framework::OpFuncNode>* op_func_nodes);
+  void BuildOperatorDependences();
+  void BuildAndCacheInstructionCtx(Instruction* instr_node);
+  void BuildSkipShareLoDInfo();
+  void UpdateSyncOpNum();
+  void AnalyseExecuteOrderForTrace();
+
+  // inplace
+  void BuildInplace();
+  bool BuildInplaceCheckVarIsOnlyInput(
+      const std::vector<std::vector<size_t>>& input_var2op, size_t var_index);
+  void SetFeedVarsInplaceSkip(const std::vector<std::string>& feed_names);
+
+  // cuda graph
+  void CheckCUDAGraphBeforeRun(const std::vector<std::string>& feed_names);
+  void PrepareForCUDAGraphCapture();
+
+  // execution
+  void RunImpl();
+  void ExecuteInstructionList(const std::vector<Instruction>& vec_instr);
+  void RunInstructionAsync(size_t instr_id);
+  void RunInstruction(const Instruction& instr_node);
+  void RunNextInstructions(const Instruction& instr_id,
+                           SchedulingQueue* reserved_next_ops);
+  void RunOperator(const Instruction& instr_node);
+  // Trace
+  void TraceInstructionList(const std::vector<Instruction>& vec_instr);
+
+  // only used when program contains no feed op
+  void Prepare(const std::vector<std::string>& feed_names,
+               const std::vector<phi::DenseTensor>& feed_tensors,
+               bool prepare_feed);
+
+  void RecordMemcpyD2H(const Instruction& instr_node);
+
+  // gc
+  void RecordStreamForGC(const Instruction& instr);
+  void CheckGC(const Instruction& instr);
+  void ClearLoDTensorArrayInLocalScope();
+
+  // workqueue
+  std::shared_ptr<interpreter::AsyncWorkQueue> GetWorkQueue();
+
+  // scope
+  bool HasLocalScope() const;
+
+  // For log and debug
+  std::string GetDepsString() const;
+
+  bool is_build_{false};
+  bool static_build_{false};
+
+  const platform::Place place_;
+  const BlockDesc& block_;  // not owned
+
+  interpreter::DependencyBuilder dependency_builder_;
+  interpreter::StreamAnalyzer stream_analyzer_;
+
+  // NOTE(zhiqiu): when add fetch ops in GetInterpreterCore, we will
+  // copy a new program and block, the copy_program_ here is used to
+  // hold the program, otherwise block_ maybe not valid after the
+  // new program is deleted.
+  std::shared_ptr<ProgramDesc> copy_program_{nullptr};
+
+  // from variable scope
+  std::vector<Variable*> var_list_;
+  std::map<std::string, int> name2id_;
+  std::vector<VariableMetaInfo> vec_meta_info_;
+
+  std::vector<Instruction> vec_instruction_;  // deconstruct before OpFuncNode
+
+  std::atomic<size_t> unfinished_op_number_{0};
+
+  ExecutionConfig execution_config_;
+
+  VariableScope var_scope_;
+  Scope* local_scope_{nullptr};  // not owned
+
+  EventsWaiter main_thread_blocker_;
+  std::shared_ptr<interpreter::AsyncWorkQueue> async_work_queue_;
+
+  details::ExceptionHolder exception_holder_;
+  std::shared_ptr<EventsWaiter::EventNotifier> exception_notifier_{nullptr};
+  std::shared_ptr<EventsWaiter::EventNotifier> completion_notifier_{nullptr};
+
+  std::unique_ptr<InterpreterCoreGarbageCollector> gc_;
+
+  // last_live_ops_[i] contains the id of operators that last access the i-th
+  // var
+  std::map<size_t, std::set<size_t>> last_live_ops_;
+
+  // dependecy_count_[i] contains the number of dependencies that the i-th op
+  // need to wait
+  std::vector<size_t> dependecy_count_;
+
+  std::vector<std::shared_ptr<interpreter::OpDepInfo>> deps_;
+  std::vector<std::shared_ptr<interpreter::VarRefInfo>> refs_;
+
+  // used for Trace
+  int64_t sync_op_num_{-1};
+  std::vector<size_t> trace_execute_order_;
+
+  InstructionSchedulingPriorityLess instruction_scheduling_priority_less;
+
+  std::vector<HookFunc> hookfuncs_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index 73bb7568435..2bd6045b401 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -71,12 +71,8 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
       auto base_progrm = paddle::TranslateLegacyProgramToProgram(*program);
       auto kernel_program =
           paddle::dialect::PdOpLowerToKernelPass(base_progrm.get());
-      interpretercores_.emplace_back(
-          std::make_unique<InterpreterCore>(place_,
-                                            program->Block(0),
-                                            scope_,
-                                            std::move(kernel_program),
-                                            execution_config));
+      interpretercores_.emplace_back(std::make_unique<InterpreterCore>(
+          place_, std::move(kernel_program), scope_, execution_config));
     } else {
       interpretercores_.emplace_back(
           std::make_unique<InterpreterCore>(place_,
diff --git a/test/cpp/new_executor/standalone_executor_new_ir_test.cc b/test/cpp/new_executor/standalone_executor_new_ir_test.cc
index ae6d87370a7..d9547fc757d 100644
--- a/test/cpp/new_executor/standalone_executor_new_ir_test.cc
+++ b/test/cpp/new_executor/standalone_executor_new_ir_test.cc
@@ -69,8 +69,7 @@ TEST(StandaloneExecutor, run) {
   Scope scope;
 
   ProgramDesc prog_desc;
-  InterpreterCore test_core(
-      place, prog_desc.Block(0), &scope, std::move(kernel_program));
+  InterpreterCore test_core(place, std::move(kernel_program), &scope);
 
   test_core.Run({});
 
@@ -139,8 +138,7 @@ TEST(StandaloneExecutor, run_2) {
 
   ProgramDesc prog_desc;
 
-  InterpreterCore test_core(
-      place, prog_desc.Block(0), &scope, std::move(kernel_program));
+  InterpreterCore test_core(place, std::move(kernel_program), &scope);
 
   test_core.Run({});
 
-- 
GitLab