diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc
index 8c64d65ff4be660150519ea28f48c24144fc2e27..6e5578a2d12b4c29445c1ee4597431a647a13c9a 100644
--- a/paddle/fluid/framework/executor_gc_helper.cc
+++ b/paddle/fluid/framework/executor_gc_helper.cc
@@ -31,6 +31,36 @@
 namespace paddle {
 namespace framework {
 
+void OpInOutInfo::Build(const OperatorBase *op) {
+  is_built_ = true;
+  auto &inferer = op->Info().NoNeedBufferVarsInferer();
+  if (inferer) {
+    no_need_buffer_ins_ = inferer(op->Inputs(), op->Outputs(), op->Attrs());
+
+    if (no_need_buffer_ins_.empty()) return;
+
+    for (auto &in_name_pair : op->Inputs()) {
+      if (no_need_buffer_ins_.count(in_name_pair.first) != 0) {
+        continue;
+      }
+
+      for (auto &in_arg_name : in_name_pair.second) {
+        other_args_set_.insert(in_arg_name);
+      }
+    }
+
+    for (auto &out_name_pair : op->Outputs()) {
+      for (auto &out_arg_name : out_name_pair.second) {
+        other_args_set_.insert(out_arg_name);
+      }
+    }
+  }
+}
+
+bool OpInOutInfo::IsInArgBufferNeeded(const std::string &in_arg_name) const {
+  return no_need_buffer_ins_.empty() || other_args_set_.count(in_arg_name) != 0;
+}
+
 static bool VarCanBeDeleted(const std::string &name, const BlockDesc &block,
                             const std::unordered_set<std::string> &skip_vars) {
   if (skip_vars.count(name) != 0) {
diff --git a/paddle/fluid/framework/executor_gc_helper.h b/paddle/fluid/framework/executor_gc_helper.h
index e354a83e5c8caca8c22f6d930fb978485b1f94a4..80bd68f7c83042cec9b855310d2e2e31e54334ce 100644
--- a/paddle/fluid/framework/executor_gc_helper.h
+++ b/paddle/fluid/framework/executor_gc_helper.h
@@ -33,38 +33,11 @@ class Scope;
 
 struct OpInOutInfo {
  public:
-  void Build(const OperatorBase *op) {
-    is_built_ = true;
-    auto &inferer = op->Info().NoNeedBufferVarsInferer();
-    if (inferer) {
-      no_need_buffer_ins_ = inferer(op->Inputs(), op->Outputs(), op->Attrs());
-
-      if (no_need_buffer_ins_.empty()) return;
-
-      for (auto &in_name_pair : op->Inputs()) {
-        if (no_need_buffer_ins_.count(in_name_pair.first) != 0) {
-          continue;
-        }
-
-        for (auto &in_arg_name : in_name_pair.second) {
-          other_args_set_.insert(in_arg_name);
-        }
-      }
-
-      for (auto &out_name_pair : op->Outputs()) {
-        for (auto &out_arg_name : out_name_pair.second) {
-          other_args_set_.insert(out_arg_name);
-        }
-      }
-    }
-  }
+  void Build(const OperatorBase *op);
 
   bool IsBuilt() const { return is_built_; }
 
-  bool IsInArgBufferNeeded(const std::string &in_arg_name) const {
-    return no_need_buffer_ins_.empty() ||
-           other_args_set_.count(in_arg_name) != 0;
-  }
+  bool IsInArgBufferNeeded(const std::string &in_arg_name) const;
 
  private:
   // A set to record unused buffer input vars of op
diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt
index d758e98b417e70a89e729e003c5554717b5c5100..622aeec142c3a2d50d2d07e4c4afaf46e46ff011 100644
--- a/paddle/fluid/framework/new_executor/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/CMakeLists.txt
@@ -3,10 +3,11 @@ lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper l
 graph_to_program_pass variable_helper timer monitor nan_inf_utils)
 
 cc_library(workqueue SRCS workqueue.cc workqueue_utils.cc DEPS enforce)
-cc_library(interpretercore_garbage_collector SRCS interpretercore_garbage_collector.cc DEPS workqueue ${DEVICE_EVENT_LIBS})
-cc_library(interpretercore_util SRCS interpretercore_util.cc DEPS ${INTERPRETERCORE_DEPS} workqueue)
-cc_library(event_manager SRCS event_manager.cc DEPS ${DEVICE_EVENT_LIBS} glog)
-cc_library(stream_analyzer SRCS stream_analyzer.cc DEPS ${DEVICE_EVENT_LIBS} glog device_context)
+cc_library(new_executor_defs SRCS new_executor_defs.cc DEPS enforce glog scope)
+cc_library(interpretercore_garbage_collector SRCS interpretercore_garbage_collector.cc DEPS workqueue ${DEVICE_EVENT_LIBS} executor_gc_helper)
+cc_library(interpretercore_util SRCS interpretercore_util.cc DEPS ${INTERPRETERCORE_DEPS} workqueue new_executor_defs)
+cc_library(event_manager SRCS event_manager.cc DEPS ${DEVICE_EVENT_LIBS} glog new_executor_defs)
+cc_library(stream_analyzer SRCS stream_analyzer.cc DEPS ${DEVICE_EVENT_LIBS} glog device_context new_executor_defs)
 cc_library(interpretercore SRCS interpretercore.cc DEPS workqueue ${DEVICE_EVENT_LIBS} interpretercore_util interpretercore_garbage_collector stream_analyzer event_manager)
 cc_library(standalone_executor SRCS standalone_executor.cc DEPS interpretercore)
 cc_test(workqueue_test SRCS workqueue_test.cc DEPS workqueue)
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 89810fd303802c1eeec86062f2e43d620fef3692..84b765680fdbd96f2cb78d9502dd762fac960e43 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -121,6 +121,8 @@ void InterpreterCore::Convert() {
     for (auto var_id : gc_check_input_list) {
       vec_meta_info[var_id].var_ref_count_++;
       instr.AddGCCheckVar(var_id);
+      VLOG(4) << "clear " << global_scope_->GetNameById(var_id) << " after "
+              << instr.OpBase()->Type();
     }
   }
 
@@ -131,6 +133,8 @@ void InterpreterCore::Convert() {
         if (input_var2op_info_.at(id).size() == 0) {
           // output var not be used by any kernel
           vec_instruction_[i].AddGCCheckVar(id);
+          VLOG(4) << "clear " << global_scope_->GetNameById(id) << " after "
+                  << vec_instruction_[i].OpBase()->Type();
           vec_meta_info[id].var_ref_count_++;
         }
       }
@@ -437,6 +441,8 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) {
 
     try {
       RunInstruction(instr_node);
+      // GC infomation
+      CheckGC(instr_node);
     } catch (platform::EnforceNotMet& ex) {
       framework::InsertCallStackInfo(op->Type(), op->Attrs(), &ex);
       exception_holder_.Catch(std::make_exception_ptr(std::move(ex)));
@@ -463,9 +469,6 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) {
     interpreter::RecordEvent(instr_node, place_);
     op_run_number_.fetch_add(1, std::memory_order_relaxed);
 
-    // GC infomation
-    CheckGC(instr_node);
-
     RunNextInstructions(instr_node, &ready_ops);
   }
 }
@@ -476,6 +479,9 @@ void InterpreterCore::CheckGC(const Instruction& instr) {
   auto& atomic_var_ref = async_work_queue_->AtomicVarRef();
 
   for (auto var_id : instr.GCCheckVars()) {
+    VLOG(4) << "GC " << global_scope_->GetNameById(var_id) << " "
+            << var_scope.VarDesc(var_id);
+
     bool is_ready =
         atomic_var_ref[var_id]->fetch_sub(1, std::memory_order_relaxed) == 1;
     // ignore all persistable var while GC
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc
new file mode 100644
index 0000000000000000000000000000000000000000..221ad2dd628b6fbaff707aa5be677541171f1db0
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -0,0 +1,633 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/new_executor/new_executor_defs.h"
+#include "paddle/fluid/framework/rw_lock.h"
+
+// When in inference scenario, the scopes will not be written by two threads in
+// a mean time, but a scope may be read by multiple threads concurrently, and
+// the mutex will cause serious performance issue.
+// So the mutex is disabled when `ON_INFER`.
+#ifdef PADDLE_ON_INFERENCE
+#define SCOPE_VARS_READER_LOCK
+#define SCOPE_VARS_WRITER_LOCK
+#else
+#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_);
+#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_);
+#endif
+
+namespace paddle {
+namespace framework {
+
+InterpretercoreInferShapeContext::InterpretercoreInferShapeContext(
+    const OperatorBase& op, const RuntimeContext& ctx)
+    : op_(op), ctx_(ctx), can_skip_lod_(false) {}
+
+bool InterpretercoreInferShapeContext::HasInput(const std::string& name) const {
+  // has only one input
+  const auto& ins = ctx_.inputs;
+  auto it = ins.find(name);
+  if (it == ins.end()) {
+    return false;
+  }
+  const auto& in = it->second;
+  if (in.size() == 0) return false;
+  PADDLE_ENFORCE_EQ(
+      in.size(), 1UL,
+      platform::errors::InvalidArgument(
+          "Input %s should not contain more than one inputs.", name));
+  return in[0] != nullptr;
+}
+
+bool InterpretercoreInferShapeContext::HasOutput(
+    const std::string& name) const {
+  // has only one output
+  const auto& outs = ctx_.outputs;
+  auto it = outs.find(name);
+  if (it == outs.end()) {
+    return false;
+  }
+  const auto& out = it->second;
+  if (out.size() == 0) {
+    return false;
+  }
+  PADDLE_ENFORCE_EQ(
+      out.size(), 1UL,
+      platform::errors::InvalidArgument(
+          "Output %s should not contain more than one outputs.", name));
+  return out[0] != nullptr;
+}
+
+bool InterpretercoreInferShapeContext::HasInputs(
+    const std::string& name) const {
+  const auto& ins = ctx_.inputs;
+  auto it = ins.find(name);
+  if (it == ins.end() || it->second.empty()) {
+    return false;
+  }
+  for (auto& input : it->second) {
+    if (input == nullptr) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool InterpretercoreInferShapeContext::HasOutputs(
+    const std::string& name) const {
+  const auto& outs = ctx_.outputs;
+  auto it = outs.find(name);
+  if (it == outs.end() || it->second.empty()) {
+    return false;
+  }
+  for (auto& output : it->second) {
+    if (output == nullptr) {
+      return false;
+    }
+  }
+  return true;
+}
+
+AttrReader InterpretercoreInferShapeContext::Attrs() const {
+  return AttrReader(op_.Attrs());
+}
+
+std::vector<std::string> InterpretercoreInferShapeContext::Inputs(
+    const std::string& name) const {
+  return op_.Inputs(name);
+}
+
+std::vector<std::string> InterpretercoreInferShapeContext::Outputs(
+    const std::string& name) const {
+  return op_.Outputs(name);
+}
+
+std::string InterpretercoreInferShapeContext::GetInputNameByIdx(
+    size_t idx) const {
+  auto& op_proto =
+      paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
+  PADDLE_ENFORCE_LT(idx, op_proto->inputs().size(),
+                    platform::errors::OutOfRange(
+                        "The index should be less than the size of inputs of "
+                        "operator %s, but got index is %d and size is %d",
+                        op_.Type(), idx, op_proto->inputs().size()));
+  return op_proto->inputs()[idx].name();
+}
+
+std::string InterpretercoreInferShapeContext::GetOutputNameByIdx(
+    size_t idx) const {
+  auto& op_proto =
+      paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
+  PADDLE_ENFORCE_LT(idx, op_proto->outputs().size(),
+                    platform::errors::OutOfRange(
+                        "The index should be less than the size of outputs of "
+                        "operator %s, but got index is %d and size is %d",
+                        op_.Type(), idx, op_proto->outputs().size()));
+  return op_proto->outputs()[idx].name();
+}
+
+void InterpretercoreInferShapeContext::ShareDim(const std::string& in,
+                                                const std::string& out,
+                                                size_t i, size_t j) {
+  auto in_it = ctx_.inputs.find(in);
+  auto out_it = ctx_.outputs.find(out);
+  PADDLE_ENFORCE_NE(in_it, ctx_.inputs.end(),
+                    platform::errors::NotFound("Input %s does not exist.", in));
+  PADDLE_ENFORCE_NE(
+      out_it, ctx_.outputs.end(),
+      platform::errors::NotFound("Output %s does not exist.", out));
+  PADDLE_ENFORCE_LT(i, in_it->second.size(),
+                    platform::errors::InvalidArgument(
+                        "The index of input dimension is out of range, "
+                        "excepted index less than %zu, but received %zu.",
+                        in_it->second.size(), i));
+  PADDLE_ENFORCE_LT(j, out_it->second.size(),
+                    platform::errors::InvalidArgument(
+                        "The index of output dimension is out of range, "
+                        "excepted index less than %zu, but received %zu.",
+                        out_it->second.size(), j));
+
+  Variable* in_var = in_it->second[i];
+  Variable* out_var = out_it->second[j];
+
+  PADDLE_ENFORCE_EQ(
+      in_var->Type(), out_var->Type(),
+      platform::errors::InvalidArgument(
+          "The type of input (%s) and output (%s) are inconsistent.", in, out));
+
+  if (in_var->IsType<framework::SelectedRows>()) {
+    auto& in_sele_rows = in_var->Get<framework::SelectedRows>();
+    auto out_sele_rows = out_var->GetMutable<framework::SelectedRows>();
+    out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims());
+    out_sele_rows->set_rows(in_sele_rows.rows());
+    out_sele_rows->set_height(in_sele_rows.height());
+  } else if (in_var->IsType<framework::LoDTensor>()) {
+    auto& in_lod_tensor = in_var->Get<framework::LoDTensor>();
+    auto* out_lod_tensor = out_var->GetMutable<framework::LoDTensor>();
+    out_lod_tensor->Resize(in_lod_tensor.dims());
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Currently, the input type of ShareDim only can be LoDTensor "
+        "or SelectedRows."));
+  }
+}
+
+void InterpretercoreInferShapeContext::ShareAllLoD(
+    const std::string& in, const std::string& out) const {
+  auto in_it = ctx_.inputs.find(in);
+  auto out_it = ctx_.outputs.find(out);
+  PADDLE_ENFORCE_NE(in_it, ctx_.inputs.end(),
+                    platform::errors::NotFound(
+                        "Input [%s] found error in Op [%s]", in, op_.Type()));
+  PADDLE_ENFORCE_NE(out_it, ctx_.outputs.end(),
+                    platform::errors::NotFound(
+                        "Output [%s] found error in Op [%s]", out, op_.Type()));
+
+  auto& in_var_list = in_it->second;
+  auto& out_var_list = out_it->second;
+
+  PADDLE_ENFORCE_EQ(
+      in_var_list.size(), out_var_list.size(),
+      platform::errors::PreconditionNotMet(
+          "Op [%s]: Input var size should be equal with output var size",
+          op_.Type()));
+
+  auto& out_var_names = op_.Outputs(out);
+
+  for (size_t i = 0; i < in_var_list.size(); ++i) {
+    if (out_var_names[i] == framework::kEmptyVarName) {
+      continue;
+    }
+
+    Variable* in_var = in_var_list[i];
+    if (!in_var->IsType<LoDTensor>()) return;
+    Variable* out_var = out_var_list[i];
+    PADDLE_ENFORCE_EQ(out_var->IsType<LoDTensor>(), true,
+                      platform::errors::PreconditionNotMet(
+                          "The %d-th output of Output(%s) must be LoDTensor.",
+                          i, out_var_names[i]));
+    auto& in_tensor = in_var->Get<LoDTensor>();
+    auto* out_tensor = out_var->GetMutable<LoDTensor>();
+    out_tensor->set_lod(in_tensor.lod());
+#ifdef PADDLE_WITH_MKLDNN
+    if (in_tensor.layout() != DataLayout::kMKLDNN)
+#endif
+      out_tensor->set_layout(in_tensor.layout());
+  }
+}
+
+void InterpretercoreInferShapeContext::ShareLoD(const std::string& in,
+                                                const std::string& out,
+                                                size_t i, size_t j) const {
+  if (can_skip_lod_) {
+    return;
+  }
+  auto in_it = ctx_.inputs.find(in);
+  auto out_it = ctx_.outputs.find(out);
+  PADDLE_ENFORCE_NE(in_it, ctx_.inputs.end(),
+                    platform::errors::NotFound("Input %s does not exist.", in));
+  PADDLE_ENFORCE_NE(
+      out_it, ctx_.outputs.end(),
+      platform::errors::NotFound("Output %s does not exist.", out));
+  PADDLE_ENFORCE_LT(i, in_it->second.size(),
+                    platform::errors::InvalidArgument(
+                        "The index of input dimension is out of range, "
+                        "excepted index less than %zu, but received %zu.",
+                        in_it->second.size(), i));
+  PADDLE_ENFORCE_LT(j, out_it->second.size(),
+                    platform::errors::InvalidArgument(
+                        "The index of output dimension is out of range, "
+                        "excepted index less than %zu, but received %zu.",
+                        out_it->second.size(), j));
+
+  Variable* in_var = in_it->second.at(i);
+  if (!in_var->IsType<LoDTensor>()) return;
+  Variable* out_var = out_it->second.at(j);
+  PADDLE_ENFORCE_EQ(
+      out_var->IsType<LoDTensor>(), true,
+      platform::errors::InvalidArgument(
+          "The %zu-th output of Output(%s) must be LoDTensor.", j, out));
+  auto& in_tensor = in_var->Get<LoDTensor>();
+  auto* out_tensor = out_var->GetMutable<LoDTensor>();
+  out_tensor->set_lod(in_tensor.lod());
+
+// TODO(dzhwinter) : reuse ShareLoD in most operators.
+// Need to call ShareLayout explicitly in sequence related ops.
+// Shall we have a better method to shared info between in/out Tensor?
+#ifdef PADDLE_WITH_MKLDNN
+  // Fix me: ugly workaround below
+  // Correct solution:
+  //    set_layout() should NOT be called here (i.e. ShareLoD). Instead,
+  //    layout of output tensor should be set "manually" in Compute()
+  //    of each OPKernel. The reason layout should NOT be shared between
+  //    input and output "automatically" (now by InferShape()->ShareLoD())
+  //    is that layout transform may occur after InferShape().
+  // Workaround:
+  //    Skip set_layout() when input layout is kMKLDNN
+  //    This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN
+  //    OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called
+  //    in Compute()
+  if (in_tensor.layout() != DataLayout::kMKLDNN)
+#endif
+    out_tensor->set_layout(in_tensor.layout());
+}
+
+int32_t InterpretercoreInferShapeContext::GetLoDLevel(const std::string& in,
+                                                      size_t i) const {
+  PADDLE_THROW(platform::errors::PreconditionNotMet(
+      "GetLoDLevel is only used in compile time. The calculation of "
+      "output's actual lod is different among operators so that should be "
+      "set in the runtime kernel."));
+}
+
+void InterpretercoreInferShapeContext::SetLoDLevel(const std::string& out,
+                                                   int32_t lod_level,
+                                                   size_t j) const {
+  PADDLE_THROW(platform::errors::PreconditionNotMet(
+      "SetLoDLevel is only used in compile time. The calculation of "
+      "output's actual lod is different among operators so that should be "
+      "set in the runtime kernel."));
+}
+
+bool InterpretercoreInferShapeContext::IsRuntime() const { return true; }
+
+// TODO(paddle-dev): Can this be template?
+std::vector<InferShapeVarPtr> InterpretercoreInferShapeContext::GetInputVarPtrs(
+    const std::string& name) {
+  const std::vector<Variable*>& vars = InputVars(name);
+  std::vector<InferShapeVarPtr> res;
+  res.reserve(vars.size());
+  res.insert(res.begin(), vars.begin(), vars.end());
+  return res;
+}
+
+std::vector<InferShapeVarPtr>
+InterpretercoreInferShapeContext::GetOutputVarPtrs(const std::string& name) {
+  const std::vector<Variable*>& vars = OutputVars(name);
+  std::vector<InferShapeVarPtr> res;
+  res.reserve(vars.size());
+  res.insert(res.begin(), vars.begin(), vars.end());
+  return res;
+}
+
+DDim InterpretercoreInferShapeContext::GetInputDim(
+    const std::string& name) const {
+  const std::vector<Variable*>& vars = InputVars(name);
+  PADDLE_ENFORCE_EQ(
+      vars.size(), 1UL,
+      platform::errors::InvalidArgument(
+          "Input(%s) should hold one element, but now it holds %zu elements.",
+          name, vars.size()));
+  return this->GetDim(vars[0]);
+}
+
+std::vector<DDim> InterpretercoreInferShapeContext::GetInputsDim(
+    const std::string& name) const {
+  const std::vector<Variable*>& vars = InputVars(name);
+  return GetDims(vars);
+}
+
+std::vector<proto::VarType::Type>
+InterpretercoreInferShapeContext::GetInputsVarType(
+    const std::string& name) const {
+  return GetVarTypes(InputVars(name));
+}
+
+std::vector<proto::VarType::Type>
+InterpretercoreInferShapeContext::GetOutputsVarType(
+    const std::string& name) const {
+  return GetVarTypes(OutputVars(name));
+}
+
+void InterpretercoreInferShapeContext::SetOutputDim(const std::string& name,
+                                                    const DDim& dim) {
+  auto& vars = OutputVars(name);
+  PADDLE_ENFORCE_EQ(vars.size(), 1UL, platform::errors::InvalidArgument(
+                                          "Output(%s) should hold one element, "
+                                          "but now it holds %zu elements.",
+                                          name, vars.size()));
+  SetDim(vars[0], dim);
+}
+
+void InterpretercoreInferShapeContext::SetOutputsDim(
+    const std::string& name, const std::vector<DDim>& dims) {
+  auto& vars = OutputVars(name);
+  SetDims(vars, dims);
+}
+
+void InterpretercoreInferShapeContext::SetSkipLoD(bool skip) {
+  can_skip_lod_ = skip;
+}
+
+DDim InterpretercoreInferShapeContext::GetDim(Variable* var) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::InvalidArgument("Input variable is nullptr."));
+  if (var->IsType<LoDTensor>()) {
+    return var->Get<LoDTensor>().dims();
+  } else if (var->IsType<SelectedRows>()) {
+    return var->Get<SelectedRows>().GetCompleteDims();
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Only LoDTensor or SelectedRows support 'GetDim', but input "
+        "Variable's type is %s.",
+        ToTypeName(var->Type())));
+  }
+}
+
+std::vector<DDim> InterpretercoreInferShapeContext::GetDims(
+    const std::vector<Variable*>& vars) const {
+  std::vector<DDim> ret;
+  ret.reserve(vars.size());
+  std::transform(vars.begin(), vars.end(), std::back_inserter(ret),
+                 [this](Variable* var) { return this->GetDim(var); });
+  return ret;
+}
+
+std::vector<DDim> InterpretercoreInferShapeContext::GetRepeatedDims(
+    const std::string& name) const {
+  PADDLE_THROW(platform::errors::PreconditionNotMet(
+      "GetRepeatedDims method only ban be used in compile time."));
+}
+
+void InterpretercoreInferShapeContext::SetDim(Variable* var, const DDim& dim) {
+  if (var->IsType<LoDTensor>()) {
+    var->GetMutable<LoDTensor>()->Resize(dim);
+  } else if (var->IsType<SelectedRows>()) {
+    var->GetMutable<SelectedRows>()->set_height(dim[0]);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Variable type error, expect LoDTensor or SelectedRows, but received "
+        "(%s).",
+        ToTypeName(var->Type())));
+  }
+}
+
+void InterpretercoreInferShapeContext::SetDims(
+    const std::vector<Variable*>& vars, const std::vector<DDim>& dims) {
+  size_t length = vars.size();
+  PADDLE_ENFORCE_EQ(length, dims.size(),
+                    platform::errors::InvalidArgument(
+                        "The number of input variables do not match the "
+                        "number of input dimensions, the number of variables "
+                        "is %zu, the number of dimensions is %zu.",
+                        length, dims.size()));
+  for (size_t i = 0; i < length; ++i) {
+    if (vars[i] == nullptr) {
+      continue;
+    }
+    SetDim(vars[i], dims[i]);
+  }
+}
+
+void InterpretercoreInferShapeContext::SetRepeatedDims(
+    const std::string& name, const std::vector<DDim>& dims) {
+  PADDLE_THROW(platform::errors::PreconditionNotMet(
+      "SetRepeatedDims method only can be used in compile time."));
+}
+
+std::vector<proto::VarType::Type> InterpretercoreInferShapeContext::GetVarTypes(
+    const std::vector<Variable*>& vars) const {
+  std::vector<proto::VarType::Type> retv;
+  retv.resize(vars.size());
+  std::transform(
+      vars.begin(), vars.end(), retv.begin(),
+      std::bind(std::mem_fn(&InterpretercoreInferShapeContext::GetVarType),
+                this, std::placeholders::_1));
+  return retv;
+}
+
+proto::VarType::Type InterpretercoreInferShapeContext::GetVarType(
+    Variable* var) const {
+  return ToVarType(var->Type());
+}
+
+const std::vector<Variable*>& InterpretercoreInferShapeContext::InputVars(
+    const std::string& name) const {
+  auto it = ctx_.inputs.find(name);
+  PADDLE_ENFORCE_NE(
+      it, ctx_.inputs.end(),
+      platform::errors::NotFound("Operator (%s) does not have the input (%s).",
+                                 op_.Type(), name));
+  return it->second;
+}
+
+const std::vector<Variable*>& InterpretercoreInferShapeContext::OutputVars(
+    const std::string& name) const {
+  auto it = ctx_.outputs.find(name);
+  PADDLE_ENFORCE_NE(
+      it, ctx_.outputs.end(),
+      platform::errors::NotFound(
+          "Operator (%s) does not have the outputs (%s).", op_.Type(), name));
+  return it->second;
+}
+
+VariableScope::VariableScope(Scope* scope) {
+  // for @EMPTY@ variable
+  var_list_.push_back(nullptr);
+  name2id_[kEmptyVarName] = 0;
+  vec_meta_info_.emplace_back(0, nullptr);
+  scope_ = scope;
+  PADDLE_ENFORCE_NE(
+      scope, nullptr,
+      platform::errors::PreconditionNotMet(
+          "You have passed a nullptr to construct VariableScope."));
+  listener_ = std::make_shared<VariableScopeListener>(this);
+  scope->AddListener(listener_);
+}
+
+VariableScope::~VariableScope() {
+  if (scope_ && listener_) {
+    scope_->DelListener(listener_);
+  }
+}
+
+const Scope* VariableScope::GetScope() const { return scope_; }
+
+Variable* VariableScope::FindVar(const std::string& name) const {
+  auto it = name2id_.find(name);
+  if (it != name2id_.end()) {
+    PADDLE_ENFORCE_LT(it->second, var_list_.size(),
+                      platform::errors::NotFound(
+                          "The id(%d) of variable(%s) should not be larger "
+                          "than the size of variable list(%d).",
+                          it->second, name, var_list_.size()));
+    return var_list_[it->second];
+  }
+  return nullptr;
+}
+
+// Get variable id by name, return -1 if not found
+int VariableScope::GetIdByName(const std::string& name) const {
+  auto it = name2id_.find(name);
+  if (it != name2id_.end()) {
+    return it->second;
+  }
+  return -1;
+}
+
+// Get variable name by id, return "" if not found
+std::string VariableScope::GetNameById(int id) const {
+  // NOTE(zhiqiu): do not use vec_meta_info_[id].vardesc_->Name() since
+  // vec_meta_info_[id] may be nullptr,
+  // typically when the target variable is not existed in the original program
+  // desc, but created by interpretercore.
+  // For example, created and used by d2h_copy or h2d_copy operator.
+  auto it = std::find_if(name2id_.begin(), name2id_.end(),
+                         [id](const auto& pair) { return pair.second == id; });
+  if (it != name2id_.end()) {
+    return it->first;
+  }
+  return "";
+}
+
+bool VariableScope::HasVar(const std::string& name) const {
+  return name2id_.find(name) != name2id_.end();
+}
+
+int VariableScope::VarId(const std::string& name) const {
+  CheckExist(name);
+  return name2id_.at(name);
+}
+
+Variable* VariableScope::Var(int id) const { return var_list_.at(id); }
+
+Variable* VariableScope::Var(const std::string& name) const {
+  return var_list_.at(VarId(name));
+}
+
+size_t VariableScope::VarSize() const { return var_list_.size(); }
+
+void VariableScope::AddVar(const std::string& name,
+                           framework::VarDesc* var_desc) {  // NOLINT
+  auto v = scope_->Var(name);
+  if (nullptr == var_desc) {
+    v->GetMutable<LoDTensor>();
+  } else {
+    InitializeVariable(
+        v,
+        var_desc
+            ->GetType());  // Scope don't initialize variable recently created
+  }
+  SetVarDesc(name, var_desc);
+}
+
+void VariableScope::AddVar(const std::string& name,
+                           const Variable& var) {  // NOLINT
+  // Though name existed in outer_scope_, we need
+  // add again to create name2id map.
+  scope_->Var(name);
+}
+
+void VariableScope::SetVarDesc(const std::string& name,
+                               framework::VarDesc* var_desc) {
+  CheckExist(name);
+  vec_meta_info_[VarId(name)].var_desc_ = var_desc;
+}
+
+paddle::framework::VarDesc* VariableScope::VarDesc(
+    const std::string& name) const {
+  return VarDesc(VarId(name));
+}
+
+paddle::framework::VarDesc* VariableScope::VarDesc(int id) const {
+  CheckExist(id);
+  return vec_meta_info_[id].var_desc_;
+}
+
+void VariableScope::CheckExist(int id) const {
+  PADDLE_ENFORCE_LT(id, var_list_.size(),
+                    platform::errors::PreconditionNotMet(
+                        "Required var_id < %d, but received var_id = %d.",
+                        var_list_.size(), id));
+}
+
+void VariableScope::CheckExist(const std::string& name) const {
+  PADDLE_ENFORCE_EQ(HasVar(name), true, platform::errors::NotFound(
+                                            "%s not in VariableScope.", name));
+}
+
+VariableScopeListener::VariableScopeListener(VariableScope* var_scope) {
+  var_scope_ = var_scope;
+}
+
+void VariableScopeListener::onCreateVariable(const std::string& name) {
+  auto v = var_scope_->scope_->GetVar(name);  // must exsit in outer_scope_
+  if (!var_scope_->HasVar(name)) {            // may exist in variable scope.
+    VLOG(4) << "Calling VariableScope::onCreateVariable with var_name: "
+            << name;
+    var_scope_->name2id_[name] = var_scope_->VarSize();
+    var_scope_->var_list_.emplace_back(v);
+    var_scope_->vec_meta_info_.emplace_back(0, nullptr);
+  }
+}
+
+void VariableScopeListener::onDeleteVariable(const std::string& name) {
+  if (var_scope_->HasVar(name)) {
+    VLOG(4) << "Calling VariableScope::onDeleteVariable with var_name: "
+            << name;
+  }
+}
+void VariableScopeListener::onRenameVariable(const std::string& old_name,
+                                             const std::string& new_name) {}
+void VariableScopeListener::onCreateScope(Scope* Scope) {}
+void VariableScopeListener::onDeleteScope(Scope* Scope) {}
+void VariableScopeListener::onClear() {}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index c765b7fe4d2acc822308528db294be30c4dbdc7b..4206f2733a06b3bc76dabdddded67bdd86ee2c99 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -19,10 +19,23 @@
 #include <vector>
 
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/device_event_base.h"
 #include "paddle/fluid/platform/event.h"
 
+// When in inference scenario, the scopes will not be written by two threads in
+// a mean time, but a scope may be read by multiple threads concurrently, and
+// the mutex will cause serious performance issue.
+// So the mutex is disabled when `ON_INFER`.
+#ifdef PADDLE_ON_INFERENCE
+#define SCOPE_VARS_READER_LOCK
+#define SCOPE_VARS_WRITER_LOCK
+#else
+#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_);
+#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_);
+#endif
+
 namespace paddle {
 namespace framework {
 
@@ -33,429 +46,90 @@ using OpKernelMap =
 class InterpretercoreInferShapeContext : public InferShapeContext {
  public:
   InterpretercoreInferShapeContext(const OperatorBase& op,
-                                   const RuntimeContext& ctx)
-      : op_(op), ctx_(ctx), can_skip_lod_(false) {}
-
-  bool HasInput(const std::string& name) const override {
-    // has only one input
-    const auto& ins = ctx_.inputs;
-    auto it = ins.find(name);
-    if (it == ins.end()) {
-      return false;
-    }
-    const auto& in = it->second;
-    if (in.size() == 0) return false;
-    PADDLE_ENFORCE_EQ(
-        in.size(), 1UL,
-        platform::errors::InvalidArgument(
-            "Input %s should not contain more than one inputs.", name));
-    return in[0] != nullptr;
-  }
+                                   const RuntimeContext& ctx);
 
-  bool HasOutput(const std::string& name) const override {
-    // has only one output
-    const auto& outs = ctx_.outputs;
-    auto it = outs.find(name);
-    if (it == outs.end()) {
-      return false;
-    }
-    const auto& out = it->second;
-    if (out.size() == 0) {
-      return false;
-    }
-    PADDLE_ENFORCE_EQ(
-        out.size(), 1UL,
-        platform::errors::InvalidArgument(
-            "Output %s should not contain more than one outputs.", name));
-    return out[0] != nullptr;
-  }
+  bool HasInput(const std::string& name) const override;
 
-  bool HasInputs(const std::string& name) const override {
-    const auto& ins = ctx_.inputs;
-    auto it = ins.find(name);
-    if (it == ins.end() || it->second.empty()) {
-      return false;
-    }
-    for (auto& input : it->second) {
-      if (input == nullptr) {
-        return false;
-      }
-    }
-    return true;
-  }
+  bool HasOutput(const std::string& name) const override;
 
-  bool HasOutputs(const std::string& name) const override {
-    const auto& outs = ctx_.outputs;
-    auto it = outs.find(name);
-    if (it == outs.end() || it->second.empty()) {
-      return false;
-    }
-    for (auto& output : it->second) {
-      if (output == nullptr) {
-        return false;
-      }
-    }
-    return true;
-  }
+  bool HasInputs(const std::string& name) const override;
 
-  AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
+  bool HasOutputs(const std::string& name) const override;
 
-  std::vector<std::string> Inputs(const std::string& name) const override {
-    return op_.Inputs(name);
-  }
+  AttrReader Attrs() const override;
 
-  std::vector<std::string> Outputs(const std::string& name) const override {
-    return op_.Outputs(name);
-  }
+  std::vector<std::string> Inputs(const std::string& name) const override;
 
-  std::string GetInputNameByIdx(size_t idx) const override {
-    auto& op_proto =
-        paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
-    PADDLE_ENFORCE_LT(idx, op_proto->inputs().size(),
-                      platform::errors::OutOfRange(
-                          "The index should be less than the size of inputs of "
-                          "operator %s, but got index is %d and size is %d",
-                          op_.Type(), idx, op_proto->inputs().size()));
-    return op_proto->inputs()[idx].name();
-  }
+  std::vector<std::string> Outputs(const std::string& name) const override;
 
-  std::string GetOutputNameByIdx(size_t idx) const override {
-    auto& op_proto =
-        paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
-    PADDLE_ENFORCE_LT(
-        idx, op_proto->outputs().size(),
-        platform::errors::OutOfRange(
-            "The index should be less than the size of outputs of "
-            "operator %s, but got index is %d and size is %d",
-            op_.Type(), idx, op_proto->outputs().size()));
-    return op_proto->outputs()[idx].name();
-  }
+  std::string GetInputNameByIdx(size_t idx) const override;
+
+  std::string GetOutputNameByIdx(size_t idx) const override;
 
   void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
-                size_t j = 0) override {
-    auto in_it = ctx_.inputs.find(in);
-    auto out_it = ctx_.outputs.find(out);
-    PADDLE_ENFORCE_NE(
-        in_it, ctx_.inputs.end(),
-        platform::errors::NotFound("Input %s does not exist.", in));
-    PADDLE_ENFORCE_NE(
-        out_it, ctx_.outputs.end(),
-        platform::errors::NotFound("Output %s does not exist.", out));
-    PADDLE_ENFORCE_LT(i, in_it->second.size(),
-                      platform::errors::InvalidArgument(
-                          "The index of input dimension is out of range, "
-                          "excepted index less than %zu, but received %zu.",
-                          in_it->second.size(), i));
-    PADDLE_ENFORCE_LT(j, out_it->second.size(),
-                      platform::errors::InvalidArgument(
-                          "The index of output dimension is out of range, "
-                          "excepted index less than %zu, but received %zu.",
-                          out_it->second.size(), j));
-
-    Variable* in_var = in_it->second[i];
-    Variable* out_var = out_it->second[j];
-
-    PADDLE_ENFORCE_EQ(
-        in_var->Type(), out_var->Type(),
-        platform::errors::InvalidArgument(
-            "The type of input (%s) and output (%s) are inconsistent.", in,
-            out));
-
-    if (in_var->IsType<framework::SelectedRows>()) {
-      auto& in_sele_rows = in_var->Get<framework::SelectedRows>();
-      auto out_sele_rows = out_var->GetMutable<framework::SelectedRows>();
-      out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims());
-      out_sele_rows->set_rows(in_sele_rows.rows());
-      out_sele_rows->set_height(in_sele_rows.height());
-    } else if (in_var->IsType<framework::LoDTensor>()) {
-      auto& in_lod_tensor = in_var->Get<framework::LoDTensor>();
-      auto* out_lod_tensor = out_var->GetMutable<framework::LoDTensor>();
-      out_lod_tensor->Resize(in_lod_tensor.dims());
-    } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Currently, the input type of ShareDim only can be LoDTensor "
-          "or SelectedRows."));
-    }
-  }
+                size_t j = 0) override;
 
   void ShareAllLoD(const std::string& in,
-                   const std::string& out) const override {
-    auto in_it = ctx_.inputs.find(in);
-    auto out_it = ctx_.outputs.find(out);
-    PADDLE_ENFORCE_NE(in_it, ctx_.inputs.end(),
-                      platform::errors::NotFound(
-                          "Input [%s] found error in Op [%s]", in, op_.Type()));
-    PADDLE_ENFORCE_NE(
-        out_it, ctx_.outputs.end(),
-        platform::errors::NotFound("Output [%s] found error in Op [%s]", out,
-                                   op_.Type()));
-
-    auto& in_var_list = in_it->second;
-    auto& out_var_list = out_it->second;
-
-    PADDLE_ENFORCE_EQ(
-        in_var_list.size(), out_var_list.size(),
-        platform::errors::PreconditionNotMet(
-            "Op [%s]: Input var size should be equal with output var size",
-            op_.Type()));
-
-    auto& out_var_names = op_.Outputs(out);
-
-    for (size_t i = 0; i < in_var_list.size(); ++i) {
-      if (out_var_names[i] == framework::kEmptyVarName) {
-        continue;
-      }
-
-      Variable* in_var = in_var_list[i];
-      if (!in_var->IsType<LoDTensor>()) return;
-      Variable* out_var = out_var_list[i];
-      PADDLE_ENFORCE_EQ(out_var->IsType<LoDTensor>(), true,
-                        platform::errors::PreconditionNotMet(
-                            "The %d-th output of Output(%s) must be LoDTensor.",
-                            i, out_var_names[i]));
-      auto& in_tensor = in_var->Get<LoDTensor>();
-      auto* out_tensor = out_var->GetMutable<LoDTensor>();
-      out_tensor->set_lod(in_tensor.lod());
-#ifdef PADDLE_WITH_MKLDNN
-      if (in_tensor.layout() != DataLayout::kMKLDNN)
-#endif
-        out_tensor->set_layout(in_tensor.layout());
-    }
-  }
+                   const std::string& out) const override;
 
   void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
-                size_t j = 0) const override {
-    if (can_skip_lod_) {
-      return;
-    }
-    auto in_it = ctx_.inputs.find(in);
-    auto out_it = ctx_.outputs.find(out);
-    PADDLE_ENFORCE_NE(
-        in_it, ctx_.inputs.end(),
-        platform::errors::NotFound("Input %s does not exist.", in));
-    PADDLE_ENFORCE_NE(
-        out_it, ctx_.outputs.end(),
-        platform::errors::NotFound("Output %s does not exist.", out));
-    PADDLE_ENFORCE_LT(i, in_it->second.size(),
-                      platform::errors::InvalidArgument(
-                          "The index of input dimension is out of range, "
-                          "excepted index less than %zu, but received %zu.",
-                          in_it->second.size(), i));
-    PADDLE_ENFORCE_LT(j, out_it->second.size(),
-                      platform::errors::InvalidArgument(
-                          "The index of output dimension is out of range, "
-                          "excepted index less than %zu, but received %zu.",
-                          out_it->second.size(), j));
-
-    Variable* in_var = in_it->second.at(i);
-    if (!in_var->IsType<LoDTensor>()) return;
-    Variable* out_var = out_it->second.at(j);
-    PADDLE_ENFORCE_EQ(
-        out_var->IsType<LoDTensor>(), true,
-        platform::errors::InvalidArgument(
-            "The %zu-th output of Output(%s) must be LoDTensor.", j, out));
-    auto& in_tensor = in_var->Get<LoDTensor>();
-    auto* out_tensor = out_var->GetMutable<LoDTensor>();
-    out_tensor->set_lod(in_tensor.lod());
-
-// TODO(dzhwinter) : reuse ShareLoD in most operators.
-// Need to call ShareLayout explicitly in sequence related ops.
-// Shall we have a better method to shared info between in/out Tensor?
-#ifdef PADDLE_WITH_MKLDNN
-    // Fix me: ugly workaround below
-    // Correct solution:
-    //    set_layout() should NOT be called here (i.e. ShareLoD). Instead,
-    //    layout of output tensor should be set "manually" in Compute()
-    //    of each OPKernel. The reason layout should NOT be shared between
-    //    input and output "automatically" (now by InferShape()->ShareLoD())
-    //    is that layout transform may occur after InferShape().
-    // Workaround:
-    //    Skip set_layout() when input layout is kMKLDNN
-    //    This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN
-    //    OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called
-    //    in Compute()
-    if (in_tensor.layout() != DataLayout::kMKLDNN)
-#endif
-      out_tensor->set_layout(in_tensor.layout());
-  }
+                size_t j = 0) const override;
 
-  int32_t GetLoDLevel(const std::string& in, size_t i = 0) const override {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "GetLoDLevel is only used in compile time. The calculation of "
-        "output's actual lod is different among operators so that should be "
-        "set in the runtime kernel."));
-  }
+  int32_t GetLoDLevel(const std::string& in, size_t i = 0) const override;
 
   void SetLoDLevel(const std::string& out, int32_t lod_level,
-                   size_t j = 0) const override {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "SetLoDLevel is only used in compile time. The calculation of "
-        "output's actual lod is different among operators so that should be "
-        "set in the runtime kernel."));
-  }
+                   size_t j = 0) const override;
 
-  bool IsRuntime() const override { return true; }
+  bool IsRuntime() const override;
 
   // TODO(paddle-dev): Can this be template?
   std::vector<InferShapeVarPtr> GetInputVarPtrs(
-      const std::string& name) override {
-    const std::vector<Variable*>& vars = InputVars(name);
-    std::vector<InferShapeVarPtr> res;
-    res.reserve(vars.size());
-    res.insert(res.begin(), vars.begin(), vars.end());
-    return res;
-  }
+      const std::string& name) override;
 
   std::vector<InferShapeVarPtr> GetOutputVarPtrs(
-      const std::string& name) override {
-    const std::vector<Variable*>& vars = OutputVars(name);
-    std::vector<InferShapeVarPtr> res;
-    res.reserve(vars.size());
-    res.insert(res.begin(), vars.begin(), vars.end());
-    return res;
-  }
+      const std::string& name) override;
 
-  DDim GetInputDim(const std::string& name) const override {
-    const std::vector<Variable*>& vars = InputVars(name);
-    PADDLE_ENFORCE_EQ(
-        vars.size(), 1UL,
-        platform::errors::InvalidArgument(
-            "Input(%s) should hold one element, but now it holds %zu elements.",
-            name, vars.size()));
-    return this->GetDim(vars[0]);
-  }
+  DDim GetInputDim(const std::string& name) const override;
 
-  std::vector<DDim> GetInputsDim(const std::string& name) const override {
-    const std::vector<Variable*>& vars = InputVars(name);
-    return GetDims(vars);
-  }
+  std::vector<DDim> GetInputsDim(const std::string& name) const override;
 
   std::vector<proto::VarType::Type> GetInputsVarType(
-      const std::string& name) const override {
-    return GetVarTypes(InputVars(name));
-  }
+      const std::string& name) const override;
 
   std::vector<proto::VarType::Type> GetOutputsVarType(
-      const std::string& name) const override {
-    return GetVarTypes(OutputVars(name));
-  }
+      const std::string& name) const override;
 
-  void SetOutputDim(const std::string& name, const DDim& dim) override {
-    auto& vars = OutputVars(name);
-    PADDLE_ENFORCE_EQ(
-        vars.size(), 1UL,
-        platform::errors::InvalidArgument("Output(%s) should hold one element, "
-                                          "but now it holds %zu elements.",
-                                          name, vars.size()));
-    SetDim(vars[0], dim);
-  }
+  void SetOutputDim(const std::string& name, const DDim& dim) override;
 
   void SetOutputsDim(const std::string& name,
-                     const std::vector<DDim>& dims) override {
-    auto& vars = OutputVars(name);
-    SetDims(vars, dims);
-  }
+                     const std::vector<DDim>& dims) override;
 
-  void SetSkipLoD(bool skip) { can_skip_lod_ = skip; }
+  void SetSkipLoD(bool skip);
 
  protected:
-  DDim GetDim(Variable* var) const {
-    PADDLE_ENFORCE_NOT_NULL(
-        var, platform::errors::InvalidArgument("Input variable is nullptr."));
-    if (var->IsType<LoDTensor>()) {
-      return var->Get<LoDTensor>().dims();
-    } else if (var->IsType<SelectedRows>()) {
-      return var->Get<SelectedRows>().GetCompleteDims();
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Only LoDTensor or SelectedRows support 'GetDim', but input "
-          "Variable's type is %s.",
-          ToTypeName(var->Type())));
-    }
-  }
+  DDim GetDim(Variable* var) const;
 
-  std::vector<DDim> GetDims(const std::vector<Variable*>& vars) const {
-    std::vector<DDim> ret;
-    ret.reserve(vars.size());
-    std::transform(vars.begin(), vars.end(), std::back_inserter(ret),
-                   [this](Variable* var) { return this->GetDim(var); });
-    return ret;
-  }
+  std::vector<DDim> GetDims(const std::vector<Variable*>& vars) const;
 
-  std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "GetRepeatedDims method only ban be used in compile time."));
-  }
+  std::vector<DDim> GetRepeatedDims(const std::string& name) const override;
 
-  void SetDim(Variable* var, const DDim& dim) {
-    if (var->IsType<LoDTensor>()) {
-      var->GetMutable<LoDTensor>()->Resize(dim);
-    } else if (var->IsType<SelectedRows>()) {
-      var->GetMutable<SelectedRows>()->set_height(dim[0]);
-    } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Variable type error, expect LoDTensor or SelectedRows, but received "
-          "(%s).",
-          ToTypeName(var->Type())));
-    }
-  }
+  void SetDim(Variable* var, const DDim& dim);
 
   void SetDims(const std::vector<Variable*>& vars,
-               const std::vector<DDim>& dims) {
-    size_t length = vars.size();
-    PADDLE_ENFORCE_EQ(length, dims.size(),
-                      platform::errors::InvalidArgument(
-                          "The number of input variables do not match the "
-                          "number of input dimensions, the number of variables "
-                          "is %zu, the number of dimensions is %zu.",
-                          length, dims.size()));
-    for (size_t i = 0; i < length; ++i) {
-      if (vars[i] == nullptr) {
-        continue;
-      }
-      SetDim(vars[i], dims[i]);
-    }
-  }
+               const std::vector<DDim>& dims);
 
   void SetRepeatedDims(const std::string& name,
-                       const std::vector<DDim>& dims) override {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "SetRepeatedDims method only can be used in compile time."));
-  }
+                       const std::vector<DDim>& dims) override;
 
   std::vector<proto::VarType::Type> GetVarTypes(
-      const std::vector<Variable*>& vars) const {
-    std::vector<proto::VarType::Type> retv;
-    retv.resize(vars.size());
-    std::transform(
-        vars.begin(), vars.end(), retv.begin(),
-        std::bind(std::mem_fn(&InterpretercoreInferShapeContext::GetVarType),
-                  this, std::placeholders::_1));
-    return retv;
-  }
+      const std::vector<Variable*>& vars) const;
 
-  proto::VarType::Type GetVarType(Variable* var) const {
-    return ToVarType(var->Type());
-  }
+  proto::VarType::Type GetVarType(Variable* var) const;
 
  private:
-  const std::vector<Variable*>& InputVars(const std::string& name) const {
-    auto it = ctx_.inputs.find(name);
-    PADDLE_ENFORCE_NE(
-        it, ctx_.inputs.end(),
-        platform::errors::NotFound(
-            "Operator (%s) does not have the input (%s).", op_.Type(), name));
-    return it->second;
-  }
+  const std::vector<Variable*>& InputVars(const std::string& name) const;
 
-  const std::vector<Variable*>& OutputVars(const std::string& name) const {
-    auto it = ctx_.outputs.find(name);
-    PADDLE_ENFORCE_NE(
-        it, ctx_.outputs.end(),
-        platform::errors::NotFound(
-            "Operator (%s) does not have the outputs (%s).", op_.Type(), name));
-    return it->second;
-  }
+  const std::vector<Variable*>& OutputVars(const std::string& name) const;
 
   const OperatorBase& op_;
   const RuntimeContext& ctx_;
@@ -467,8 +141,28 @@ struct OpKernelFunc {
 };
 
 struct VariableMetaInfo {
-  int var_ref_count_;
-  paddle::framework::VarDesc* vardesc_;
+  int var_ref_count_{0};
+  framework::VarDesc* var_desc_{nullptr};
+
+  VariableMetaInfo() {}
+  VariableMetaInfo(int var_ref_count, framework::VarDesc* var_desc)
+      : var_ref_count_(var_ref_count), var_desc_(var_desc) {}
+};
+
+class VariableScope;
+class VariableScopeListener : public ScopeListener {
+ public:
+  explicit VariableScopeListener(VariableScope* var_scope_);
+  void onCreateVariable(const std::string& name) override;
+  void onDeleteVariable(const std::string& name) override;
+  void onRenameVariable(const std::string& old_name,
+                        const std::string& new_name) override;
+  void onCreateScope(Scope* Scope) override;
+  void onDeleteScope(Scope* Scope) override;
+  void onClear() override;
+
+ private:
+  VariableScope* var_scope_;  // not owned
 };
 
 // TODO(zhiqiu): Maybe we need to add rwlock for VariableScope?
@@ -477,171 +171,61 @@ struct VariableMetaInfo {
 // ScopeBase. Scope manager the variables and VariableScope is just a quick
 // access machanism. ScopeListener is the callback to sync changes in Original
 // Scope. We can make it a membership of VariableScope. Here we use inherent.
-class VariableScope : public ScopeBase, public ScopeListener {
+class VariableScope : public ScopeBase {
  public:
-  VariableScope(Scope* outer_scope) {
-    // for @EMPTY@ variable
-    var_list_.push_back(nullptr);
-    name2id_[kEmptyVarName] = 0;
-    VariableMetaInfo info;
-    info.var_ref_count_ = 0;
-    info.vardesc_ = nullptr;
-    vec_meta_info_.push_back(info);
-    outer_scope_ = outer_scope;
-
-    PADDLE_ENFORCE_NE(
-        outer_scope_, nullptr,
-        platform::errors::PreconditionNotMet(
-            "You have passed a nullptr to construct VariableScope."));
-    outer_scope->AddListener(this);
-  }
+  explicit VariableScope(Scope* scope);
 
-  ~VariableScope() {
-    if (outer_scope_ != nullptr) outer_scope_->DelListener(this);
-  }
+  const Scope* GetScope() const;
 
-  const Scope* GetScope() const { return outer_scope_; }
-
-  Variable* FindVar(const std::string& name) const {
-    auto it = name2id_.find(name);
-    if (it != name2id_.end()) {
-      PADDLE_ENFORCE_LT(it->second, var_list_.size(),
-                        platform::errors::NotFound(
-                            "The id(%d) of variable(%s) should not be larger "
-                            "than the size of variable list(%d).",
-                            it->second, name, var_list_.size()));
-      return var_list_[it->second];
-    }
-    return nullptr;
-  }
+  Variable* FindVar(const std::string& name) const;
+
+  ~VariableScope();
 
   // Get variable id by name, return -1 if not found
-  int GetIdByName(const std::string& name) const {
-    auto it = name2id_.find(name);
-    if (it != name2id_.end()) {
-      return it->second;
-    }
-    return -1;
-  }
+  int GetIdByName(const std::string& name) const;
 
   // Get variable name by id, return "" if not found
-  std::string GetNameById(int id) const {
-    // NOTE(zhiqiu): do not use vec_meta_info_[id].vardesc_->Name() since
-    // vec_meta_info_[id] may be nullptr,
-    // typically when the target variable is not existed in the original program
-    // desc, but created by interpretercore.
-    // For example, created and used by d2h_copy or h2d_copy operator.
-    auto it =
-        std::find_if(name2id_.begin(), name2id_.end(),
-                     [id](const auto& pair) { return pair.second == id; });
-    if (it != name2id_.end()) {
-      return it->first;
-    }
-    return "";
-  }
+  std::string GetNameById(int id) const;
 
-  bool HasVar(const std::string& name) const {
-    return name2id_.find(name) != name2id_.end();
-  }
+  bool HasVar(const std::string& name) const;
 
-  int VarId(const std::string& name) const {
-    CheckExist(name);
-    return name2id_.at(name);
-  }
+  int VarId(const std::string& name) const;
 
-  Variable* Var(int id) const { return var_list_.at(id); }
+  Variable* Var(int id) const;
 
-  Variable* Var(const std::string& name) const {
-    return var_list_.at(VarId(name));
-  }
+  Variable* Var(const std::string& name) const;
 
-  size_t VarSize() const { return var_list_.size(); }
-
-  void AddVar(const std::string& name, VarDesc* var_desc) {  // NOLINT
-    // AddVar -> Scope::Var -> onCreateVariable.
-    VLOG(4) << "Add variable: " << name << " through AddVar()";
-    auto v = outer_scope_->Var(name);
-    if (nullptr == var_desc) {
-      v->GetMutable<LoDTensor>();
-    } else {
-      InitializeVariable(
-          v,
-          var_desc
-              ->GetType());  // Scope don't initialize variable recently created
-    }
-    SetVarDesc(name, var_desc);
-  }
+  size_t VarSize() const;
 
-  void AddVar(const std::string& name, Variable& var) {  // NOLINT
-    // Though name existed in outer_scope_, we need
-    // add again to create name2id map.
-    outer_scope_->Var(name);
-  }
+  void AddVar(const std::string& name, VarDesc* var_desc);
 
-  void SetVarDesc(const std::string& name, framework::VarDesc* var_desc) {
-    CheckExist(name);
-    vec_meta_info_[VarId(name)].vardesc_ = var_desc;
-  }
+  void AddVar(const std::string& name, const Variable& var);
 
-  paddle::framework::VarDesc* VarDesc(const std::string& name) const {
-    return VarDesc(VarId(name));
-  }
+  void SetVarDesc(const std::string& name, framework::VarDesc* var_desc);
 
-  paddle::framework::VarDesc* VarDesc(int id) const {
-    CheckExist(id);
-    return vec_meta_info_[id].vardesc_;
-  }
+  paddle::framework::VarDesc* VarDesc(const std::string& name) const;
 
-  void CheckExist(int id) const {
-    PADDLE_ENFORCE_LT(id, var_list_.size(),
-                      platform::errors::PreconditionNotMet(
-                          "Required var_id < %d, but received var_id = %d.",
-                          var_list_.size(), id));
-  }
+  paddle::framework::VarDesc* VarDesc(int id) const;
 
-  void CheckExist(const std::string& name) const {
-    PADDLE_ENFORCE_EQ(
-        HasVar(name), true,
-        platform::errors::NotFound("%s not in VariableScope.", name));
-  }
+  void CheckExist(int id) const;
+
+  void CheckExist(const std::string& name) const;
 
- public:  // callbacks from ScopeListener class
-  void onCreateVariable(const std::string& name) override {
-    auto v = outer_scope_->GetVar(name);  // must exsit in outer_scope_
-    if (!HasVar(name)) {                  // may exist in variable scope.
-      VLOG(4) << "Calling VariableScope::onCreateVariable with var_name: "
-              << name;
-      name2id_[name] = VarSize();
-      var_list_.push_back(v);
-
-      VariableMetaInfo info;
-      info.var_ref_count_ = 0;
-      info.vardesc_ = nullptr;  // set nullptr, then modifty it in AddVar()
-      vec_meta_info_.push_back(info);
-    }
-  }
-  void onDeleteVariable(const std::string& name) override {
-    if (HasVar(name)) {
-      VLOG(4) << "Calling VariableScope::onDeleteVariable with var_name: "
-              << name;
-    }
-  }
-  void onRenameVariable(const std::string& old_name,
-                        const std::string& new_name) override {}
-  void onCreateScope(Scope* Scope) override {}
-  void onDeleteScope(Scope* Scope) override {}
-  void onClear() override {}
   std::vector<VariableMetaInfo>& MutableVecMetaInfo() { return vec_meta_info_; }
 
   const std::vector<VariableMetaInfo>& VecMetaInfo() const {
     return vec_meta_info_;
   }
 
+  friend class VariableScopeListener;
+
  private:
   std::vector<Variable*> var_list_;
   std::map<std::string, int> name2id_;
   std::vector<VariableMetaInfo> vec_meta_info_;
-  Scope* outer_scope_ = nullptr;
+  Scope* scope_ = nullptr;
+  // mutable RWLock vars_lock_;
+  std::shared_ptr<VariableScopeListener> listener_;
 };
 
 class NextInstruction {
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index 1c9f6b3d901bd4f517313502e5e1b752333ed219..d1c887148583adb7d9e5de980fb79d037a90c0f9 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -23,16 +23,14 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
     : place_(place),
       startup_prog_(startup_prog),
       main_prog_(main_prog),
-      outer_scope_(scope),
-      global_scope_(scope) {
-  paddle::framework::InitDevices();
+      global_scope_(VariableScope(scope)) {
   // init scope
-  BuildVariableOuterScope(startup_prog, &global_scope_, scope);
+  BuildVariableScope(startup_prog, &global_scope_);
 
-  if (outer_scope_ != nullptr) {
-    auto name_list = outer_scope_->LocalVarNames();
+  if (scope != nullptr) {
+    auto name_list = scope->LocalVarNames();
     for (auto name : name_list) {
-      auto v = outer_scope_->Var(name);
+      auto v = scope->Var(name);
       if (!global_scope_.HasVar(name)) {
         global_scope_.AddVar(name, *v);
       }
@@ -62,9 +60,8 @@ framework::interpreter::CostInfo StandaloneExecutor::DryRun(
   return core->DryRun(feed_names, feed_tensors);
 }
 
-void StandaloneExecutor::BuildVariableOuterScope(
-    const framework::ProgramDesc& pdesc, VariableScope* var_scope,
-    Scope* outer_scope) {
+void StandaloneExecutor::BuildVariableScope(const framework::ProgramDesc& pdesc,
+                                            VariableScope* var_scope) {
   auto& global_block = pdesc.Block(0);
 
   for (auto& var : global_block.AllVars()) {
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.h b/paddle/fluid/framework/new_executor/standalone_executor.h
index 1fbdf7b4b0ad60e0788d64a8ad898e688afd0270..9b535c9b6307341a91f60be1e08bb3e62fb99844 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.h
+++ b/paddle/fluid/framework/new_executor/standalone_executor.h
@@ -50,8 +50,8 @@ class StandaloneExecutor : public ExecutorBase {
       const std::vector<framework::LoDTensor>& feed_tensors);
 
  private:
-  void BuildVariableOuterScope(const framework::ProgramDesc& pdesc,
-                               VariableScope* var_scope, Scope* outer_scope);
+  void BuildVariableScope(const framework::ProgramDesc& pdesc,
+                          VariableScope* var_scope);
 
   std::shared_ptr<InterpreterCore> GetInterpreterCore(
       const std::vector<std::string>& feed_names,
@@ -60,7 +60,6 @@ class StandaloneExecutor : public ExecutorBase {
   const platform::Place& place_;
   const ProgramDesc& startup_prog_;
   const ProgramDesc& main_prog_;
-  Scope* outer_scope_;
   VariableScope global_scope_;
 
   std::unordered_map<std::string, std::shared_ptr<ProgramDesc>> programs_;
diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index 2aad8d245a69aa9a9f5433de48e3706f918303d8..20bc5c7789d1267f74d7a26e9cce21205cb5a29e 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -75,6 +75,7 @@ paddle::framework::ProgramDesc load_from_file(const std::string& file_name) {
 }
 
 int main(int argc, char* argv[]) {
+  paddle::framework::InitDevices();
   std::cout << "main" << std::endl;
   int64_t batch_size = std::stoi(argv[1]);
   paddle::framework::InitDevices();
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 7a36354d7e6695070e60fde626f7dc260d300ee3..4bb94a4e7e5a1857ba10addddbb830b7ab7e8748 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -266,14 +266,14 @@ Variable* Scope::FindVarLocally(const std::string& name) const {
   return nullptr;
 }
 
-void Scope::AddListener(ScopeListener* listener) {
+void Scope::AddListener(const std::shared_ptr<ScopeListener>& listener) {
   auto it = std::find(listeners_.begin(), listeners_.end(), listener);
   if (it == listeners_.end()) {
     listeners_.push_back(listener);
   }
 }
 
-void Scope::DelListener(ScopeListener* listener) {
+void Scope::DelListener(const std::shared_ptr<ScopeListener>& listener) {
   listeners_.remove(listener);
 }
 
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index ca486aec8c27e17669e7e1446d8acdbd93135891..892618b7e6cc19fee949826e84188bcff2f6de3f 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -144,9 +144,9 @@ class Scope : public ScopeBase {
   // Rename variable to a new name and return the new name
   std::string Rename(const std::string& origin_name) const;
 
-  void AddListener(ScopeListener* listener);
+  void AddListener(const std::shared_ptr<ScopeListener>& listener);
 
-  void DelListener(ScopeListener* listener);
+  void DelListener(const std::shared_ptr<ScopeListener>& listener);
 
  protected:
   struct KeyHasher {
@@ -184,7 +184,7 @@ class Scope : public ScopeBase {
   // Scope in `kids_` are owned by this class.
   mutable std::list<Scope*> kids_;
   const Scope* parent_{nullptr};
-  std::list<ScopeListener*> listeners_;
+  std::list<std::shared_ptr<ScopeListener>> listeners_;
 
   DISABLE_COPY_AND_ASSIGN(Scope);
 
diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
index 03062ab8e2b29876850b1a607ab492d28e25dd26..325d74bb5d7f4bd6d6953216824bb135f4b2d56f 100644
--- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
+++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
@@ -275,7 +275,7 @@ class TestException(unittest.TestCase):
 
         for feed in feeds:
             out = exe.run(main_program, feed=feed, fetch_list=fetch_vars)
-        print(out)
+        print(main_program)
         return out
 
     def run_new_executor(self, feed):
@@ -287,10 +287,10 @@ class TestException(unittest.TestCase):
     def test_exception(self):
         feed = [{
             'id': np.array([1, 2, 3, 4, 5]).astype(np.int64),
-            'data': np.array([1, 2, 3, 4]).astype(np.float32),
+            'data': np.array([1, 2, 3]).astype(np.float32),
         }, {
             'id': np.array([1, 2, 3, 4, 11]).astype(np.int64),
-            'data': np.array([1, 2, 3, 4]).astype(np.float32),
+            'data': np.array([1, 2, 3]).astype(np.float32),
         }]
         self.assertRaises(ValueError, self.run_new_executor, feed)
 
@@ -307,6 +307,18 @@ class TestException(unittest.TestCase):
         feed[1]['data'][0] = np.nan
         self.assertRaises(RuntimeError, self.run_new_executor, feed)
 
+    def test_scope(self):
+        feed = [{
+            'id': np.array([1, 2, 3, 4, 5]).astype(np.int64),
+            'data': np.array([1, 2, 3]).astype(np.float32),
+        }, {
+            'id': np.array([1, 2, 3, 4, 5]).astype(np.int64),
+            'data': np.array([2, 2, 2]).astype(np.float32),
+        }]
+        self.run_new_executor(feed)
+        self.assertIsNotNone(paddle.static.global_scope().find_var(
+            'embedding.tmp_2'))
+
 
 if __name__ == "__main__":
     unittest.main()