code refactoring for new executor (#34970)

* code refactoring, test=develop * refine, test=develop * refine, test=develop * refine, test=develop

code refactoring for new executor (#34970)
* code refactoring, test=develop * refine, test=develop * refine, test=develop * refine, test=develop
40d4d834 · wanghuancoder · GitHub · 1b747de7 · 40d4d834 · 40d4d834
12 changed file
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -25,6 +25,7 @@ add_subdirectory(ir)
 add_subdirectory(details)
 add_subdirectory(fleet)
 add_subdirectory(io)
+add_subdirectory(new_executor)
 #ddim lib
 proto_library(framework_proto SRCS framework.proto)

--- a/paddle/fluid/framework/new_executor/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/CMakeLists.txt
+cc_library(interpretercore SRCS interpretercore.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ${PYBIND_DEPS} profiler)
+cc_library(standalone_executor SRCS standalone_executor.cc DEPS interpretercore operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ${PYBIND_DEPS} profiler)
+# cc_binary(standalone_executor_test SRCS standalone_executor_test.cc DEPS interpretercore standalone_executor operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
--- a/paddle/fluid/framework/new_exec.h
+++ b/paddle/fluid/framework/new_exec.h
@@ -11,153 +11,257 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#pragma once
+#include "paddle/fluid/framework/new_executor/interpretercore.h"
-#include <iostream>
-#include <string>
-#include <chrono>
-#include <map>
-#include <memory>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/framework/executor_gc_helper.h"
-#include "paddle/fluid/framework/garbage_collector.h"
-#include "paddle/fluid/framework/new_exec_util.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/init.h"
-// USE_OP(fill_constant);
-// USE_OP(elementwise_add);
-// using namespace std;
 namespace paddle {
 namespace framework {
-using std::cerr;
+InterpreterCore::InterpreterCore(const platform::Place& place,
-using std::endl;
+                                 const ProgramDesc& main_prog,
+                                 VariableScope* global_scope,
+                                 const std::vector<std::string>& feed_names,
+                                 const std::vector<std::string>& fetch_names)
+    : place_(place), main_program_(main_prog), global_scope_(global_scope) {
+  is_build_ = false;
+  feed_names_ = feed_names;
+  fetch_names_ = fetch_names;
+  // add feedop and fetchop to main_program
-using OpKernelComputeFunc = std::function<void(const ExecutionContext&)>;
+  // prune
-using OpKernelMap =
-    std::unordered_map<OpKernelType, OpKernelComputeFunc, OpKernelType::Hash>;
-framework::ProgramDesc load_from_file(const std::string& file_name) {
+  // optmize graph pass
-  std::ifstream fin(file_name, std::ios::in | std::ios::binary);
-  fin.seekg(0, std::ios::end);
-  std::string buffer(fin.tellg(), ' ');
-  fin.seekg(0, std::ios::beg);
-  fin.read(&buffer[0], buffer.size());
-  fin.close();
-  ProgramDesc program_desc(buffer);
+  // convert to run graph
-  return program_desc;
 }
-struct OpKernelFunc {
+void InterpreterCore::Run(const std::vector<framework::Tensor>& feed_tensors,
-  OpKernelComputeFunc compute_func_;
+                          std::vector<framework::Tensor>* fetch_tensors) {
-  OperatorBase* operator_base_;
+  if (is_build_ == false) {
-};
+    BuildVariableScope(main_program_, global_scope_);
-struct VariableMetaInfo {
-  int var_ref_count_;
-};
-struct VariableScope {
-  std::vector<Variable*> var_list;
-  std::map<std::string, int> name2id;
-  std::vector<VariableMetaInfo> vec_meta_info_;
-};
-struct NextInstruction {
-  std::vector<size_t> direct_run_;
-};
-struct EventInter {};
-struct InstructionInfo {
-  std::vector<size_t> dependecy_count_;
-};
-struct EventRun {
-  EventInter event_inter;
-  std::vector<size_t> same_device_run_;
-  std::vector<size_t> synchronized_run;
-};
-struct Instruction {
-  OpKernelFunc kernel_func_;
-  std::map<std::string, std::vector<int>> input_index_;
-  std::map<std::string, std::vector<int>> output_index_;
-  std::vector<size_t> gc_check_var_list;
-  NextInstruction next_instruction_;
-  std::vector<EventInter> vec_event_list_;
-};
-struct OpFuncNode {
-  // int unsed;
-  std::map<std::string, std::vector<int>> input_index;
-  std::map<std::string, std::vector<int>> output_index;
-  OpKernelComputeFunc kernel_func_;
-};
-int convert(const platform::Place& place) {
-  if (is_cpu_place(place)) {
-    return 0;
  }
-  if (is_gpu_place(place)) {
+  for (size_t i = 0; i < feed_names_.size(); ++i) {
-    return 1;
+    auto it = global_scope_->name2id.find(feed_names_[i]);
+    assert(it != global_scope_->name2id.end());
+    auto feed_tensor =
+        global_scope_->var_list[it->second]->GetMutable<framework::LoDTensor>();
+    feed_tensor->ShareDataWith(feed_tensors[i]);
+  }
+  if (is_build_ == false) {
+    BuildOpFuncList(place_, main_program_, &op_list_, &vec_func_list_,
+                    global_scope_);
+    is_build_ = true;
+    // convert vec func_list to graph
+    Convert();
+  } else {
+    ExecuteInstructionList(vec_instruction_, *global_scope_, place_);
  }
-  return -1;
+  for (size_t i = 0; i < fetch_names_.size(); ++i) {
+    auto it = global_scope_->name2id.find(fetch_names_[i]);
+    assert(it != global_scope_->name2id.end());
+    PADDLE_ENFORCE_NE(
+        it, global_scope_->name2id.end(),
+        platform::errors::NotFound(
+            "Can't find (%d) the fetch var (%s) in scope", i, fetch_names_[i]));
+    auto fetch_tensor =
+        global_scope_->var_list[it->second]->GetMutable<framework::LoDTensor>();
+    if (platform::is_gpu_place(fetch_tensor->place())) {
+      Tensor out;
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      auto* dev_ctx = pool.Get(place_);
+      dev_ctx->Wait();
+      TensorCopySync(*fetch_tensor, platform::CPUPlace(), &out);
+      dev_ctx->Wait();
+      fetch_tensors->push_back(out);
+    } else {
+      Tensor out;
+      TensorCopySync(*fetch_tensor, platform::CPUPlace(), &out);
+      fetch_tensors->push_back(out);
+    }
+  }
 }
-std::vector<size_t> merge_vec(const std::vector<size_t>& first,
+void InterpreterCore::Convert() {
-                              const std::vector<size_t>& second) {
+  input_var2op_info_.resize(global_scope_->var_list.size());
-  std::vector<size_t> out(first.size() + second.size());
-  std::merge(first.begin(), first.end(), second.begin(), second.end(),
+  vec_instruction_.reserve(vec_func_list_.size());
-             out.begin());
+  dependecy_count_.resize(vec_func_list_.size());
+  vec_meta_info_.resize(global_scope_->var_list.size());
+  for (size_t i = 0; i < vec_func_list_.size(); ++i) {
+    Instruction temp_inst;
+    temp_inst.kernel_func_.compute_func_ = vec_func_list_[i].kernel_func_;
+    temp_inst.kernel_func_.operator_base_ = op_list_[i];
+    temp_inst.input_index_ = vec_func_list_[i].input_index;
+    temp_inst.output_index_ = vec_func_list_[i].output_index;
+    std::vector<size_t> gc_check_input_list;
+    for (auto& item : vec_func_list_[i].input_index) {
+      for (auto id : item.second) {
+        input_var2op_info_[id].push_back(i);
+        gc_check_input_list.push_back(id);
+      }
+    }
+    std::sort(gc_check_input_list.begin(), gc_check_input_list.end());
+    auto last =
+        std::unique(gc_check_input_list.begin(), gc_check_input_list.end());
+    gc_check_input_list.erase(last, gc_check_input_list.end());
+    for (auto var_id : gc_check_input_list) {
+      vec_meta_info_[var_id].var_ref_count_++;
+    }
-  std::vector<size_t>::iterator it;
+    temp_inst.gc_check_var_list.swap(gc_check_input_list);
-  it = std::unique(out.begin(), out.end());
-  out.resize(std::distance(out.begin(), it));
+    vec_instruction_.push_back(temp_inst);
+  }
-  return out;
+  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
+    std::vector<size_t> vec_temp;
+    for (auto& item : vec_instruction_[i].output_index_) {
+      for (auto id : item.second) {
+        vec_temp = MergeVector(vec_temp, input_var2op_info_[id]);
+      }
+    }
+    // In Program, op order is a very import information.
+    // Op can noly add op after it as next as next ops.
+    std::vector<size_t> filter_next;
+    filter_next.reserve(vec_temp.size());
+    for (auto item : vec_temp) {
+      if (item > i) {
+        filter_next.push_back(item);
+      }
+    }
+    vec_instruction_[i].next_instruction_.direct_run_ = filter_next;
+    // checkout ouput
+    for (auto& item : vec_instruction_[i].output_index_) {
+      for (auto id : item.second) {
+        if (input_var2op_info_[id].size() == 0) {
+          // output var not be used by any kernel
+          vec_instruction_[i].gc_check_var_list.push_back(id);
+          vec_meta_info_[id].var_ref_count_++;
+        }
+      }
+    }
+    for (auto inst_id : filter_next) {
+      dependecy_count_[inst_id]++;
+    }
+  }
 }
-void build_variable_outer_scope(const framework::ProgramDesc& pdesc,
+void InterpreterCore::RunInstruction(const Instruction& instr_node,
-                                VariableScope* var_scope, Scope* outer_scope) {
+                                     const VariableScope& var_scope,
-  auto& global_block = pdesc.Block(0);
+                                     const platform::Place& place) {
+  auto op_base = instr_node.kernel_func_.operator_base_;
+  // build runtime cost
+  VariableValueMap ins_map;
+  for (auto& var_name_item : instr_node.input_index_) {
+    std::vector<Variable*> input_vars;
+    input_vars.reserve(var_name_item.second.size());
+    for (auto& id : var_name_item.second) {
+      input_vars.emplace_back(var_scope.var_list[id]);
+    }
+    ins_map.emplace(var_name_item.first, std::move(input_vars));
+  }
-  for (auto& var : global_block.AllVars()) {
+  VariableValueMap outs_map;
-    if (var->Name() == framework::kEmptyVarName) {
+  for (auto& var_name_item : instr_node.output_index_) {
-      continue;
+    std::vector<Variable*> out_vars;
+    out_vars.reserve(var_name_item.second.size());
+    for (auto& id : var_name_item.second) {
+      out_vars.emplace_back(var_scope.var_list[id]);
    }
-    auto v = outer_scope->Var(var->Name());
+    outs_map.emplace(var_name_item.first, std::move(out_vars));
+  }
-    if (var_scope->name2id.find(var->Name()) == var_scope->name2id.end()) {
+  RuntimeContext runtime_context({}, {});
-      var_scope->name2id[var->Name()] = var_scope->var_list.size();
+  runtime_context.inputs.swap(ins_map);
+  runtime_context.outputs.swap(outs_map);
+  RuntimeInferShapeContext infer_shape_ctx(*op_base, runtime_context);
+  static_cast<const framework::OperatorWithKernel*>(op_base)->InferShape(
+      &infer_shape_ctx);
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(place);
+  Scope scope;
+  auto exec_context =
+      ExecutionContext(*op_base, scope, *dev_ctx, runtime_context);
+  instr_node.kernel_func_.compute_func_(exec_context);
+}
+void InterpreterCore::ExecuteInstructionList(
+    const std::vector<Instruction>& vec_instr, const VariableScope& var_scope,
+    const platform::Place& place) {
+  std::queue<size_t> working_queue;
+  auto working_dependecy_count = dependecy_count_;
+  for (size_t i = 0; i < dependecy_count_.size(); ++i) {
+    if (dependecy_count_[i] == 0) {
+      working_queue.push(i);
+    }
+  }
+  auto working_var_ref = vec_meta_info_;
+  size_t run_op_number = 0;
+  while (!working_queue.empty()) {
+    auto instr_id = working_queue.front();
+    working_queue.pop();
+    auto& instr_node = vec_instr[instr_id];
+    RunInstruction(instr_node, var_scope, place);
+    auto& next_instr = instr_node.next_instruction_.direct_run_;
+    ++run_op_number;
+    for (auto next_i : next_instr) {
+      --working_dependecy_count[next_i];
+      if (working_dependecy_count[next_i] == 0) {
+        working_queue.push(next_i);
+      }
+    }
+    // GC infomation
+    auto& gc_check_list = instr_node.gc_check_var_list;
+    for (auto var_id : gc_check_list) {
+      --working_var_ref[var_id].var_ref_count_;
    }
+  }
-    InitializeVariable(v, var->GetType());
+  for (size_t i = 0; i < working_var_ref.size(); ++i) {
-    var_scope->var_list.push_back(v);
+    if (working_var_ref[i].var_ref_count_ != 0) {
+      std::cerr << " var ref is not zero " << i << std::endl;
+    }
  }
 }
-void build_variable_scope(const framework::ProgramDesc& pdesc,
+std::vector<size_t> InterpreterCore::MergeVector(
-                          VariableScope* var_scope) {
+    const std::vector<size_t>& first, const std::vector<size_t>& second) {
+  std::vector<size_t> out(first.size() + second.size());
+  std::merge(first.begin(), first.end(), second.begin(), second.end(),
+             out.begin());
+  std::vector<size_t>::iterator it;
+  it = std::unique(out.begin(), out.end());
+  out.resize(std::distance(out.begin(), it));
+  return out;
+}
+void InterpreterCore::BuildVariableScope(const framework::ProgramDesc& pdesc,
+                                         VariableScope* var_scope) {
  auto& global_block = pdesc.Block(0);
  for (auto& var : global_block.AllVars()) {
@@ -167,19 +271,18 @@ void build_variable_scope(const framework::ProgramDesc& pdesc,
    if (var_scope->name2id.find(var->Name()) == var_scope->name2id.end()) {
      var_scope->name2id[var->Name()] = var_scope->var_list.size();
+      auto v = new Variable();
+      InitializeVariable(v, var->GetType());
+      var_scope->var_list.push_back(v);
    }
-    auto v = new Variable();
-    InitializeVariable(v, var->GetType());
-    var_scope->var_list.push_back(v);
  }
 }
-void build_op_func_list(const framework::ProgramDesc& pdesc,
+void InterpreterCore::BuildOpFuncList(const platform::Place& place,
-                        std::vector<OperatorBase*>* op_list,
+                                      const framework::ProgramDesc& pdesc,
-                        std::vector<OpFuncNode>* vec_func_list,
+                                      std::vector<OperatorBase*>* op_list,
-                        VariableScope* var_scope,
+                                      std::vector<OpFuncNode>* vec_func_list,
-                        const platform::Place& place) {
+                                      VariableScope* var_scope) {
  auto& global_block = pdesc.Block(0);
  for (auto& op : global_block.AllOps()) {
@@ -291,7 +394,8 @@ void build_op_func_list(const framework::ProgramDesc& pdesc,
          VariableNameMap copy_out_map;
          copy_out_map["Out"] = {new_var_name};
          AttributeMap attr_map;
-          attr_map["dst_place_type"] = convert(place);
+          attr_map["dst_place_type"] =
+              is_cpu_place(place) ? 0 : is_gpu_place(place) ? 1 : -1;
          std::map<std::string, std::vector<int>> copy_ins_name2id;
          copy_ins_name2id["X"] = ins_name2id[var_name_item.first];
@@ -363,267 +467,5 @@ void build_op_func_list(const framework::ProgramDesc& pdesc,
  }
 }
-class InterpreterCore {
- public:
-  InterpreterCore(const platform::Place& place, const ProgramDesc& prog,
-                  const ProgramDesc& startup_prog, Scope* scope)
-      : place_(place), prog_(prog), outer_scope_(scope) {
-    paddle::framework::InitDevices();
-    is_build_ = false;
-    if (outer_scope_ != nullptr) {
-      auto name_list = outer_scope_->LocalVarNames();
-      for (auto name : name_list) {
-        auto v = outer_scope_->Var(name);
-        if (global_scope.name2id.find(name) == global_scope.name2id.end()) {
-          global_scope.name2id[name] = global_scope.var_list.size();
-        }
-        global_scope.var_list.push_back(v);
-      }
-    }
-    paddle::framework::build_variable_outer_scope(startup_prog, &global_scope,
-                                                  outer_scope_);
-    std::vector<paddle::framework::OpFuncNode> vec_func_list;
-    std::vector<paddle::framework::OperatorBase*> op_list;
-    paddle::framework::build_op_func_list(
-        startup_prog, &op_list, &vec_func_list, &global_scope, place_);
-    // add variable to outer_scope
-  }
-  void run(const std::vector<std::string>& vec_name,
-           const std::vector<framework::Tensor>& vec_tensor,
-           const std::vector<std::string>& vec_fetch_name,
-           std::vector<framework::Tensor>* vec_out) {
-    if (is_build_ == false) {
-      paddle::framework::build_variable_scope(prog_, &global_scope);
-    }
-    for (size_t i = 0; i < vec_name.size(); ++i) {
-      auto it = global_scope.name2id.find(vec_name[i]);
-      assert(it != global_scope.name2id.end());
-      auto feed_tensor =
-          global_scope.var_list[it->second]->GetMutable<framework::LoDTensor>();
-      feed_tensor->ShareDataWith(vec_tensor[i]);
-    }
-    if (is_build_ == false) {
-      paddle::framework::build_op_func_list(prog_, &op_list, &vec_func_list,
-                                            &global_scope, place_);
-      is_build_ = true;
-      // convert vec func_list to graph
-      convert();
-    } else {
-      exec_instruction_list(vec_instruction_, global_scope, place_);
-    }
-    for (size_t i = 0; i < vec_fetch_name.size(); ++i) {
-      auto it = global_scope.name2id.find(vec_fetch_name[i]);
-      assert(it != global_scope.name2id.end());
-      PADDLE_ENFORCE_NE(it, global_scope.name2id.end(),
-                        platform::errors::NotFound(
-                            "Can't find (%d) the fetch var (%s) in scope", i,
-                            vec_fetch_name[i]));
-      auto fetch_tensor =
-          global_scope.var_list[it->second]->GetMutable<framework::LoDTensor>();
-      if (platform::is_gpu_place(fetch_tensor->place())) {
-        Tensor out;
-        platform::DeviceContextPool& pool =
-            platform::DeviceContextPool::Instance();
-        auto* dev_ctx = pool.Get(place_);
-        dev_ctx->Wait();
-        TensorCopySync(*fetch_tensor, platform::CPUPlace(), &out);
-        dev_ctx->Wait();
-        vec_out->push_back(out);
-      } else {
-        Tensor out;
-        TensorCopySync(*fetch_tensor, platform::CPUPlace(), &out);
-        vec_out->push_back(out);
-      }
-    }
-  }
- private:
-  void convert() {
-    input_var2op_info_.resize(global_scope.var_list.size());
-    vec_instruction_.reserve(vec_func_list.size());
-    dependecy_count_.resize(vec_func_list.size());
-    global_scope.vec_meta_info_.resize(global_scope.var_list.size());
-    for (size_t i = 0; i < vec_func_list.size(); ++i) {
-      Instruction temp_inst;
-      temp_inst.kernel_func_.compute_func_ = vec_func_list[i].kernel_func_;
-      temp_inst.kernel_func_.operator_base_ = op_list[i];
-      temp_inst.input_index_ = vec_func_list[i].input_index;
-      temp_inst.output_index_ = vec_func_list[i].output_index;
-      std::vector<size_t> gc_check_input_list;
-      for (auto& item : vec_func_list[i].input_index) {
-        for (auto id : item.second) {
-          input_var2op_info_[id].push_back(i);
-          gc_check_input_list.push_back(id);
-        }
-      }
-      std::sort(gc_check_input_list.begin(), gc_check_input_list.end());
-      auto last =
-          std::unique(gc_check_input_list.begin(), gc_check_input_list.end());
-      gc_check_input_list.erase(last, gc_check_input_list.end());
-      for (auto var_id : gc_check_input_list) {
-        global_scope.vec_meta_info_[var_id].var_ref_count_++;
-      }
-      temp_inst.gc_check_var_list.swap(gc_check_input_list);
-      vec_instruction_.push_back(temp_inst);
-    }
-    for (size_t i = 0; i < vec_instruction_.size(); ++i) {
-      std::vector<size_t> vec_temp;
-      for (auto& item : vec_instruction_[i].output_index_) {
-        for (auto id : item.second) {
-          vec_temp = merge_vec(vec_temp, input_var2op_info_[id]);
-        }
-      }
-      // In Program, op order is a very import information.
-      // Op can noly add op after it as next as next ops.
-      std::vector<size_t> filter_next;
-      filter_next.reserve(vec_temp.size());
-      for (auto item : vec_temp) {
-        if (item > i) {
-          filter_next.push_back(item);
-        }
-      }
-      vec_instruction_[i].next_instruction_.direct_run_ = filter_next;
-      // checkout ouput
-      for (auto& item : vec_instruction_[i].output_index_) {
-        for (auto id : item.second) {
-          if (input_var2op_info_[id].size() == 0) {
-            // output var not be used by any kernel
-            vec_instruction_[i].gc_check_var_list.push_back(id);
-            global_scope.vec_meta_info_[id].var_ref_count_++;
-          }
-        }
-      }
-      for (auto inst_id : filter_next) {
-        dependecy_count_[inst_id]++;
-      }
-    }
-  }
-  void run_instr(const Instruction& instr_node, const VariableScope& var_scope,
-                 const platform::Place& place) {
-    auto op_base = instr_node.kernel_func_.operator_base_;
-    // build runtime cost
-    VariableValueMap ins_map;
-    for (auto& var_name_item : instr_node.input_index_) {
-      std::vector<Variable*> input_vars;
-      input_vars.reserve(var_name_item.second.size());
-      for (auto& id : var_name_item.second) {
-        input_vars.emplace_back(var_scope.var_list[id]);
-      }
-      ins_map.emplace(var_name_item.first, std::move(input_vars));
-    }
-    VariableValueMap outs_map;
-    for (auto& var_name_item : instr_node.output_index_) {
-      std::vector<Variable*> out_vars;
-      out_vars.reserve(var_name_item.second.size());
-      for (auto& id : var_name_item.second) {
-        out_vars.emplace_back(var_scope.var_list[id]);
-      }
-      outs_map.emplace(var_name_item.first, std::move(out_vars));
-    }
-    RuntimeContext runtime_context({}, {});
-    runtime_context.inputs.swap(ins_map);
-    runtime_context.outputs.swap(outs_map);
-    RuntimeInferShapeContext infer_shape_ctx(*op_base, runtime_context);
-    static_cast<const framework::OperatorWithKernel*>(op_base)->InferShape(
-        &infer_shape_ctx);
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto* dev_ctx = pool.Get(place);
-    Scope scope;
-    auto exec_context =
-        ExecutionContext(*op_base, scope, *dev_ctx, runtime_context);
-    instr_node.kernel_func_.compute_func_(exec_context);
-  }
-  void exec_instruction_list(const std::vector<Instruction>& vec_instr,
-                             const VariableScope& var_scope,
-                             const platform::Place& place) {
-    std::queue<size_t> working_queue;
-    auto working_dependecy_count = dependecy_count_;
-    for (size_t i = 0; i < dependecy_count_.size(); ++i) {
-      if (dependecy_count_[i] == 0) {
-        working_queue.push(i);
-      }
-    }
-    auto working_var_ref = global_scope.vec_meta_info_;
-    size_t run_op_number = 0;
-    while (!working_queue.empty()) {
-      auto instr_id = working_queue.front();
-      working_queue.pop();
-      auto& instr_node = vec_instr[instr_id];
-      run_instr(instr_node, var_scope, place);
-      auto& next_instr = instr_node.next_instruction_.direct_run_;
-      ++run_op_number;
-      for (auto next_i : next_instr) {
-        --working_dependecy_count[next_i];
-        if (working_dependecy_count[next_i] == 0) {
-          working_queue.push(next_i);
-        }
-      }
-      // GC infomation
-      auto& gc_check_list = instr_node.gc_check_var_list;
-      for (auto var_id : gc_check_list) {
-        --working_var_ref[var_id].var_ref_count_;
-      }
-    }
-    for (size_t i = 0; i < working_var_ref.size(); ++i) {
-      if (working_var_ref[i].var_ref_count_ != 0) {
-        cerr << " var ref is not zero " << i << endl;
-      }
-    }
-  }
-  const platform::Place& place_;
-  const ProgramDesc& prog_;
-  paddle::framework::VariableScope global_scope;
-  std::vector<paddle::framework::OpFuncNode> vec_func_list;
-  std::vector<paddle::framework::OperatorBase*> op_list;
-  bool is_build_;
-  std::vector<Instruction> vec_instruction_;
-  InstructionInfo instruction_info_;
-  std::vector<size_t> dependecy_count_;
-  std::vector<VariableMetaInfo> ref_coun_info;
-  std::vector<std::vector<size_t>> input_var2op_info_;
-  Scope* outer_scope_;
-};
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <map>
+#include <queue>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/framework/new_executor/interpretercore_util.h"
+#include "paddle/fluid/framework/new_executor/new_executor_defs.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/variable.h"
+namespace paddle {
+namespace framework {
+class InterpreterCore {
+ public:
+  InterpreterCore(const platform::Place& place, const ProgramDesc& main_prog,
+                  VariableScope* global_scope,
+                  const std::vector<std::string>& feed_names,
+                  const std::vector<std::string>& fetch_names);
+  void Run(const std::vector<framework::Tensor>& feed_tensors,
+           std::vector<framework::Tensor>* fetch_tensors);
+  static void BuildOpFuncList(const platform::Place& place,
+                              const framework::ProgramDesc& pdesc,
+                              std::vector<OperatorBase*>* op_list,
+                              std::vector<OpFuncNode>* vec_func_list,
+                              VariableScope* var_scope);
+ private:
+  void Convert();
+  void RunInstruction(const Instruction& instr_node,
+                      const VariableScope& var_scope,
+                      const platform::Place& place);
+  void ExecuteInstructionList(const std::vector<Instruction>& vec_instr,
+                              const VariableScope& var_scope,
+                              const platform::Place& place);
+  std::vector<size_t> MergeVector(const std::vector<size_t>& first,
+                                  const std::vector<size_t>& second);
+  void BuildVariableScope(const framework::ProgramDesc& pdesc,
+                          VariableScope* var_scope);
+  const platform::Place& place_;
+  const ProgramDesc& main_program_;
+  VariableScope* global_scope_;
+  std::vector<VariableMetaInfo> vec_meta_info_;
+  std::vector<paddle::framework::OpFuncNode> vec_func_list_;
+  std::vector<paddle::framework::OperatorBase*> op_list_;
+  std::vector<Instruction> vec_instruction_;
+  InstructionInfo instruction_info_;
+  std::vector<size_t> dependecy_count_;
+  std::vector<VariableMetaInfo> ref_coun_info_;
+  std::vector<std::vector<size_t>> input_var2op_info_;
+  bool is_build_;
+  std::vector<std::string> feed_names_;
+  std::vector<std::string> fetch_names_;
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/new_exec_util.h
+++ b/paddle/fluid/framework/new_exec_util.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 /*************************************************************************
-  > File Name: new_exec_util.h
+  > File Name: interpretercore_util.h
  > Author: guanshanshan@baidu.com
  > Created Time: Fri 23 Jul 2021 06:19:19 AM UTC
 ************************************************************************/

--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/framework/operator.h"
+namespace paddle {
+namespace framework {
+using OpKernelComputeFunc = std::function<void(const ExecutionContext&)>;
+using OpKernelMap =
+    std::unordered_map<OpKernelType, OpKernelComputeFunc, OpKernelType::Hash>;
+struct OpKernelFunc {
+  OpKernelComputeFunc compute_func_;
+  OperatorBase* operator_base_;
+};
+struct VariableMetaInfo {
+  int var_ref_count_;
+};
+struct VariableScope {
+  std::vector<Variable*> var_list;
+  std::map<std::string, int> name2id;
+};
+struct NextInstruction {
+  std::vector<size_t> direct_run_;
+};
+struct EventInter {};
+struct InstructionInfo {
+  std::vector<size_t> dependecy_count_;
+};
+struct EventRun {
+  EventInter event_inter;
+  std::vector<size_t> same_device_run_;
+  std::vector<size_t> synchronized_run;
+};
+struct Instruction {
+  OpKernelFunc kernel_func_;
+  std::map<std::string, std::vector<int>> input_index_;
+  std::map<std::string, std::vector<int>> output_index_;
+  std::vector<size_t> gc_check_var_list;
+  NextInstruction next_instruction_;
+  std::vector<EventInter> vec_event_list_;
+};
+struct OpFuncNode {
+  // int unsed;
+  std::map<std::string, std::vector<int>> input_index;
+  std::map<std::string, std::vector<int>> output_index;
+  OpKernelComputeFunc kernel_func_;
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/new_executor/standalone_executor.h"
+namespace paddle {
+namespace framework {
+StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
+                                       const ProgramDesc& startup_prog,
+                                       const ProgramDesc& main_prog,
+                                       Scope* scope)
+    : place_(place),
+      startup_prog_(startup_prog),
+      main_prog_(main_prog),
+      outer_scope_(scope) {
+  paddle::framework::InitDevices();
+  // init scope
+  BuildVariableOuterScope(startup_prog, &global_scope_, scope);
+  if (outer_scope_ != nullptr) {
+    auto name_list = outer_scope_->LocalVarNames();
+    for (auto name : name_list) {
+      auto v = outer_scope_->Var(name);
+      if (global_scope_.name2id.find(name) == global_scope_.name2id.end()) {
+        global_scope_.name2id[name] = global_scope_.var_list.size();
+      }
+      global_scope_.var_list.push_back(v);
+    }
+  }
+  // run startup program
+  std::vector<paddle::framework::OpFuncNode> vec_func_list;
+  std::vector<paddle::framework::OperatorBase*> op_list;
+  InterpreterCore::BuildOpFuncList(place_, startup_prog, &op_list,
+                                   &vec_func_list, &global_scope_);
+}
+int StandaloneExecutor::Run(const std::vector<std::string>& feed_names,
+                            const std::vector<framework::Tensor>& feed_tensors,
+                            const std::vector<std::string>& fetch_names,
+                            std::vector<framework::Tensor>* fetch_tensors) {
+  auto core = GetInterpreterCore(feed_names, fetch_names);
+  core->Run(feed_tensors, fetch_tensors);
+  return 0;
+}
+void StandaloneExecutor::BuildVariableOuterScope(
+    const framework::ProgramDesc& pdesc, VariableScope* var_scope,
+    Scope* outer_scope) {
+  auto& global_block = pdesc.Block(0);
+  for (auto& var : global_block.AllVars()) {
+    if (var->Name() == framework::kEmptyVarName) {
+      continue;
+    }
+    if (var_scope->name2id.find(var->Name()) == var_scope->name2id.end()) {
+      var_scope->name2id[var->Name()] = var_scope->var_list.size();
+      auto v = outer_scope->Var(var->Name());
+      InitializeVariable(v, var->GetType());
+      var_scope->var_list.push_back(v);
+    }
+  }
+}
+std::shared_ptr<InterpreterCore> StandaloneExecutor::GetInterpreterCore(
+    const std::vector<std::string>& feed_names,
+    const std::vector<std::string>& fetch_names) {
+  std::ostringstream oss;
+  oss << "feed:";
+  for (auto& feedname : feed_names) {
+    oss << feedname << ",";
+  }
+  oss << "fetch:";
+  for (auto& fetchname : fetch_names) {
+    oss << fetchname << ",";
+  }
+  auto iter = interpretercores_.find(oss.str());
+  if (iter == interpretercores_.end()) {
+    auto core = std::make_shared<InterpreterCore>(
+        place_, main_prog_, &global_scope_, feed_names, fetch_names);
+    interpretercores_.emplace(oss.str(), core);
+    return core;
+  } else {
+    return iter->second;
+  }
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/new_executor/standalone_executor.h
+++ b/paddle/fluid/framework/new_executor/standalone_executor.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/framework/new_executor/interpretercore.h"
+namespace paddle {
+namespace framework {
+class ExecutorBase {
+ public:
+  virtual ~ExecutorBase() {}
+  virtual int Run(const std::vector<std::string>& feed_names,
+                  const std::vector<framework::Tensor>& feed_tensors,
+                  const std::vector<std::string>& fetch_names,
+                  std::vector<framework::Tensor>* fetch_tensors) = 0;
+};
+class StandaloneExecutor : public ExecutorBase {
+ public:
+  StandaloneExecutor(const platform::Place& place,
+                     const ProgramDesc& startup_prog,
+                     const ProgramDesc& main_prog, Scope* scope);
+  ~StandaloneExecutor() {}
+  virtual int Run(const std::vector<std::string>& feed_names,
+                  const std::vector<framework::Tensor>& feed_tensors,
+                  const std::vector<std::string>& fetch_names,
+                  std::vector<framework::Tensor>* fetch_tensors);
+ private:
+  void BuildVariableOuterScope(const framework::ProgramDesc& pdesc,
+                               VariableScope* var_scope, Scope* outer_scope);
+  std::shared_ptr<InterpreterCore> GetInterpreterCore(
+      const std::vector<std::string>& feed_names,
+      const std::vector<std::string>& fetch_names);
+  const platform::Place& place_;
+  const ProgramDesc& startup_prog_;
+  const ProgramDesc& main_prog_;
+  Scope* outer_scope_;
+  VariableScope global_scope_;
+  std::unordered_map<std::string, std::shared_ptr<InterpreterCore>>
+      interpretercores_;
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/new_exec_test.cc
+++ b/paddle/fluid/framework/new_exec_test.cc
@@ -21,68 +21,44 @@
 #include <unordered_map>
 #include <vector>
-#include "paddle/fluid/framework/executor_gc_helper.h"
+#include "paddle/fluid/framework/new_executor/standalone_executor.h"
-#include "paddle/fluid/framework/garbage_collector.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/pybind/pybind.h"
+paddle::framework::ProgramDesc load_from_file(const std::string& file_name) {
+  std::ifstream fin(file_name, std::ios::in | std::ios::binary);
+  fin.seekg(0, std::ios::end);
+  std::string buffer(fin.tellg(), ' ');
+  fin.seekg(0, std::ios::beg);
+  fin.read(&buffer[0], buffer.size());
+  fin.close();
-#include "gperftools/profiler.h"
+  paddle::framework::ProgramDesc program_desc(buffer);
-#include "paddle/fluid/framework/new_exec.h"
+  return program_desc;
-#include "paddle/fluid/platform/init.h"
+}
 int main() {
  paddle::framework::InitDevices();
-  paddle::framework::VariableScope global_scope;
  auto place = paddle::platform::CUDAPlace(0);
-  auto test_prog = paddle::framework::load_from_file("lm_startup_program");
+  auto test_prog = load_from_file("lm_startup_program");
-  {
-    paddle::framework::build_variable_scope(test_prog, &global_scope);
-    std::vector<paddle::framework::OpFuncNode> vec_func_list;
-    std::vector<paddle::framework::OperatorBase*> op_list;
-    paddle::framework::build_op_func_list(test_prog, op_list, vec_func_list,
-                                          &global_scope, place);
-    // paddle::framework::exec_op_func_list( vec_func_list, op_list,
-    // global_scope, place );
-  }
-  cerr << "run main" << endl;
+  auto main_prog = load_from_file("lm_main_program");
-  auto main_prog = paddle::framework::load_from_file("lm_main_program");
-  paddle::framework::build_variable_scope(main_prog, &global_scope);
-  std::vector<paddle::framework::OpFuncNode> vec_main_func_list;
-  std::vector<paddle::framework::OperatorBase*> op_main_list;
-  paddle::framework::build_op_func_list(
-      main_prog, op_main_list, vec_main_func_list, &global_scope, place);
  paddle::framework::Scope scope;
-  paddle::framework::InterpreterCore interp_core(place, main_prog, test_prog,
+  paddle::framework::StandaloneExecutor exec(place, test_prog, main_prog,
-                                                 &scope);
+                                             &scope);
  auto start = std::chrono::steady_clock::now();
-  ProfilerStart("new_executor.prof");
  for (size_t i = 0; i < 2320; ++i) {
    if (i % 200 == 0) {
-      cerr << i << endl;
+      std::cout << i << std::endl;
    }
-    // paddle::framework::exec_op_func_list( vec_main_func_list, op_main_list,
-    // global_scope, place );
    std::vector<paddle::framework::Tensor> vec_out;
-    interp_core.run({}, {}, {}, vec_out);
+    exec.Run({}, {}, {}, &vec_out);
  }
-  ProfilerStop();
  auto end = std::chrono::steady_clock::now();
  std::chrono::duration<double> diff = end - start;
-  cerr << "time cost " << diff.count() << endl;
+  std::cout << "time cost " << diff.count() << std::endl;
  return 1;
 }
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -200,6 +200,7 @@ if(WITH_PYTHON)
  endif(WIN32)
  add_custom_target(op_function_generator_cmd ALL DEPENDS ${impl_file})
+  list(APPEND PYBIND_DEPS interpretercore standalone_executor)
  cc_library(paddle_pybind SHARED
    SRCS ${PYBIND_SRCS}
    DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -42,7 +42,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/new_exec.h"
+#include "paddle/fluid/framework/new_executor/standalone_executor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
@@ -1945,30 +1945,30 @@ All parameter, weight, gradient are variables in Paddle.
                 fetch_vars);
      });
-  py::class_<framework::InterpreterCore>(m, "InterpreterCore")
+  py::class_<framework::StandaloneExecutor>(m, "StandaloneExecutor")
      .def(py::init<const platform::Place &, const ProgramDesc &,
                    const ProgramDesc &, Scope *>())
      .def("run",
-           [](InterpreterCore &self,
+           [](StandaloneExecutor &self,
              const std::unordered_map<std::string, py::array> &input_dict,
-              std::vector<std::string> vec_fetch_name) {
+              std::vector<std::string> fetch_names) {
             pybind11::gil_scoped_release release;
-             std::vector<framework::Tensor> vec_tensor;
+             std::vector<framework::Tensor> feed_tensors;
-             std::vector<std::string> vec_name;
+             std::vector<std::string> feed_names;
             for (auto &item : input_dict) {
               framework::LoDTensor t;
               SetTensorFromPyArray<platform::CPUPlace>(
                   &t, item.second, platform::CPUPlace(), false);
-               vec_name.push_back(item.first);
+               feed_names.push_back(item.first);
-               vec_tensor.push_back(t);
+               feed_tensors.push_back(t);
             }
-             std::vector<framework::Tensor> vec_out;
+             std::vector<framework::Tensor> fetch_tensors;
-             self.run(vec_name, vec_tensor, vec_fetch_name, &vec_out);
+             self.Run(feed_names, feed_tensors, fetch_names, &fetch_tensors);
             std::vector<py::array> vec_ret;
-             for (size_t i = 0; i < vec_out.size(); ++i) {
+             for (size_t i = 0; i < fetch_tensors.size(); ++i) {
-               vec_ret.push_back(TensorToPyArray(vec_out[i], true));
+               vec_ret.push_back(TensorToPyArray(fetch_tensors[i], true));
             }
             return vec_ret;
           });

--- a/python/paddle/fluid/tests/unittests/interpreter/test_interpreter.py
+++ b/python/paddle/fluid/tests/unittests/interpreter/test_interpreter.py
@@ -15,7 +15,7 @@
 import unittest
 import paddle
 from paddle.fluid import core
-from paddle.fluid.core import InterpreterCore
+from paddle.fluid.core import StandaloneExecutor
 import numpy as np
@@ -37,19 +37,25 @@ class LinearTestCase(unittest.TestCase):
        startup_program = paddle.fluid.default_startup_program()
        p = core.Place()
        p.set_place(self.place)
-        inter_core = InterpreterCore(p, main_program.desc, startup_program.desc,
+        standaloneexecutor = StandaloneExecutor(p, startup_program.desc,
-                                     core.Scope())
+                                                main_program.desc, core.Scope())
-        out = inter_core.run({
+        out = standaloneexecutor.run({
            "a": np.ones(
                [2, 2], dtype="float32") * 2
        }, [c.name])
        for i in range(10):
-            out = inter_core.run({
+            out = standaloneexecutor.run({
                "a": np.ones(
                    [2, 2], dtype="float32") * i
            }, [c.name])
+        for i in range(10):
+            out = standaloneexecutor.run({
+                "a": np.ones(
+                    [2, 2], dtype="float32") * i
+            }, [a.name, c.name])
 if __name__ == "__main__":
    unittest.main()