[new executor]Support CINN use InterpreterCore (#48911)

* cinn use interpretercore * fix bug * fix compile bug * fix scope bug * refine code * refine code by comment * refine code by comment

[new executor]Support CINN use InterpreterCore (#48911)
* cinn use interpretercore * fix bug * fix compile bug * fix scope bug * refine code * refine code by comment * refine code by comment
2ca3d3f7 · zhangbo9674 · GitHub · 0839bba3 · 2ca3d3f7 · 2ca3d3f7
10 changed file
--- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
@@ -622,7 +622,8 @@ void BuildOpFuncList(const platform::Place& place,
        // NOTE(Ruibiao): We do not encourage directly using scope in OP kernel.
        // But some OPs do have such behavior (e.g., cinn_launch OP). Here
        // special treatment for them.
-        if (op_with_kernel->Type() == "cinn_launch") {
+        if (op_with_kernel->Type() == "cinn_launch" ||
+            op_with_kernel->Type() == "cinn_instruction_run") {
          VLOG(6) << "OP(" << op_with_kernel->Type()
                  << ") use scope in kernel, "
                     "so pass a real scope to "

--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -108,7 +108,8 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
                                 const std::set<std::string>& skip_gc_vars,
                                 framework::Scope* scope,
                                 bool used_for_jit,
-                                 bool used_for_control_flow_op)
+                                 bool used_for_control_flow_op,
+                                 bool used_for_cinn)
    : place_(place),
      block_(block),
      execution_config_(place, block.OpSize()),
@@ -121,9 +122,9 @@ InterpreterCore::InterpreterCore(const platform::Place& place,

  execution_config_.used_for_jit = used_for_jit;
  execution_config_.used_for_control_flow_op = used_for_control_flow_op;
-  execution_config_.create_local_scope = !used_for_jit &&
-                                         FLAGS_new_executor_use_local_scope &&
-                                         !used_for_control_flow_op;
+  execution_config_.create_local_scope =
+      !used_for_jit && FLAGS_new_executor_use_local_scope &&
+      !used_for_control_flow_op && !used_for_cinn;
  execution_config_.skip_gc_vars = skip_gc_vars;
  execution_config_.Log(/*log_level=*/8);

@@ -425,8 +426,9 @@ void InterpreterCore::BuildAndCacheInstructionCtx(Instruction* instr_node) {
  }

  // set runtime_ctx and infershape_ctx_
-  if (instr_node->OpBase()->Type() == "cinn_launch") {  // OP use scope in
-                                                        // kernel
+  if (instr_node->OpBase()->Type() == "cinn_launch" ||
+      instr_node->OpBase()->Type() == "cinn_instruction_run") {  // OP use scope
+                                                                 // in kernel
    Scope* local_scope = HasLocalScope() ? var_scope_.GetMutableLocalScope()
                                         : var_scope_.GetMutableScope();
    instr_node->ResetContextWithScope(ins_map, outs_map, *local_scope);

--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -46,7 +46,8 @@ class InterpreterCore {
                  const std::set<std::string>& skip_gc_vars,
                  Scope* scope,
                  bool used_for_jit = false,
-                  bool used_for_control_flow_op = false);
+                  bool used_for_control_flow_op = false,
+                  bool used_for_cinn = false);

  ~InterpreterCore();


--- a/paddle/fluid/operators/cinn/CMakeLists.txt
+++ b/paddle/fluid/operators/cinn/CMakeLists.txt
@@ -15,6 +15,7 @@ cc_library(
       build_strategy
       device_context
       parallel_executor
+       standalone_executor
       transform_type
       cinn)


--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
@@ -88,16 +88,15 @@ class TestCinnInstructionRunOp : public ::testing::Test {
    cinn_launch_op->Run(scope, place);
  }

-  void RunAndCheck(const platform::Place& place) {
+  void RunAndCheck(const platform::Place& place, framework::Scope* scope) {
    // Run ops and check the computation results
-    framework::Scope scope;
-    InitVariablesWithRandomValue<float>({"x", "y"}, {10, 20}, place, &scope);
-    scope.Var(test_op_out_name)->GetMutable<phi::DenseTensor>();
-    scope.Var(add_op_out_name)->GetMutable<phi::DenseTensor>();
-    elementwise_add_op->Run(scope, place);
-    cinn_launch_op->Run(scope, place);
-    CompareOpResult<float>(scope.GetVar(test_op_out_name),
-                           scope.GetVar(add_op_out_name));
+    InitVariablesWithRandomValue<float>({"x", "y"}, {10, 20}, place, scope);
+    scope->Var(test_op_out_name)->GetMutable<phi::DenseTensor>();
+    scope->Var(add_op_out_name)->GetMutable<phi::DenseTensor>();
+    elementwise_add_op->Run(*scope, place);
+    cinn_launch_op->Run(*scope, place);
+    CompareOpResult<float>(scope->GetVar(test_op_out_name),
+                           scope->GetVar(add_op_out_name));
  }

  void TearDown() override { CinnCompiler::GetInstance()->Clear(); }
@@ -106,17 +105,21 @@ class TestCinnInstructionRunOp : public ::testing::Test {
 TEST_F(TestCinnInstructionRunOp, CPU) {
  platform::CPUPlace place;
  Compile(place);
-  RunAndCheck(place);
+  framework::Scope scope1;
+  RunAndCheck(place, &scope1);
  // the second run on the same place is to check the cache logic
-  RunAndCheck(place);
+  framework::Scope scope2;
+  RunAndCheck(place, &scope2);
 }

 #ifdef PADDLE_WITH_CUDA
 TEST_F(TestCinnInstructionRunOp, GPU) {
  platform::CUDAPlace place;
  Compile(place);
-  RunAndCheck(place);
-  RunAndCheck(place);
+  framework::Scope scope1;
+  RunAndCheck(place, &scope1);
+  framework::Scope scope2;
+  RunAndCheck(place, &scope2);
 }
 #endif


--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -88,8 +88,9 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph,
  }

  // Convert the CINN runtime program to a Paddle graph
-  runtime_graph_ = std::make_unique<framework::ir::Graph>(
-      BuildCompiledProgram(graph, compiled_obj));
+  runtime_program_desc_ = BuildCompiledProgram(graph, compiled_obj);
+  runtime_graph_ =
+      std::make_unique<framework::ir::Graph>(*runtime_program_desc_.get());
  auto& outer_varinfo = graph.Get<Name2VarInfoMap>(kMemOptVarInfoFromMainGraph);
  runtime_graph_->SetNotOwned<Name2VarInfoMap>(kMemOptVarInfoFromMainGraph,
                                               &outer_varinfo);
@@ -100,6 +101,7 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph,
    // that means it can be erased after graph execution
    if (!outer_varinfo.count(var_name)) {
      skip_eager_vars_.emplace_back(var_name);
+      skip_gc_vars_.insert(var_name);
    }
  };
  std::for_each(
@@ -313,12 +315,14 @@ void CinnLaunchContext::AssignInternalVariable(const std::string& var_name) {
      });
 }

-framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram(
+std::unique_ptr<framework::ProgramDesc> CinnLaunchContext::BuildCompiledProgram(
    const framework::ir::Graph& graph, const CinnCompiledObject& compiled_obj) {
  CinnRuntimeProgram* runtime_program = compiled_obj.runtime_program.get();
  // Step 0: Create an empty program_desc, there will be only one block
-  framework::ProgramDesc program_desc;
-  auto* block = program_desc.MutableBlock(0);
+  // framework::ProgramDesc program_desc;
+  std::unique_ptr<framework::ProgramDesc> program_desc(
+      new framework::ProgramDesc());
+  auto* block = program_desc->MutableBlock(0);
  const std::vector<std::unique_ptr<CinnInstruction>>& instructions =
      runtime_program->GetRunInstructions();

@@ -445,6 +449,46 @@ ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place,
  return parallel_executor_.get();
 }

+framework::InterpreterCore* CinnLaunchContext::InitializeInterpreterCore(
+    const platform::Place& place, framework::Scope* scope) {
+  if (!interpreter_core_ || scope != cached_scope_) {
+    VLOG(1) << "interpreter_core_ is null or scope != cached_scope_: "
+               "interpreter_core_: "
+            << interpreter_core_.get() << "; scope: " << scope
+            << "; cached_scope_: " << cached_scope_;
+    for (auto&& var_name : internal_var_names_) {
+      auto* var = scope->FindVar(var_name);
+      if (var != nullptr) {
+        continue;
+      }
+      framework::InitializeVariable(scope->Var(var_name),
+                                    framework::proto::VarType::LOD_TENSOR);
+    }
+    if (!interpreter_core_) {
+      interpreter_core_ = std::make_unique<framework::InterpreterCore>(
+          place,
+          runtime_program_desc_->Block(0),
+          skip_gc_vars_,
+          scope,
+          /*used_for_jit*/ false,
+          /*used_for_control_flow_op*/ false,
+          /*used_for_cinn*/ true);
+    } else {
+      interpreter_core_->reset_scope(scope);
+    }
+    UpdateCapturedEnv(*scope, place);
+  }
+  for (auto&& var_name : initialized_beforehand_vars_) {
+    auto* var = scope->GetVar(var_name);
+    auto* buffer = GetCinnBufferOfVar(var_name);
+    auto dim = framework::DDim(buffer->dims, buffer->dimensions);
+    var->GetMutable<phi::DenseTensor>()->Resize(dim);
+    var->GetMutable<phi::DenseTensor>()->mutable_data(
+        place, framework::paddle2cinn::TransToPaddleDataType(buffer->type));
+  }
+  return interpreter_core_.get();
+}
+
 cinn_buffer_t* CinnLaunchContext::GetCinnBufferOfVar(
    const std::string& var_name) {
  auto it = paddle2cinn_varmap_.find(var_name);

--- a/paddle/fluid/operators/cinn/cinn_launch_context.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.h
@@ -22,6 +22,7 @@
 #include <vector>

 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/new_executor/interpretercore.h"
 #include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/ddim.h"
@@ -74,6 +75,9 @@ class CinnLaunchContext {
  framework::ParallelExecutor* InitializePE(const platform::Place& place,
                                            framework::Scope* scope);

+  framework::InterpreterCore* InitializeInterpreterCore(
+      const platform::Place& place, framework::Scope* scope);
+
  // explicitly update several environment variables captured
  // by callback of execution arguments
  void UpdateCapturedEnv(const framework::Scope& scope,
@@ -132,7 +136,7 @@ class CinnLaunchContext {

  // Construct a Paddle ProgramDesc with the CINN runtime
  // instructions included in the compiled CINN Program
-  framework::ProgramDesc BuildCompiledProgram(
+  std::unique_ptr<framework::ProgramDesc> BuildCompiledProgram(
      const framework::ir::Graph& graph,
      const CinnCompiledObject& compiled_obj);

@@ -155,6 +159,10 @@ class CinnLaunchContext {
  // the variable scope compiled from cinn
  const std::shared_ptr<CinnScope> cinn_scope_;

+  std::unique_ptr<framework::ProgramDesc> runtime_program_desc_;
+  std::unique_ptr<framework::InterpreterCore> interpreter_core_;
+  std::set<std::string> skip_gc_vars_;
+
  // the ir::Graph object converted from the program compiled by CINN
  std::unique_ptr<framework::ir::Graph> runtime_graph_;
  // a ParallelExecutor to execute the runtime graph

--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
@@ -31,6 +31,7 @@
 #include "paddle/fluid/platform/profiler.h"

 DECLARE_bool(enable_pe_launch_cinn);
+DECLARE_bool(enable_interpretercore_launch_cinn);
 namespace paddle {
 namespace operators {

@@ -135,12 +136,21 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
    // Step 4. Execute the compiled CINN instructions by a PE or
    //         by the CINN compiled program in sequential order
    if (FLAGS_enable_pe_launch_cinn) {
-      platform::RecordEvent record_event_4(
-          "Step 4. Execute the runtime graph by PE.");
-      VLOG(4) << "Execute the runtime graph by PE";
-      framework::Scope& exec_scope = scope.NewScope();
-      auto* pe = launch_context->InitializePE(place, &exec_scope);
-      pe->RunWithoutFetch(launch_context->GetSkipEagerVars());
+      if (FLAGS_enable_interpretercore_launch_cinn) {
+        platform::RecordEvent record_event_4(
+            "Step 4. Execute the runtime program by InterpreterCore.");
+        VLOG(4) << "Execute the runtime program by InterpreterCore";
+        auto* interpreter_core = launch_context->InitializeInterpreterCore(
+            place, const_cast<framework::Scope*>(&scope));
+        interpreter_core->Run({});
+      } else {
+        platform::RecordEvent record_event_4(
+            "Step 4. Execute the runtime graph by PE.");
+        VLOG(4) << "Execute the runtime graph by PE";
+        framework::Scope& exec_scope = scope.NewScope();
+        auto* pe = launch_context->InitializePE(place, &exec_scope);
+        pe->RunWithoutFetch(launch_context->GetSkipEagerVars());
+      }
    } else {
      platform::RecordEvent record_event_4(
          "Step 4. Execute the compiled executable program.");

--- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
@@ -36,6 +36,7 @@ USE_OP(cinn_instruction_run);
 USE_OP_ITSELF(elementwise_add);
 DECLARE_double(eager_delete_tensor_gb);
 DECLARE_bool(enable_pe_launch_cinn);
+DECLARE_bool(enable_interpretercore_launch_cinn);
 DECLARE_bool(enable_cinn_auto_tune);

 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
@@ -74,31 +75,34 @@ class TestCinnLaunchOp : public ::testing::Test {
                                                {{}});
  }

-  void RunAndCheck(const platform::Place& place) {
+  void RunAndCheck(const platform::Place& place, framework::Scope* scope) {
    // Run ops and check the computation results
-    framework::Scope scope;
-    InitVariablesWithRandomValue<float>({"x", "y"}, {10, 20}, place, &scope);
-    scope.Var(test_op_out_name)->GetMutable<phi::DenseTensor>();
-    scope.Var(add_op_out_name)->GetMutable<phi::DenseTensor>();
-    elementwise_add_op->Run(scope, place);
-    cinn_launch_op->Run(scope, place);
-    CompareOpResult<float>(scope.GetVar(test_op_out_name),
-                           scope.GetVar(add_op_out_name));
+    InitVariablesWithRandomValue<float>({"x", "y"}, {10, 20}, place, scope);
+    scope->Var(test_op_out_name)->GetMutable<phi::DenseTensor>();
+    scope->Var(add_op_out_name)->GetMutable<phi::DenseTensor>();
+    elementwise_add_op->Run(*scope, place);
+    cinn_launch_op->Run(*scope, place);
+    CompareOpResult<float>(scope->GetVar(test_op_out_name),
+                           scope->GetVar(add_op_out_name));
  }

  void TearDown() override { CinnCompiler::GetInstance()->Clear(); }
 };

 TEST_F(TestCinnLaunchOp, TestRunCPUInstructionByPE) {
-  RunAndCheck(platform::CPUPlace());
+  framework::Scope scope1;
+  RunAndCheck(platform::CPUPlace(), &scope1);
  // the second run on the same place is to check the cache logic
-  RunAndCheck(platform::CPUPlace());
+  framework::Scope scope2;
+  RunAndCheck(platform::CPUPlace(), &scope2);
 }

 #ifdef PADDLE_WITH_CUDA
 TEST_F(TestCinnLaunchOp, TestRunGPUInstructionByPE) {
-  RunAndCheck(platform::CUDAPlace());
-  RunAndCheck(platform::CUDAPlace());
+  framework::Scope scope1;
+  RunAndCheck(platform::CUDAPlace(), &scope1);
+  framework::Scope scope2;
+  RunAndCheck(platform::CUDAPlace(), &scope2);
 }
 #endif

@@ -106,9 +110,11 @@ TEST_F(TestCinnLaunchOp, TestRunCPUInstructionByCinnProgram) {
  // set FLAGS_enable_pe_launch_cinn=false to switch to use
  // default scheduler of CINN to execute the compiled program
  FLAGS_enable_pe_launch_cinn = false;
-
-  RunAndCheck(platform::CPUPlace());
-  RunAndCheck(platform::CPUPlace());
+  FLAGS_enable_interpretercore_launch_cinn = false;
+  framework::Scope scope1;
+  RunAndCheck(platform::CPUPlace(), &scope1);
+  framework::Scope scope2;
+  RunAndCheck(platform::CPUPlace(), &scope2);
 }

 #ifdef PADDLE_WITH_CUDA
@@ -116,8 +122,11 @@ TEST_F(TestCinnLaunchOp, TestRunGPUInstructionByCinnProgram) {
  // set FLAGS_enable_pe_launch_cinn=false to switch to use
  // default scheduler of CINN to execute the compiled program
  FLAGS_enable_pe_launch_cinn = false;
-  RunAndCheck(platform::CUDAPlace());
-  RunAndCheck(platform::CUDAPlace());
+  FLAGS_enable_interpretercore_launch_cinn = false;
+  framework::Scope scope1;
+  RunAndCheck(platform::CUDAPlace(), &scope1);
+  framework::Scope scope2;
+  RunAndCheck(platform::CUDAPlace(), &scope2);
 }
 #endif

@@ -125,8 +134,10 @@ TEST_F(TestCinnLaunchOp, TestRunWithAutoTuneEnabled) {
  FLAGS_enable_cinn_auto_tune = true;

  // currently only check on cpu, will add a test for gpu after CINN ready
-  RunAndCheck(platform::CPUPlace());
-  RunAndCheck(platform::CPUPlace());
+  framework::Scope scope1;
+  RunAndCheck(platform::CPUPlace(), &scope1);
+  framework::Scope scope2;
+  RunAndCheck(platform::CPUPlace(), &scope2);
 }

 namespace details {

--- a/paddle/phi/core/flags.cc
+++ b/paddle/phi/core/flags.cc
@@ -978,6 +978,20 @@ PADDLE_DEFINE_EXPORTED_bool(enable_pe_launch_cinn,
                            "It controls whether to execute cinn compiled "
                            "program with ParallelExecutor");

+/*
+ * CINN related FLAG
+ * Name: FLAGS_enable_interpretercore_launch_cinn
+ * Since Version: 2.4
+ * Value Range: bool, default=true
+ * Example: FLAGS_enable_interpretercore_launch_cinn=true would execute the CINN
+ * compiled instructions of a paddle graph with InterpreterCore, otherwise with
+ * the CINN compiled runtime program in sequential order.
+ */
+PADDLE_DEFINE_EXPORTED_bool(enable_interpretercore_launch_cinn,
+                            true,
+                            "It controls whether to execute cinn compiled "
+                            "program with InterpreterCore");
+
 /*
 * CINN related FLAG
 * Name: FLAGS_enable_cinn_auto_tune