From 4d042a83732b8c2d4ff9abfd3e103b6f0799831d Mon Sep 17 00:00:00 2001
From: TeFeng Chen <ctfeng66@163.com>
Date: Thu, 24 Feb 2022 17:14:46 +0800
Subject: [PATCH] build a Paddle Graph from CINN compiled program for execution
 with PE (#39724)

* build a Paddle Graph from CINN compiled program for execution with PE

* update names of some variables

* fix random fail in build_cinn_pass_test and update some comments

* fix compiler error by merging phi pr
---
 .../framework/paddle2cinn/build_cinn_pass.cc  |   9 +-
 .../framework/paddle2cinn/build_cinn_pass.h   |   7 +
 .../paddle2cinn/build_cinn_pass_test.cc       |   4 +-
 .../framework/paddle2cinn/cinn_compiler.cc    |   6 +-
 paddle/fluid/operators/cinn/CMakeLists.txt    |   4 +-
 .../operators/cinn/cinn_launch_context.cc     | 249 +++++++++++++-----
 .../operators/cinn/cinn_launch_context.h      |  94 ++++---
 .../cinn/cinn_launch_context_test.cc          | 241 ++++++++++++-----
 paddle/fluid/operators/cinn/cinn_launch_op.h  |  74 ++----
 paddle/fluid/operators/cinn/test_helper.h     |  12 +
 10 files changed, 477 insertions(+), 223 deletions(-)
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
index d55950064a4..6e55727c8bf 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
@@ -44,11 +44,6 @@ DECLARE_string(deny_cinn_ops);
 
 namespace paddle {
 namespace framework {
-
-namespace ir {
-class MemOptVarInfo;
-}  // namespace ir
-
 namespace paddle2cinn {
 
 using framework::ir::Graph;
@@ -398,9 +393,7 @@ std::unique_ptr<Graph> CreateNewSubGraph(const GraphNodeSet& cluster,
       kNoNeedBufferFeeds, no_need_buffer_feeds.release());
   // initialize empty map for kMemOptVarInfoFromMainGraph attribute,
   // it will be filled on the share_mem_opt_info_to_subgraph pass
-  subgraph->GetOrInit<std::unordered_map<
-      std::string, std::shared_ptr<framework::ir::MemOptVarInfo>>>(
-      kMemOptVarInfoFromMainGraph);
+  subgraph->GetOrInit<Name2VarInfoMap>(kMemOptVarInfoFromMainGraph);
   return subgraph;
 }
 
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h
index 8cb920831cc..a902eacde82 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h
@@ -18,6 +18,10 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+namespace ir {
+class MemOptVarInfo;
+}  // namespace ir
+
 namespace paddle2cinn {
 
 constexpr char kCinnLaunchOp[] = "cinn_launch";
@@ -27,6 +31,9 @@ constexpr char kInternalVars[] = "InternalVars";
 constexpr char kOutputVars[] = "OutputVars";
 constexpr char kMemOptVarInfoFromMainGraph[] =
     "mem_opt_var_info_from_main_graph";
+using Name2VarInfoMap =
+    std::unordered_map<std::string,
+                       std::shared_ptr<framework::ir::MemOptVarInfo>>;
 
 // A pass named BuildCinnPass, the function of this pass is:
 //
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
index 919fc60d4cb..bf9d1baaf39 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
@@ -255,7 +255,9 @@ TEST(BuildCinnPassTest, AllOpSupportCinn) {
   ASSERT_EQ(
       std::unordered_set<Node*>(cinn_op->inputs.begin(), cinn_op->inputs.end()),
       std::unordered_set<Node*>({v0, v1, v2, v4}));
-  ASSERT_EQ(cinn_op->outputs, std::vector<Node*>({v6, v7}));
+  ASSERT_EQ(std::unordered_set<Node*>(cinn_op->outputs.begin(),
+                                      cinn_op->outputs.end()),
+            std::unordered_set<Node*>({v6, v7}));
   ASSERT_EQ(v1->outputs, std::vector<Node*>({cinn_op}));
   ASSERT_EQ(v6->inputs, std::vector<Node*>({cinn_op}));
 
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 716cd85e711..706815185a1 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -248,10 +248,10 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
   *compiled_obj = {std::move(graph_compiler),
                    std::move(compiled_res.runtime_program), scope,
                    symbol.var_model_to_program_map()};
-  compiled_obj->launch_context =
-      std::make_unique<operators::details::CinnLaunchContext>(
-          compiled_obj->paddle2cinn_varmap, compiled_obj->scope);
   compiled_obj->cached_index = compiled_num;
+  compiled_obj->launch_context =
+      std::make_unique<operators::details::CinnLaunchContext>(graph,
+                                                              *compiled_obj);
   return compiled_obj;
 }
 
diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt
index a2fc080faad..f1247ebdf23 100644
--- a/paddle/fluid/operators/cinn/CMakeLists.txt
+++ b/paddle/fluid/operators/cinn/CMakeLists.txt
@@ -1,13 +1,13 @@
 include(operators)
 
 cc_library(cinn_op_helper SRCS cinn_op_helper.cc DEPS operator device_context)
-cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope cinn)
+cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy parallel_executor cinn)
 
 SET(CINN_OP_DEPS string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context)
 register_operators(DEPS ${CINN_OP_DEPS})
 
 if (WITH_TESTING)
-  cc_test(cinn_launch_context_test SRCS cinn_launch_context_test.cc DEPS ddim lod_tensor scope cinn_launch_context)
+  cc_test(cinn_launch_context_test SRCS cinn_launch_context_test.cc DEPS ddim lod_tensor scope proto_desc graph cinn_launch_context cinn_instruction_run_op cinn)
   set_tests_properties(cinn_launch_context_test PROPERTIES LABELS "RUN_TYPE=CINN")
 
   SET(CINN_RUN_ENVIRONMENT "OMP_NUM_THREADS=1;runtime_include_dir=${PADDLE_BINARY_DIR}/third_party/CINN/src/external_cinn/cinn/runtime/cuda")
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc
index 0b677f79f7f..0a21d937aa1 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -17,22 +17,39 @@
 #include <functional>
 #include <utility>
 #include <vector>
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/instruction.h"
 #include "cinn/hlir/framework/scope.h"
 #include "cinn/hlir/framework/tensor.h"
 #include "cinn/runtime/cinn_runtime.h"
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/execution_strategy.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
+#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/operators/cinn/cinn_op_helper.h"
+#include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 namespace operators::details {
 
-using LoDTensor = framework::LoDTensor;
+using framework::Scope;
+using framework::LoDTensor;
+using framework::ParallelExecutor;
+using CinnInstruction = ::cinn::hlir::framework::Instruction;
+using CinnRuntimeProgram = ::cinn::hlir::framework::Program;
+using framework::paddle2cinn::Name2VarInfoMap;
+using framework::paddle2cinn::kMemOptVarInfoFromMainGraph;
 
-CinnLaunchContext::CinnLaunchContext(
-    const std::unordered_map<std::string, std::string>& paddle2cinn_varmap,
-    const std::shared_ptr<CinnScope>& cinn_scope)
-    : cinn_scope_(cinn_scope) {
-  // generate all names of the cinn execution arguments
+CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph,
+                                     const CinnCompiledObject& compiled_obj)
+    : cinn_scope_(compiled_obj.scope) {
+  // collect all names of the CINN execution arguments
   auto var_names = cinn_scope_->var_names();
   cinn_argument_names_.reserve(var_names.size());
   std::transform(
@@ -40,7 +57,42 @@ CinnLaunchContext::CinnLaunchContext(
       std::inserter(cinn_argument_names_, cinn_argument_names_.end()),
       [](const auto& name_view) { return std::string(name_view.data()); });
   // build name map between the original variables and compiled ones
-  BuildVarNameMap(paddle2cinn_varmap, cinn_argument_names_);
+  BuildVarNameMap(compiled_obj.paddle2cinn_varmap, cinn_argument_names_);
+
+  const auto& input_var_names =
+      graph.Get<std::vector<std::string>>(framework::paddle2cinn::kInputVars);
+  const auto& output_var_names =
+      graph.Get<std::vector<std::string>>(framework::paddle2cinn::kOutputVars);
+  internal_var_names_ =
+      ExtractInternalVarNames(input_var_names, output_var_names);
+  // check completeness of output variables in compiled result
+  for (auto&& var_name : output_var_names) {
+    PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true,
+                      platform::errors::PreconditionNotMet(
+                          "Variable(%s) not applied in CINN", var_name));
+  }
+
+  // initialize all execution arguments
+  InitializeArguments();
+  // DEPRECATED(CtfGo): following callback assignment will be deprecated soon
+  for (auto&& var_name : input_var_names) {
+    if (IsVariableUsed(var_name)) {
+      AssignExternalVariable(var_name);
+    }
+  }
+  for (auto&& var_name : output_var_names) {
+    AssignExternalVariable(var_name);
+  }
+  for (auto&& var_name : internal_var_names_) {
+    AssignInternalVariable(var_name);
+  }
+
+  // Convert the CINN runtime program to a Paddle graph
+  runtime_graph_ = std::make_unique<framework::ir::Graph>(
+      BuildCompiledProgram(graph, compiled_obj));
+  runtime_graph_->SetNotOwned<Name2VarInfoMap>(
+      kMemOptVarInfoFromMainGraph,
+      &graph.Get<Name2VarInfoMap>(kMemOptVarInfoFromMainGraph));
 }
 
 void CinnLaunchContext::BuildVarNameMap(
@@ -94,21 +146,15 @@ void CinnLaunchContext::UpdateCapturedEnv(const framework::Scope& scope,
           << std::addressof(place);
 }
 
-bool CinnLaunchContext::IsArgumentsInitialized() const {
-  if (hold_buffers_.empty() || name2argument_.empty()) {
-    return false;
-  }
-  return true;
-}
-
 bool CinnLaunchContext::IsVariableUsed(const std::string& var_name) const {
   return paddle2cinn_varmap_.count(var_name) > 0;
 }
 
-CinnTensor CinnLaunchContext::GetCinnTensor(const std::string& arg_name) {
-  PADDLE_ENFORCE_GT(cinn_argument_names_.count(arg_name), 0,
-                    platform::errors::InvalidArgument(
-                        "Variable(%s) not found in cinn scope.", arg_name));
+CinnTensor CinnLaunchContext::GetCinnTensorOfVar(const std::string& var_name) {
+  PADDLE_ENFORCE_EQ(
+      IsVariableUsed(var_name), true,
+      platform::errors::NotFound("Variable(%s) not applied in CINN", var_name));
+  const auto& arg_name = paddle2cinn_varmap_.at(var_name);
   return cinn_scope_->GetTensor(arg_name);
 }
 
@@ -132,10 +178,13 @@ std::unordered_set<std::string> CinnLaunchContext::ExtractInternalVarNames(
   return remain_var_names;
 }
 
-void CinnLaunchContext::CheckTensorEquivalent(const std::string& var_name,
-                                              const LoDTensor& paddle_tensor,
-                                              const CinnTensor& cinn_tensor) {
+void CinnLaunchContext::CheckTensorEquivalent(
+    const std::string& var_name, const framework::LoDTensor& paddle_tensor) {
+  PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true,
+                    platform::errors::InvalidArgument(
+                        "Variable(%s) not applied in cinn", var_name));
   // check dimension
+  auto cinn_tensor = GetCinnTensorOfVar(var_name);
   auto cinn_dims = phi::make_ddim(cinn_tensor->shape().data());
   PADDLE_ENFORCE_EQ(paddle_tensor.dims(), cinn_dims,
                     platform::errors::PreconditionNotMet(
@@ -146,22 +195,28 @@ void CinnLaunchContext::CheckTensorEquivalent(const std::string& var_name,
   // TODO(CtfGo): check the underlying data type after CINN ready
 }
 
+void CinnLaunchContext::InitializeArguments() {
+  for (auto&& arg : cinn_argument_names_) {
+    auto cinn_buffer = std::make_unique<cinn_buffer_t>();
+    auto cinn_tensor = GetCinnTensorOfVar(cinn2paddle_varmap_.at(arg));
+    // assign dimensions with corresponding compiled tensor
+    cinn_buffer->resize(cinn_tensor->shape().data().data(),
+                        cinn_tensor->shape().data().size());
+    VLOG(4) << string::Sprintf(
+        "Append an argument:name(%s),dims(%s),argument size:(%lu)", arg,
+        framework::DDim(cinn_buffer->dims, cinn_buffer->dimensions).to_str(),
+        name2argument_.size());
+    name2argument_.emplace(arg, cinn_buffer.get());
+    hold_buffers_.emplace_back(std::move(cinn_buffer));
+  }
+}
+
 void CinnLaunchContext::AssignExternalVariable(const std::string& var_name) {
   PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true,
                     platform::errors::InvalidArgument(
                         "Variable(%s) not applied in cinn", var_name));
-  const auto& cinn_arg_name = paddle2cinn_varmap_.at(var_name);
-
-  const auto& paddle_tensor = cached_scope_->GetVar(var_name)->Get<LoDTensor>();
-  CinnTensor cinn_tensor = GetCinnTensor(cinn_arg_name);
-  if (paddle_tensor.IsInitialized()) {
-    CheckTensorEquivalent(var_name, paddle_tensor, cinn_tensor);
-  }
-
-  auto cinn_buffer = std::make_unique<cinn_buffer_t>();
-  // assign dimensions and alloc/free callback of cinn_buffer_t
-  cinn_buffer->resize(cinn_tensor->shape().data().data(),
-                      cinn_tensor->shape().data().size());
+  auto* cinn_buffer = GetCinnBufferOfVar(var_name);
+  // assign external malloc/free callbacks of cinn_buffer_t
   cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
       [this, var_name](void* ctx, cinn_buffer_t* buffer) {
         auto* tensor = cached_scope_->GetVar(var_name)->GetMutable<LoDTensor>();
@@ -177,22 +232,14 @@ void CinnLaunchContext::AssignExternalVariable(const std::string& var_name) {
         // Do nothing
         return 0;
       });
-
-  return AppendArgument(cinn_arg_name, std::move(cinn_buffer));
 }
 
 void CinnLaunchContext::AssignInternalVariable(const std::string& var_name) {
   PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true,
                     platform::errors::InvalidArgument(
                         "Variable(%s) not applied in cinn", var_name));
-  const auto& cinn_arg_name = paddle2cinn_varmap_.at(var_name);
-
-  CinnTensor cinn_tensor = GetCinnTensor(cinn_arg_name);
-  auto cinn_buffer = std::make_unique<cinn_buffer_t>();
-  // assign dimensions and alloc/free callback of cinn_buffer_t
-  cinn_buffer->resize(cinn_tensor->shape().data().data(),
-                      cinn_tensor->shape().data().size());
-
+  auto* cinn_buffer = GetCinnBufferOfVar(var_name);
+  // assign external malloc/free callbacks of cinn_buffer_t
   cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
       [this, var_name](void* ctx, cinn_buffer_t* buffer) {
         auto* tensor =
@@ -212,30 +259,106 @@ void CinnLaunchContext::AssignInternalVariable(const std::string& var_name) {
         tensor->clear();
         return 0;
       });
-  return AppendArgument(cinn_arg_name, std::move(cinn_buffer));
 }
 
-void CinnLaunchContext::AppendArgument(
-    const std::string& arg_name, std::unique_ptr<cinn_buffer_t>&& buffer) {
-  name2argument_.emplace(arg_name, buffer.get());
-  hold_buffers_.emplace_back(std::move(buffer));
-  VLOG(4) << string::Sprintf(
-      "Append an argument:name(%s),dims(%s),argument size:(%lu)", arg_name,
-      framework::DDim(buffer->dims, buffer->dimensions).to_str(),
-      name2argument_.size());
+framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram(
+    const framework::ir::Graph& graph, const CinnCompiledObject& compiled_obj) {
+  CinnRuntimeProgram* runtime_program = compiled_obj.runtime_program.get();
+  // Step 0: Create an empty program_desc, there will be only one block
+  framework::ProgramDesc program_desc;
+  auto* block = program_desc.MutableBlock(0);
+  const std::vector<std::unique_ptr<CinnInstruction>>& instructions =
+      runtime_program->GetRunInstructions();
+
+  // build a map that links the name of a Paddle variable to its VarDesc
+  const std::unordered_set<framework::ir::Node*>& nodes = graph.Nodes();
+  std::unordered_map<std::string, framework::VarDesc*> original_vardescs;
+  for (auto* node : nodes) {
+    if (node->IsVar() && node->Var()) {
+      original_vardescs.emplace(node->Name(), node->Var());
+    }
+  }
+
+  // Step 1: Create a VarDesc for each execution argument:
+  //   (1) For those variables that are input or output variables of the
+  //   original subgraph, there must exist an original VarDesc, so
+  //   we copy some useful info(such as IsParameter,Persistable)
+  //   to the new VarDesc.
+  //   (2) For all variables, the shape, data type of their VarDescs
+  //   are set by values of the corresponding compiled tensors,
+  //   including the in/out variables where the equiality between their tensors
+  //   and the CINN compiled ones is verified in corresponding cinn_launch_op.
+  for (auto&& arg : cinn_argument_names_) {
+    const std::string& var_name = cinn2paddle_varmap_.at(arg);
+    framework::VarDesc* var_desc = block->Var(var_name);
+    var_desc->SetType(framework::proto::VarType::LOD_TENSOR);
+
+    auto res = original_vardescs.find(var_name);
+    if (res != original_vardescs.end()) {
+      auto* ori_desc = res->second;
+      var_desc->SetPersistable(ori_desc->Persistable());
+      var_desc->SetIsParameter(ori_desc->IsParameter());
+    }
+
+    auto cinn_tensor = GetCinnTensorOfVar(var_name);
+    // TODO(CtfGo): set the corresponding data type after CINN ready,
+    //              currently set as FP32 in default
+    var_desc->SetDataType(framework::proto::VarType::FP32);
+    var_desc->SetShape(std::vector<int64_t>(cinn_tensor->shape().data().begin(),
+                                            cinn_tensor->shape().data().end()));
+  }
+
+  // transform names of the input or output arguments of a CINN instruction
+  // to the corresponding Paddle variable names, and repack them as one vector
+  auto trans_and_pack_args_fn =
+      [this](const std::vector<std::vector<std::string>>& cinn_args_array) {
+        std::vector<std::string> var_names;
+        for (auto&& cinn_args : cinn_args_array) {
+          for (auto&& arg : cinn_args) {
+            auto res = cinn2paddle_varmap_.find(arg);
+            PADDLE_ENFORCE_NE(
+                res, cinn2paddle_varmap_.end(),
+                platform::errors::NotFound("Argument(%s) not found", arg));
+            var_names.emplace_back(res->second);
+          }
+        }
+        return var_names;
+      };
+
+  // Step 2: create a VarDesc of cinn_instruction_run op for
+  //         each CINN instruction and append it to the main block
+  for (auto ins_idx = 0; ins_idx < instructions.size(); ++ins_idx) {
+    auto* ins = instructions.at(ins_idx).get();
+    auto in_args = trans_and_pack_args_fn(ins->GetInArgs());
+    auto out_args = trans_and_pack_args_fn(ins->GetOutArgs());
+
+    auto* op_desc = block->AppendOp();
+    op_desc->SetType("cinn_instruction_run");
+    op_desc->SetInput(kX, in_args);
+    op_desc->SetOutput(kOutputs, out_args);
+    op_desc->SetAttr(kCachedIndex,
+                     {static_cast<int64_t>(compiled_obj.cached_index)});
+    op_desc->SetAttr(kInstructionIndex, {static_cast<int64_t>(ins_idx)});
+  }
+
+  return program_desc;
 }
 
-const std::map<std::string, cinn_pod_value_t>&
-CinnLaunchContext::FinalizeArguments() const {
-  // Check all execution parameters are assigned valued.
-  std::for_each(cinn_argument_names_.begin(), cinn_argument_names_.end(),
-                [this](const auto& arg_name) {
-                  PADDLE_ENFORCE_GT(
-                      name2argument_.count(arg_name), 0,
-                      platform::errors::NotFound(
-                          "Argument(%s) is missed for execution", arg_name));
-                });
-  return name2argument_;
+ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place,
+                                                  framework::Scope* scope) {
+  if (!parallel_executor_) {
+    framework::details::ExecutionStrategy exec_strategy;
+    framework::details::BuildStrategy build_strategy;
+    parallel_executor_ = std::make_unique<ParallelExecutor>(
+        place, scope, exec_strategy, build_strategy, runtime_graph_.get());
+  }
+
+  // update the scope bound to an OpHandle and rebuild temporary variables
+  std::unordered_map<Scope*, Scope*> scope_map = {
+      {parallel_executor_->GetLocalScopes().front(), scope}};
+  parallel_executor_->ResetOpHandleScopeMapOfGraphs(scope_map);
+  parallel_executor_->PrepareVariables(scope);
+  return parallel_executor_.get();
 }
 
 cinn_buffer_t* CinnLaunchContext::GetCinnBufferOfVar(
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.h b/paddle/fluid/operators/cinn/cinn_launch_context.h
index 502e6a92dc1..a4d613ea618 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.h
@@ -21,7 +21,7 @@
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/ddim.h"
 
@@ -35,10 +35,25 @@ class Program;
 }  // namespace cinn::hlir::framework
 
 namespace paddle {
+namespace framework {
+class ProgramDesc;
+class Scope;
+class VarDesc;
+
+namespace ir {
+class Graph;
+}  // namespace ir
+
+namespace paddle2cinn {
+class CinnCompiledObject;
+}  // namespace paddle2cinn
+}  // namespace framework
+
 namespace operators::details {
 
 using CinnTensor = ::cinn::hlir::framework::Tensor;
 using CinnScope = ::cinn::hlir::framework::Scope;
+using CinnCompiledObject = framework::paddle2cinn::CinnCompiledObject;
 
 // This class is used to cache some reusable data among repeated
 // executions for efficiency and it also provides easy interfaces
@@ -49,58 +64,71 @@ using CinnScope = ::cinn::hlir::framework::Scope;
 // Variable while a CINN variable is called an Argument.
 class CinnLaunchContext {
  public:
-  explicit CinnLaunchContext(
-      const std::unordered_map<std::string, std::string>& paddle2cinn_varmap,
-      const std::shared_ptr<CinnScope>& cinn_scope);
+  explicit CinnLaunchContext(const framework::ir::Graph& graph,
+                             const CinnCompiledObject& compiled_obj);
+
+  // Initialize a ParallelExecutor to execute the runtime graph,
+  // it will be constructed in the first call, and just update
+  // the execution scope in the following usage.
+  framework::ParallelExecutor* InitializePE(const platform::Place& place,
+                                            framework::Scope* scope);
 
   // explicitly update several environment variables captured
   // by callback of execution arguments
   void UpdateCapturedEnv(const framework::Scope& scope,
                          const platform::Place& place);
 
-  // Return whether execution arguments has been initialized
-  bool IsArgumentsInitialized() const;
-
   // Return whether a Paddle variable used in cinn execution
   bool IsVariableUsed(const std::string& var_name) const;
 
-  // Assign tensor buffer to input or output variables
-  void AssignExternalVariable(const std::string& var_name);
-
-  // Assign tensor buffer to internal variables
-  void AssignInternalVariable(const std::string& var_name);
+  // Check the equiality in type and dimension between the tensor
+  // in Paddle and the compiled tensor returned by CINN of a same variable
+  void CheckTensorEquivalent(const std::string& var_name,
+                             const framework::LoDTensor& paddle_tensor);
 
-  // Extract internal variable names from all applied variables
-  // in execution by excluding the input and output variables
-  std::unordered_set<std::string> ExtractInternalVarNames(
-      const std::vector<std::string>& input_var_names,
-      const std::vector<std::string>& output_var_names);
+  // Return internal variable names list
+  const std::unordered_set<std::string>& GetInternalVarNames() const {
+    return internal_var_names_;
+  }
 
   // Finalize all execution arguments and return the name->argument map
-  const std::map<std::string, cinn_pod_value_t>& FinalizeArguments() const;
+  const std::map<std::string, cinn_pod_value_t>& FinalizeArguments() const {
+    return name2argument_;
+  }
 
   // Return the cinn_buffer_t* of a specific variable
   cinn_buffer_t* GetCinnBufferOfVar(const std::string& var_name);
 
  private:
-  // Get CinnTensor with CINN argument name
-  CinnTensor GetCinnTensor(const std::string& arg_name);
+  // Get corresponding compiled tensor of a Paddle variable name
+  CinnTensor GetCinnTensorOfVar(const std::string& var_name);
+
   // Build the name maps of paddle->cinn and cinn->paddle
   // in reverse for all variables used in cinn execution
   void BuildVarNameMap(
       const std::unordered_map<std::string, std::string>& compiled_varmap,
       const std::unordered_set<std::string>& argument_names);
 
-  // Check whether the tensor in Paddle and the compiled
-  // tensor returned by CINN of a same variable
-  // are equivalent in type and dimension
-  void CheckTensorEquivalent(const std::string& var_name,
-                             const framework::LoDTensor& paddle_tensor,
-                             const CinnTensor& cinn_tensor);
+  // Extract internal variable names from all applied variables
+  // in execution by excluding the input and output variables
+  std::unordered_set<std::string> ExtractInternalVarNames(
+      const std::vector<std::string>& input_var_names,
+      const std::vector<std::string>& output_var_names);
+
+  // Initialize each execution argument with a cinn_buffer_t
+  void InitializeArguments();
 
-  // Append an argument with (cinn name)->(cinn_buffer_t) pair
-  void AppendArgument(const std::string& arg_name,
-                      std::unique_ptr<cinn_buffer_t>&& buffer);
+  // Assign tensor buffer to input or output variables
+  void AssignExternalVariable(const std::string& var_name);
+
+  // Assign tensor buffer to internal variables
+  void AssignInternalVariable(const std::string& var_name);
+
+  // Construct a Paddle ProgramDesc with the CINN runtime
+  // instructions included in the compiled CINN Program
+  framework::ProgramDesc BuildCompiledProgram(
+      const framework::ir::Graph& graph,
+      const CinnCompiledObject& compiled_obj);
 
  private:
   const framework::Scope* cached_scope_ = nullptr;
@@ -111,16 +139,22 @@ class CinnLaunchContext {
   std::unordered_map<std::string, std::string> paddle2cinn_varmap_;
   // a name map from cinn execution arguments to paddle variables
   std::unordered_map<std::string, std::string> cinn2paddle_varmap_;
+  // a list of internal variable names in Paddle
+  std::unordered_set<std::string> internal_var_names_;
   // the names of the cinn arguments used in compiled executable program
   std::unordered_set<std::string> cinn_argument_names_;
   // the variable scope compiled from cinn
   const std::shared_ptr<CinnScope> cinn_scope_;
 
+  // the ir::Graph object converted from the program compiled by CINN
+  std::unique_ptr<framework::ir::Graph> runtime_graph_;
+  // a ParallelExecutor to execute the runtime graph
+  std::unique_ptr<framework::ParallelExecutor> parallel_executor_;
+
   // because a cinn_pod_value_t does not own a cinn_buffer_t object,
   // an extra stroage is necessary to keep those objects and they can
   // not be released until the runtime program finish execution.
   std::vector<std::unique_ptr<cinn_buffer_t>> hold_buffers_;
-
   // this map saves all execution arguments with their cinn names as key,
   // and it is passed to the Execute interface of a cinn runtime program.
   std::map<std::string, cinn_pod_value_t> name2argument_;
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
index 58a9c5db712..4976a59d1dd 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
@@ -13,87 +13,229 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cinn/cinn_launch_context.h"
+#include <memory>
+#include <set>
+#include <utility>
+#include "cinn/common/target.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/instruction.h"
 #include "cinn/hlir/framework/scope.h"
 #include "cinn/hlir/framework/tensor.h"
 #include "cinn/runtime/cinn_runtime.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
+#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
+#include "paddle/fluid/framework/parallel_executor.h"
+#include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/operators/cinn/cinn_op_helper.h"
 #include "paddle/phi/core/ddim.h"
 
+USE_OP(cinn_instruction_run);
 namespace paddle {
 namespace operators::details {
 
-using LoDTensor = framework::LoDTensor;
+using framework::OpDesc;
+using framework::ProgramDesc;
+using framework::LoDTensor;
+using framework::ir::Graph;
+using framework::ParallelExecutor;
+using framework::paddle2cinn::Name2VarInfoMap;
 using CinnShape = ::cinn::hlir::framework::Shape;
+using CinnInstruction = ::cinn::hlir::framework::Instruction;
+using CinnRuntimeProgram = ::cinn::hlir::framework::Program;
 
-std::unique_ptr<CinnLaunchContext> CreateDefaultLaunchContext() {
+const Graph& InitDefaultSubgraph() {
   static std::once_flag initialized;
-  static std::unordered_map<std::string, std::string> paddle2cinn_varmap;
-  static std::shared_ptr<CinnScope> cinn_scope;
-  std::call_once(initialized, [&paddle2cinn_varmap, &cinn_scope]() {
-    auto& scope = cinn_scope;
-    scope = std::make_shared<CinnScope>();
+  static std::unique_ptr<Graph> graph;
+  std::call_once(initialized, [&]() {
+    ProgramDesc program;
+    auto* block = program.MutableBlock(0);
+    auto* var1 = block->Var("var1");
+    var1->SetPersistable(true);
+    block->Var("var2");
+    block->Var("var3");
+    block->Var("var4");
+    auto* var5 = block->Var("var5");
+    var5->SetIsParameter(true);
+    auto add_op = std::unique_ptr<OpDesc>(
+        new OpDesc("elementwise_add", {{"X", {"var1"}}, {"Y", {"var2"}}},
+                   {{"Out", {"var3"}}}, {}));
+    block->AppendAllocatedOp(std::move(add_op));
+    auto mul_op = std::unique_ptr<OpDesc>(new OpDesc(
+        "mul", {{"X", {"var1"}}, {"Y", {"var2"}}}, {{"Out", {"var4"}}}, {}));
+    block->AppendAllocatedOp(std::move(mul_op));
+    auto res_op = std::unique_ptr<OpDesc>(
+        new OpDesc("elementwise_add", {{"X", {"var3"}}, {"Y", {"var4"}}},
+                   {{"Out", {"var5"}}}, {}));
+    block->AppendAllocatedOp(std::move(res_op));
+    graph = std::make_unique<Graph>(program);
+
+    graph->Set<std::vector<std::string>>(
+        framework::paddle2cinn::kInputVars,
+        new std::vector<std::string>({"var1", "var2"}));
+    graph->Set<std::vector<std::string>>(
+        framework::paddle2cinn::kInternalVars,
+        new std::vector<std::string>({"var3", "var4"}));
+    graph->Set<std::vector<std::string>>(
+        framework::paddle2cinn::kOutputVars,
+        new std::vector<std::string>({"var5"}));
+    graph->GetOrInit<Name2VarInfoMap>(
+        framework::paddle2cinn::kMemOptVarInfoFromMainGraph);
+  });
+  return *graph.get();
+}
 
+CinnCompiledObject* InitDefaultCompiledObject() {
+  static std::once_flag initialized;
+  static auto compiled_obj = std::make_unique<CinnCompiledObject>();
+  std::call_once(initialized, [result = compiled_obj.get()]() {
+    auto& scope = result->scope;
+    scope = std::make_shared<CinnScope>();
     scope->Var<CinnTensor>("cinn_var1");
     scope->GetTensor("cinn_var1")->Resize(CinnShape({3, 4}));
     scope->Var<CinnTensor>("cinn_var2");
     scope->GetTensor("cinn_var2")->Resize(CinnShape({6, 7, 8}));
     scope->Var<CinnTensor>("cinn_var3");
     scope->GetTensor("cinn_var3")->Resize(CinnShape({10, 16}));
+    scope->Var<CinnTensor>("cinn_var4");
+    scope->GetTensor("cinn_var4")->Resize(CinnShape({10, 16}));
+    scope->Var<CinnTensor>("cinn_var5");
+    scope->GetTensor("cinn_var5")->Resize(CinnShape({10, 16}));
 
-    paddle2cinn_varmap = {
-        {"var1", "cinn_var1"}, {"var3", "cinn_var3"}, {"var4", "cinn_var4"}};
+    // input variables: var1, var2; output: var5
+    // internal variables: var3 and var4, here var3 is retained
+    // in result map, so the name will be used neither cinn_var3
+    auto& paddle2cinn_varmap = result->paddle2cinn_varmap;
+    paddle2cinn_varmap = {{"var1", "cinn_var1"},
+                          {"var2", "cinn_var2"},
+                          {"var3", "cinn_var3"},
+                          {"var5", "cinn_var5"}};
+
+    auto& runtime_program = result->runtime_program;
+    std::vector<std::unique_ptr<CinnInstruction>> instructions;
+    instructions.emplace_back(new CinnInstruction(
+        cinn::common::DefaultHostTarget(), scope.get(),
+        {"cinn_var1", "cinn_var2"}, {"cinn_var3"}, "elementwise_add"));
+    instructions.emplace_back(
+        new CinnInstruction(cinn::common::DefaultHostTarget(), scope.get(),
+                            {"cinn_var1", "cinn_var2"}, {"cinn_var4"}, "mul"));
+    instructions.emplace_back(new CinnInstruction(
+        cinn::common::DefaultHostTarget(), scope.get(),
+        {"cinn_var3", "cinn_var4"}, {"cinn_var5"}, "elementwise_add"));
+    runtime_program =
+        std::make_unique<CinnRuntimeProgram>(scope, std::move(instructions));
+    result->cached_index = 110;
   });
 
-  return std::make_unique<CinnLaunchContext>(paddle2cinn_varmap, cinn_scope);
+  return compiled_obj.get();
 }
 
-TEST(CinnLaunchContextTest, TestBasic) {
-  auto launch_context = CreateDefaultLaunchContext();
-  // test IsVariableUsed
+class CinnLaunchContextTest : public ::testing::Test {
+ public:
+  std::unique_ptr<CinnLaunchContext> launch_context;
+  CinnCompiledObject* compiled_obj;
+
+  void SetUp() override {
+    compiled_obj = InitDefaultCompiledObject();
+    launch_context = std::make_unique<CinnLaunchContext>(InitDefaultSubgraph(),
+                                                         *compiled_obj);
+  }
+};
+
+TEST_F(CinnLaunchContextTest, TestConstructResult) {
   ASSERT_EQ(launch_context->IsVariableUsed("var1"), true);
+  ASSERT_EQ(launch_context->IsVariableUsed("var2"), true);
+  ASSERT_EQ(launch_context->IsVariableUsed("var3"), true);
   ASSERT_EQ(launch_context->IsVariableUsed("var4"), false);
-  // test UpdateCapturedEnv
-  platform::CPUPlace place;
-  framework::Scope scope;
-  ASSERT_NO_THROW(launch_context->UpdateCapturedEnv(scope, place));
-  // test IsArgumentsInitialized
-  ASSERT_FALSE(launch_context->IsArgumentsInitialized());
+  ASSERT_EQ(launch_context->IsVariableUsed("var5"), true);
+
+  // check result of ExtractInternalVarNames
+  ASSERT_EQ(launch_context->GetInternalVarNames(),
+            std::unordered_set<std::string>({"var3", "cinn_var4"}));
+
+  // check completeness of arguments list, and also check
+  // the two name maps of the paddle->cinn and the reverse one
+  // through the IsVariableUsed interface
+  auto&& arguments = launch_context->FinalizeArguments();
+  ASSERT_EQ(arguments.size(), 5);
+  auto check_argument_fn = [&arguments, this](const std::string& var_name,
+                                              const std::string& arg_name) {
+    ASSERT_EQ(launch_context->IsVariableUsed(var_name), true);
+    ASSERT_NO_THROW(launch_context->GetCinnBufferOfVar(var_name));
+    ASSERT_GT(arguments.count(arg_name), 0);
+    EXPECT_EQ(launch_context->GetCinnBufferOfVar(var_name),
+              static_cast<cinn_buffer_t*>(arguments.at(arg_name)));
+    auto* buffer = launch_context->GetCinnBufferOfVar(var_name);
+    auto&& scope = compiled_obj->scope;
+    ASSERT_EQ(framework::DDim(buffer->dims, buffer->dimensions),
+              phi::make_ddim(scope->GetTensor(arg_name)->shape().data()));
+  };
+  check_argument_fn("var1", "cinn_var1");
+  check_argument_fn("var2", "cinn_var2");
+  check_argument_fn("var3", "cinn_var3");
+  check_argument_fn("cinn_var4", "cinn_var4");
+  check_argument_fn("var5", "cinn_var5");
 }
 
-TEST(CinnLaunchContextTest, TestCheckTensorEquivalent) {
+TEST_F(CinnLaunchContextTest, TestCheckTensorEquivalent) {
   platform::CPUPlace place;
   framework::Scope scope;
-  auto launch_context = CreateDefaultLaunchContext();
   launch_context->UpdateCapturedEnv(scope, place);
   auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();
 
   // CheckTensorEquivalent: tensor dimension not equivalent
   tensor1->mutable_data<float>(phi::make_ddim({3, 5}), place);
-  ASSERT_THROW(launch_context->AssignExternalVariable("var1"),
+  ASSERT_THROW(launch_context->CheckTensorEquivalent("var1", *tensor1),
                paddle::platform::EnforceNotMet);
 }
 
-TEST(CinnLaunchContextTest, TestAssignVariablePreCondition) {
+TEST_F(CinnLaunchContextTest, TestBuildCompiledProgram) {
   platform::CPUPlace place;
   framework::Scope scope;
-  auto launch_context = CreateDefaultLaunchContext();
-  launch_context->UpdateCapturedEnv(scope, place);
-  auto* tensor4 = scope.Var("var4")->GetMutable<LoDTensor>();
+  ParallelExecutor* pe = nullptr;
+  ASSERT_NO_THROW((pe = launch_context->InitializePE(place, &scope)));
 
-  // not used
-  ASSERT_THROW(launch_context->AssignExternalVariable("var4"),
-               paddle::platform::EnforceNotMet);
-  // not found
-  ASSERT_THROW(launch_context->AssignInternalVariable("cinn_var4"),
-               paddle::platform::EnforceNotMet);
+  // check details of program build by compiled instructions
+  const ProgramDesc& program = pe->Graph().OriginProgram();
+  ASSERT_EQ(program.Size(), 1);
+  const auto& block = program.Block(0);
+  // vars
+  std::set<std::string> var_names = block.LocalVarNames();
+  ASSERT_EQ(var_names.size(), 5);
+  for (auto&& var_name : var_names) {
+    auto* var = block.FindVar(var_name);
+    ASSERT_NE(var, nullptr);
+    auto* buffer = launch_context->GetCinnBufferOfVar(var_name);
+    ASSERT_EQ(framework::DDim(buffer->dims, buffer->dimensions),
+              phi::make_ddim(var->GetShape()));
+  }
+  ASSERT_TRUE(block.FindVar("var1")->Persistable());
+  ASSERT_FALSE(block.FindVar("var5")->Persistable());
+  ASSERT_TRUE(block.FindVar("var5")->IsParameter());
+  ASSERT_FALSE(block.FindVar("var1")->IsParameter());
+  // ops
+  ASSERT_EQ(block.OpSize(), 3);
+  auto* op1 = block.Op(0);
+  ASSERT_EQ(op1->Type(), "cinn_instruction_run");
+  ASSERT_EQ(op1->Input(kX), std::vector<std::string>({"var1", "var2"}));
+  ASSERT_EQ(op1->Output(kOutputs), std::vector<std::string>({"var3"}));
+  ASSERT_EQ(op1->GetAttrIfExists<int64_t>(kCachedIndex), 110);
+  ASSERT_EQ(op1->GetAttrIfExists<int64_t>(kInstructionIndex), 0);
+  auto* op3 = block.Op(2);
+  ASSERT_EQ(op3->Type(), "cinn_instruction_run");
+  ASSERT_EQ(op3->Input(kX), std::vector<std::string>({"var3", "cinn_var4"}));
+  ASSERT_EQ(op3->Output(kOutputs), std::vector<std::string>({"var5"}));
+  ASSERT_EQ(op3->GetAttrIfExists<int64_t>(kCachedIndex), 110);
+  ASSERT_EQ(op3->GetAttrIfExists<int64_t>(kInstructionIndex), 2);
 }
 
-TEST(CinnLaunchContextTest, TestAppendArgument) {
-  platform::CPUPlace cpu_place;
-  platform::Place place(cpu_place);
+// DEPRECATED(CtfGo): following test of callback assignment
+// will be deprecated after we switch to pe
+TEST_F(CinnLaunchContextTest, TestCallbackAssignment) {
+  platform::CPUPlace place;
   framework::Scope scope;
-  auto launch_context = CreateDefaultLaunchContext();
   launch_context->UpdateCapturedEnv(scope, place);
 
   // assign external variables
@@ -101,33 +243,8 @@ TEST(CinnLaunchContextTest, TestAppendArgument) {
   float* data1 = tensor1->mutable_data<float>(phi::make_ddim({3, 4}), place);
   data1[0] = 9.99f;
   data1[10] = 19.99f;
-  ASSERT_NO_THROW(launch_context->AssignExternalVariable("var1"));
-
-  auto* tensor3 = scope.Var("var3")->GetMutable<LoDTensor>();
-  tensor3->mutable_data<float>(phi::make_ddim({10, 16}), place);
-  ASSERT_NO_THROW(launch_context->AssignExternalVariable("var3"));
-
-  // FinalizeArguments missed check
-  ASSERT_THROW(launch_context->FinalizeArguments(),
-               paddle::platform::EnforceNotMet);
-  // test get internal variables
-  auto internal_variable_names =
-      launch_context->ExtractInternalVarNames({"var1"}, {"var3"});
-  ASSERT_EQ(internal_variable_names.size(), 1);
-  EXPECT_EQ(*internal_variable_names.begin(), "cinn_var2");
-
-  auto* tensor2 = scope.Var("var2")->GetMutable<LoDTensor>();
-  tensor2->mutable_data<float>(phi::make_ddim({6, 7, 8}), place);
-  ASSERT_NO_THROW(launch_context->AssignInternalVariable("cinn_var2"));
-
   // check argument is set correctly and alloc/free callbacks work well
-  auto name2argument = launch_context->FinalizeArguments();
-  ASSERT_EQ(name2argument.size(), 3);
-  ASSERT_EQ(name2argument.count("cinn_var1"), 1);
-  ASSERT_TRUE(launch_context->IsArgumentsInitialized());
-
-  auto* cinn_buffer =
-      static_cast<cinn_buffer_t*>(name2argument.at("cinn_var1"));
+  auto* cinn_buffer = launch_context->GetCinnBufferOfVar("var1");
   ASSERT_EQ(cinn_buffer->memory, nullptr);
   cinn_buffer->external_malloc->operator()(nullptr, cinn_buffer);
   ASSERT_NE(cinn_buffer->memory, nullptr);
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h
index 1db9f2f25e2..cf3b98c6679 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
@@ -105,63 +105,29 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
     auto* launch_context = cinn_compiled_object.launch_context.get();
     // Step 3. Prepare arguments needed for the compiled executable program.
     launch_context->UpdateCapturedEnv(scope, place);
-    if (!launch_context->IsArgumentsInitialized()) {
-      VLOG(4) << "CinnLaunchOp prepare arguments";
-
-      // 3.1 Prepare input variables: tensors of input variables have
-      //     been initialized before graph compiled, just check the
-      //     equiality between tensors of paddle and cinn.
-      for (const auto& var_name : input_no_need_buffer_variable_names) {
-        // the input variable declared as 'no need buffer' can not be used
-        PADDLE_ENFORCE_EQ(
-            launch_context->IsVariableUsed(var_name), false,
-            platform::errors::InvalidArgument(
-                "Input variable(%s) should not be used by cinn in execution",
-                var_name));
-      }
-
-      for (const auto& var_name : input_x_variable_names) {
-        // some input variables don't need for cinn because they are
-        // eliminated by optimized passes or some cinn operators use
-        // less variables
-        if (!launch_context->IsVariableUsed(var_name)) {
-          VLOG(4) << "Input variable" << var_name << " not used by cinn";
-          continue;
-        }
-
-        launch_context->AssignExternalVariable(var_name);
-      }
-
-      // 3.2 Prepare output variables: all output variables should
-      //     be initialized and allocated buffer before
-      //     the runtime program start execution, the compilation result
-      //     includes details of their buffer assginment and we use that to
-      //     allocate space in Paddle. For those variables allocated yet,
-      //     like persistable parameters, just check the equiality between
-      //     Paddle allocation and CINN buffer assginment.
-      auto output_variable_names = ctx.OutputNames(kOutputs);
-      for (const auto var_name : output_variable_names) {
-        PADDLE_ENFORCE_EQ(
-            launch_context->IsVariableUsed(var_name), true,
-            platform::errors::InvalidArgument(
-                "Output variable(%s) not used by cinn", var_name));
-
-        launch_context->AssignExternalVariable(var_name);
-      }
-
-      // 3.3 Prepare internal or temporary variables: Create a temporary
-      //     scope to keep internal variables within graph or temporary
-      //     variables needed by the compiled runtime program in addition.
-      //     Here we directly use the names from CinnScope as Paddle variable
-      //     names, because they will not be used outside the graph
-      //     and should be destructed after computation finished.
-      auto internal_variable_names = launch_context->ExtractInternalVarNames(
-          input_x_variable_names, output_variable_names);
-      for (const auto& var_name : internal_variable_names) {
-        launch_context->AssignInternalVariable(var_name);
+    // 3.1 Input variables: tensors of input variables have
+    //     been initialized before graph compiled, just check the
+    //     equiality between tensors of paddle and cinn.
+    for (const auto& var_name : input_x_variable_names) {
+      // some input variables don't need for cinn because they are
+      // eliminated by optimized passes or some cinn operators use
+      // less variables
+      if (!launch_context->IsVariableUsed(var_name)) {
+        VLOG(4) << "Input variable" << var_name << " not used by cinn";
+        continue;
       }
+      launch_context->CheckTensorEquivalent(var_name,
+                                            *inputs_name2tensor.at(var_name));
     }
 
+    // 3.2 Output variables: the output variables will be initialized
+    //     and allocated buffer in callbacks which are defined in the
+    //     external_malloc/free interface of cinn_buffer_t
+    //     in their corresponding arguments.
+    // 3.3 Internal variables: A temporary scope is created in
+    //     UpdateCapturedEnv to keep the internal variables and
+    //     they are also initialized through callbacks
+
     // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
     details::SetCinnRuntimeFlags();
 
diff --git a/paddle/fluid/operators/cinn/test_helper.h b/paddle/fluid/operators/cinn/test_helper.h
index eb3d725d554..9720a5309fa 100644
--- a/paddle/fluid/operators/cinn/test_helper.h
+++ b/paddle/fluid/operators/cinn/test_helper.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/phi/core/ddim.h"
 
@@ -31,6 +32,7 @@ using LoDTensor = framework::LoDTensor;
 using Variable = framework::Variable;
 using Graph = framework::ir::Graph;
 using Node = framework::ir::Node;
+using framework::paddle2cinn::Name2VarInfoMap;
 
 std::unique_ptr<Graph> CreateOnlyElementwiseAddGraph(
     const std::string& x_name, const std::string& y_name,
@@ -71,6 +73,16 @@ std::unique_ptr<Graph> CreateOnlyElementwiseAddGraph(
   y_node->inputs = {feed_op_node_y};
   y_node->outputs = {elementwise_add_node};
   out_node->inputs = {elementwise_add_node};
+  // set necessary attributes
+  g->Set<std::vector<std::string>>(
+      framework::paddle2cinn::kInputVars,
+      new std::vector<std::string>({x_name, y_name}));
+  g->Set<std::vector<std::string>>(framework::paddle2cinn::kInternalVars,
+                                   new std::vector<std::string>({}));
+  g->Set<std::vector<std::string>>(framework::paddle2cinn::kOutputVars,
+                                   new std::vector<std::string>({out_name}));
+  g->GetOrInit<Name2VarInfoMap>(
+      framework::paddle2cinn::kMemOptVarInfoFromMainGraph);
   return g;
 }
 
-- 
GitLab