From 2d4fe1637bdab18b203e2274c9fafdc2e689cd48 Mon Sep 17 00:00:00 2001
From: TeFeng Chen <ctfeng66@163.com>
Date: Mon, 18 Apr 2022 10:24:58 +0800
Subject: [PATCH] cinn_launch_op: optimize the overhead of preparing variables
 before executing cinn compiled program (#41777)

* optimize preparation overhead before executing cinn compiled program

* update code notes

* fix flag annotation

* add a flag of auto-tune feature beforehand
---
 .../framework/paddle2cinn/cinn_compiler.cc    | 34 ++++++++
 .../framework/paddle2cinn/cinn_compiler.h     |  7 ++
 paddle/fluid/operators/cinn/CMakeLists.txt    |  2 +-
 .../operators/cinn/cinn_launch_context.cc     | 25 +++---
 paddle/fluid/operators/cinn/cinn_launch_op.h  | 40 ++++------
 .../operators/cinn/cinn_launch_op_test.cc     | 77 ++++++++++++-------
 paddle/fluid/platform/flags.cc                | 26 +++++++
 7 files changed, 151 insertions(+), 60 deletions(-)
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 6cde65f6ab5..83a5b6f8213 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -31,11 +31,13 @@
 #include "cinn/hlir/framework/graph_compiler.h"
 #include "cinn/hlir/framework/pass.h"
 #include "cinn/hlir/pass/use_pass.h"
+#include "gflags/gflags.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
 #include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -45,6 +47,7 @@
 #include "paddle/fluid/string/string_helper.h"
 #include "paddle/phi/core/utils/rw_lock.h"
 
+DECLARE_bool(enable_pe_launch_cinn);
 namespace paddle {
 namespace framework {
 namespace paddle2cinn {
@@ -217,6 +220,33 @@ void CinnCompiler::Clear() {
   real_compiled_num_.store(0);
 }
 
+void CinnCompiler::CheckCompiledValid(
+    const ir::Graph& graph,
+    const std::map<std::string, const LoDTensor*>& input_tensors,
+    const CinnCompiledObject& compiled_obj) const {
+  const auto& input_var_names = graph.Get<std::vector<std::string>>(kInputVars);
+  const auto& output_var_names =
+      graph.Get<std::vector<std::string>>(kOutputVars);
+  auto* launch_context = compiled_obj.launch_context.get();
+  // 1. check all of the output variables will be assigned by compiled program
+  for (auto&& var_name : output_var_names) {
+    PADDLE_ENFORCE_EQ(launch_context->IsVariableUsed(var_name), true,
+                      platform::errors::PreconditionNotMet(
+                          "Variable(%s) not applied in CINN", var_name));
+  }
+  // 2. check all of the used input variables were correctly deduced by CINN.
+  for (const auto& var_name : input_var_names) {
+    // some input variables were not used by CINN because they were eliminated
+    // by its optimized passes or some operators of it need less inputs
+    if (!launch_context->IsVariableUsed(var_name)) {
+      VLOG(4) << "Input variable" << var_name << " not used by cinn";
+      continue;
+    }
+    launch_context->CheckTensorEquivalent(var_name,
+                                          *input_tensors.at(var_name));
+  }
+}
+
 std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
     const ir::Graph& graph,
     const std::map<std::string, const LoDTensor*>& input_tensors,
@@ -244,6 +274,9 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
       std::make_unique<GraphCompiler>(target, scope, cinn_graph);
   GraphCompiler::CompileOptions options;
   options.with_instantiate_variables = false;
+  if (!FLAGS_enable_pe_launch_cinn) {
+    options.with_buffer_handle_instruction_inserted = true;
+  }
   auto compiled_res =
       graph_compiler->Build(options, std::move(fetch_ids), stream);
   auto compiled_obj = std::make_unique<CinnCompiledObject>();
@@ -254,6 +287,7 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
   compiled_obj->launch_context =
       std::make_unique<operators::details::CinnLaunchContext>(graph,
                                                               *compiled_obj);
+  CheckCompiledValid(graph, input_tensors, *compiled_obj);
   return compiled_obj;
 }
 
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
index 5fa54b302a3..cf17e68156b 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
@@ -103,6 +103,13 @@ class CinnCompiler {
       const ::cinn::common::Target& target, std::int64_t compiled_num,
       void* stream = nullptr) const;
 
+  // check whether a compiled result is valid by comparing
+  // the consistency of external variables of the subgraph
+  void CheckCompiledValid(
+      const ir::Graph& graph,
+      const std::map<std::string, const LoDTensor*>& input_tensors,
+      const CinnCompiledObject& compiled_obj) const;
+
   std::unordered_map<std::string, std::unique_ptr<ir::Graph>> graphs_;
   std::unordered_map<CinnCacheKeyByAddress, std::int64_t, CinnCacheKey::Hash>
       cache_by_address_;
diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt
index 2406445e6cf..862a0d04fbd 100644
--- a/paddle/fluid/operators/cinn/CMakeLists.txt
+++ b/paddle/fluid/operators/cinn/CMakeLists.txt
@@ -3,7 +3,7 @@ include(operators)
 cc_library(cinn_op_helper SRCS cinn_op_helper.cc DEPS operator device_context)
 cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy device_context parallel_executor transform_type cinn)
 
-SET(CINN_OP_DEPS parallel_executor string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context transform_type)
+SET(CINN_OP_DEPS parallel_executor string_helper variable_helper cinn cinn_compiler cinn_op_helper cinn_launch_context transform_type)
 register_operators(DEPS ${CINN_OP_DEPS})
 
 if (WITH_TESTING)
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc
index b445527322f..a660d59fb4c 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -33,6 +33,7 @@
 #include "paddle/fluid/framework/paddle2cinn/transform_type.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/cinn/cinn_op_helper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
@@ -69,13 +70,6 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph,
       graph.Get<std::vector<std::string>>(framework::paddle2cinn::kOutputVars);
   internal_var_names_ =
       ExtractInternalVarNames(input_var_names, output_var_names);
-  // check completeness of output variables in compiled result
-  for (auto&& var_name : output_var_names) {
-    PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true,
-                      platform::errors::PreconditionNotMet(
-                          "Variable(%s) not applied in CINN", var_name));
-  }
-
   // initialize all execution arguments
   InitializeArguments();
   // DEPRECATED(CtfGo): following callback assignment will be deprecated soon
@@ -235,7 +229,7 @@ void CinnLaunchContext::InitializeArguments() {
                         cinn_tensor->shape().data().size());
     cinn_buffer->type = cinn::runtime::ToRuntimeType(cinn_tensor->type());
     VLOG(4) << string::Sprintf(
-        "Append an argument:name(%s),dims(%s),type(%s)",
+        "Append an argument:name(%s),dims(%s),type(%s)", arg,
         framework::DDim(cinn_buffer->dims, cinn_buffer->dimensions).to_str(),
         cinn_tensor->type());
     name2argument_.emplace(arg, cinn_buffer.get());
@@ -400,7 +394,20 @@ ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place,
   std::unordered_map<Scope*, Scope*> scope_map = {
       {parallel_executor_->GetLocalScopes().front(), scope}};
   parallel_executor_->ResetOpHandleScopeMapOfGraphs(scope_map);
-  parallel_executor_->PrepareVariables(scope);
+  // instead of using the PrepareVariables function of ParallelExecutor to
+  // initialize all variables, here we only initialize internal variables
+  // because external variables are already included in parent scope.
+  for (auto&& var_name : internal_var_names_) {
+    auto* var = scope->FindVar(var_name);
+    if (var != nullptr) {
+      VLOG(5) << "internal variable:" << var_name
+              << " has been initialized beforehand in global scope, skipped.";
+      continue;
+    }
+    framework::InitializeVariable(scope->Var(var_name),
+                                  framework::proto::VarType::LOD_TENSOR);
+  }
+
   for (auto&& var_name : initialized_beforehand_vars_) {
     auto* var = scope->GetVar(var_name);
     auto* buffer = GetCinnBufferOfVar(var_name);
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h
index 5263aae03ed..024bf2bceb3 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
@@ -18,7 +18,9 @@
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+
 #include "cinn/common/target.h"
+#include "gflags/gflags.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -26,6 +28,7 @@
 #include "paddle/fluid/operators/cinn/cinn_launch_context.h"
 #include "paddle/fluid/operators/cinn/cinn_op_helper.h"
 
+DECLARE_bool(enable_pe_launch_cinn);
 namespace paddle {
 namespace operators {
 
@@ -101,34 +104,23 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
     const auto& cinn_compiled_object = CinnCompiler::GetInstance()->Compile(
         compilation_key, inputs_name2tensor, target, stream);
     details::DebugCinnCompiledResult(cinn_compiled_object);
-
     auto* launch_context = cinn_compiled_object.launch_context.get();
-    // Step 3. check the computational consistency of the subgraph
-    //         before and after the compilation
-    // 3.1 Input variables: tensors of input variables have
-    //     been initialized before graph compiled, just check the
-    //     equiality between tensors of paddle and cinn.
-    for (const auto& var_name : input_x_variable_names) {
-      // some input variables don't need for cinn because they are
-      // eliminated by optimized passes or some cinn operators use
-      // less variables
-      if (!launch_context->IsVariableUsed(var_name)) {
-        VLOG(4) << "Input variable" << var_name << " not used by cinn";
-        continue;
-      }
-      launch_context->CheckTensorEquivalent(var_name,
-                                            *inputs_name2tensor.at(var_name));
-    }
 
-    // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
+    // Step 3. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
     details::SetCinnRuntimeFlags();
 
-    // Step 5. use PE to execute the compiled CINN instructions
-    //         in nodes of the runtime graph
-    VLOG(4) << "Execute the runtime graph by PE";
-    framework::Scope& exec_scope = scope.NewScope();
-    auto* pe = launch_context->InitializePE(place, &exec_scope);
-    pe->RunWithoutFetch(launch_context->GetSkipEagerVars());
+    // Step 4. Execute the compiled CINN instructions by a PE or
+    //         by the CINN compiled program in sequential order
+    if (FLAGS_enable_pe_launch_cinn) {
+      VLOG(4) << "Execute the runtime graph by PE";
+      framework::Scope& exec_scope = scope.NewScope();
+      auto* pe = launch_context->InitializePE(place, &exec_scope);
+      pe->RunWithoutFetch(launch_context->GetSkipEagerVars());
+    } else {
+      VLOG(4) << "Execute the compiled executable program";
+      launch_context->UpdateCapturedEnv(scope, place);
+      LaunchCinnExecution(cinn_compiled_object, *launch_context, stream);
+    }
     VLOG(4) << "CinnLaunchOp launch execution done.";
   }
 };
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
index 585f1caabed..3e363c56eb9 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
@@ -32,6 +32,7 @@ USE_OP(cinn_launch);
 USE_OP(cinn_instruction_run);
 USE_OP_ITSELF(elementwise_add);
 DECLARE_double(eager_delete_tensor_gb);
+DECLARE_bool(enable_pe_launch_cinn);
 
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 #ifdef PADDLE_WITH_CUDA
@@ -42,43 +43,67 @@ namespace paddle::operators {
 
 using framework::paddle2cinn::CinnCompiler;
 
-TEST(CinnLaunchOpTest, TestWithElementwiseAdd) {
-  paddle::framework::InitDevices();
-  platform::SetNumThreads(1);
-  // cache test graph into CinnCompiler
-  const std::string& test_op_out_name = "cinn_launch_op_out";
-  const std::string& add_op_out_name = "add_op_out";
-  auto compilation_key = CinnCompiler::GetInstance()->AddGraph(
-      CreateOnlyElementwiseAddGraph("x", "y", test_op_out_name));
-
-  // create cinn_launch_op and elementwise_add op
-  auto cinn_launch_op = paddle::framework::OpRegistry::CreateOp(
-      "cinn_launch", {{"X", {"x", "y"}}}, {{"Out", {test_op_out_name}}},
-      {{"compilation_key", compilation_key}});
-  auto elementwise_add_op = paddle::framework::OpRegistry::CreateOp(
-      "elementwise_add", {{"X", {"x"}}, {"Y", {"y"}}},
-      {{"Out", {add_op_out_name}}}, {{}});
-
-  // Run ops and check the computation results
-  auto run_and_check_fn = [&](const platform::Place& place) {
+class TestCinnLaunchOp : public ::testing::Test {
+ public:
+  const char* test_op_out_name = "add_op_out";
+  const char* add_op_out_name = "add_op_out";
+  std::unique_ptr<framework::OperatorBase> cinn_launch_op;
+  std::unique_ptr<framework::OperatorBase> elementwise_add_op;
+
+  void SetUp() override {
+    paddle::framework::InitDevices();
+    platform::SetNumThreads(1);
+    // cache test graph into CinnCompiler
+    auto compilation_key = CinnCompiler::GetInstance()->AddGraph(
+        CreateOnlyElementwiseAddGraph("x", "y", test_op_out_name));
+
+    // create cinn_launch_op and elementwise_add op
+    cinn_launch_op = paddle::framework::OpRegistry::CreateOp(
+        "cinn_launch", {{"X", {"x", "y"}}}, {{"Out", {test_op_out_name}}},
+        {{"compilation_key", compilation_key}});
+    elementwise_add_op = paddle::framework::OpRegistry::CreateOp(
+        "elementwise_add", {{"X", {"x"}}, {"Y", {"y"}}},
+        {{"Out", {add_op_out_name}}}, {{}});
+  }
+
+  void RunAndCheck(const platform::Place& place) {
+    // Run ops and check the computation results
     framework::Scope scope;
     InitVariablesWithRandomValue<float>({"x", "y"}, {10, 20}, place, &scope);
     scope.Var(test_op_out_name)->GetMutable<LoDTensor>();
     scope.Var(add_op_out_name)->GetMutable<LoDTensor>();
-    cinn_launch_op->Run(scope, place);
     elementwise_add_op->Run(scope, place);
+    cinn_launch_op->Run(scope, place);
     CompareOpResult<float>(scope.GetVar(test_op_out_name),
                            scope.GetVar(add_op_out_name));
-  };
-  FLAGS_eager_delete_tensor_gb = -1;
+  }
+
+  void TearDown() override { CinnCompiler::GetInstance()->Clear(); }
+};
 
+TEST_F(TestCinnLaunchOp, TestRunInstructionByPE) {
   // CPU
-  run_and_check_fn(platform::CPUPlace());
-  run_and_check_fn(platform::CPUPlace());
+  RunAndCheck(platform::CPUPlace());
+  // the second run on the same place is to check the cache logic
+  RunAndCheck(platform::CPUPlace());
+#ifdef PADDLE_WITH_CUDA
+  // GPU
+  RunAndCheck(platform::CUDAPlace());
+  RunAndCheck(platform::CUDAPlace());
+#endif
+}
+
+TEST_F(TestCinnLaunchOp, TestRunInstructionByCinnProgram) {
+  // set FLAGS_enable_pe_launch_cinn=false to switch to use
+  // default scheduler of CINN to execute the compiled program
+  FLAGS_enable_pe_launch_cinn = false;
+
+  RunAndCheck(platform::CPUPlace());
+  RunAndCheck(platform::CPUPlace());
 #ifdef PADDLE_WITH_CUDA
   // GPU
-  run_and_check_fn(platform::CUDAPlace());
-  run_and_check_fn(platform::CUDAPlace());
+  RunAndCheck(platform::CUDAPlace());
+  RunAndCheck(platform::CUDAPlace());
 #endif
 }
 
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index a43eaa41cfe..f89452853b4 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -751,6 +751,32 @@ PADDLE_DEFINE_EXPORTED_string(allow_cinn_ops, "",
  */
 PADDLE_DEFINE_EXPORTED_string(deny_cinn_ops, "",
                               "It controls the cinn op subset to be not used.");
+
+/*
+ * CINN related FLAG
+ * Name: FLAGS_enable_pe_launch_cinn
+ * Since Version: 2.3
+ * Value Range: bool, default=true
+ * Example: FLAGS_enable_pe_launch_cinn=true would execute the CINN compiled
+ * instructions of a paddle graph with ParallelExecutor, otherwise with the
+ * CINN compiled runtime program in sequential order.
+ */
+PADDLE_DEFINE_EXPORTED_bool(enable_pe_launch_cinn, true,
+                            "It controls whether to execute cinn compiled "
+                            "program with ParallelExecutor");
+
+/*
+ * CINN related FLAG
+ * Name: FLAGS_enable_cinn_auto_tune
+ * Since Version: 2.3
+ * Value Range: bool, default=false
+ * Example: FLAGS_enable_cinn_auto_tune=true would use CINN with its
+ * auto-tune feature enabled
+ */
+PADDLE_DEFINE_EXPORTED_bool(enable_cinn_auto_tune, false,
+                            "It controls whether to use cinn with "
+                            "its auto-tune feature enabled");
+
 #endif
 
 DEFINE_int32(record_pool_max_size, 2000000,
-- 
GitLab