cinn_launch_op: optimize the overhead of preparing variables before executing...

cinn_launch_op: optimize the overhead of preparing variables before executing cinn compiled program (#41777) (#41910) cherry-pick #41777 * optimize preparation overhead before executing cinn compiled program

cinn_launch_op: optimize the overhead of preparing variables before executing...
cinn_launch_op: optimize the overhead of preparing variables before executing cinn compiled program (#41777) (#41910) cherry-pick #41777 * optimize preparation overhead before executing cinn compiled program
dab7dfbf · TeFeng Chen · GitHub · 0fb06e46 · dab7dfbf · dab7dfbf
7 changed file
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -31,11 +31,13 @@
 #include "cinn/hlir/framework/graph_compiler.h"
 #include "cinn/hlir/framework/pass.h"
 #include "cinn/hlir/pass/use_pass.h"
+#include "gflags/gflags.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
 #include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -45,6 +47,7 @@
 #include "paddle/fluid/string/string_helper.h"
 #include "paddle/phi/core/utils/rw_lock.h"
+DECLARE_bool(enable_pe_launch_cinn);
 namespace paddle {
 namespace framework {
 namespace paddle2cinn {
@@ -217,6 +220,33 @@ void CinnCompiler::Clear() {
  real_compiled_num_.store(0);
 }
+void CinnCompiler::CheckCompiledValid(
+    const ir::Graph& graph,
+    const std::map<std::string, const LoDTensor*>& input_tensors,
+    const CinnCompiledObject& compiled_obj) const {
+  const auto& input_var_names = graph.Get<std::vector<std::string>>(kInputVars);
+  const auto& output_var_names =
+      graph.Get<std::vector<std::string>>(kOutputVars);
+  auto* launch_context = compiled_obj.launch_context.get();
+  // 1. check all of the output variables will be assigned by compiled program
+  for (auto&& var_name : output_var_names) {
+    PADDLE_ENFORCE_EQ(launch_context->IsVariableUsed(var_name), true,
+                      platform::errors::PreconditionNotMet(
+                          "Variable(%s) not applied in CINN", var_name));
+  }
+  // 2. check all of the used input variables were correctly deduced by CINN.
+  for (const auto& var_name : input_var_names) {
+    // some input variables were not used by CINN because they were eliminated
+    // by its optimized passes or some operators of it need less inputs
+    if (!launch_context->IsVariableUsed(var_name)) {
+      VLOG(4) << "Input variable" << var_name << " not used by cinn";
+      continue;
+    }
+    launch_context->CheckTensorEquivalent(var_name,
+                                          *input_tensors.at(var_name));
+  }
+}
 std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
    const ir::Graph& graph,
    const std::map<std::string, const LoDTensor*>& input_tensors,
@@ -244,6 +274,9 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
      std::make_unique<GraphCompiler>(target, scope, cinn_graph);
  GraphCompiler::CompileOptions options;
  options.with_instantiate_variables = false;
+  if (!FLAGS_enable_pe_launch_cinn) {
+    options.with_buffer_handle_instruction_inserted = true;
+  }
  auto compiled_res =
      graph_compiler->Build(options, std::move(fetch_ids), stream);
  auto compiled_obj = std::make_unique<CinnCompiledObject>();
@@ -254,6 +287,7 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
  compiled_obj->launch_context =
      std::make_unique<operators::details::CinnLaunchContext>(graph,
                                                              *compiled_obj);
+  CheckCompiledValid(graph, input_tensors, *compiled_obj);
  return compiled_obj;
 }

--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
@@ -103,6 +103,13 @@ class CinnCompiler {
      const ::cinn::common::Target& target, std::int64_t compiled_num,
      void* stream = nullptr) const;
+  // check whether a compiled result is valid by comparing
+  // the consistency of external variables of the subgraph
+  void CheckCompiledValid(
+      const ir::Graph& graph,
+      const std::map<std::string, const LoDTensor*>& input_tensors,
+      const CinnCompiledObject& compiled_obj) const;
  std::unordered_map<std::string, std::unique_ptr<ir::Graph>> graphs_;
  std::unordered_map<CinnCacheKeyByAddress, std::int64_t, CinnCacheKey::Hash>
      cache_by_address_;

--- a/paddle/fluid/operators/cinn/CMakeLists.txt
+++ b/paddle/fluid/operators/cinn/CMakeLists.txt
@@ -3,7 +3,7 @@ include(operators)
 cc_library(cinn_op_helper SRCS cinn_op_helper.cc DEPS operator device_context)
 cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy device_context parallel_executor transform_type cinn)
-SET(CINN_OP_DEPS parallel_executor string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context transform_type)
+SET(CINN_OP_DEPS parallel_executor string_helper variable_helper cinn cinn_compiler cinn_op_helper cinn_launch_context transform_type)
 register_operators(DEPS ${CINN_OP_DEPS})
 if (WITH_TESTING)

--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -33,6 +33,7 @@
 #include "paddle/fluid/framework/paddle2cinn/transform_type.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/cinn/cinn_op_helper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
@@ -69,13 +70,6 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph,
      graph.Get<std::vector<std::string>>(framework::paddle2cinn::kOutputVars);
  internal_var_names_ =
      ExtractInternalVarNames(input_var_names, output_var_names);
-  // check completeness of output variables in compiled result
-  for (auto&& var_name : output_var_names) {
-    PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true,
-                      platform::errors::PreconditionNotMet(
-                          "Variable(%s) not applied in CINN", var_name));
-  }
  // initialize all execution arguments
  InitializeArguments();
  // DEPRECATED(CtfGo): following callback assignment will be deprecated soon
@@ -235,7 +229,7 @@ void CinnLaunchContext::InitializeArguments() {
                        cinn_tensor->shape().data().size());
    cinn_buffer->type = cinn::runtime::ToRuntimeType(cinn_tensor->type());
    VLOG(4) << string::Sprintf(
-        "Append an argument:name(%s),dims(%s),type(%s)",
+        "Append an argument:name(%s),dims(%s),type(%s)", arg,
        framework::DDim(cinn_buffer->dims, cinn_buffer->dimensions).to_str(),
        cinn_tensor->type());
    name2argument_.emplace(arg, cinn_buffer.get());
@@ -400,7 +394,20 @@ ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place,
  std::unordered_map<Scope*, Scope*> scope_map = {
      {parallel_executor_->GetLocalScopes().front(), scope}};
  parallel_executor_->ResetOpHandleScopeMapOfGraphs(scope_map);
-  parallel_executor_->PrepareVariables(scope);
+  // instead of using the PrepareVariables function of ParallelExecutor to
+  // initialize all variables, here we only initialize internal variables
+  // because external variables are already included in parent scope.
+  for (auto&& var_name : internal_var_names_) {
+    auto* var = scope->FindVar(var_name);
+    if (var != nullptr) {
+      VLOG(5) << "internal variable:" << var_name
+              << " has been initialized beforehand in global scope, skipped.";
+      continue;
+    }
+    framework::InitializeVariable(scope->Var(var_name),
+                                  framework::proto::VarType::LOD_TENSOR);
+  }
  for (auto&& var_name : initialized_beforehand_vars_) {
    auto* var = scope->GetVar(var_name);
    auto* buffer = GetCinnBufferOfVar(var_name);

--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
@@ -18,7 +18,9 @@
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include "cinn/common/target.h"
+#include "gflags/gflags.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -26,6 +28,7 @@
 #include "paddle/fluid/operators/cinn/cinn_launch_context.h"
 #include "paddle/fluid/operators/cinn/cinn_op_helper.h"
+DECLARE_bool(enable_pe_launch_cinn);
 namespace paddle {
 namespace operators {
@@ -101,34 +104,23 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
    const auto& cinn_compiled_object = CinnCompiler::GetInstance()->Compile(
        compilation_key, inputs_name2tensor, target, stream);
    details::DebugCinnCompiledResult(cinn_compiled_object);
    auto* launch_context = cinn_compiled_object.launch_context.get();
-    // Step 3. check the computational consistency of the subgraph
-    //         before and after the compilation
-    // 3.1 Input variables: tensors of input variables have
-    //     been initialized before graph compiled, just check the
-    //     equiality between tensors of paddle and cinn.
-    for (const auto& var_name : input_x_variable_names) {
-      // some input variables don't need for cinn because they are
-      // eliminated by optimized passes or some cinn operators use
-      // less variables
-      if (!launch_context->IsVariableUsed(var_name)) {
-        VLOG(4) << "Input variable" << var_name << " not used by cinn";
-        continue;
-      }
-      launch_context->CheckTensorEquivalent(var_name,
-                                            *inputs_name2tensor.at(var_name));
-    }
-    // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
+    // Step 3. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
    details::SetCinnRuntimeFlags();
-    // Step 5. use PE to execute the compiled CINN instructions
+    // Step 4. Execute the compiled CINN instructions by a PE or
-    //         in nodes of the runtime graph
+    //         by the CINN compiled program in sequential order
-    VLOG(4) << "Execute the runtime graph by PE";
+    if (FLAGS_enable_pe_launch_cinn) {
-    framework::Scope& exec_scope = scope.NewScope();
+      VLOG(4) << "Execute the runtime graph by PE";
-    auto* pe = launch_context->InitializePE(place, &exec_scope);
+      framework::Scope& exec_scope = scope.NewScope();
-    pe->RunWithoutFetch(launch_context->GetSkipEagerVars());
+      auto* pe = launch_context->InitializePE(place, &exec_scope);
+      pe->RunWithoutFetch(launch_context->GetSkipEagerVars());
+    } else {
+      VLOG(4) << "Execute the compiled executable program";
+      launch_context->UpdateCapturedEnv(scope, place);
+      LaunchCinnExecution(cinn_compiled_object, *launch_context, stream);
+    }
    VLOG(4) << "CinnLaunchOp launch execution done.";
  }
 };

--- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
@@ -32,6 +32,7 @@ USE_OP(cinn_launch);
 USE_OP(cinn_instruction_run);
 USE_OP_ITSELF(elementwise_add);
 DECLARE_double(eager_delete_tensor_gb);
+DECLARE_bool(enable_pe_launch_cinn);
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 #ifdef PADDLE_WITH_CUDA
@@ -42,43 +43,67 @@ namespace paddle::operators {
 using framework::paddle2cinn::CinnCompiler;
-TEST(CinnLaunchOpTest, TestWithElementwiseAdd) {
+class TestCinnLaunchOp : public ::testing::Test {
-  paddle::framework::InitDevices();
+ public:
-  platform::SetNumThreads(1);
+  const char* test_op_out_name = "add_op_out";
-  // cache test graph into CinnCompiler
+  const char* add_op_out_name = "add_op_out";
-  const std::string& test_op_out_name = "cinn_launch_op_out";
+  std::unique_ptr<framework::OperatorBase> cinn_launch_op;
-  const std::string& add_op_out_name = "add_op_out";
+  std::unique_ptr<framework::OperatorBase> elementwise_add_op;
-  auto compilation_key = CinnCompiler::GetInstance()->AddGraph(
-      CreateOnlyElementwiseAddGraph("x", "y", test_op_out_name));
+  void SetUp() override {
+    paddle::framework::InitDevices();
-  // create cinn_launch_op and elementwise_add op
+    platform::SetNumThreads(1);
-  auto cinn_launch_op = paddle::framework::OpRegistry::CreateOp(
+    // cache test graph into CinnCompiler
-      "cinn_launch", {{"X", {"x", "y"}}}, {{"Out", {test_op_out_name}}},
+    auto compilation_key = CinnCompiler::GetInstance()->AddGraph(
-      {{"compilation_key", compilation_key}});
+        CreateOnlyElementwiseAddGraph("x", "y", test_op_out_name));
-  auto elementwise_add_op = paddle::framework::OpRegistry::CreateOp(
-      "elementwise_add", {{"X", {"x"}}, {"Y", {"y"}}},
+    // create cinn_launch_op and elementwise_add op
-      {{"Out", {add_op_out_name}}}, {{}});
+    cinn_launch_op = paddle::framework::OpRegistry::CreateOp(
+        "cinn_launch", {{"X", {"x", "y"}}}, {{"Out", {test_op_out_name}}},
-  // Run ops and check the computation results
+        {{"compilation_key", compilation_key}});
-  auto run_and_check_fn = [&](const platform::Place& place) {
+    elementwise_add_op = paddle::framework::OpRegistry::CreateOp(
+        "elementwise_add", {{"X", {"x"}}, {"Y", {"y"}}},
+        {{"Out", {add_op_out_name}}}, {{}});
+  }
+  void RunAndCheck(const platform::Place& place) {
+    // Run ops and check the computation results
    framework::Scope scope;
    InitVariablesWithRandomValue<float>({"x", "y"}, {10, 20}, place, &scope);
    scope.Var(test_op_out_name)->GetMutable<LoDTensor>();
    scope.Var(add_op_out_name)->GetMutable<LoDTensor>();
-    cinn_launch_op->Run(scope, place);
    elementwise_add_op->Run(scope, place);
+    cinn_launch_op->Run(scope, place);
    CompareOpResult<float>(scope.GetVar(test_op_out_name),
                           scope.GetVar(add_op_out_name));
-  };
+  }
-  FLAGS_eager_delete_tensor_gb = -1;
+  void TearDown() override { CinnCompiler::GetInstance()->Clear(); }
+};
+TEST_F(TestCinnLaunchOp, TestRunInstructionByPE) {
  // CPU
-  run_and_check_fn(platform::CPUPlace());
+  RunAndCheck(platform::CPUPlace());
-  run_and_check_fn(platform::CPUPlace());
+  // the second run on the same place is to check the cache logic
+  RunAndCheck(platform::CPUPlace());
+#ifdef PADDLE_WITH_CUDA
+  // GPU
+  RunAndCheck(platform::CUDAPlace());
+  RunAndCheck(platform::CUDAPlace());
+#endif
+}
+TEST_F(TestCinnLaunchOp, TestRunInstructionByCinnProgram) {
+  // set FLAGS_enable_pe_launch_cinn=false to switch to use
+  // default scheduler of CINN to execute the compiled program
+  FLAGS_enable_pe_launch_cinn = false;
+  RunAndCheck(platform::CPUPlace());
+  RunAndCheck(platform::CPUPlace());
 #ifdef PADDLE_WITH_CUDA
  // GPU
-  run_and_check_fn(platform::CUDAPlace());
+  RunAndCheck(platform::CUDAPlace());
-  run_and_check_fn(platform::CUDAPlace());
+  RunAndCheck(platform::CUDAPlace());
 #endif
 }

--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -751,6 +751,32 @@ PADDLE_DEFINE_EXPORTED_string(allow_cinn_ops, "",
 */
 PADDLE_DEFINE_EXPORTED_string(deny_cinn_ops, "",
                              "It controls the cinn op subset to be not used.");
+/*
+ * CINN related FLAG
+ * Name: FLAGS_enable_pe_launch_cinn
+ * Since Version: 2.3
+ * Value Range: bool, default=true
+ * Example: FLAGS_enable_pe_launch_cinn=true would execute the CINN compiled
+ * instructions of a paddle graph with ParallelExecutor, otherwise with the
+ * CINN compiled runtime program in sequential order.
+ */
+PADDLE_DEFINE_EXPORTED_bool(enable_pe_launch_cinn, true,
+                            "It controls whether to execute cinn compiled "
+                            "program with ParallelExecutor");
+/*
+ * CINN related FLAG
+ * Name: FLAGS_enable_cinn_auto_tune
+ * Since Version: 2.3
+ * Value Range: bool, default=false
+ * Example: FLAGS_enable_cinn_auto_tune=true would use CINN with its
+ * auto-tune feature enabled
+ */
+PADDLE_DEFINE_EXPORTED_bool(enable_cinn_auto_tune, false,
+                            "It controls whether to use cinn with "
+                            "its auto-tune feature enabled");
 #endif
 DEFINE_int32(record_pool_max_size, 2000000,