From dab7dfbf888b37ae4ffb1a9b72c0cc1f49cf3cbd Mon Sep 17 00:00:00 2001 From: TeFeng Chen Date: Tue, 19 Apr 2022 10:30:39 +0800 Subject: [PATCH] cinn_launch_op: optimize the overhead of preparing variables before executing cinn compiled program (#41777) (#41910) cherry-pick #41777 * optimize preparation overhead before executing cinn compiled program --- .../framework/paddle2cinn/cinn_compiler.cc | 34 ++++++++ .../framework/paddle2cinn/cinn_compiler.h | 7 ++ paddle/fluid/operators/cinn/CMakeLists.txt | 2 +- .../operators/cinn/cinn_launch_context.cc | 25 +++--- paddle/fluid/operators/cinn/cinn_launch_op.h | 40 ++++------ .../operators/cinn/cinn_launch_op_test.cc | 77 ++++++++++++------- paddle/fluid/platform/flags.cc | 26 +++++++ 7 files changed, 151 insertions(+), 60 deletions(-) diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc index 6cde65f6ab5..83a5b6f8213 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc @@ -31,11 +31,13 @@ #include "cinn/hlir/framework/graph_compiler.h" #include "cinn/hlir/framework/pass.h" #include "cinn/hlir/pass/use_pass.h" +#include "gflags/gflags.h" #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h" #include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/tensor.h" @@ -45,6 +47,7 @@ #include "paddle/fluid/string/string_helper.h" #include "paddle/phi/core/utils/rw_lock.h" +DECLARE_bool(enable_pe_launch_cinn); namespace paddle { namespace framework { namespace paddle2cinn { @@ -217,6 +220,33 @@ void CinnCompiler::Clear() { real_compiled_num_.store(0); } +void CinnCompiler::CheckCompiledValid( + const ir::Graph& graph, + const std::map& input_tensors, + const CinnCompiledObject& compiled_obj) const { + const auto& input_var_names = graph.Get>(kInputVars); + const auto& output_var_names = + graph.Get>(kOutputVars); + auto* launch_context = compiled_obj.launch_context.get(); + // 1. check all of the output variables will be assigned by compiled program + for (auto&& var_name : output_var_names) { + PADDLE_ENFORCE_EQ(launch_context->IsVariableUsed(var_name), true, + platform::errors::PreconditionNotMet( + "Variable(%s) not applied in CINN", var_name)); + } + // 2. check all of the used input variables were correctly deduced by CINN. + for (const auto& var_name : input_var_names) { + // some input variables were not used by CINN because they were eliminated + // by its optimized passes or some operators of it need less inputs + if (!launch_context->IsVariableUsed(var_name)) { + VLOG(4) << "Input variable" << var_name << " not used by cinn"; + continue; + } + launch_context->CheckTensorEquivalent(var_name, + *input_tensors.at(var_name)); + } +} + std::unique_ptr CinnCompiler::CompileGraph( const ir::Graph& graph, const std::map& input_tensors, @@ -244,6 +274,9 @@ std::unique_ptr CinnCompiler::CompileGraph( std::make_unique(target, scope, cinn_graph); GraphCompiler::CompileOptions options; options.with_instantiate_variables = false; + if (!FLAGS_enable_pe_launch_cinn) { + options.with_buffer_handle_instruction_inserted = true; + } auto compiled_res = graph_compiler->Build(options, std::move(fetch_ids), stream); auto compiled_obj = std::make_unique(); @@ -254,6 +287,7 @@ std::unique_ptr CinnCompiler::CompileGraph( compiled_obj->launch_context = std::make_unique(graph, *compiled_obj); + CheckCompiledValid(graph, input_tensors, *compiled_obj); return compiled_obj; } diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h index 5fa54b302a3..cf17e68156b 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h @@ -103,6 +103,13 @@ class CinnCompiler { const ::cinn::common::Target& target, std::int64_t compiled_num, void* stream = nullptr) const; + // check whether a compiled result is valid by comparing + // the consistency of external variables of the subgraph + void CheckCompiledValid( + const ir::Graph& graph, + const std::map& input_tensors, + const CinnCompiledObject& compiled_obj) const; + std::unordered_map> graphs_; std::unordered_map cache_by_address_; diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt index 2406445e6cf..862a0d04fbd 100644 --- a/paddle/fluid/operators/cinn/CMakeLists.txt +++ b/paddle/fluid/operators/cinn/CMakeLists.txt @@ -3,7 +3,7 @@ include(operators) cc_library(cinn_op_helper SRCS cinn_op_helper.cc DEPS operator device_context) cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy device_context parallel_executor transform_type cinn) -SET(CINN_OP_DEPS parallel_executor string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context transform_type) +SET(CINN_OP_DEPS parallel_executor string_helper variable_helper cinn cinn_compiler cinn_op_helper cinn_launch_context transform_type) register_operators(DEPS ${CINN_OP_DEPS}) if (WITH_TESTING) diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc index b445527322f..a660d59fb4c 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_context.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc @@ -33,6 +33,7 @@ #include "paddle/fluid/framework/paddle2cinn/transform_type.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/operators/cinn/cinn_op_helper.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" @@ -69,13 +70,6 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph, graph.Get>(framework::paddle2cinn::kOutputVars); internal_var_names_ = ExtractInternalVarNames(input_var_names, output_var_names); - // check completeness of output variables in compiled result - for (auto&& var_name : output_var_names) { - PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true, - platform::errors::PreconditionNotMet( - "Variable(%s) not applied in CINN", var_name)); - } - // initialize all execution arguments InitializeArguments(); // DEPRECATED(CtfGo): following callback assignment will be deprecated soon @@ -235,7 +229,7 @@ void CinnLaunchContext::InitializeArguments() { cinn_tensor->shape().data().size()); cinn_buffer->type = cinn::runtime::ToRuntimeType(cinn_tensor->type()); VLOG(4) << string::Sprintf( - "Append an argument:name(%s),dims(%s),type(%s)", + "Append an argument:name(%s),dims(%s),type(%s)", arg, framework::DDim(cinn_buffer->dims, cinn_buffer->dimensions).to_str(), cinn_tensor->type()); name2argument_.emplace(arg, cinn_buffer.get()); @@ -400,7 +394,20 @@ ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place, std::unordered_map scope_map = { {parallel_executor_->GetLocalScopes().front(), scope}}; parallel_executor_->ResetOpHandleScopeMapOfGraphs(scope_map); - parallel_executor_->PrepareVariables(scope); + // instead of using the PrepareVariables function of ParallelExecutor to + // initialize all variables, here we only initialize internal variables + // because external variables are already included in parent scope. + for (auto&& var_name : internal_var_names_) { + auto* var = scope->FindVar(var_name); + if (var != nullptr) { + VLOG(5) << "internal variable:" << var_name + << " has been initialized beforehand in global scope, skipped."; + continue; + } + framework::InitializeVariable(scope->Var(var_name), + framework::proto::VarType::LOD_TENSOR); + } + for (auto&& var_name : initialized_beforehand_vars_) { auto* var = scope->GetVar(var_name); auto* buffer = GetCinnBufferOfVar(var_name); diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h index 5263aae03ed..024bf2bceb3 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op.h +++ b/paddle/fluid/operators/cinn/cinn_launch_op.h @@ -18,7 +18,9 @@ #include #include #include + #include "cinn/common/target.h" +#include "gflags/gflags.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" @@ -26,6 +28,7 @@ #include "paddle/fluid/operators/cinn/cinn_launch_context.h" #include "paddle/fluid/operators/cinn/cinn_op_helper.h" +DECLARE_bool(enable_pe_launch_cinn); namespace paddle { namespace operators { @@ -101,34 +104,23 @@ class CinnLaunchOpKernel : public framework::OpKernel { const auto& cinn_compiled_object = CinnCompiler::GetInstance()->Compile( compilation_key, inputs_name2tensor, target, stream); details::DebugCinnCompiledResult(cinn_compiled_object); - auto* launch_context = cinn_compiled_object.launch_context.get(); - // Step 3. check the computational consistency of the subgraph - // before and after the compilation - // 3.1 Input variables: tensors of input variables have - // been initialized before graph compiled, just check the - // equiality between tensors of paddle and cinn. - for (const auto& var_name : input_x_variable_names) { - // some input variables don't need for cinn because they are - // eliminated by optimized passes or some cinn operators use - // less variables - if (!launch_context->IsVariableUsed(var_name)) { - VLOG(4) << "Input variable" << var_name << " not used by cinn"; - continue; - } - launch_context->CheckTensorEquivalent(var_name, - *inputs_name2tensor.at(var_name)); - } - // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic. + // Step 3. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic. details::SetCinnRuntimeFlags(); - // Step 5. use PE to execute the compiled CINN instructions - // in nodes of the runtime graph - VLOG(4) << "Execute the runtime graph by PE"; - framework::Scope& exec_scope = scope.NewScope(); - auto* pe = launch_context->InitializePE(place, &exec_scope); - pe->RunWithoutFetch(launch_context->GetSkipEagerVars()); + // Step 4. Execute the compiled CINN instructions by a PE or + // by the CINN compiled program in sequential order + if (FLAGS_enable_pe_launch_cinn) { + VLOG(4) << "Execute the runtime graph by PE"; + framework::Scope& exec_scope = scope.NewScope(); + auto* pe = launch_context->InitializePE(place, &exec_scope); + pe->RunWithoutFetch(launch_context->GetSkipEagerVars()); + } else { + VLOG(4) << "Execute the compiled executable program"; + launch_context->UpdateCapturedEnv(scope, place); + LaunchCinnExecution(cinn_compiled_object, *launch_context, stream); + } VLOG(4) << "CinnLaunchOp launch execution done."; } }; diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc index 585f1caabed..3e363c56eb9 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc @@ -32,6 +32,7 @@ USE_OP(cinn_launch); USE_OP(cinn_instruction_run); USE_OP_ITSELF(elementwise_add); DECLARE_double(eager_delete_tensor_gb); +DECLARE_bool(enable_pe_launch_cinn); PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); #ifdef PADDLE_WITH_CUDA @@ -42,43 +43,67 @@ namespace paddle::operators { using framework::paddle2cinn::CinnCompiler; -TEST(CinnLaunchOpTest, TestWithElementwiseAdd) { - paddle::framework::InitDevices(); - platform::SetNumThreads(1); - // cache test graph into CinnCompiler - const std::string& test_op_out_name = "cinn_launch_op_out"; - const std::string& add_op_out_name = "add_op_out"; - auto compilation_key = CinnCompiler::GetInstance()->AddGraph( - CreateOnlyElementwiseAddGraph("x", "y", test_op_out_name)); - - // create cinn_launch_op and elementwise_add op - auto cinn_launch_op = paddle::framework::OpRegistry::CreateOp( - "cinn_launch", {{"X", {"x", "y"}}}, {{"Out", {test_op_out_name}}}, - {{"compilation_key", compilation_key}}); - auto elementwise_add_op = paddle::framework::OpRegistry::CreateOp( - "elementwise_add", {{"X", {"x"}}, {"Y", {"y"}}}, - {{"Out", {add_op_out_name}}}, {{}}); - - // Run ops and check the computation results - auto run_and_check_fn = [&](const platform::Place& place) { +class TestCinnLaunchOp : public ::testing::Test { + public: + const char* test_op_out_name = "add_op_out"; + const char* add_op_out_name = "add_op_out"; + std::unique_ptr cinn_launch_op; + std::unique_ptr elementwise_add_op; + + void SetUp() override { + paddle::framework::InitDevices(); + platform::SetNumThreads(1); + // cache test graph into CinnCompiler + auto compilation_key = CinnCompiler::GetInstance()->AddGraph( + CreateOnlyElementwiseAddGraph("x", "y", test_op_out_name)); + + // create cinn_launch_op and elementwise_add op + cinn_launch_op = paddle::framework::OpRegistry::CreateOp( + "cinn_launch", {{"X", {"x", "y"}}}, {{"Out", {test_op_out_name}}}, + {{"compilation_key", compilation_key}}); + elementwise_add_op = paddle::framework::OpRegistry::CreateOp( + "elementwise_add", {{"X", {"x"}}, {"Y", {"y"}}}, + {{"Out", {add_op_out_name}}}, {{}}); + } + + void RunAndCheck(const platform::Place& place) { + // Run ops and check the computation results framework::Scope scope; InitVariablesWithRandomValue({"x", "y"}, {10, 20}, place, &scope); scope.Var(test_op_out_name)->GetMutable(); scope.Var(add_op_out_name)->GetMutable(); - cinn_launch_op->Run(scope, place); elementwise_add_op->Run(scope, place); + cinn_launch_op->Run(scope, place); CompareOpResult(scope.GetVar(test_op_out_name), scope.GetVar(add_op_out_name)); - }; - FLAGS_eager_delete_tensor_gb = -1; + } + + void TearDown() override { CinnCompiler::GetInstance()->Clear(); } +}; +TEST_F(TestCinnLaunchOp, TestRunInstructionByPE) { // CPU - run_and_check_fn(platform::CPUPlace()); - run_and_check_fn(platform::CPUPlace()); + RunAndCheck(platform::CPUPlace()); + // the second run on the same place is to check the cache logic + RunAndCheck(platform::CPUPlace()); +#ifdef PADDLE_WITH_CUDA + // GPU + RunAndCheck(platform::CUDAPlace()); + RunAndCheck(platform::CUDAPlace()); +#endif +} + +TEST_F(TestCinnLaunchOp, TestRunInstructionByCinnProgram) { + // set FLAGS_enable_pe_launch_cinn=false to switch to use + // default scheduler of CINN to execute the compiled program + FLAGS_enable_pe_launch_cinn = false; + + RunAndCheck(platform::CPUPlace()); + RunAndCheck(platform::CPUPlace()); #ifdef PADDLE_WITH_CUDA // GPU - run_and_check_fn(platform::CUDAPlace()); - run_and_check_fn(platform::CUDAPlace()); + RunAndCheck(platform::CUDAPlace()); + RunAndCheck(platform::CUDAPlace()); #endif } diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 7fb3fc4b1ed..87bad9cbdfc 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -751,6 +751,32 @@ PADDLE_DEFINE_EXPORTED_string(allow_cinn_ops, "", */ PADDLE_DEFINE_EXPORTED_string(deny_cinn_ops, "", "It controls the cinn op subset to be not used."); + +/* + * CINN related FLAG + * Name: FLAGS_enable_pe_launch_cinn + * Since Version: 2.3 + * Value Range: bool, default=true + * Example: FLAGS_enable_pe_launch_cinn=true would execute the CINN compiled + * instructions of a paddle graph with ParallelExecutor, otherwise with the + * CINN compiled runtime program in sequential order. + */ +PADDLE_DEFINE_EXPORTED_bool(enable_pe_launch_cinn, true, + "It controls whether to execute cinn compiled " + "program with ParallelExecutor"); + +/* + * CINN related FLAG + * Name: FLAGS_enable_cinn_auto_tune + * Since Version: 2.3 + * Value Range: bool, default=false + * Example: FLAGS_enable_cinn_auto_tune=true would use CINN with its + * auto-tune feature enabled + */ +PADDLE_DEFINE_EXPORTED_bool(enable_cinn_auto_tune, false, + "It controls whether to use cinn with " + "its auto-tune feature enabled"); + #endif DEFINE_int32(record_pool_max_size, 2000000, -- GitLab