未验证 提交 dab7dfbf 编写于 作者: T TeFeng Chen 提交者: GitHub

cinn_launch_op: optimize the overhead of preparing variables before executing...

cinn_launch_op: optimize the overhead of preparing variables before executing cinn compiled program (#41777) (#41910)

cherry-pick #41777
* optimize preparation overhead before executing cinn compiled program
上级 0fb06e46
...@@ -31,11 +31,13 @@ ...@@ -31,11 +31,13 @@
#include "cinn/hlir/framework/graph_compiler.h" #include "cinn/hlir/framework/graph_compiler.h"
#include "cinn/hlir/framework/pass.h" #include "cinn/hlir/framework/pass.h"
#include "cinn/hlir/pass/use_pass.h" #include "cinn/hlir/pass/use_pass.h"
#include "gflags/gflags.h"
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
#include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h" #include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
...@@ -45,6 +47,7 @@ ...@@ -45,6 +47,7 @@
#include "paddle/fluid/string/string_helper.h" #include "paddle/fluid/string/string_helper.h"
#include "paddle/phi/core/utils/rw_lock.h" #include "paddle/phi/core/utils/rw_lock.h"
DECLARE_bool(enable_pe_launch_cinn);
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace paddle2cinn { namespace paddle2cinn {
...@@ -217,6 +220,33 @@ void CinnCompiler::Clear() { ...@@ -217,6 +220,33 @@ void CinnCompiler::Clear() {
real_compiled_num_.store(0); real_compiled_num_.store(0);
} }
void CinnCompiler::CheckCompiledValid(
const ir::Graph& graph,
const std::map<std::string, const LoDTensor*>& input_tensors,
const CinnCompiledObject& compiled_obj) const {
const auto& input_var_names = graph.Get<std::vector<std::string>>(kInputVars);
const auto& output_var_names =
graph.Get<std::vector<std::string>>(kOutputVars);
auto* launch_context = compiled_obj.launch_context.get();
// 1. check all of the output variables will be assigned by compiled program
for (auto&& var_name : output_var_names) {
PADDLE_ENFORCE_EQ(launch_context->IsVariableUsed(var_name), true,
platform::errors::PreconditionNotMet(
"Variable(%s) not applied in CINN", var_name));
}
// 2. check all of the used input variables were correctly deduced by CINN.
for (const auto& var_name : input_var_names) {
// some input variables were not used by CINN because they were eliminated
// by its optimized passes or some operators of it need less inputs
if (!launch_context->IsVariableUsed(var_name)) {
VLOG(4) << "Input variable" << var_name << " not used by cinn";
continue;
}
launch_context->CheckTensorEquivalent(var_name,
*input_tensors.at(var_name));
}
}
std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph( std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
const ir::Graph& graph, const ir::Graph& graph,
const std::map<std::string, const LoDTensor*>& input_tensors, const std::map<std::string, const LoDTensor*>& input_tensors,
...@@ -244,6 +274,9 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph( ...@@ -244,6 +274,9 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
std::make_unique<GraphCompiler>(target, scope, cinn_graph); std::make_unique<GraphCompiler>(target, scope, cinn_graph);
GraphCompiler::CompileOptions options; GraphCompiler::CompileOptions options;
options.with_instantiate_variables = false; options.with_instantiate_variables = false;
if (!FLAGS_enable_pe_launch_cinn) {
options.with_buffer_handle_instruction_inserted = true;
}
auto compiled_res = auto compiled_res =
graph_compiler->Build(options, std::move(fetch_ids), stream); graph_compiler->Build(options, std::move(fetch_ids), stream);
auto compiled_obj = std::make_unique<CinnCompiledObject>(); auto compiled_obj = std::make_unique<CinnCompiledObject>();
...@@ -254,6 +287,7 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph( ...@@ -254,6 +287,7 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
compiled_obj->launch_context = compiled_obj->launch_context =
std::make_unique<operators::details::CinnLaunchContext>(graph, std::make_unique<operators::details::CinnLaunchContext>(graph,
*compiled_obj); *compiled_obj);
CheckCompiledValid(graph, input_tensors, *compiled_obj);
return compiled_obj; return compiled_obj;
} }
......
...@@ -103,6 +103,13 @@ class CinnCompiler { ...@@ -103,6 +103,13 @@ class CinnCompiler {
const ::cinn::common::Target& target, std::int64_t compiled_num, const ::cinn::common::Target& target, std::int64_t compiled_num,
void* stream = nullptr) const; void* stream = nullptr) const;
// check whether a compiled result is valid by comparing
// the consistency of external variables of the subgraph
void CheckCompiledValid(
const ir::Graph& graph,
const std::map<std::string, const LoDTensor*>& input_tensors,
const CinnCompiledObject& compiled_obj) const;
std::unordered_map<std::string, std::unique_ptr<ir::Graph>> graphs_; std::unordered_map<std::string, std::unique_ptr<ir::Graph>> graphs_;
std::unordered_map<CinnCacheKeyByAddress, std::int64_t, CinnCacheKey::Hash> std::unordered_map<CinnCacheKeyByAddress, std::int64_t, CinnCacheKey::Hash>
cache_by_address_; cache_by_address_;
......
...@@ -3,7 +3,7 @@ include(operators) ...@@ -3,7 +3,7 @@ include(operators)
cc_library(cinn_op_helper SRCS cinn_op_helper.cc DEPS operator device_context) cc_library(cinn_op_helper SRCS cinn_op_helper.cc DEPS operator device_context)
cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy device_context parallel_executor transform_type cinn) cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy device_context parallel_executor transform_type cinn)
SET(CINN_OP_DEPS parallel_executor string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context transform_type) SET(CINN_OP_DEPS parallel_executor string_helper variable_helper cinn cinn_compiler cinn_op_helper cinn_launch_context transform_type)
register_operators(DEPS ${CINN_OP_DEPS}) register_operators(DEPS ${CINN_OP_DEPS})
if (WITH_TESTING) if (WITH_TESTING)
......
...@@ -33,6 +33,7 @@ ...@@ -33,6 +33,7 @@
#include "paddle/fluid/framework/paddle2cinn/transform_type.h" #include "paddle/fluid/framework/paddle2cinn/transform_type.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/cinn/cinn_op_helper.h" #include "paddle/fluid/operators/cinn/cinn_op_helper.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
...@@ -69,13 +70,6 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph, ...@@ -69,13 +70,6 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph,
graph.Get<std::vector<std::string>>(framework::paddle2cinn::kOutputVars); graph.Get<std::vector<std::string>>(framework::paddle2cinn::kOutputVars);
internal_var_names_ = internal_var_names_ =
ExtractInternalVarNames(input_var_names, output_var_names); ExtractInternalVarNames(input_var_names, output_var_names);
// check completeness of output variables in compiled result
for (auto&& var_name : output_var_names) {
PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true,
platform::errors::PreconditionNotMet(
"Variable(%s) not applied in CINN", var_name));
}
// initialize all execution arguments // initialize all execution arguments
InitializeArguments(); InitializeArguments();
// DEPRECATED(CtfGo): following callback assignment will be deprecated soon // DEPRECATED(CtfGo): following callback assignment will be deprecated soon
...@@ -235,7 +229,7 @@ void CinnLaunchContext::InitializeArguments() { ...@@ -235,7 +229,7 @@ void CinnLaunchContext::InitializeArguments() {
cinn_tensor->shape().data().size()); cinn_tensor->shape().data().size());
cinn_buffer->type = cinn::runtime::ToRuntimeType(cinn_tensor->type()); cinn_buffer->type = cinn::runtime::ToRuntimeType(cinn_tensor->type());
VLOG(4) << string::Sprintf( VLOG(4) << string::Sprintf(
"Append an argument:name(%s),dims(%s),type(%s)", "Append an argument:name(%s),dims(%s),type(%s)", arg,
framework::DDim(cinn_buffer->dims, cinn_buffer->dimensions).to_str(), framework::DDim(cinn_buffer->dims, cinn_buffer->dimensions).to_str(),
cinn_tensor->type()); cinn_tensor->type());
name2argument_.emplace(arg, cinn_buffer.get()); name2argument_.emplace(arg, cinn_buffer.get());
...@@ -400,7 +394,20 @@ ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place, ...@@ -400,7 +394,20 @@ ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place,
std::unordered_map<Scope*, Scope*> scope_map = { std::unordered_map<Scope*, Scope*> scope_map = {
{parallel_executor_->GetLocalScopes().front(), scope}}; {parallel_executor_->GetLocalScopes().front(), scope}};
parallel_executor_->ResetOpHandleScopeMapOfGraphs(scope_map); parallel_executor_->ResetOpHandleScopeMapOfGraphs(scope_map);
parallel_executor_->PrepareVariables(scope); // instead of using the PrepareVariables function of ParallelExecutor to
// initialize all variables, here we only initialize internal variables
// because external variables are already included in parent scope.
for (auto&& var_name : internal_var_names_) {
auto* var = scope->FindVar(var_name);
if (var != nullptr) {
VLOG(5) << "internal variable:" << var_name
<< " has been initialized beforehand in global scope, skipped.";
continue;
}
framework::InitializeVariable(scope->Var(var_name),
framework::proto::VarType::LOD_TENSOR);
}
for (auto&& var_name : initialized_beforehand_vars_) { for (auto&& var_name : initialized_beforehand_vars_) {
auto* var = scope->GetVar(var_name); auto* var = scope->GetVar(var_name);
auto* buffer = GetCinnBufferOfVar(var_name); auto* buffer = GetCinnBufferOfVar(var_name);
......
...@@ -18,7 +18,9 @@ ...@@ -18,7 +18,9 @@
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include "cinn/common/target.h" #include "cinn/common/target.h"
#include "gflags/gflags.h"
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
...@@ -26,6 +28,7 @@ ...@@ -26,6 +28,7 @@
#include "paddle/fluid/operators/cinn/cinn_launch_context.h" #include "paddle/fluid/operators/cinn/cinn_launch_context.h"
#include "paddle/fluid/operators/cinn/cinn_op_helper.h" #include "paddle/fluid/operators/cinn/cinn_op_helper.h"
DECLARE_bool(enable_pe_launch_cinn);
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -101,34 +104,23 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> { ...@@ -101,34 +104,23 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
const auto& cinn_compiled_object = CinnCompiler::GetInstance()->Compile( const auto& cinn_compiled_object = CinnCompiler::GetInstance()->Compile(
compilation_key, inputs_name2tensor, target, stream); compilation_key, inputs_name2tensor, target, stream);
details::DebugCinnCompiledResult(cinn_compiled_object); details::DebugCinnCompiledResult(cinn_compiled_object);
auto* launch_context = cinn_compiled_object.launch_context.get(); auto* launch_context = cinn_compiled_object.launch_context.get();
// Step 3. check the computational consistency of the subgraph
// before and after the compilation
// 3.1 Input variables: tensors of input variables have
// been initialized before graph compiled, just check the
// equiality between tensors of paddle and cinn.
for (const auto& var_name : input_x_variable_names) {
// some input variables don't need for cinn because they are
// eliminated by optimized passes or some cinn operators use
// less variables
if (!launch_context->IsVariableUsed(var_name)) {
VLOG(4) << "Input variable" << var_name << " not used by cinn";
continue;
}
launch_context->CheckTensorEquivalent(var_name,
*inputs_name2tensor.at(var_name));
}
// Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic. // Step 3. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
details::SetCinnRuntimeFlags(); details::SetCinnRuntimeFlags();
// Step 5. use PE to execute the compiled CINN instructions // Step 4. Execute the compiled CINN instructions by a PE or
// in nodes of the runtime graph // by the CINN compiled program in sequential order
VLOG(4) << "Execute the runtime graph by PE"; if (FLAGS_enable_pe_launch_cinn) {
framework::Scope& exec_scope = scope.NewScope(); VLOG(4) << "Execute the runtime graph by PE";
auto* pe = launch_context->InitializePE(place, &exec_scope); framework::Scope& exec_scope = scope.NewScope();
pe->RunWithoutFetch(launch_context->GetSkipEagerVars()); auto* pe = launch_context->InitializePE(place, &exec_scope);
pe->RunWithoutFetch(launch_context->GetSkipEagerVars());
} else {
VLOG(4) << "Execute the compiled executable program";
launch_context->UpdateCapturedEnv(scope, place);
LaunchCinnExecution(cinn_compiled_object, *launch_context, stream);
}
VLOG(4) << "CinnLaunchOp launch execution done."; VLOG(4) << "CinnLaunchOp launch execution done.";
} }
}; };
......
...@@ -32,6 +32,7 @@ USE_OP(cinn_launch); ...@@ -32,6 +32,7 @@ USE_OP(cinn_launch);
USE_OP(cinn_instruction_run); USE_OP(cinn_instruction_run);
USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(elementwise_add);
DECLARE_double(eager_delete_tensor_gb); DECLARE_double(eager_delete_tensor_gb);
DECLARE_bool(enable_pe_launch_cinn);
PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -42,43 +43,67 @@ namespace paddle::operators { ...@@ -42,43 +43,67 @@ namespace paddle::operators {
using framework::paddle2cinn::CinnCompiler; using framework::paddle2cinn::CinnCompiler;
TEST(CinnLaunchOpTest, TestWithElementwiseAdd) { class TestCinnLaunchOp : public ::testing::Test {
paddle::framework::InitDevices(); public:
platform::SetNumThreads(1); const char* test_op_out_name = "add_op_out";
// cache test graph into CinnCompiler const char* add_op_out_name = "add_op_out";
const std::string& test_op_out_name = "cinn_launch_op_out"; std::unique_ptr<framework::OperatorBase> cinn_launch_op;
const std::string& add_op_out_name = "add_op_out"; std::unique_ptr<framework::OperatorBase> elementwise_add_op;
auto compilation_key = CinnCompiler::GetInstance()->AddGraph(
CreateOnlyElementwiseAddGraph("x", "y", test_op_out_name)); void SetUp() override {
paddle::framework::InitDevices();
// create cinn_launch_op and elementwise_add op platform::SetNumThreads(1);
auto cinn_launch_op = paddle::framework::OpRegistry::CreateOp( // cache test graph into CinnCompiler
"cinn_launch", {{"X", {"x", "y"}}}, {{"Out", {test_op_out_name}}}, auto compilation_key = CinnCompiler::GetInstance()->AddGraph(
{{"compilation_key", compilation_key}}); CreateOnlyElementwiseAddGraph("x", "y", test_op_out_name));
auto elementwise_add_op = paddle::framework::OpRegistry::CreateOp(
"elementwise_add", {{"X", {"x"}}, {"Y", {"y"}}}, // create cinn_launch_op and elementwise_add op
{{"Out", {add_op_out_name}}}, {{}}); cinn_launch_op = paddle::framework::OpRegistry::CreateOp(
"cinn_launch", {{"X", {"x", "y"}}}, {{"Out", {test_op_out_name}}},
// Run ops and check the computation results {{"compilation_key", compilation_key}});
auto run_and_check_fn = [&](const platform::Place& place) { elementwise_add_op = paddle::framework::OpRegistry::CreateOp(
"elementwise_add", {{"X", {"x"}}, {"Y", {"y"}}},
{{"Out", {add_op_out_name}}}, {{}});
}
void RunAndCheck(const platform::Place& place) {
// Run ops and check the computation results
framework::Scope scope; framework::Scope scope;
InitVariablesWithRandomValue<float>({"x", "y"}, {10, 20}, place, &scope); InitVariablesWithRandomValue<float>({"x", "y"}, {10, 20}, place, &scope);
scope.Var(test_op_out_name)->GetMutable<LoDTensor>(); scope.Var(test_op_out_name)->GetMutable<LoDTensor>();
scope.Var(add_op_out_name)->GetMutable<LoDTensor>(); scope.Var(add_op_out_name)->GetMutable<LoDTensor>();
cinn_launch_op->Run(scope, place);
elementwise_add_op->Run(scope, place); elementwise_add_op->Run(scope, place);
cinn_launch_op->Run(scope, place);
CompareOpResult<float>(scope.GetVar(test_op_out_name), CompareOpResult<float>(scope.GetVar(test_op_out_name),
scope.GetVar(add_op_out_name)); scope.GetVar(add_op_out_name));
}; }
FLAGS_eager_delete_tensor_gb = -1;
void TearDown() override { CinnCompiler::GetInstance()->Clear(); }
};
TEST_F(TestCinnLaunchOp, TestRunInstructionByPE) {
// CPU // CPU
run_and_check_fn(platform::CPUPlace()); RunAndCheck(platform::CPUPlace());
run_and_check_fn(platform::CPUPlace()); // the second run on the same place is to check the cache logic
RunAndCheck(platform::CPUPlace());
#ifdef PADDLE_WITH_CUDA
// GPU
RunAndCheck(platform::CUDAPlace());
RunAndCheck(platform::CUDAPlace());
#endif
}
TEST_F(TestCinnLaunchOp, TestRunInstructionByCinnProgram) {
// set FLAGS_enable_pe_launch_cinn=false to switch to use
// default scheduler of CINN to execute the compiled program
FLAGS_enable_pe_launch_cinn = false;
RunAndCheck(platform::CPUPlace());
RunAndCheck(platform::CPUPlace());
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
// GPU // GPU
run_and_check_fn(platform::CUDAPlace()); RunAndCheck(platform::CUDAPlace());
run_and_check_fn(platform::CUDAPlace()); RunAndCheck(platform::CUDAPlace());
#endif #endif
} }
......
...@@ -751,6 +751,32 @@ PADDLE_DEFINE_EXPORTED_string(allow_cinn_ops, "", ...@@ -751,6 +751,32 @@ PADDLE_DEFINE_EXPORTED_string(allow_cinn_ops, "",
*/ */
PADDLE_DEFINE_EXPORTED_string(deny_cinn_ops, "", PADDLE_DEFINE_EXPORTED_string(deny_cinn_ops, "",
"It controls the cinn op subset to be not used."); "It controls the cinn op subset to be not used.");
/*
* CINN related FLAG
* Name: FLAGS_enable_pe_launch_cinn
* Since Version: 2.3
* Value Range: bool, default=true
* Example: FLAGS_enable_pe_launch_cinn=true would execute the CINN compiled
* instructions of a paddle graph with ParallelExecutor, otherwise with the
* CINN compiled runtime program in sequential order.
*/
PADDLE_DEFINE_EXPORTED_bool(enable_pe_launch_cinn, true,
"It controls whether to execute cinn compiled "
"program with ParallelExecutor");
/*
* CINN related FLAG
* Name: FLAGS_enable_cinn_auto_tune
* Since Version: 2.3
* Value Range: bool, default=false
* Example: FLAGS_enable_cinn_auto_tune=true would use CINN with its
* auto-tune feature enabled
*/
PADDLE_DEFINE_EXPORTED_bool(enable_cinn_auto_tune, false,
"It controls whether to use cinn with "
"its auto-tune feature enabled");
#endif #endif
DEFINE_int32(record_pool_max_size, 2000000, DEFINE_int32(record_pool_max_size, 2000000,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册