未验证 提交 dab7dfbf 编写于 作者: T TeFeng Chen 提交者: GitHub

cinn_launch_op: optimize the overhead of preparing variables before executing...

cinn_launch_op: optimize the overhead of preparing variables before executing cinn compiled program (#41777) (#41910)

cherry-pick #41777
* optimize preparation overhead before executing cinn compiled program
上级 0fb06e46
......@@ -31,11 +31,13 @@
#include "cinn/hlir/framework/graph_compiler.h"
#include "cinn/hlir/framework/pass.h"
#include "cinn/hlir/pass/use_pass.h"
#include "gflags/gflags.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
#include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/tensor.h"
......@@ -45,6 +47,7 @@
#include "paddle/fluid/string/string_helper.h"
#include "paddle/phi/core/utils/rw_lock.h"
DECLARE_bool(enable_pe_launch_cinn);
namespace paddle {
namespace framework {
namespace paddle2cinn {
......@@ -217,6 +220,33 @@ void CinnCompiler::Clear() {
real_compiled_num_.store(0);
}
void CinnCompiler::CheckCompiledValid(
const ir::Graph& graph,
const std::map<std::string, const LoDTensor*>& input_tensors,
const CinnCompiledObject& compiled_obj) const {
const auto& input_var_names = graph.Get<std::vector<std::string>>(kInputVars);
const auto& output_var_names =
graph.Get<std::vector<std::string>>(kOutputVars);
auto* launch_context = compiled_obj.launch_context.get();
// 1. check all of the output variables will be assigned by compiled program
for (auto&& var_name : output_var_names) {
PADDLE_ENFORCE_EQ(launch_context->IsVariableUsed(var_name), true,
platform::errors::PreconditionNotMet(
"Variable(%s) not applied in CINN", var_name));
}
// 2. check all of the used input variables were correctly deduced by CINN.
for (const auto& var_name : input_var_names) {
// some input variables were not used by CINN because they were eliminated
// by its optimized passes or some operators of it need less inputs
if (!launch_context->IsVariableUsed(var_name)) {
VLOG(4) << "Input variable" << var_name << " not used by cinn";
continue;
}
launch_context->CheckTensorEquivalent(var_name,
*input_tensors.at(var_name));
}
}
std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
const ir::Graph& graph,
const std::map<std::string, const LoDTensor*>& input_tensors,
......@@ -244,6 +274,9 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
std::make_unique<GraphCompiler>(target, scope, cinn_graph);
GraphCompiler::CompileOptions options;
options.with_instantiate_variables = false;
if (!FLAGS_enable_pe_launch_cinn) {
options.with_buffer_handle_instruction_inserted = true;
}
auto compiled_res =
graph_compiler->Build(options, std::move(fetch_ids), stream);
auto compiled_obj = std::make_unique<CinnCompiledObject>();
......@@ -254,6 +287,7 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
compiled_obj->launch_context =
std::make_unique<operators::details::CinnLaunchContext>(graph,
*compiled_obj);
CheckCompiledValid(graph, input_tensors, *compiled_obj);
return compiled_obj;
}
......
......@@ -103,6 +103,13 @@ class CinnCompiler {
const ::cinn::common::Target& target, std::int64_t compiled_num,
void* stream = nullptr) const;
// check whether a compiled result is valid by comparing
// the consistency of external variables of the subgraph
void CheckCompiledValid(
const ir::Graph& graph,
const std::map<std::string, const LoDTensor*>& input_tensors,
const CinnCompiledObject& compiled_obj) const;
std::unordered_map<std::string, std::unique_ptr<ir::Graph>> graphs_;
std::unordered_map<CinnCacheKeyByAddress, std::int64_t, CinnCacheKey::Hash>
cache_by_address_;
......
......@@ -3,7 +3,7 @@ include(operators)
cc_library(cinn_op_helper SRCS cinn_op_helper.cc DEPS operator device_context)
cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy device_context parallel_executor transform_type cinn)
SET(CINN_OP_DEPS parallel_executor string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context transform_type)
SET(CINN_OP_DEPS parallel_executor string_helper variable_helper cinn cinn_compiler cinn_op_helper cinn_launch_context transform_type)
register_operators(DEPS ${CINN_OP_DEPS})
if (WITH_TESTING)
......
......@@ -33,6 +33,7 @@
#include "paddle/fluid/framework/paddle2cinn/transform_type.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/cinn/cinn_op_helper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
......@@ -69,13 +70,6 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph,
graph.Get<std::vector<std::string>>(framework::paddle2cinn::kOutputVars);
internal_var_names_ =
ExtractInternalVarNames(input_var_names, output_var_names);
// check completeness of output variables in compiled result
for (auto&& var_name : output_var_names) {
PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true,
platform::errors::PreconditionNotMet(
"Variable(%s) not applied in CINN", var_name));
}
// initialize all execution arguments
InitializeArguments();
// DEPRECATED(CtfGo): following callback assignment will be deprecated soon
......@@ -235,7 +229,7 @@ void CinnLaunchContext::InitializeArguments() {
cinn_tensor->shape().data().size());
cinn_buffer->type = cinn::runtime::ToRuntimeType(cinn_tensor->type());
VLOG(4) << string::Sprintf(
"Append an argument:name(%s),dims(%s),type(%s)",
"Append an argument:name(%s),dims(%s),type(%s)", arg,
framework::DDim(cinn_buffer->dims, cinn_buffer->dimensions).to_str(),
cinn_tensor->type());
name2argument_.emplace(arg, cinn_buffer.get());
......@@ -400,7 +394,20 @@ ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place,
std::unordered_map<Scope*, Scope*> scope_map = {
{parallel_executor_->GetLocalScopes().front(), scope}};
parallel_executor_->ResetOpHandleScopeMapOfGraphs(scope_map);
parallel_executor_->PrepareVariables(scope);
// instead of using the PrepareVariables function of ParallelExecutor to
// initialize all variables, here we only initialize internal variables
// because external variables are already included in parent scope.
for (auto&& var_name : internal_var_names_) {
auto* var = scope->FindVar(var_name);
if (var != nullptr) {
VLOG(5) << "internal variable:" << var_name
<< " has been initialized beforehand in global scope, skipped.";
continue;
}
framework::InitializeVariable(scope->Var(var_name),
framework::proto::VarType::LOD_TENSOR);
}
for (auto&& var_name : initialized_beforehand_vars_) {
auto* var = scope->GetVar(var_name);
auto* buffer = GetCinnBufferOfVar(var_name);
......
......@@ -18,7 +18,9 @@
#include <string>
#include <unordered_map>
#include <unordered_set>
#include "cinn/common/target.h"
#include "gflags/gflags.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
......@@ -26,6 +28,7 @@
#include "paddle/fluid/operators/cinn/cinn_launch_context.h"
#include "paddle/fluid/operators/cinn/cinn_op_helper.h"
DECLARE_bool(enable_pe_launch_cinn);
namespace paddle {
namespace operators {
......@@ -101,34 +104,23 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
const auto& cinn_compiled_object = CinnCompiler::GetInstance()->Compile(
compilation_key, inputs_name2tensor, target, stream);
details::DebugCinnCompiledResult(cinn_compiled_object);
auto* launch_context = cinn_compiled_object.launch_context.get();
// Step 3. check the computational consistency of the subgraph
// before and after the compilation
// 3.1 Input variables: tensors of input variables have
// been initialized before graph compiled, just check the
// equiality between tensors of paddle and cinn.
for (const auto& var_name : input_x_variable_names) {
// some input variables don't need for cinn because they are
// eliminated by optimized passes or some cinn operators use
// less variables
if (!launch_context->IsVariableUsed(var_name)) {
VLOG(4) << "Input variable" << var_name << " not used by cinn";
continue;
}
launch_context->CheckTensorEquivalent(var_name,
*inputs_name2tensor.at(var_name));
}
// Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
// Step 3. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
details::SetCinnRuntimeFlags();
// Step 5. use PE to execute the compiled CINN instructions
// in nodes of the runtime graph
VLOG(4) << "Execute the runtime graph by PE";
framework::Scope& exec_scope = scope.NewScope();
auto* pe = launch_context->InitializePE(place, &exec_scope);
pe->RunWithoutFetch(launch_context->GetSkipEagerVars());
// Step 4. Execute the compiled CINN instructions by a PE or
// by the CINN compiled program in sequential order
if (FLAGS_enable_pe_launch_cinn) {
VLOG(4) << "Execute the runtime graph by PE";
framework::Scope& exec_scope = scope.NewScope();
auto* pe = launch_context->InitializePE(place, &exec_scope);
pe->RunWithoutFetch(launch_context->GetSkipEagerVars());
} else {
VLOG(4) << "Execute the compiled executable program";
launch_context->UpdateCapturedEnv(scope, place);
LaunchCinnExecution(cinn_compiled_object, *launch_context, stream);
}
VLOG(4) << "CinnLaunchOp launch execution done.";
}
};
......
......@@ -32,6 +32,7 @@ USE_OP(cinn_launch);
USE_OP(cinn_instruction_run);
USE_OP_ITSELF(elementwise_add);
DECLARE_double(eager_delete_tensor_gb);
DECLARE_bool(enable_pe_launch_cinn);
PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
#ifdef PADDLE_WITH_CUDA
......@@ -42,43 +43,67 @@ namespace paddle::operators {
using framework::paddle2cinn::CinnCompiler;
TEST(CinnLaunchOpTest, TestWithElementwiseAdd) {
paddle::framework::InitDevices();
platform::SetNumThreads(1);
// cache test graph into CinnCompiler
const std::string& test_op_out_name = "cinn_launch_op_out";
const std::string& add_op_out_name = "add_op_out";
auto compilation_key = CinnCompiler::GetInstance()->AddGraph(
CreateOnlyElementwiseAddGraph("x", "y", test_op_out_name));
// create cinn_launch_op and elementwise_add op
auto cinn_launch_op = paddle::framework::OpRegistry::CreateOp(
"cinn_launch", {{"X", {"x", "y"}}}, {{"Out", {test_op_out_name}}},
{{"compilation_key", compilation_key}});
auto elementwise_add_op = paddle::framework::OpRegistry::CreateOp(
"elementwise_add", {{"X", {"x"}}, {"Y", {"y"}}},
{{"Out", {add_op_out_name}}}, {{}});
// Run ops and check the computation results
auto run_and_check_fn = [&](const platform::Place& place) {
class TestCinnLaunchOp : public ::testing::Test {
public:
const char* test_op_out_name = "add_op_out";
const char* add_op_out_name = "add_op_out";
std::unique_ptr<framework::OperatorBase> cinn_launch_op;
std::unique_ptr<framework::OperatorBase> elementwise_add_op;
void SetUp() override {
paddle::framework::InitDevices();
platform::SetNumThreads(1);
// cache test graph into CinnCompiler
auto compilation_key = CinnCompiler::GetInstance()->AddGraph(
CreateOnlyElementwiseAddGraph("x", "y", test_op_out_name));
// create cinn_launch_op and elementwise_add op
cinn_launch_op = paddle::framework::OpRegistry::CreateOp(
"cinn_launch", {{"X", {"x", "y"}}}, {{"Out", {test_op_out_name}}},
{{"compilation_key", compilation_key}});
elementwise_add_op = paddle::framework::OpRegistry::CreateOp(
"elementwise_add", {{"X", {"x"}}, {"Y", {"y"}}},
{{"Out", {add_op_out_name}}}, {{}});
}
void RunAndCheck(const platform::Place& place) {
// Run ops and check the computation results
framework::Scope scope;
InitVariablesWithRandomValue<float>({"x", "y"}, {10, 20}, place, &scope);
scope.Var(test_op_out_name)->GetMutable<LoDTensor>();
scope.Var(add_op_out_name)->GetMutable<LoDTensor>();
cinn_launch_op->Run(scope, place);
elementwise_add_op->Run(scope, place);
cinn_launch_op->Run(scope, place);
CompareOpResult<float>(scope.GetVar(test_op_out_name),
scope.GetVar(add_op_out_name));
};
FLAGS_eager_delete_tensor_gb = -1;
}
void TearDown() override { CinnCompiler::GetInstance()->Clear(); }
};
TEST_F(TestCinnLaunchOp, TestRunInstructionByPE) {
// CPU
run_and_check_fn(platform::CPUPlace());
run_and_check_fn(platform::CPUPlace());
RunAndCheck(platform::CPUPlace());
// the second run on the same place is to check the cache logic
RunAndCheck(platform::CPUPlace());
#ifdef PADDLE_WITH_CUDA
// GPU
RunAndCheck(platform::CUDAPlace());
RunAndCheck(platform::CUDAPlace());
#endif
}
TEST_F(TestCinnLaunchOp, TestRunInstructionByCinnProgram) {
// set FLAGS_enable_pe_launch_cinn=false to switch to use
// default scheduler of CINN to execute the compiled program
FLAGS_enable_pe_launch_cinn = false;
RunAndCheck(platform::CPUPlace());
RunAndCheck(platform::CPUPlace());
#ifdef PADDLE_WITH_CUDA
// GPU
run_and_check_fn(platform::CUDAPlace());
run_and_check_fn(platform::CUDAPlace());
RunAndCheck(platform::CUDAPlace());
RunAndCheck(platform::CUDAPlace());
#endif
}
......
......@@ -751,6 +751,32 @@ PADDLE_DEFINE_EXPORTED_string(allow_cinn_ops, "",
*/
PADDLE_DEFINE_EXPORTED_string(deny_cinn_ops, "",
"It controls the cinn op subset to be not used.");
/*
* CINN related FLAG
* Name: FLAGS_enable_pe_launch_cinn
* Since Version: 2.3
* Value Range: bool, default=true
* Example: FLAGS_enable_pe_launch_cinn=true would execute the CINN compiled
* instructions of a paddle graph with ParallelExecutor, otherwise with the
* CINN compiled runtime program in sequential order.
*/
PADDLE_DEFINE_EXPORTED_bool(enable_pe_launch_cinn, true,
"It controls whether to execute cinn compiled "
"program with ParallelExecutor");
/*
* CINN related FLAG
* Name: FLAGS_enable_cinn_auto_tune
* Since Version: 2.3
* Value Range: bool, default=false
* Example: FLAGS_enable_cinn_auto_tune=true would use CINN with its
* auto-tune feature enabled
*/
PADDLE_DEFINE_EXPORTED_bool(enable_cinn_auto_tune, false,
"It controls whether to use cinn with "
"its auto-tune feature enabled");
#endif
DEFINE_int32(record_pool_max_size, 2000000,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册