未验证 提交 2ca3d3f7 编写于 作者: Z zhangbo9674 提交者: GitHub

[new executor]Support CINN use InterpreterCore (#48911)

* cinn use interpretercore

* fix bug

* fix compile bug

* fix scope bug

* refine code

* refine code by comment

* refine code by comment
上级 0839bba3
......@@ -622,7 +622,8 @@ void BuildOpFuncList(const platform::Place& place,
// NOTE(Ruibiao): We do not encourage directly using scope in OP kernel.
// But some OPs do have such behavior (e.g., cinn_launch OP). Here
// special treatment for them.
if (op_with_kernel->Type() == "cinn_launch") {
if (op_with_kernel->Type() == "cinn_launch" ||
op_with_kernel->Type() == "cinn_instruction_run") {
VLOG(6) << "OP(" << op_with_kernel->Type()
<< ") use scope in kernel, "
"so pass a real scope to "
......
......@@ -108,7 +108,8 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
const std::set<std::string>& skip_gc_vars,
framework::Scope* scope,
bool used_for_jit,
bool used_for_control_flow_op)
bool used_for_control_flow_op,
bool used_for_cinn)
: place_(place),
block_(block),
execution_config_(place, block.OpSize()),
......@@ -121,9 +122,9 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
execution_config_.used_for_jit = used_for_jit;
execution_config_.used_for_control_flow_op = used_for_control_flow_op;
execution_config_.create_local_scope = !used_for_jit &&
FLAGS_new_executor_use_local_scope &&
!used_for_control_flow_op;
execution_config_.create_local_scope =
!used_for_jit && FLAGS_new_executor_use_local_scope &&
!used_for_control_flow_op && !used_for_cinn;
execution_config_.skip_gc_vars = skip_gc_vars;
execution_config_.Log(/*log_level=*/8);
......@@ -425,8 +426,9 @@ void InterpreterCore::BuildAndCacheInstructionCtx(Instruction* instr_node) {
}
// set runtime_ctx and infershape_ctx_
if (instr_node->OpBase()->Type() == "cinn_launch") { // OP use scope in
// kernel
if (instr_node->OpBase()->Type() == "cinn_launch" ||
instr_node->OpBase()->Type() == "cinn_instruction_run") { // OP use scope
// in kernel
Scope* local_scope = HasLocalScope() ? var_scope_.GetMutableLocalScope()
: var_scope_.GetMutableScope();
instr_node->ResetContextWithScope(ins_map, outs_map, *local_scope);
......
......@@ -46,7 +46,8 @@ class InterpreterCore {
const std::set<std::string>& skip_gc_vars,
Scope* scope,
bool used_for_jit = false,
bool used_for_control_flow_op = false);
bool used_for_control_flow_op = false,
bool used_for_cinn = false);
~InterpreterCore();
......
......@@ -15,6 +15,7 @@ cc_library(
build_strategy
device_context
parallel_executor
standalone_executor
transform_type
cinn)
......
......@@ -88,16 +88,15 @@ class TestCinnInstructionRunOp : public ::testing::Test {
cinn_launch_op->Run(scope, place);
}
void RunAndCheck(const platform::Place& place) {
void RunAndCheck(const platform::Place& place, framework::Scope* scope) {
// Run ops and check the computation results
framework::Scope scope;
InitVariablesWithRandomValue<float>({"x", "y"}, {10, 20}, place, &scope);
scope.Var(test_op_out_name)->GetMutable<phi::DenseTensor>();
scope.Var(add_op_out_name)->GetMutable<phi::DenseTensor>();
elementwise_add_op->Run(scope, place);
cinn_launch_op->Run(scope, place);
CompareOpResult<float>(scope.GetVar(test_op_out_name),
scope.GetVar(add_op_out_name));
InitVariablesWithRandomValue<float>({"x", "y"}, {10, 20}, place, scope);
scope->Var(test_op_out_name)->GetMutable<phi::DenseTensor>();
scope->Var(add_op_out_name)->GetMutable<phi::DenseTensor>();
elementwise_add_op->Run(*scope, place);
cinn_launch_op->Run(*scope, place);
CompareOpResult<float>(scope->GetVar(test_op_out_name),
scope->GetVar(add_op_out_name));
}
void TearDown() override { CinnCompiler::GetInstance()->Clear(); }
......@@ -106,17 +105,21 @@ class TestCinnInstructionRunOp : public ::testing::Test {
TEST_F(TestCinnInstructionRunOp, CPU) {
platform::CPUPlace place;
Compile(place);
RunAndCheck(place);
framework::Scope scope1;
RunAndCheck(place, &scope1);
// the second run on the same place is to check the cache logic
RunAndCheck(place);
framework::Scope scope2;
RunAndCheck(place, &scope2);
}
#ifdef PADDLE_WITH_CUDA
TEST_F(TestCinnInstructionRunOp, GPU) {
platform::CUDAPlace place;
Compile(place);
RunAndCheck(place);
RunAndCheck(place);
framework::Scope scope1;
RunAndCheck(place, &scope1);
framework::Scope scope2;
RunAndCheck(place, &scope2);
}
#endif
......
......@@ -88,8 +88,9 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph,
}
// Convert the CINN runtime program to a Paddle graph
runtime_graph_ = std::make_unique<framework::ir::Graph>(
BuildCompiledProgram(graph, compiled_obj));
runtime_program_desc_ = BuildCompiledProgram(graph, compiled_obj);
runtime_graph_ =
std::make_unique<framework::ir::Graph>(*runtime_program_desc_.get());
auto& outer_varinfo = graph.Get<Name2VarInfoMap>(kMemOptVarInfoFromMainGraph);
runtime_graph_->SetNotOwned<Name2VarInfoMap>(kMemOptVarInfoFromMainGraph,
&outer_varinfo);
......@@ -100,6 +101,7 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph,
// that means it can be erased after graph execution
if (!outer_varinfo.count(var_name)) {
skip_eager_vars_.emplace_back(var_name);
skip_gc_vars_.insert(var_name);
}
};
std::for_each(
......@@ -313,12 +315,14 @@ void CinnLaunchContext::AssignInternalVariable(const std::string& var_name) {
});
}
framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram(
std::unique_ptr<framework::ProgramDesc> CinnLaunchContext::BuildCompiledProgram(
const framework::ir::Graph& graph, const CinnCompiledObject& compiled_obj) {
CinnRuntimeProgram* runtime_program = compiled_obj.runtime_program.get();
// Step 0: Create an empty program_desc, there will be only one block
framework::ProgramDesc program_desc;
auto* block = program_desc.MutableBlock(0);
// framework::ProgramDesc program_desc;
std::unique_ptr<framework::ProgramDesc> program_desc(
new framework::ProgramDesc());
auto* block = program_desc->MutableBlock(0);
const std::vector<std::unique_ptr<CinnInstruction>>& instructions =
runtime_program->GetRunInstructions();
......@@ -445,6 +449,46 @@ ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place,
return parallel_executor_.get();
}
framework::InterpreterCore* CinnLaunchContext::InitializeInterpreterCore(
const platform::Place& place, framework::Scope* scope) {
if (!interpreter_core_ || scope != cached_scope_) {
VLOG(1) << "interpreter_core_ is null or scope != cached_scope_: "
"interpreter_core_: "
<< interpreter_core_.get() << "; scope: " << scope
<< "; cached_scope_: " << cached_scope_;
for (auto&& var_name : internal_var_names_) {
auto* var = scope->FindVar(var_name);
if (var != nullptr) {
continue;
}
framework::InitializeVariable(scope->Var(var_name),
framework::proto::VarType::LOD_TENSOR);
}
if (!interpreter_core_) {
interpreter_core_ = std::make_unique<framework::InterpreterCore>(
place,
runtime_program_desc_->Block(0),
skip_gc_vars_,
scope,
/*used_for_jit*/ false,
/*used_for_control_flow_op*/ false,
/*used_for_cinn*/ true);
} else {
interpreter_core_->reset_scope(scope);
}
UpdateCapturedEnv(*scope, place);
}
for (auto&& var_name : initialized_beforehand_vars_) {
auto* var = scope->GetVar(var_name);
auto* buffer = GetCinnBufferOfVar(var_name);
auto dim = framework::DDim(buffer->dims, buffer->dimensions);
var->GetMutable<phi::DenseTensor>()->Resize(dim);
var->GetMutable<phi::DenseTensor>()->mutable_data(
place, framework::paddle2cinn::TransToPaddleDataType(buffer->type));
}
return interpreter_core_.get();
}
cinn_buffer_t* CinnLaunchContext::GetCinnBufferOfVar(
const std::string& var_name) {
auto it = paddle2cinn_varmap_.find(var_name);
......
......@@ -22,6 +22,7 @@
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/new_executor/interpretercore.h"
#include "paddle/fluid/framework/parallel_executor.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/core/ddim.h"
......@@ -74,6 +75,9 @@ class CinnLaunchContext {
framework::ParallelExecutor* InitializePE(const platform::Place& place,
framework::Scope* scope);
framework::InterpreterCore* InitializeInterpreterCore(
const platform::Place& place, framework::Scope* scope);
// explicitly update several environment variables captured
// by callback of execution arguments
void UpdateCapturedEnv(const framework::Scope& scope,
......@@ -132,7 +136,7 @@ class CinnLaunchContext {
// Construct a Paddle ProgramDesc with the CINN runtime
// instructions included in the compiled CINN Program
framework::ProgramDesc BuildCompiledProgram(
std::unique_ptr<framework::ProgramDesc> BuildCompiledProgram(
const framework::ir::Graph& graph,
const CinnCompiledObject& compiled_obj);
......@@ -155,6 +159,10 @@ class CinnLaunchContext {
// the variable scope compiled from cinn
const std::shared_ptr<CinnScope> cinn_scope_;
std::unique_ptr<framework::ProgramDesc> runtime_program_desc_;
std::unique_ptr<framework::InterpreterCore> interpreter_core_;
std::set<std::string> skip_gc_vars_;
// the ir::Graph object converted from the program compiled by CINN
std::unique_ptr<framework::ir::Graph> runtime_graph_;
// a ParallelExecutor to execute the runtime graph
......
......@@ -31,6 +31,7 @@
#include "paddle/fluid/platform/profiler.h"
DECLARE_bool(enable_pe_launch_cinn);
DECLARE_bool(enable_interpretercore_launch_cinn);
namespace paddle {
namespace operators {
......@@ -135,12 +136,21 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
// Step 4. Execute the compiled CINN instructions by a PE or
// by the CINN compiled program in sequential order
if (FLAGS_enable_pe_launch_cinn) {
platform::RecordEvent record_event_4(
"Step 4. Execute the runtime graph by PE.");
VLOG(4) << "Execute the runtime graph by PE";
framework::Scope& exec_scope = scope.NewScope();
auto* pe = launch_context->InitializePE(place, &exec_scope);
pe->RunWithoutFetch(launch_context->GetSkipEagerVars());
if (FLAGS_enable_interpretercore_launch_cinn) {
platform::RecordEvent record_event_4(
"Step 4. Execute the runtime program by InterpreterCore.");
VLOG(4) << "Execute the runtime program by InterpreterCore";
auto* interpreter_core = launch_context->InitializeInterpreterCore(
place, const_cast<framework::Scope*>(&scope));
interpreter_core->Run({});
} else {
platform::RecordEvent record_event_4(
"Step 4. Execute the runtime graph by PE.");
VLOG(4) << "Execute the runtime graph by PE";
framework::Scope& exec_scope = scope.NewScope();
auto* pe = launch_context->InitializePE(place, &exec_scope);
pe->RunWithoutFetch(launch_context->GetSkipEagerVars());
}
} else {
platform::RecordEvent record_event_4(
"Step 4. Execute the compiled executable program.");
......
......@@ -36,6 +36,7 @@ USE_OP(cinn_instruction_run);
USE_OP_ITSELF(elementwise_add);
DECLARE_double(eager_delete_tensor_gb);
DECLARE_bool(enable_pe_launch_cinn);
DECLARE_bool(enable_interpretercore_launch_cinn);
DECLARE_bool(enable_cinn_auto_tune);
PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
......@@ -74,31 +75,34 @@ class TestCinnLaunchOp : public ::testing::Test {
{{}});
}
void RunAndCheck(const platform::Place& place) {
void RunAndCheck(const platform::Place& place, framework::Scope* scope) {
// Run ops and check the computation results
framework::Scope scope;
InitVariablesWithRandomValue<float>({"x", "y"}, {10, 20}, place, &scope);
scope.Var(test_op_out_name)->GetMutable<phi::DenseTensor>();
scope.Var(add_op_out_name)->GetMutable<phi::DenseTensor>();
elementwise_add_op->Run(scope, place);
cinn_launch_op->Run(scope, place);
CompareOpResult<float>(scope.GetVar(test_op_out_name),
scope.GetVar(add_op_out_name));
InitVariablesWithRandomValue<float>({"x", "y"}, {10, 20}, place, scope);
scope->Var(test_op_out_name)->GetMutable<phi::DenseTensor>();
scope->Var(add_op_out_name)->GetMutable<phi::DenseTensor>();
elementwise_add_op->Run(*scope, place);
cinn_launch_op->Run(*scope, place);
CompareOpResult<float>(scope->GetVar(test_op_out_name),
scope->GetVar(add_op_out_name));
}
void TearDown() override { CinnCompiler::GetInstance()->Clear(); }
};
TEST_F(TestCinnLaunchOp, TestRunCPUInstructionByPE) {
RunAndCheck(platform::CPUPlace());
framework::Scope scope1;
RunAndCheck(platform::CPUPlace(), &scope1);
// the second run on the same place is to check the cache logic
RunAndCheck(platform::CPUPlace());
framework::Scope scope2;
RunAndCheck(platform::CPUPlace(), &scope2);
}
#ifdef PADDLE_WITH_CUDA
TEST_F(TestCinnLaunchOp, TestRunGPUInstructionByPE) {
RunAndCheck(platform::CUDAPlace());
RunAndCheck(platform::CUDAPlace());
framework::Scope scope1;
RunAndCheck(platform::CUDAPlace(), &scope1);
framework::Scope scope2;
RunAndCheck(platform::CUDAPlace(), &scope2);
}
#endif
......@@ -106,9 +110,11 @@ TEST_F(TestCinnLaunchOp, TestRunCPUInstructionByCinnProgram) {
// set FLAGS_enable_pe_launch_cinn=false to switch to use
// default scheduler of CINN to execute the compiled program
FLAGS_enable_pe_launch_cinn = false;
RunAndCheck(platform::CPUPlace());
RunAndCheck(platform::CPUPlace());
FLAGS_enable_interpretercore_launch_cinn = false;
framework::Scope scope1;
RunAndCheck(platform::CPUPlace(), &scope1);
framework::Scope scope2;
RunAndCheck(platform::CPUPlace(), &scope2);
}
#ifdef PADDLE_WITH_CUDA
......@@ -116,8 +122,11 @@ TEST_F(TestCinnLaunchOp, TestRunGPUInstructionByCinnProgram) {
// set FLAGS_enable_pe_launch_cinn=false to switch to use
// default scheduler of CINN to execute the compiled program
FLAGS_enable_pe_launch_cinn = false;
RunAndCheck(platform::CUDAPlace());
RunAndCheck(platform::CUDAPlace());
FLAGS_enable_interpretercore_launch_cinn = false;
framework::Scope scope1;
RunAndCheck(platform::CUDAPlace(), &scope1);
framework::Scope scope2;
RunAndCheck(platform::CUDAPlace(), &scope2);
}
#endif
......@@ -125,8 +134,10 @@ TEST_F(TestCinnLaunchOp, TestRunWithAutoTuneEnabled) {
FLAGS_enable_cinn_auto_tune = true;
// currently only check on cpu, will add a test for gpu after CINN ready
RunAndCheck(platform::CPUPlace());
RunAndCheck(platform::CPUPlace());
framework::Scope scope1;
RunAndCheck(platform::CPUPlace(), &scope1);
framework::Scope scope2;
RunAndCheck(platform::CPUPlace(), &scope2);
}
namespace details {
......
......@@ -978,6 +978,20 @@ PADDLE_DEFINE_EXPORTED_bool(enable_pe_launch_cinn,
"It controls whether to execute cinn compiled "
"program with ParallelExecutor");
/*
* CINN related FLAG
* Name: FLAGS_enable_interpretercore_launch_cinn
* Since Version: 2.4
* Value Range: bool, default=true
* Example: FLAGS_enable_interpretercore_launch_cinn=true would execute the CINN
* compiled instructions of a paddle graph with InterpreterCore, otherwise with
* the CINN compiled runtime program in sequential order.
*/
PADDLE_DEFINE_EXPORTED_bool(enable_interpretercore_launch_cinn,
true,
"It controls whether to execute cinn compiled "
"program with InterpreterCore");
/*
* CINN related FLAG
* Name: FLAGS_enable_cinn_auto_tune
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册