diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index 4e0359144c2838c2deb1a7ce0bc8be2dfa759fc0..651ebf4c4379940319fd2affd3f264672e5a7f1d 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -622,7 +622,8 @@ void BuildOpFuncList(const platform::Place& place, // NOTE(Ruibiao): We do not encourage directly using scope in OP kernel. // But some OPs do have such behavior (e.g., cinn_launch OP). Here // special treatment for them. - if (op_with_kernel->Type() == "cinn_launch") { + if (op_with_kernel->Type() == "cinn_launch" || + op_with_kernel->Type() == "cinn_instruction_run") { VLOG(6) << "OP(" << op_with_kernel->Type() << ") use scope in kernel, " "so pass a real scope to " diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 4642a684663b64296e373b59b0f241659a24cd73..5ad12071bd3c2d22468cce2155ba97ccec444293 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -108,7 +108,8 @@ InterpreterCore::InterpreterCore(const platform::Place& place, const std::set& skip_gc_vars, framework::Scope* scope, bool used_for_jit, - bool used_for_control_flow_op) + bool used_for_control_flow_op, + bool used_for_cinn) : place_(place), block_(block), execution_config_(place, block.OpSize()), @@ -121,9 +122,9 @@ InterpreterCore::InterpreterCore(const platform::Place& place, execution_config_.used_for_jit = used_for_jit; execution_config_.used_for_control_flow_op = used_for_control_flow_op; - execution_config_.create_local_scope = !used_for_jit && - FLAGS_new_executor_use_local_scope && - !used_for_control_flow_op; + execution_config_.create_local_scope = + !used_for_jit && FLAGS_new_executor_use_local_scope && + !used_for_control_flow_op && !used_for_cinn; execution_config_.skip_gc_vars = skip_gc_vars; execution_config_.Log(/*log_level=*/8); @@ -425,8 +426,9 @@ void InterpreterCore::BuildAndCacheInstructionCtx(Instruction* instr_node) { } // set runtime_ctx and infershape_ctx_ - if (instr_node->OpBase()->Type() == "cinn_launch") { // OP use scope in - // kernel + if (instr_node->OpBase()->Type() == "cinn_launch" || + instr_node->OpBase()->Type() == "cinn_instruction_run") { // OP use scope + // in kernel Scope* local_scope = HasLocalScope() ? var_scope_.GetMutableLocalScope() : var_scope_.GetMutableScope(); instr_node->ResetContextWithScope(ins_map, outs_map, *local_scope); diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index 80db521d60d04acbef55c16ca0bae20c5d31fd6e..a09942387a95c6474d9a3b56c4c8a5f6d50a0a54 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -46,7 +46,8 @@ class InterpreterCore { const std::set& skip_gc_vars, Scope* scope, bool used_for_jit = false, - bool used_for_control_flow_op = false); + bool used_for_control_flow_op = false, + bool used_for_cinn = false); ~InterpreterCore(); diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt index e4063436c03363269f8593428c87bef0c374008c..b0b084c6928b0d5b4951c9e5ab985dc7b442f347 100644 --- a/paddle/fluid/operators/cinn/CMakeLists.txt +++ b/paddle/fluid/operators/cinn/CMakeLists.txt @@ -15,6 +15,7 @@ cc_library( build_strategy device_context parallel_executor + standalone_executor transform_type cinn) diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc index 6469ac3ab212c0f8ff84058141b6bce6133ba3ec..0abb39573cb38ad17f957e1d3fef3015a99b9d7f 100644 --- a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc +++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc @@ -88,16 +88,15 @@ class TestCinnInstructionRunOp : public ::testing::Test { cinn_launch_op->Run(scope, place); } - void RunAndCheck(const platform::Place& place) { + void RunAndCheck(const platform::Place& place, framework::Scope* scope) { // Run ops and check the computation results - framework::Scope scope; - InitVariablesWithRandomValue({"x", "y"}, {10, 20}, place, &scope); - scope.Var(test_op_out_name)->GetMutable(); - scope.Var(add_op_out_name)->GetMutable(); - elementwise_add_op->Run(scope, place); - cinn_launch_op->Run(scope, place); - CompareOpResult(scope.GetVar(test_op_out_name), - scope.GetVar(add_op_out_name)); + InitVariablesWithRandomValue({"x", "y"}, {10, 20}, place, scope); + scope->Var(test_op_out_name)->GetMutable(); + scope->Var(add_op_out_name)->GetMutable(); + elementwise_add_op->Run(*scope, place); + cinn_launch_op->Run(*scope, place); + CompareOpResult(scope->GetVar(test_op_out_name), + scope->GetVar(add_op_out_name)); } void TearDown() override { CinnCompiler::GetInstance()->Clear(); } @@ -106,17 +105,21 @@ class TestCinnInstructionRunOp : public ::testing::Test { TEST_F(TestCinnInstructionRunOp, CPU) { platform::CPUPlace place; Compile(place); - RunAndCheck(place); + framework::Scope scope1; + RunAndCheck(place, &scope1); // the second run on the same place is to check the cache logic - RunAndCheck(place); + framework::Scope scope2; + RunAndCheck(place, &scope2); } #ifdef PADDLE_WITH_CUDA TEST_F(TestCinnInstructionRunOp, GPU) { platform::CUDAPlace place; Compile(place); - RunAndCheck(place); - RunAndCheck(place); + framework::Scope scope1; + RunAndCheck(place, &scope1); + framework::Scope scope2; + RunAndCheck(place, &scope2); } #endif diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc index 3b9d7d00edad4db9ca3dd98b8af7ac08d087b399..982fedfe23d8c47d66bb3415e13324eebb6ec674 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_context.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc @@ -88,8 +88,9 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph, } // Convert the CINN runtime program to a Paddle graph - runtime_graph_ = std::make_unique( - BuildCompiledProgram(graph, compiled_obj)); + runtime_program_desc_ = BuildCompiledProgram(graph, compiled_obj); + runtime_graph_ = + std::make_unique(*runtime_program_desc_.get()); auto& outer_varinfo = graph.Get(kMemOptVarInfoFromMainGraph); runtime_graph_->SetNotOwned(kMemOptVarInfoFromMainGraph, &outer_varinfo); @@ -100,6 +101,7 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph, // that means it can be erased after graph execution if (!outer_varinfo.count(var_name)) { skip_eager_vars_.emplace_back(var_name); + skip_gc_vars_.insert(var_name); } }; std::for_each( @@ -313,12 +315,14 @@ void CinnLaunchContext::AssignInternalVariable(const std::string& var_name) { }); } -framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram( +std::unique_ptr CinnLaunchContext::BuildCompiledProgram( const framework::ir::Graph& graph, const CinnCompiledObject& compiled_obj) { CinnRuntimeProgram* runtime_program = compiled_obj.runtime_program.get(); // Step 0: Create an empty program_desc, there will be only one block - framework::ProgramDesc program_desc; - auto* block = program_desc.MutableBlock(0); + // framework::ProgramDesc program_desc; + std::unique_ptr program_desc( + new framework::ProgramDesc()); + auto* block = program_desc->MutableBlock(0); const std::vector>& instructions = runtime_program->GetRunInstructions(); @@ -445,6 +449,46 @@ ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place, return parallel_executor_.get(); } +framework::InterpreterCore* CinnLaunchContext::InitializeInterpreterCore( + const platform::Place& place, framework::Scope* scope) { + if (!interpreter_core_ || scope != cached_scope_) { + VLOG(1) << "interpreter_core_ is null or scope != cached_scope_: " + "interpreter_core_: " + << interpreter_core_.get() << "; scope: " << scope + << "; cached_scope_: " << cached_scope_; + for (auto&& var_name : internal_var_names_) { + auto* var = scope->FindVar(var_name); + if (var != nullptr) { + continue; + } + framework::InitializeVariable(scope->Var(var_name), + framework::proto::VarType::LOD_TENSOR); + } + if (!interpreter_core_) { + interpreter_core_ = std::make_unique( + place, + runtime_program_desc_->Block(0), + skip_gc_vars_, + scope, + /*used_for_jit*/ false, + /*used_for_control_flow_op*/ false, + /*used_for_cinn*/ true); + } else { + interpreter_core_->reset_scope(scope); + } + UpdateCapturedEnv(*scope, place); + } + for (auto&& var_name : initialized_beforehand_vars_) { + auto* var = scope->GetVar(var_name); + auto* buffer = GetCinnBufferOfVar(var_name); + auto dim = framework::DDim(buffer->dims, buffer->dimensions); + var->GetMutable()->Resize(dim); + var->GetMutable()->mutable_data( + place, framework::paddle2cinn::TransToPaddleDataType(buffer->type)); + } + return interpreter_core_.get(); +} + cinn_buffer_t* CinnLaunchContext::GetCinnBufferOfVar( const std::string& var_name) { auto it = paddle2cinn_varmap_.find(var_name); diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.h b/paddle/fluid/operators/cinn/cinn_launch_context.h index e343d1dd9c34842f28500e6122f26942e2554c62..d6ce95de0859d0e3d2e63e0dbaef8ae7bbac5036 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_context.h +++ b/paddle/fluid/operators/cinn/cinn_launch_context.h @@ -22,6 +22,7 @@ #include #include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/new_executor/interpretercore.h" #include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/platform/place.h" #include "paddle/phi/core/ddim.h" @@ -74,6 +75,9 @@ class CinnLaunchContext { framework::ParallelExecutor* InitializePE(const platform::Place& place, framework::Scope* scope); + framework::InterpreterCore* InitializeInterpreterCore( + const platform::Place& place, framework::Scope* scope); + // explicitly update several environment variables captured // by callback of execution arguments void UpdateCapturedEnv(const framework::Scope& scope, @@ -132,7 +136,7 @@ class CinnLaunchContext { // Construct a Paddle ProgramDesc with the CINN runtime // instructions included in the compiled CINN Program - framework::ProgramDesc BuildCompiledProgram( + std::unique_ptr BuildCompiledProgram( const framework::ir::Graph& graph, const CinnCompiledObject& compiled_obj); @@ -155,6 +159,10 @@ class CinnLaunchContext { // the variable scope compiled from cinn const std::shared_ptr cinn_scope_; + std::unique_ptr runtime_program_desc_; + std::unique_ptr interpreter_core_; + std::set skip_gc_vars_; + // the ir::Graph object converted from the program compiled by CINN std::unique_ptr runtime_graph_; // a ParallelExecutor to execute the runtime graph diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h index e27ef6079174bec73746b293c6dcd73c0835ba87..dc740e215d69d1b663bf560058028524a2dc1180 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op.h +++ b/paddle/fluid/operators/cinn/cinn_launch_op.h @@ -31,6 +31,7 @@ #include "paddle/fluid/platform/profiler.h" DECLARE_bool(enable_pe_launch_cinn); +DECLARE_bool(enable_interpretercore_launch_cinn); namespace paddle { namespace operators { @@ -135,12 +136,21 @@ class CinnLaunchOpKernel : public framework::OpKernel { // Step 4. Execute the compiled CINN instructions by a PE or // by the CINN compiled program in sequential order if (FLAGS_enable_pe_launch_cinn) { - platform::RecordEvent record_event_4( - "Step 4. Execute the runtime graph by PE."); - VLOG(4) << "Execute the runtime graph by PE"; - framework::Scope& exec_scope = scope.NewScope(); - auto* pe = launch_context->InitializePE(place, &exec_scope); - pe->RunWithoutFetch(launch_context->GetSkipEagerVars()); + if (FLAGS_enable_interpretercore_launch_cinn) { + platform::RecordEvent record_event_4( + "Step 4. Execute the runtime program by InterpreterCore."); + VLOG(4) << "Execute the runtime program by InterpreterCore"; + auto* interpreter_core = launch_context->InitializeInterpreterCore( + place, const_cast(&scope)); + interpreter_core->Run({}); + } else { + platform::RecordEvent record_event_4( + "Step 4. Execute the runtime graph by PE."); + VLOG(4) << "Execute the runtime graph by PE"; + framework::Scope& exec_scope = scope.NewScope(); + auto* pe = launch_context->InitializePE(place, &exec_scope); + pe->RunWithoutFetch(launch_context->GetSkipEagerVars()); + } } else { platform::RecordEvent record_event_4( "Step 4. Execute the compiled executable program."); diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc index dad26dc637400ca80b727e35b41b03ff7a3faf05..e10f4a85a9a6affe04c288768aa43559095eb8c6 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc @@ -36,6 +36,7 @@ USE_OP(cinn_instruction_run); USE_OP_ITSELF(elementwise_add); DECLARE_double(eager_delete_tensor_gb); DECLARE_bool(enable_pe_launch_cinn); +DECLARE_bool(enable_interpretercore_launch_cinn); DECLARE_bool(enable_cinn_auto_tune); PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); @@ -74,31 +75,34 @@ class TestCinnLaunchOp : public ::testing::Test { {{}}); } - void RunAndCheck(const platform::Place& place) { + void RunAndCheck(const platform::Place& place, framework::Scope* scope) { // Run ops and check the computation results - framework::Scope scope; - InitVariablesWithRandomValue({"x", "y"}, {10, 20}, place, &scope); - scope.Var(test_op_out_name)->GetMutable(); - scope.Var(add_op_out_name)->GetMutable(); - elementwise_add_op->Run(scope, place); - cinn_launch_op->Run(scope, place); - CompareOpResult(scope.GetVar(test_op_out_name), - scope.GetVar(add_op_out_name)); + InitVariablesWithRandomValue({"x", "y"}, {10, 20}, place, scope); + scope->Var(test_op_out_name)->GetMutable(); + scope->Var(add_op_out_name)->GetMutable(); + elementwise_add_op->Run(*scope, place); + cinn_launch_op->Run(*scope, place); + CompareOpResult(scope->GetVar(test_op_out_name), + scope->GetVar(add_op_out_name)); } void TearDown() override { CinnCompiler::GetInstance()->Clear(); } }; TEST_F(TestCinnLaunchOp, TestRunCPUInstructionByPE) { - RunAndCheck(platform::CPUPlace()); + framework::Scope scope1; + RunAndCheck(platform::CPUPlace(), &scope1); // the second run on the same place is to check the cache logic - RunAndCheck(platform::CPUPlace()); + framework::Scope scope2; + RunAndCheck(platform::CPUPlace(), &scope2); } #ifdef PADDLE_WITH_CUDA TEST_F(TestCinnLaunchOp, TestRunGPUInstructionByPE) { - RunAndCheck(platform::CUDAPlace()); - RunAndCheck(platform::CUDAPlace()); + framework::Scope scope1; + RunAndCheck(platform::CUDAPlace(), &scope1); + framework::Scope scope2; + RunAndCheck(platform::CUDAPlace(), &scope2); } #endif @@ -106,9 +110,11 @@ TEST_F(TestCinnLaunchOp, TestRunCPUInstructionByCinnProgram) { // set FLAGS_enable_pe_launch_cinn=false to switch to use // default scheduler of CINN to execute the compiled program FLAGS_enable_pe_launch_cinn = false; - - RunAndCheck(platform::CPUPlace()); - RunAndCheck(platform::CPUPlace()); + FLAGS_enable_interpretercore_launch_cinn = false; + framework::Scope scope1; + RunAndCheck(platform::CPUPlace(), &scope1); + framework::Scope scope2; + RunAndCheck(platform::CPUPlace(), &scope2); } #ifdef PADDLE_WITH_CUDA @@ -116,8 +122,11 @@ TEST_F(TestCinnLaunchOp, TestRunGPUInstructionByCinnProgram) { // set FLAGS_enable_pe_launch_cinn=false to switch to use // default scheduler of CINN to execute the compiled program FLAGS_enable_pe_launch_cinn = false; - RunAndCheck(platform::CUDAPlace()); - RunAndCheck(platform::CUDAPlace()); + FLAGS_enable_interpretercore_launch_cinn = false; + framework::Scope scope1; + RunAndCheck(platform::CUDAPlace(), &scope1); + framework::Scope scope2; + RunAndCheck(platform::CUDAPlace(), &scope2); } #endif @@ -125,8 +134,10 @@ TEST_F(TestCinnLaunchOp, TestRunWithAutoTuneEnabled) { FLAGS_enable_cinn_auto_tune = true; // currently only check on cpu, will add a test for gpu after CINN ready - RunAndCheck(platform::CPUPlace()); - RunAndCheck(platform::CPUPlace()); + framework::Scope scope1; + RunAndCheck(platform::CPUPlace(), &scope1); + framework::Scope scope2; + RunAndCheck(platform::CPUPlace(), &scope2); } namespace details { diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc index 29c9c63e7fd22ac90f134c876148a2d845071b7c..6850d91b8dd4519453096363634fdc6ca92a979d 100644 --- a/paddle/phi/core/flags.cc +++ b/paddle/phi/core/flags.cc @@ -978,6 +978,20 @@ PADDLE_DEFINE_EXPORTED_bool(enable_pe_launch_cinn, "It controls whether to execute cinn compiled " "program with ParallelExecutor"); +/* + * CINN related FLAG + * Name: FLAGS_enable_interpretercore_launch_cinn + * Since Version: 2.4 + * Value Range: bool, default=true + * Example: FLAGS_enable_interpretercore_launch_cinn=true would execute the CINN + * compiled instructions of a paddle graph with InterpreterCore, otherwise with + * the CINN compiled runtime program in sequential order. + */ +PADDLE_DEFINE_EXPORTED_bool(enable_interpretercore_launch_cinn, + true, + "It controls whether to execute cinn compiled " + "program with InterpreterCore"); + /* * CINN related FLAG * Name: FLAGS_enable_cinn_auto_tune