From 167d511f074633992764f51c3be416a0d9169ff7 Mon Sep 17 00:00:00 2001 From: TeFeng Chen Date: Thu, 3 Mar 2022 19:01:38 +0800 Subject: [PATCH] cinn_launch_op: switch to execution by PE (#39911) * swith to PE execution in cinn launch * fix outer variables erased * skip the map bug temporarily for test * temporary solution for batch_norm bug * update comment * fix compile error * cinn_instruction_run_op_test: update code to skip external alloc/free instructions generated --- .../framework/paddle2cinn/cinn_compiler.cc | 1 - paddle/fluid/operators/cinn/CMakeLists.txt | 6 +-- .../cinn/cinn_instruction_run_op_test.cc | 2 +- .../operators/cinn/cinn_launch_context.cc | 46 +++++++++++++++++-- .../operators/cinn/cinn_launch_context.h | 10 ++++ paddle/fluid/operators/cinn/cinn_launch_op.h | 21 ++++----- .../operators/cinn/cinn_launch_op_test.cc | 4 ++ 7 files changed, 69 insertions(+), 21 deletions(-) diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc index 706815185a..c015e90f71 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc @@ -241,7 +241,6 @@ std::unique_ptr CinnCompiler::CompileGraph( std::make_unique(target, scope, cinn_graph); GraphCompiler::CompileOptions options; options.with_instantiate_variables = false; - options.with_buffer_handle_instruction_inserted = true; auto compiled_res = graph_compiler->Build(options, std::move(fetch_ids), stream); auto compiled_obj = std::make_unique(); diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt index f1247ebdf2..2092f65212 100644 --- a/paddle/fluid/operators/cinn/CMakeLists.txt +++ b/paddle/fluid/operators/cinn/CMakeLists.txt @@ -1,9 +1,9 @@ include(operators) cc_library(cinn_op_helper SRCS cinn_op_helper.cc DEPS operator device_context) -cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy parallel_executor cinn) +cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy device_context parallel_executor cinn) -SET(CINN_OP_DEPS string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context) +SET(CINN_OP_DEPS parallel_executor string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context) register_operators(DEPS ${CINN_OP_DEPS}) if (WITH_TESTING) @@ -11,7 +11,7 @@ if (WITH_TESTING) set_tests_properties(cinn_launch_context_test PROPERTIES LABELS "RUN_TYPE=CINN") SET(CINN_RUN_ENVIRONMENT "OMP_NUM_THREADS=1;runtime_include_dir=${PADDLE_BINARY_DIR}/third_party/CINN/src/external_cinn/cinn/runtime/cuda") - cc_test(cinn_launch_op_test SRCS cinn_launch_op_test.cc DEPS cinn_compiler cinn_launch_op elementwise_add_op) + cc_test(cinn_launch_op_test SRCS cinn_launch_op_test.cc DEPS cinn_compiler cinn_launch_op cinn_instruction_run_op elementwise_add_op gflags) set_tests_properties(cinn_launch_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT "${CINN_RUN_ENVIRONMENT}") cc_test(cinn_instruction_run_op_test SRCS cinn_instruction_run_op_test.cc DEPS cinn_compiler cinn_launch_op cinn_instruction_run_op elementwise_add_op) diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc index 7c4bdc09a5..2afee35112 100644 --- a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc +++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc @@ -50,7 +50,7 @@ TEST(CinnInstructionOpTest, TestWithElementwiseAdd) { auto cinn_instruction_run_op = paddle::framework::OpRegistry::CreateOp( "cinn_instruction_run", {{"X", {"x", "y"}}}, {{"Out", {test_op_out_name}}}, - {{"cached_index", 0}, {"instruction_index", 1}}); + {{"cached_index", 0}, {"instruction_index", 0}}); auto elementwise_add_op = paddle::framework::OpRegistry::CreateOp( "elementwise_add", {{"X", {"x"}}, {"Y", {"y"}}}, {{"Out", {add_op_out_name}}}, {{}}); diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc index 0a21d937aa..b76dd60409 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_context.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc @@ -31,6 +31,7 @@ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/operators/cinn/cinn_op_helper.h" +#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/core/ddim.h" @@ -90,9 +91,30 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph, // Convert the CINN runtime program to a Paddle graph runtime_graph_ = std::make_unique( BuildCompiledProgram(graph, compiled_obj)); - runtime_graph_->SetNotOwned( - kMemOptVarInfoFromMainGraph, - &graph.Get(kMemOptVarInfoFromMainGraph)); + auto& outer_varinfo = graph.Get(kMemOptVarInfoFromMainGraph); + runtime_graph_->SetNotOwned(kMemOptVarInfoFromMainGraph, + &outer_varinfo); + // collect skip_eager_vars + skip_eager_vars_.reserve(input_var_names.size() + output_var_names.size()); + auto add_skip_var_fn = [&outer_varinfo, this](const std::string& var_name) { + // if a var exists at outer_varinfo map, + // that means it can be erased after graph execution + if (!outer_varinfo.count(var_name)) { + skip_eager_vars_.emplace_back(var_name); + } + }; + std::for_each(input_var_names.begin(), input_var_names.end(), + add_skip_var_fn); + std::for_each(output_var_names.begin(), output_var_names.end(), + add_skip_var_fn); + VLOG(4) << string::Sprintf( + "Distribution of variables in the graph compiled:" + "input[%lu],internal[%lu],output[%lu]," + "outer_eager_deletion[%lu],skip_eager_deletion[%lu]," + "initialized_beforehand[%lu]", + input_var_names.size(), internal_var_names_.size(), + output_var_names.size(), outer_varinfo.size(), skip_eager_vars_.size(), + initialized_beforehand_vars_.size()); } void CinnLaunchContext::BuildVarNameMap( @@ -288,6 +310,7 @@ framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram( // are set by values of the corresponding compiled tensors, // including the in/out variables where the equiality between their tensors // and the CINN compiled ones is verified in corresponding cinn_launch_op. + std::unordered_set has_refer_vars; for (auto&& arg : cinn_argument_names_) { const std::string& var_name = cinn2paddle_varmap_.at(arg); framework::VarDesc* var_desc = block->Var(var_name); @@ -298,6 +321,7 @@ framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram( auto* ori_desc = res->second; var_desc->SetPersistable(ori_desc->Persistable()); var_desc->SetIsParameter(ori_desc->IsParameter()); + has_refer_vars.insert(var_name); } auto cinn_tensor = GetCinnTensorOfVar(var_name); @@ -331,6 +355,12 @@ framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram( auto* ins = instructions.at(ins_idx).get(); auto in_args = trans_and_pack_args_fn(ins->GetInArgs()); auto out_args = trans_and_pack_args_fn(ins->GetOutArgs()); + for (auto&& var_name : in_args) { + if (!has_refer_vars.count(var_name)) { + initialized_beforehand_vars_.emplace_back(var_name); + } + } + has_refer_vars.insert(out_args.begin(), out_args.end()); auto* op_desc = block->AppendOp(); op_desc->SetType("cinn_instruction_run"); @@ -348,16 +378,26 @@ ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place, framework::Scope* scope) { if (!parallel_executor_) { framework::details::ExecutionStrategy exec_strategy; + exec_strategy.num_threads_ = 1; + exec_strategy.use_device_ = platform::Place2DeviceType(place); framework::details::BuildStrategy build_strategy; parallel_executor_ = std::make_unique( place, scope, exec_strategy, build_strategy, runtime_graph_.get()); } // update the scope bound to an OpHandle and rebuild temporary variables + VLOG(4) << "Reset scope and initialize temporary variables"; std::unordered_map scope_map = { {parallel_executor_->GetLocalScopes().front(), scope}}; parallel_executor_->ResetOpHandleScopeMapOfGraphs(scope_map); parallel_executor_->PrepareVariables(scope); + for (auto&& var_name : initialized_beforehand_vars_) { + auto* var = scope->GetVar(var_name); + auto* buffer = GetCinnBufferOfVar(var_name); + auto dim = framework::DDim(buffer->dims, buffer->dimensions); + var->GetMutable()->Resize(dim); + var->GetMutable()->mutable_data(place); + } return parallel_executor_.get(); } diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.h b/paddle/fluid/operators/cinn/cinn_launch_context.h index a4d613ea61..ed5e4383d8 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_context.h +++ b/paddle/fluid/operators/cinn/cinn_launch_context.h @@ -86,6 +86,11 @@ class CinnLaunchContext { void CheckTensorEquivalent(const std::string& var_name, const framework::LoDTensor& paddle_tensor); + // Return the name list of variables skipped eager deletion + const std::vector& GetSkipEagerVars() const { + return skip_eager_vars_; + } + // Return internal variable names list const std::unordered_set& GetInternalVarNames() const { return internal_var_names_; @@ -143,6 +148,9 @@ class CinnLaunchContext { std::unordered_set internal_var_names_; // the names of the cinn arguments used in compiled executable program std::unordered_set cinn_argument_names_; + // TODO(CtfGo): remove this list after fixing batch_norm bug + // due to duplicate association in the same variable. + std::vector initialized_beforehand_vars_; // the variable scope compiled from cinn const std::shared_ptr cinn_scope_; @@ -150,6 +158,8 @@ class CinnLaunchContext { std::unique_ptr runtime_graph_; // a ParallelExecutor to execute the runtime graph std::unique_ptr parallel_executor_; + // the name list of skip_eager_vars in runtime + std::vector skip_eager_vars_; // because a cinn_pod_value_t does not own a cinn_buffer_t object, // an extra stroage is necessary to keep those objects and they can diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h index cf3b98c667..5263aae03e 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op.h +++ b/paddle/fluid/operators/cinn/cinn_launch_op.h @@ -103,8 +103,8 @@ class CinnLaunchOpKernel : public framework::OpKernel { details::DebugCinnCompiledResult(cinn_compiled_object); auto* launch_context = cinn_compiled_object.launch_context.get(); - // Step 3. Prepare arguments needed for the compiled executable program. - launch_context->UpdateCapturedEnv(scope, place); + // Step 3. check the computational consistency of the subgraph + // before and after the compilation // 3.1 Input variables: tensors of input variables have // been initialized before graph compiled, just check the // equiality between tensors of paddle and cinn. @@ -120,20 +120,15 @@ class CinnLaunchOpKernel : public framework::OpKernel { *inputs_name2tensor.at(var_name)); } - // 3.2 Output variables: the output variables will be initialized - // and allocated buffer in callbacks which are defined in the - // external_malloc/free interface of cinn_buffer_t - // in their corresponding arguments. - // 3.3 Internal variables: A temporary scope is created in - // UpdateCapturedEnv to keep the internal variables and - // they are also initialized through callbacks - // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic. details::SetCinnRuntimeFlags(); - // Step 5. Launch CINN to execute the compiled executable program - VLOG(4) << "Run Cinn compiled executable program with stream: " << stream; - details::LaunchCinnExecution(cinn_compiled_object, *launch_context, stream); + // Step 5. use PE to execute the compiled CINN instructions + // in nodes of the runtime graph + VLOG(4) << "Execute the runtime graph by PE"; + framework::Scope& exec_scope = scope.NewScope(); + auto* pe = launch_context->InitializePE(place, &exec_scope); + pe->RunWithoutFetch(launch_context->GetSkipEagerVars()); VLOG(4) << "CinnLaunchOp launch execution done."; } }; diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc index f5b6161ff3..460d417e61 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include +#include "gflags/gflags.h" #include "gtest/gtest.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h" @@ -27,7 +28,9 @@ limitations under the License. */ #include "paddle/phi/core/ddim.h" USE_OP(cinn_launch); +USE_OP(cinn_instruction_run); USE_OP_ITSELF(elementwise_add); +DECLARE_double(eager_delete_tensor_gb); namespace paddle::operators { @@ -61,6 +64,7 @@ TEST(CinnLaunchOpTest, TestWithElementwiseAdd) { CompareOpResult(scope.GetVar(test_op_out_name), scope.GetVar(add_op_out_name)); }; + FLAGS_eager_delete_tensor_gb = -1; // CPU run_and_check_fn(platform::CPUPlace()); -- GitLab