From 4d042a83732b8c2d4ff9abfd3e103b6f0799831d Mon Sep 17 00:00:00 2001 From: TeFeng Chen Date: Thu, 24 Feb 2022 17:14:46 +0800 Subject: [PATCH] build a Paddle Graph from CINN compiled program for execution with PE (#39724) * build a Paddle Graph from CINN compiled program for execution with PE * update names of some variables * fix random fail in build_cinn_pass_test and update some comments * fix compiler error by merging phi pr --- .../framework/paddle2cinn/build_cinn_pass.cc | 9 +- .../framework/paddle2cinn/build_cinn_pass.h | 7 + .../paddle2cinn/build_cinn_pass_test.cc | 4 +- .../framework/paddle2cinn/cinn_compiler.cc | 6 +- paddle/fluid/operators/cinn/CMakeLists.txt | 4 +- .../operators/cinn/cinn_launch_context.cc | 249 +++++++++++++----- .../operators/cinn/cinn_launch_context.h | 94 ++++--- .../cinn/cinn_launch_context_test.cc | 241 ++++++++++++----- paddle/fluid/operators/cinn/cinn_launch_op.h | 74 ++---- paddle/fluid/operators/cinn/test_helper.h | 12 + 10 files changed, 477 insertions(+), 223 deletions(-) diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc index d55950064a4..6e55727c8bf 100644 --- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc @@ -44,11 +44,6 @@ DECLARE_string(deny_cinn_ops); namespace paddle { namespace framework { - -namespace ir { -class MemOptVarInfo; -} // namespace ir - namespace paddle2cinn { using framework::ir::Graph; @@ -398,9 +393,7 @@ std::unique_ptr CreateNewSubGraph(const GraphNodeSet& cluster, kNoNeedBufferFeeds, no_need_buffer_feeds.release()); // initialize empty map for kMemOptVarInfoFromMainGraph attribute, // it will be filled on the share_mem_opt_info_to_subgraph pass - subgraph->GetOrInit>>( - kMemOptVarInfoFromMainGraph); + subgraph->GetOrInit(kMemOptVarInfoFromMainGraph); return subgraph; } diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h index 8cb920831cc..a902eacde82 100644 --- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h @@ -18,6 +18,10 @@ limitations under the License. */ namespace paddle { namespace framework { +namespace ir { +class MemOptVarInfo; +} // namespace ir + namespace paddle2cinn { constexpr char kCinnLaunchOp[] = "cinn_launch"; @@ -27,6 +31,9 @@ constexpr char kInternalVars[] = "InternalVars"; constexpr char kOutputVars[] = "OutputVars"; constexpr char kMemOptVarInfoFromMainGraph[] = "mem_opt_var_info_from_main_graph"; +using Name2VarInfoMap = + std::unordered_map>; // A pass named BuildCinnPass, the function of this pass is: // diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc index 919fc60d4cb..bf9d1baaf39 100644 --- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc @@ -255,7 +255,9 @@ TEST(BuildCinnPassTest, AllOpSupportCinn) { ASSERT_EQ( std::unordered_set(cinn_op->inputs.begin(), cinn_op->inputs.end()), std::unordered_set({v0, v1, v2, v4})); - ASSERT_EQ(cinn_op->outputs, std::vector({v6, v7})); + ASSERT_EQ(std::unordered_set(cinn_op->outputs.begin(), + cinn_op->outputs.end()), + std::unordered_set({v6, v7})); ASSERT_EQ(v1->outputs, std::vector({cinn_op})); ASSERT_EQ(v6->inputs, std::vector({cinn_op})); diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc index 716cd85e711..706815185a1 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc @@ -248,10 +248,10 @@ std::unique_ptr CinnCompiler::CompileGraph( *compiled_obj = {std::move(graph_compiler), std::move(compiled_res.runtime_program), scope, symbol.var_model_to_program_map()}; - compiled_obj->launch_context = - std::make_unique( - compiled_obj->paddle2cinn_varmap, compiled_obj->scope); compiled_obj->cached_index = compiled_num; + compiled_obj->launch_context = + std::make_unique(graph, + *compiled_obj); return compiled_obj; } diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt index a2fc080faad..f1247ebdf23 100644 --- a/paddle/fluid/operators/cinn/CMakeLists.txt +++ b/paddle/fluid/operators/cinn/CMakeLists.txt @@ -1,13 +1,13 @@ include(operators) cc_library(cinn_op_helper SRCS cinn_op_helper.cc DEPS operator device_context) -cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope cinn) +cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy parallel_executor cinn) SET(CINN_OP_DEPS string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context) register_operators(DEPS ${CINN_OP_DEPS}) if (WITH_TESTING) - cc_test(cinn_launch_context_test SRCS cinn_launch_context_test.cc DEPS ddim lod_tensor scope cinn_launch_context) + cc_test(cinn_launch_context_test SRCS cinn_launch_context_test.cc DEPS ddim lod_tensor scope proto_desc graph cinn_launch_context cinn_instruction_run_op cinn) set_tests_properties(cinn_launch_context_test PROPERTIES LABELS "RUN_TYPE=CINN") SET(CINN_RUN_ENVIRONMENT "OMP_NUM_THREADS=1;runtime_include_dir=${PADDLE_BINARY_DIR}/third_party/CINN/src/external_cinn/cinn/runtime/cuda") diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc index 0b677f79f7f..0a21d937aa1 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_context.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc @@ -17,22 +17,39 @@ #include #include #include +#include "cinn/hlir/framework/graph_compiler.h" +#include "cinn/hlir/framework/instruction.h" #include "cinn/hlir/framework/scope.h" #include "cinn/hlir/framework/tensor.h" #include "cinn/runtime/cinn_runtime.h" +#include "paddle/fluid/framework/details/build_strategy.h" +#include "paddle/fluid/framework/details/execution_strategy.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h" +#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/operators/cinn/cinn_op_helper.h" +#include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/core/ddim.h" namespace paddle { namespace operators::details { -using LoDTensor = framework::LoDTensor; +using framework::Scope; +using framework::LoDTensor; +using framework::ParallelExecutor; +using CinnInstruction = ::cinn::hlir::framework::Instruction; +using CinnRuntimeProgram = ::cinn::hlir::framework::Program; +using framework::paddle2cinn::Name2VarInfoMap; +using framework::paddle2cinn::kMemOptVarInfoFromMainGraph; -CinnLaunchContext::CinnLaunchContext( - const std::unordered_map& paddle2cinn_varmap, - const std::shared_ptr& cinn_scope) - : cinn_scope_(cinn_scope) { - // generate all names of the cinn execution arguments +CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph, + const CinnCompiledObject& compiled_obj) + : cinn_scope_(compiled_obj.scope) { + // collect all names of the CINN execution arguments auto var_names = cinn_scope_->var_names(); cinn_argument_names_.reserve(var_names.size()); std::transform( @@ -40,7 +57,42 @@ CinnLaunchContext::CinnLaunchContext( std::inserter(cinn_argument_names_, cinn_argument_names_.end()), [](const auto& name_view) { return std::string(name_view.data()); }); // build name map between the original variables and compiled ones - BuildVarNameMap(paddle2cinn_varmap, cinn_argument_names_); + BuildVarNameMap(compiled_obj.paddle2cinn_varmap, cinn_argument_names_); + + const auto& input_var_names = + graph.Get>(framework::paddle2cinn::kInputVars); + const auto& output_var_names = + graph.Get>(framework::paddle2cinn::kOutputVars); + internal_var_names_ = + ExtractInternalVarNames(input_var_names, output_var_names); + // check completeness of output variables in compiled result + for (auto&& var_name : output_var_names) { + PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true, + platform::errors::PreconditionNotMet( + "Variable(%s) not applied in CINN", var_name)); + } + + // initialize all execution arguments + InitializeArguments(); + // DEPRECATED(CtfGo): following callback assignment will be deprecated soon + for (auto&& var_name : input_var_names) { + if (IsVariableUsed(var_name)) { + AssignExternalVariable(var_name); + } + } + for (auto&& var_name : output_var_names) { + AssignExternalVariable(var_name); + } + for (auto&& var_name : internal_var_names_) { + AssignInternalVariable(var_name); + } + + // Convert the CINN runtime program to a Paddle graph + runtime_graph_ = std::make_unique( + BuildCompiledProgram(graph, compiled_obj)); + runtime_graph_->SetNotOwned( + kMemOptVarInfoFromMainGraph, + &graph.Get(kMemOptVarInfoFromMainGraph)); } void CinnLaunchContext::BuildVarNameMap( @@ -94,21 +146,15 @@ void CinnLaunchContext::UpdateCapturedEnv(const framework::Scope& scope, << std::addressof(place); } -bool CinnLaunchContext::IsArgumentsInitialized() const { - if (hold_buffers_.empty() || name2argument_.empty()) { - return false; - } - return true; -} - bool CinnLaunchContext::IsVariableUsed(const std::string& var_name) const { return paddle2cinn_varmap_.count(var_name) > 0; } -CinnTensor CinnLaunchContext::GetCinnTensor(const std::string& arg_name) { - PADDLE_ENFORCE_GT(cinn_argument_names_.count(arg_name), 0, - platform::errors::InvalidArgument( - "Variable(%s) not found in cinn scope.", arg_name)); +CinnTensor CinnLaunchContext::GetCinnTensorOfVar(const std::string& var_name) { + PADDLE_ENFORCE_EQ( + IsVariableUsed(var_name), true, + platform::errors::NotFound("Variable(%s) not applied in CINN", var_name)); + const auto& arg_name = paddle2cinn_varmap_.at(var_name); return cinn_scope_->GetTensor(arg_name); } @@ -132,10 +178,13 @@ std::unordered_set CinnLaunchContext::ExtractInternalVarNames( return remain_var_names; } -void CinnLaunchContext::CheckTensorEquivalent(const std::string& var_name, - const LoDTensor& paddle_tensor, - const CinnTensor& cinn_tensor) { +void CinnLaunchContext::CheckTensorEquivalent( + const std::string& var_name, const framework::LoDTensor& paddle_tensor) { + PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true, + platform::errors::InvalidArgument( + "Variable(%s) not applied in cinn", var_name)); // check dimension + auto cinn_tensor = GetCinnTensorOfVar(var_name); auto cinn_dims = phi::make_ddim(cinn_tensor->shape().data()); PADDLE_ENFORCE_EQ(paddle_tensor.dims(), cinn_dims, platform::errors::PreconditionNotMet( @@ -146,22 +195,28 @@ void CinnLaunchContext::CheckTensorEquivalent(const std::string& var_name, // TODO(CtfGo): check the underlying data type after CINN ready } +void CinnLaunchContext::InitializeArguments() { + for (auto&& arg : cinn_argument_names_) { + auto cinn_buffer = std::make_unique(); + auto cinn_tensor = GetCinnTensorOfVar(cinn2paddle_varmap_.at(arg)); + // assign dimensions with corresponding compiled tensor + cinn_buffer->resize(cinn_tensor->shape().data().data(), + cinn_tensor->shape().data().size()); + VLOG(4) << string::Sprintf( + "Append an argument:name(%s),dims(%s),argument size:(%lu)", arg, + framework::DDim(cinn_buffer->dims, cinn_buffer->dimensions).to_str(), + name2argument_.size()); + name2argument_.emplace(arg, cinn_buffer.get()); + hold_buffers_.emplace_back(std::move(cinn_buffer)); + } +} + void CinnLaunchContext::AssignExternalVariable(const std::string& var_name) { PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true, platform::errors::InvalidArgument( "Variable(%s) not applied in cinn", var_name)); - const auto& cinn_arg_name = paddle2cinn_varmap_.at(var_name); - - const auto& paddle_tensor = cached_scope_->GetVar(var_name)->Get(); - CinnTensor cinn_tensor = GetCinnTensor(cinn_arg_name); - if (paddle_tensor.IsInitialized()) { - CheckTensorEquivalent(var_name, paddle_tensor, cinn_tensor); - } - - auto cinn_buffer = std::make_unique(); - // assign dimensions and alloc/free callback of cinn_buffer_t - cinn_buffer->resize(cinn_tensor->shape().data().data(), - cinn_tensor->shape().data().size()); + auto* cinn_buffer = GetCinnBufferOfVar(var_name); + // assign external malloc/free callbacks of cinn_buffer_t cinn_buffer->external_malloc = new std::function( [this, var_name](void* ctx, cinn_buffer_t* buffer) { auto* tensor = cached_scope_->GetVar(var_name)->GetMutable(); @@ -177,22 +232,14 @@ void CinnLaunchContext::AssignExternalVariable(const std::string& var_name) { // Do nothing return 0; }); - - return AppendArgument(cinn_arg_name, std::move(cinn_buffer)); } void CinnLaunchContext::AssignInternalVariable(const std::string& var_name) { PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true, platform::errors::InvalidArgument( "Variable(%s) not applied in cinn", var_name)); - const auto& cinn_arg_name = paddle2cinn_varmap_.at(var_name); - - CinnTensor cinn_tensor = GetCinnTensor(cinn_arg_name); - auto cinn_buffer = std::make_unique(); - // assign dimensions and alloc/free callback of cinn_buffer_t - cinn_buffer->resize(cinn_tensor->shape().data().data(), - cinn_tensor->shape().data().size()); - + auto* cinn_buffer = GetCinnBufferOfVar(var_name); + // assign external malloc/free callbacks of cinn_buffer_t cinn_buffer->external_malloc = new std::function( [this, var_name](void* ctx, cinn_buffer_t* buffer) { auto* tensor = @@ -212,30 +259,106 @@ void CinnLaunchContext::AssignInternalVariable(const std::string& var_name) { tensor->clear(); return 0; }); - return AppendArgument(cinn_arg_name, std::move(cinn_buffer)); } -void CinnLaunchContext::AppendArgument( - const std::string& arg_name, std::unique_ptr&& buffer) { - name2argument_.emplace(arg_name, buffer.get()); - hold_buffers_.emplace_back(std::move(buffer)); - VLOG(4) << string::Sprintf( - "Append an argument:name(%s),dims(%s),argument size:(%lu)", arg_name, - framework::DDim(buffer->dims, buffer->dimensions).to_str(), - name2argument_.size()); +framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram( + const framework::ir::Graph& graph, const CinnCompiledObject& compiled_obj) { + CinnRuntimeProgram* runtime_program = compiled_obj.runtime_program.get(); + // Step 0: Create an empty program_desc, there will be only one block + framework::ProgramDesc program_desc; + auto* block = program_desc.MutableBlock(0); + const std::vector>& instructions = + runtime_program->GetRunInstructions(); + + // build a map that links the name of a Paddle variable to its VarDesc + const std::unordered_set& nodes = graph.Nodes(); + std::unordered_map original_vardescs; + for (auto* node : nodes) { + if (node->IsVar() && node->Var()) { + original_vardescs.emplace(node->Name(), node->Var()); + } + } + + // Step 1: Create a VarDesc for each execution argument: + // (1) For those variables that are input or output variables of the + // original subgraph, there must exist an original VarDesc, so + // we copy some useful info(such as IsParameter,Persistable) + // to the new VarDesc. + // (2) For all variables, the shape, data type of their VarDescs + // are set by values of the corresponding compiled tensors, + // including the in/out variables where the equiality between their tensors + // and the CINN compiled ones is verified in corresponding cinn_launch_op. + for (auto&& arg : cinn_argument_names_) { + const std::string& var_name = cinn2paddle_varmap_.at(arg); + framework::VarDesc* var_desc = block->Var(var_name); + var_desc->SetType(framework::proto::VarType::LOD_TENSOR); + + auto res = original_vardescs.find(var_name); + if (res != original_vardescs.end()) { + auto* ori_desc = res->second; + var_desc->SetPersistable(ori_desc->Persistable()); + var_desc->SetIsParameter(ori_desc->IsParameter()); + } + + auto cinn_tensor = GetCinnTensorOfVar(var_name); + // TODO(CtfGo): set the corresponding data type after CINN ready, + // currently set as FP32 in default + var_desc->SetDataType(framework::proto::VarType::FP32); + var_desc->SetShape(std::vector(cinn_tensor->shape().data().begin(), + cinn_tensor->shape().data().end())); + } + + // transform names of the input or output arguments of a CINN instruction + // to the corresponding Paddle variable names, and repack them as one vector + auto trans_and_pack_args_fn = + [this](const std::vector>& cinn_args_array) { + std::vector var_names; + for (auto&& cinn_args : cinn_args_array) { + for (auto&& arg : cinn_args) { + auto res = cinn2paddle_varmap_.find(arg); + PADDLE_ENFORCE_NE( + res, cinn2paddle_varmap_.end(), + platform::errors::NotFound("Argument(%s) not found", arg)); + var_names.emplace_back(res->second); + } + } + return var_names; + }; + + // Step 2: create a VarDesc of cinn_instruction_run op for + // each CINN instruction and append it to the main block + for (auto ins_idx = 0; ins_idx < instructions.size(); ++ins_idx) { + auto* ins = instructions.at(ins_idx).get(); + auto in_args = trans_and_pack_args_fn(ins->GetInArgs()); + auto out_args = trans_and_pack_args_fn(ins->GetOutArgs()); + + auto* op_desc = block->AppendOp(); + op_desc->SetType("cinn_instruction_run"); + op_desc->SetInput(kX, in_args); + op_desc->SetOutput(kOutputs, out_args); + op_desc->SetAttr(kCachedIndex, + {static_cast(compiled_obj.cached_index)}); + op_desc->SetAttr(kInstructionIndex, {static_cast(ins_idx)}); + } + + return program_desc; } -const std::map& -CinnLaunchContext::FinalizeArguments() const { - // Check all execution parameters are assigned valued. - std::for_each(cinn_argument_names_.begin(), cinn_argument_names_.end(), - [this](const auto& arg_name) { - PADDLE_ENFORCE_GT( - name2argument_.count(arg_name), 0, - platform::errors::NotFound( - "Argument(%s) is missed for execution", arg_name)); - }); - return name2argument_; +ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place, + framework::Scope* scope) { + if (!parallel_executor_) { + framework::details::ExecutionStrategy exec_strategy; + framework::details::BuildStrategy build_strategy; + parallel_executor_ = std::make_unique( + place, scope, exec_strategy, build_strategy, runtime_graph_.get()); + } + + // update the scope bound to an OpHandle and rebuild temporary variables + std::unordered_map scope_map = { + {parallel_executor_->GetLocalScopes().front(), scope}}; + parallel_executor_->ResetOpHandleScopeMapOfGraphs(scope_map); + parallel_executor_->PrepareVariables(scope); + return parallel_executor_.get(); } cinn_buffer_t* CinnLaunchContext::GetCinnBufferOfVar( diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.h b/paddle/fluid/operators/cinn/cinn_launch_context.h index 502e6a92dc1..a4d613ea618 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_context.h +++ b/paddle/fluid/operators/cinn/cinn_launch_context.h @@ -21,7 +21,7 @@ #include #include #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/platform/place.h" #include "paddle/phi/core/ddim.h" @@ -35,10 +35,25 @@ class Program; } // namespace cinn::hlir::framework namespace paddle { +namespace framework { +class ProgramDesc; +class Scope; +class VarDesc; + +namespace ir { +class Graph; +} // namespace ir + +namespace paddle2cinn { +class CinnCompiledObject; +} // namespace paddle2cinn +} // namespace framework + namespace operators::details { using CinnTensor = ::cinn::hlir::framework::Tensor; using CinnScope = ::cinn::hlir::framework::Scope; +using CinnCompiledObject = framework::paddle2cinn::CinnCompiledObject; // This class is used to cache some reusable data among repeated // executions for efficiency and it also provides easy interfaces @@ -49,58 +64,71 @@ using CinnScope = ::cinn::hlir::framework::Scope; // Variable while a CINN variable is called an Argument. class CinnLaunchContext { public: - explicit CinnLaunchContext( - const std::unordered_map& paddle2cinn_varmap, - const std::shared_ptr& cinn_scope); + explicit CinnLaunchContext(const framework::ir::Graph& graph, + const CinnCompiledObject& compiled_obj); + + // Initialize a ParallelExecutor to execute the runtime graph, + // it will be constructed in the first call, and just update + // the execution scope in the following usage. + framework::ParallelExecutor* InitializePE(const platform::Place& place, + framework::Scope* scope); // explicitly update several environment variables captured // by callback of execution arguments void UpdateCapturedEnv(const framework::Scope& scope, const platform::Place& place); - // Return whether execution arguments has been initialized - bool IsArgumentsInitialized() const; - // Return whether a Paddle variable used in cinn execution bool IsVariableUsed(const std::string& var_name) const; - // Assign tensor buffer to input or output variables - void AssignExternalVariable(const std::string& var_name); - - // Assign tensor buffer to internal variables - void AssignInternalVariable(const std::string& var_name); + // Check the equiality in type and dimension between the tensor + // in Paddle and the compiled tensor returned by CINN of a same variable + void CheckTensorEquivalent(const std::string& var_name, + const framework::LoDTensor& paddle_tensor); - // Extract internal variable names from all applied variables - // in execution by excluding the input and output variables - std::unordered_set ExtractInternalVarNames( - const std::vector& input_var_names, - const std::vector& output_var_names); + // Return internal variable names list + const std::unordered_set& GetInternalVarNames() const { + return internal_var_names_; + } // Finalize all execution arguments and return the name->argument map - const std::map& FinalizeArguments() const; + const std::map& FinalizeArguments() const { + return name2argument_; + } // Return the cinn_buffer_t* of a specific variable cinn_buffer_t* GetCinnBufferOfVar(const std::string& var_name); private: - // Get CinnTensor with CINN argument name - CinnTensor GetCinnTensor(const std::string& arg_name); + // Get corresponding compiled tensor of a Paddle variable name + CinnTensor GetCinnTensorOfVar(const std::string& var_name); + // Build the name maps of paddle->cinn and cinn->paddle // in reverse for all variables used in cinn execution void BuildVarNameMap( const std::unordered_map& compiled_varmap, const std::unordered_set& argument_names); - // Check whether the tensor in Paddle and the compiled - // tensor returned by CINN of a same variable - // are equivalent in type and dimension - void CheckTensorEquivalent(const std::string& var_name, - const framework::LoDTensor& paddle_tensor, - const CinnTensor& cinn_tensor); + // Extract internal variable names from all applied variables + // in execution by excluding the input and output variables + std::unordered_set ExtractInternalVarNames( + const std::vector& input_var_names, + const std::vector& output_var_names); + + // Initialize each execution argument with a cinn_buffer_t + void InitializeArguments(); - // Append an argument with (cinn name)->(cinn_buffer_t) pair - void AppendArgument(const std::string& arg_name, - std::unique_ptr&& buffer); + // Assign tensor buffer to input or output variables + void AssignExternalVariable(const std::string& var_name); + + // Assign tensor buffer to internal variables + void AssignInternalVariable(const std::string& var_name); + + // Construct a Paddle ProgramDesc with the CINN runtime + // instructions included in the compiled CINN Program + framework::ProgramDesc BuildCompiledProgram( + const framework::ir::Graph& graph, + const CinnCompiledObject& compiled_obj); private: const framework::Scope* cached_scope_ = nullptr; @@ -111,16 +139,22 @@ class CinnLaunchContext { std::unordered_map paddle2cinn_varmap_; // a name map from cinn execution arguments to paddle variables std::unordered_map cinn2paddle_varmap_; + // a list of internal variable names in Paddle + std::unordered_set internal_var_names_; // the names of the cinn arguments used in compiled executable program std::unordered_set cinn_argument_names_; // the variable scope compiled from cinn const std::shared_ptr cinn_scope_; + // the ir::Graph object converted from the program compiled by CINN + std::unique_ptr runtime_graph_; + // a ParallelExecutor to execute the runtime graph + std::unique_ptr parallel_executor_; + // because a cinn_pod_value_t does not own a cinn_buffer_t object, // an extra stroage is necessary to keep those objects and they can // not be released until the runtime program finish execution. std::vector> hold_buffers_; - // this map saves all execution arguments with their cinn names as key, // and it is passed to the Execute interface of a cinn runtime program. std::map name2argument_; diff --git a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc index 58a9c5db712..4976a59d1dd 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc @@ -13,87 +13,229 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/cinn/cinn_launch_context.h" +#include +#include +#include +#include "cinn/common/target.h" +#include "cinn/hlir/framework/graph_compiler.h" +#include "cinn/hlir/framework/instruction.h" #include "cinn/hlir/framework/scope.h" #include "cinn/hlir/framework/tensor.h" #include "cinn/runtime/cinn_runtime.h" #include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h" +#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h" +#include "paddle/fluid/framework/parallel_executor.h" +#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/operators/cinn/cinn_op_helper.h" #include "paddle/phi/core/ddim.h" +USE_OP(cinn_instruction_run); namespace paddle { namespace operators::details { -using LoDTensor = framework::LoDTensor; +using framework::OpDesc; +using framework::ProgramDesc; +using framework::LoDTensor; +using framework::ir::Graph; +using framework::ParallelExecutor; +using framework::paddle2cinn::Name2VarInfoMap; using CinnShape = ::cinn::hlir::framework::Shape; +using CinnInstruction = ::cinn::hlir::framework::Instruction; +using CinnRuntimeProgram = ::cinn::hlir::framework::Program; -std::unique_ptr CreateDefaultLaunchContext() { +const Graph& InitDefaultSubgraph() { static std::once_flag initialized; - static std::unordered_map paddle2cinn_varmap; - static std::shared_ptr cinn_scope; - std::call_once(initialized, [&paddle2cinn_varmap, &cinn_scope]() { - auto& scope = cinn_scope; - scope = std::make_shared(); + static std::unique_ptr graph; + std::call_once(initialized, [&]() { + ProgramDesc program; + auto* block = program.MutableBlock(0); + auto* var1 = block->Var("var1"); + var1->SetPersistable(true); + block->Var("var2"); + block->Var("var3"); + block->Var("var4"); + auto* var5 = block->Var("var5"); + var5->SetIsParameter(true); + auto add_op = std::unique_ptr( + new OpDesc("elementwise_add", {{"X", {"var1"}}, {"Y", {"var2"}}}, + {{"Out", {"var3"}}}, {})); + block->AppendAllocatedOp(std::move(add_op)); + auto mul_op = std::unique_ptr(new OpDesc( + "mul", {{"X", {"var1"}}, {"Y", {"var2"}}}, {{"Out", {"var4"}}}, {})); + block->AppendAllocatedOp(std::move(mul_op)); + auto res_op = std::unique_ptr( + new OpDesc("elementwise_add", {{"X", {"var3"}}, {"Y", {"var4"}}}, + {{"Out", {"var5"}}}, {})); + block->AppendAllocatedOp(std::move(res_op)); + graph = std::make_unique(program); + + graph->Set>( + framework::paddle2cinn::kInputVars, + new std::vector({"var1", "var2"})); + graph->Set>( + framework::paddle2cinn::kInternalVars, + new std::vector({"var3", "var4"})); + graph->Set>( + framework::paddle2cinn::kOutputVars, + new std::vector({"var5"})); + graph->GetOrInit( + framework::paddle2cinn::kMemOptVarInfoFromMainGraph); + }); + return *graph.get(); +} +CinnCompiledObject* InitDefaultCompiledObject() { + static std::once_flag initialized; + static auto compiled_obj = std::make_unique(); + std::call_once(initialized, [result = compiled_obj.get()]() { + auto& scope = result->scope; + scope = std::make_shared(); scope->Var("cinn_var1"); scope->GetTensor("cinn_var1")->Resize(CinnShape({3, 4})); scope->Var("cinn_var2"); scope->GetTensor("cinn_var2")->Resize(CinnShape({6, 7, 8})); scope->Var("cinn_var3"); scope->GetTensor("cinn_var3")->Resize(CinnShape({10, 16})); + scope->Var("cinn_var4"); + scope->GetTensor("cinn_var4")->Resize(CinnShape({10, 16})); + scope->Var("cinn_var5"); + scope->GetTensor("cinn_var5")->Resize(CinnShape({10, 16})); - paddle2cinn_varmap = { - {"var1", "cinn_var1"}, {"var3", "cinn_var3"}, {"var4", "cinn_var4"}}; + // input variables: var1, var2; output: var5 + // internal variables: var3 and var4, here var3 is retained + // in result map, so the name will be used neither cinn_var3 + auto& paddle2cinn_varmap = result->paddle2cinn_varmap; + paddle2cinn_varmap = {{"var1", "cinn_var1"}, + {"var2", "cinn_var2"}, + {"var3", "cinn_var3"}, + {"var5", "cinn_var5"}}; + + auto& runtime_program = result->runtime_program; + std::vector> instructions; + instructions.emplace_back(new CinnInstruction( + cinn::common::DefaultHostTarget(), scope.get(), + {"cinn_var1", "cinn_var2"}, {"cinn_var3"}, "elementwise_add")); + instructions.emplace_back( + new CinnInstruction(cinn::common::DefaultHostTarget(), scope.get(), + {"cinn_var1", "cinn_var2"}, {"cinn_var4"}, "mul")); + instructions.emplace_back(new CinnInstruction( + cinn::common::DefaultHostTarget(), scope.get(), + {"cinn_var3", "cinn_var4"}, {"cinn_var5"}, "elementwise_add")); + runtime_program = + std::make_unique(scope, std::move(instructions)); + result->cached_index = 110; }); - return std::make_unique(paddle2cinn_varmap, cinn_scope); + return compiled_obj.get(); } -TEST(CinnLaunchContextTest, TestBasic) { - auto launch_context = CreateDefaultLaunchContext(); - // test IsVariableUsed +class CinnLaunchContextTest : public ::testing::Test { + public: + std::unique_ptr launch_context; + CinnCompiledObject* compiled_obj; + + void SetUp() override { + compiled_obj = InitDefaultCompiledObject(); + launch_context = std::make_unique(InitDefaultSubgraph(), + *compiled_obj); + } +}; + +TEST_F(CinnLaunchContextTest, TestConstructResult) { ASSERT_EQ(launch_context->IsVariableUsed("var1"), true); + ASSERT_EQ(launch_context->IsVariableUsed("var2"), true); + ASSERT_EQ(launch_context->IsVariableUsed("var3"), true); ASSERT_EQ(launch_context->IsVariableUsed("var4"), false); - // test UpdateCapturedEnv - platform::CPUPlace place; - framework::Scope scope; - ASSERT_NO_THROW(launch_context->UpdateCapturedEnv(scope, place)); - // test IsArgumentsInitialized - ASSERT_FALSE(launch_context->IsArgumentsInitialized()); + ASSERT_EQ(launch_context->IsVariableUsed("var5"), true); + + // check result of ExtractInternalVarNames + ASSERT_EQ(launch_context->GetInternalVarNames(), + std::unordered_set({"var3", "cinn_var4"})); + + // check completeness of arguments list, and also check + // the two name maps of the paddle->cinn and the reverse one + // through the IsVariableUsed interface + auto&& arguments = launch_context->FinalizeArguments(); + ASSERT_EQ(arguments.size(), 5); + auto check_argument_fn = [&arguments, this](const std::string& var_name, + const std::string& arg_name) { + ASSERT_EQ(launch_context->IsVariableUsed(var_name), true); + ASSERT_NO_THROW(launch_context->GetCinnBufferOfVar(var_name)); + ASSERT_GT(arguments.count(arg_name), 0); + EXPECT_EQ(launch_context->GetCinnBufferOfVar(var_name), + static_cast(arguments.at(arg_name))); + auto* buffer = launch_context->GetCinnBufferOfVar(var_name); + auto&& scope = compiled_obj->scope; + ASSERT_EQ(framework::DDim(buffer->dims, buffer->dimensions), + phi::make_ddim(scope->GetTensor(arg_name)->shape().data())); + }; + check_argument_fn("var1", "cinn_var1"); + check_argument_fn("var2", "cinn_var2"); + check_argument_fn("var3", "cinn_var3"); + check_argument_fn("cinn_var4", "cinn_var4"); + check_argument_fn("var5", "cinn_var5"); } -TEST(CinnLaunchContextTest, TestCheckTensorEquivalent) { +TEST_F(CinnLaunchContextTest, TestCheckTensorEquivalent) { platform::CPUPlace place; framework::Scope scope; - auto launch_context = CreateDefaultLaunchContext(); launch_context->UpdateCapturedEnv(scope, place); auto* tensor1 = scope.Var("var1")->GetMutable(); // CheckTensorEquivalent: tensor dimension not equivalent tensor1->mutable_data(phi::make_ddim({3, 5}), place); - ASSERT_THROW(launch_context->AssignExternalVariable("var1"), + ASSERT_THROW(launch_context->CheckTensorEquivalent("var1", *tensor1), paddle::platform::EnforceNotMet); } -TEST(CinnLaunchContextTest, TestAssignVariablePreCondition) { +TEST_F(CinnLaunchContextTest, TestBuildCompiledProgram) { platform::CPUPlace place; framework::Scope scope; - auto launch_context = CreateDefaultLaunchContext(); - launch_context->UpdateCapturedEnv(scope, place); - auto* tensor4 = scope.Var("var4")->GetMutable(); + ParallelExecutor* pe = nullptr; + ASSERT_NO_THROW((pe = launch_context->InitializePE(place, &scope))); - // not used - ASSERT_THROW(launch_context->AssignExternalVariable("var4"), - paddle::platform::EnforceNotMet); - // not found - ASSERT_THROW(launch_context->AssignInternalVariable("cinn_var4"), - paddle::platform::EnforceNotMet); + // check details of program build by compiled instructions + const ProgramDesc& program = pe->Graph().OriginProgram(); + ASSERT_EQ(program.Size(), 1); + const auto& block = program.Block(0); + // vars + std::set var_names = block.LocalVarNames(); + ASSERT_EQ(var_names.size(), 5); + for (auto&& var_name : var_names) { + auto* var = block.FindVar(var_name); + ASSERT_NE(var, nullptr); + auto* buffer = launch_context->GetCinnBufferOfVar(var_name); + ASSERT_EQ(framework::DDim(buffer->dims, buffer->dimensions), + phi::make_ddim(var->GetShape())); + } + ASSERT_TRUE(block.FindVar("var1")->Persistable()); + ASSERT_FALSE(block.FindVar("var5")->Persistable()); + ASSERT_TRUE(block.FindVar("var5")->IsParameter()); + ASSERT_FALSE(block.FindVar("var1")->IsParameter()); + // ops + ASSERT_EQ(block.OpSize(), 3); + auto* op1 = block.Op(0); + ASSERT_EQ(op1->Type(), "cinn_instruction_run"); + ASSERT_EQ(op1->Input(kX), std::vector({"var1", "var2"})); + ASSERT_EQ(op1->Output(kOutputs), std::vector({"var3"})); + ASSERT_EQ(op1->GetAttrIfExists(kCachedIndex), 110); + ASSERT_EQ(op1->GetAttrIfExists(kInstructionIndex), 0); + auto* op3 = block.Op(2); + ASSERT_EQ(op3->Type(), "cinn_instruction_run"); + ASSERT_EQ(op3->Input(kX), std::vector({"var3", "cinn_var4"})); + ASSERT_EQ(op3->Output(kOutputs), std::vector({"var5"})); + ASSERT_EQ(op3->GetAttrIfExists(kCachedIndex), 110); + ASSERT_EQ(op3->GetAttrIfExists(kInstructionIndex), 2); } -TEST(CinnLaunchContextTest, TestAppendArgument) { - platform::CPUPlace cpu_place; - platform::Place place(cpu_place); +// DEPRECATED(CtfGo): following test of callback assignment +// will be deprecated after we switch to pe +TEST_F(CinnLaunchContextTest, TestCallbackAssignment) { + platform::CPUPlace place; framework::Scope scope; - auto launch_context = CreateDefaultLaunchContext(); launch_context->UpdateCapturedEnv(scope, place); // assign external variables @@ -101,33 +243,8 @@ TEST(CinnLaunchContextTest, TestAppendArgument) { float* data1 = tensor1->mutable_data(phi::make_ddim({3, 4}), place); data1[0] = 9.99f; data1[10] = 19.99f; - ASSERT_NO_THROW(launch_context->AssignExternalVariable("var1")); - - auto* tensor3 = scope.Var("var3")->GetMutable(); - tensor3->mutable_data(phi::make_ddim({10, 16}), place); - ASSERT_NO_THROW(launch_context->AssignExternalVariable("var3")); - - // FinalizeArguments missed check - ASSERT_THROW(launch_context->FinalizeArguments(), - paddle::platform::EnforceNotMet); - // test get internal variables - auto internal_variable_names = - launch_context->ExtractInternalVarNames({"var1"}, {"var3"}); - ASSERT_EQ(internal_variable_names.size(), 1); - EXPECT_EQ(*internal_variable_names.begin(), "cinn_var2"); - - auto* tensor2 = scope.Var("var2")->GetMutable(); - tensor2->mutable_data(phi::make_ddim({6, 7, 8}), place); - ASSERT_NO_THROW(launch_context->AssignInternalVariable("cinn_var2")); - // check argument is set correctly and alloc/free callbacks work well - auto name2argument = launch_context->FinalizeArguments(); - ASSERT_EQ(name2argument.size(), 3); - ASSERT_EQ(name2argument.count("cinn_var1"), 1); - ASSERT_TRUE(launch_context->IsArgumentsInitialized()); - - auto* cinn_buffer = - static_cast(name2argument.at("cinn_var1")); + auto* cinn_buffer = launch_context->GetCinnBufferOfVar("var1"); ASSERT_EQ(cinn_buffer->memory, nullptr); cinn_buffer->external_malloc->operator()(nullptr, cinn_buffer); ASSERT_NE(cinn_buffer->memory, nullptr); diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h index 1db9f2f25e2..cf3b98c6679 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op.h +++ b/paddle/fluid/operators/cinn/cinn_launch_op.h @@ -105,63 +105,29 @@ class CinnLaunchOpKernel : public framework::OpKernel { auto* launch_context = cinn_compiled_object.launch_context.get(); // Step 3. Prepare arguments needed for the compiled executable program. launch_context->UpdateCapturedEnv(scope, place); - if (!launch_context->IsArgumentsInitialized()) { - VLOG(4) << "CinnLaunchOp prepare arguments"; - - // 3.1 Prepare input variables: tensors of input variables have - // been initialized before graph compiled, just check the - // equiality between tensors of paddle and cinn. - for (const auto& var_name : input_no_need_buffer_variable_names) { - // the input variable declared as 'no need buffer' can not be used - PADDLE_ENFORCE_EQ( - launch_context->IsVariableUsed(var_name), false, - platform::errors::InvalidArgument( - "Input variable(%s) should not be used by cinn in execution", - var_name)); - } - - for (const auto& var_name : input_x_variable_names) { - // some input variables don't need for cinn because they are - // eliminated by optimized passes or some cinn operators use - // less variables - if (!launch_context->IsVariableUsed(var_name)) { - VLOG(4) << "Input variable" << var_name << " not used by cinn"; - continue; - } - - launch_context->AssignExternalVariable(var_name); - } - - // 3.2 Prepare output variables: all output variables should - // be initialized and allocated buffer before - // the runtime program start execution, the compilation result - // includes details of their buffer assginment and we use that to - // allocate space in Paddle. For those variables allocated yet, - // like persistable parameters, just check the equiality between - // Paddle allocation and CINN buffer assginment. - auto output_variable_names = ctx.OutputNames(kOutputs); - for (const auto var_name : output_variable_names) { - PADDLE_ENFORCE_EQ( - launch_context->IsVariableUsed(var_name), true, - platform::errors::InvalidArgument( - "Output variable(%s) not used by cinn", var_name)); - - launch_context->AssignExternalVariable(var_name); - } - - // 3.3 Prepare internal or temporary variables: Create a temporary - // scope to keep internal variables within graph or temporary - // variables needed by the compiled runtime program in addition. - // Here we directly use the names from CinnScope as Paddle variable - // names, because they will not be used outside the graph - // and should be destructed after computation finished. - auto internal_variable_names = launch_context->ExtractInternalVarNames( - input_x_variable_names, output_variable_names); - for (const auto& var_name : internal_variable_names) { - launch_context->AssignInternalVariable(var_name); + // 3.1 Input variables: tensors of input variables have + // been initialized before graph compiled, just check the + // equiality between tensors of paddle and cinn. + for (const auto& var_name : input_x_variable_names) { + // some input variables don't need for cinn because they are + // eliminated by optimized passes or some cinn operators use + // less variables + if (!launch_context->IsVariableUsed(var_name)) { + VLOG(4) << "Input variable" << var_name << " not used by cinn"; + continue; } + launch_context->CheckTensorEquivalent(var_name, + *inputs_name2tensor.at(var_name)); } + // 3.2 Output variables: the output variables will be initialized + // and allocated buffer in callbacks which are defined in the + // external_malloc/free interface of cinn_buffer_t + // in their corresponding arguments. + // 3.3 Internal variables: A temporary scope is created in + // UpdateCapturedEnv to keep the internal variables and + // they are also initialized through callbacks + // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic. details::SetCinnRuntimeFlags(); diff --git a/paddle/fluid/operators/cinn/test_helper.h b/paddle/fluid/operators/cinn/test_helper.h index eb3d725d554..9720a5309fa 100644 --- a/paddle/fluid/operators/cinn/test_helper.h +++ b/paddle/fluid/operators/cinn/test_helper.h @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h" #include "paddle/fluid/framework/scope.h" #include "paddle/phi/core/ddim.h" @@ -31,6 +32,7 @@ using LoDTensor = framework::LoDTensor; using Variable = framework::Variable; using Graph = framework::ir::Graph; using Node = framework::ir::Node; +using framework::paddle2cinn::Name2VarInfoMap; std::unique_ptr CreateOnlyElementwiseAddGraph( const std::string& x_name, const std::string& y_name, @@ -71,6 +73,16 @@ std::unique_ptr CreateOnlyElementwiseAddGraph( y_node->inputs = {feed_op_node_y}; y_node->outputs = {elementwise_add_node}; out_node->inputs = {elementwise_add_node}; + // set necessary attributes + g->Set>( + framework::paddle2cinn::kInputVars, + new std::vector({x_name, y_name})); + g->Set>(framework::paddle2cinn::kInternalVars, + new std::vector({})); + g->Set>(framework::paddle2cinn::kOutputVars, + new std::vector({out_name})); + g->GetOrInit( + framework::paddle2cinn::kMemOptVarInfoFromMainGraph); return g; } -- GitLab