diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 2fc2deb087e89a52b3e451e30e80a0cd7cc671e0..e0a80d3c79854301fe55e63fc4655fe76cdd9caf 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1131,7 +1131,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, // phase if (FLAGS_run_pten_kernel && pten::KernelFactory::Instance().HasCompatiblePtenKernel(type_)) { - if (pt_kernel_signature_.get() == nullptr || pt_kernel_.get() == nullptr) { + if (pt_kernel_signature_ == nullptr || pt_kernel_ == nullptr) { ChoosePtenKernel(exe_ctx); } run_pten_kernel_ = pt_kernel_->IsValid(); @@ -1178,8 +1178,12 @@ void OperatorWithKernel::RunImpl(const Scope& scope, platform::RecordEvent record_event("compute", platform::EventRole::kInnerOp); if (run_pten_kernel_) { - auto op_kernel_ctx = BuildPtenKernelContext(*runtime_ctx, *dev_ctx); - (*pt_kernel_)(&op_kernel_ctx); + if (pt_kernel_context_ == nullptr) { + pt_kernel_context_.reset(new pten::KernelContext()); + } + BuildPtenKernelContext(*runtime_ctx, dev_ctx); + (*pt_kernel_)(pt_kernel_context_.get()); + pt_kernel_context_->ClearData(); } else { (*kernel_func_)( ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx)); @@ -1765,8 +1769,8 @@ KernelSignature OperatorWithKernel::GetExpectedPtenKernelArgs( return KernelSignatureMap::Instance().Get(Type()); } -pten::KernelContext OperatorWithKernel::BuildPtenKernelContext( - const RuntimeContext& ctx, const platform::DeviceContext& dev_ctx) const { +void OperatorWithKernel::BuildPtenKernelContext( + const RuntimeContext& ctx, platform::DeviceContext* dev_ctx) const { // TODO(chenweihang): now only work for very simple case, // many cases need to be deal with later: // 1. the input and output are not tensor @@ -1774,7 +1778,7 @@ pten::KernelContext OperatorWithKernel::BuildPtenKernelContext( // 3. needless attributes remove // 4. use pt Tensor directly // 5. kernel input is not DenseTensor - pten::KernelContext op_kernel_ctx(dev_ctx); + pt_kernel_context_->SetDeviceContext(dev_ctx); auto& input_names = std::get<0>(pt_kernel_signature_->args); auto& attr_names = std::get<1>(pt_kernel_signature_->args); @@ -1803,30 +1807,53 @@ pten::KernelContext OperatorWithKernel::BuildPtenKernelContext( attr_names.size(), attr_defs.size())); for (size_t i = 0; i < input_names.size(); ++i) { - auto in_def = input_defs.at(i); - VLOG(2) << "in_def: " << in_def.backend << ", " << in_def.dtype << ", " - << in_def.layout; - - auto ins_vector = ctx.inputs.at(input_names[i]); - - paddle::SmallVector> tmp_inputs; - for (auto var : ins_vector) { - tmp_inputs.emplace_back( - experimental::MakePtenTensorBaseFromVar(*var, in_def)); + auto& in_def = input_defs.at(i); + auto& ins_vector = ctx.inputs.at(input_names[i]); + if (pt_kernel_context_->InputsSize() <= i) { + paddle::SmallVector> tmp_inputs; + for (auto* var : ins_vector) { + tmp_inputs.emplace_back( + experimental::MakePtenTensorBaseFromVar(*var, in_def)); + } + pt_kernel_context_->EmplaceBackInputs(std::move(tmp_inputs)); + } else { + size_t input_size = pt_kernel_context_->InputsSize(); + for (size_t j = 0; j < ins_vector.size(); ++j) { + if (input_size > i + j) { + experimental::ReMakePtenDenseTensorFromVar( + *ins_vector[j], in_def, + pt_kernel_context_->MutableInputAt(i + j)); + } + // TODO(chenweihang): adapt multi-input case later + } + pt_kernel_context_->MutableInputRangeAt(i) = + std::make_pair(i, i + ins_vector.size()); } - op_kernel_ctx.EmplaceBackInputs(std::move(tmp_inputs)); } for (size_t i = 0; i < output_names.size(); ++i) { - auto out_def = output_defs.at(i); - auto outs_vector = ctx.outputs.at(output_names[i]); - - paddle::SmallVector> tmp_outputs; - for (auto var : outs_vector) { - tmp_outputs.emplace_back( - experimental::MakePtenTensorBaseFromVar(var, out_def)); + auto& out_def = output_defs.at(i); + auto& outs_vector = ctx.outputs.at(output_names[i]); + if (pt_kernel_context_->OutputsSize() <= i) { + paddle::SmallVector> tmp_outputs; + for (auto* var : outs_vector) { + tmp_outputs.emplace_back( + experimental::MakePtenTensorBaseFromVar(var, out_def)); + } + pt_kernel_context_->EmplaceBackOutputs(std::move(tmp_outputs)); + } else { + size_t output_size = pt_kernel_context_->OutputsSize(); + for (size_t j = 0; j < outs_vector.size(); ++j) { + if (output_size > i + j) { + experimental::ReMakePtenDenseTensorFromVar( + outs_vector[j], out_def, + pt_kernel_context_->MutableOutputAt(i + j)); + } + // TODO(chenweihang): adapt multi-output case later + } + pt_kernel_context_->MutableOutputRangeAt(i) = + std::make_pair(i, i + outs_vector.size()); } - op_kernel_ctx.EmplaceBackOutputs(std::move(tmp_outputs)); } for (size_t i = 0; i < attr_names.size(); ++i) { @@ -1836,11 +1863,11 @@ pten::KernelContext OperatorWithKernel::BuildPtenKernelContext( // TODO(zhangyunfei): Scalar should hold scaler type, and we should check // attribtue type by attr_defs if (std::type_index(attr.type()) == std::type_index(typeid(float))) { - op_kernel_ctx.EmplaceBackAttr( + pt_kernel_context_->EmplaceBackAttr( std::move(pten::Scalar(BOOST_GET_CONST(float, attr)))); } else if (std::type_index(attr.type()) == std::type_index(typeid(std::string))) { - op_kernel_ctx.EmplaceBackAttr( + pt_kernel_context_->EmplaceBackAttr( std::move(pten::Scalar(BOOST_GET_CONST(std::string, attr)))); } else { PADDLE_THROW(platform::errors::Unimplemented( @@ -1851,11 +1878,11 @@ pten::KernelContext OperatorWithKernel::BuildPtenKernelContext( } else { // TODO(chenweihang): support other attrs later if (attr_defs[i].type_index == std::type_index(typeid(int))) { - op_kernel_ctx.EmplaceBackAttr(BOOST_GET_CONST(int, attr)); + pt_kernel_context_->EmplaceBackAttr(BOOST_GET_CONST(int, attr)); } else if (attr_defs[i].type_index == std::type_index(typeid(float))) { - op_kernel_ctx.EmplaceBackAttr(BOOST_GET_CONST(float, attr)); + pt_kernel_context_->EmplaceBackAttr(BOOST_GET_CONST(float, attr)); } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) { - op_kernel_ctx.EmplaceBackAttr(BOOST_GET_CONST(bool, attr)); + pt_kernel_context_->EmplaceBackAttr(BOOST_GET_CONST(bool, attr)); } else { PADDLE_THROW(platform::errors::Unimplemented( "unsupported cast op attribute `%s` when construct " @@ -1864,8 +1891,6 @@ pten::KernelContext OperatorWithKernel::BuildPtenKernelContext( } } } - - return op_kernel_ctx; } } // namespace framework diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index b75dcade6fccf703b36e705b6171b65d93f0d723..4c071b777fe8359cba4276dc53ca690df6d1c1de 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -586,8 +586,8 @@ class OperatorWithKernel : public OperatorBase { /* member functions for adapting to pten lib */ void ChoosePtenKernel(const ExecutionContext& ctx) const; - pten::KernelContext BuildPtenKernelContext( - const RuntimeContext& ctx, const platform::DeviceContext& dev_ctx) const; + void BuildPtenKernelContext(const RuntimeContext& ctx, + platform::DeviceContext* dev_ctx) const; protected: mutable std::unique_ptr kernel_type_; @@ -605,6 +605,9 @@ class OperatorWithKernel : public OperatorBase { mutable bool run_pten_kernel_ = false; mutable std::unique_ptr pt_kernel_signature_; mutable std::unique_ptr pt_kernel_; + // In order to reduce the compatibility phase + // performance overhead, temporarily cache KernelContext + mutable std::unique_ptr pt_kernel_context_; }; extern bool OpSupportGPU(const std::string& op_type); diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index c45f92496b3e827fc6f6f342f72da90afee6930e..8f196636af4894deee2044586fb7903e2780ba5a 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -1,14 +1,13 @@ cc_library(imperative_flag SRCS flags.cc DEPS gflags flags) - IF(WITH_XPU) -cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils pten_utils) +cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils pten pten_utils) ELSE() -cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils pten_utils) +cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils pten pten_utils) ENDIF() cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry) add_subdirectory(jit) cc_library(amp SRCS amp_auto_cast.cc DEPS layer ) -cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal) +cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal garbage_collector) cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator) cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator) cc_library(imperative_profiler SRCS profiler.cc DEPS flags) diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 53ae5b8127fdba5dd68ddc6748dc35e9fe7ae8ec..b584b928f96b9733bdeb36821092944323520f4b 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -356,6 +356,8 @@ void VarBase::BumpInplaceVersion() { MutableVar()->BumpInplaceVersion(); } +pten::KernelContext OpBase::pt_kernel_context_; + void OpBase::SetType(const std::string& type) { op_ = framework::OpRegistry::CreateOp(type, {}, {}, {}, false); } @@ -371,7 +373,8 @@ static void OpBaseRunImpl(const framework::OperatorBase& op, const NameVarMap& outs, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs, - const platform::Place& place) { + const platform::Place& place, + pten::KernelContext* pt_kernel_context) { auto* op_kernel = dynamic_cast(&op); PADDLE_ENFORCE_NOT_NULL( op_kernel, platform::errors::PermissionDenied( @@ -412,8 +415,8 @@ static void OpBaseRunImpl(const framework::OperatorBase& op, * after the execution of op, but the original input is directly * overwritten in the previous dynamic graph implemention. */ - auto prepared_op = - PreparedOp::Prepare(ins, outs, *op_kernel, place, attrs, default_attrs); + auto prepared_op = PreparedOp::Prepare(ins, outs, *op_kernel, place, attrs, + default_attrs, pt_kernel_context); auto tmp_ins_ptr = PrepareData(*op_kernel, ins, prepared_op.kernel_type()); if (tmp_ins_ptr == nullptr) { @@ -441,7 +444,8 @@ void OpBase::Run(const framework::OperatorBase& op, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs, const platform::Place& place) { - OpBaseRunImpl(op, ins, outs, attrs, default_attrs, place); + OpBaseRunImpl(op, ins, outs, attrs, default_attrs, place, + &pt_kernel_context_); } void OpBase::Run(const framework::OperatorBase& op, @@ -450,7 +454,8 @@ void OpBase::Run(const framework::OperatorBase& op, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs, const platform::Place& place) { - OpBaseRunImpl(op, ins, outs, attrs, default_attrs, place); + OpBaseRunImpl(op, ins, outs, attrs, default_attrs, place, + &pt_kernel_context_); } void ClearNoNeedBufferInputs(OpBase* op) { diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 16580627ed1964c6cfc81a48b15f26d0b2459a78..9108155a043b7a56ca7db608a601cfa6c3b8d714 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -36,6 +36,7 @@ #include "paddle/fluid/imperative/variable_wrapper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/macros.h" +#include "paddle/pten/include/core.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/imperative/op_base.h b/paddle/fluid/imperative/op_base.h index acb125a82925d7971b7b03ee90198f87c1a5b9c0..4122e2af3dedaee0b0dfd74923870b7137fe73a3 100644 --- a/paddle/fluid/imperative/op_base.h +++ b/paddle/fluid/imperative/op_base.h @@ -25,6 +25,7 @@ #include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/imperative/variable_wrapper.h" #include "paddle/fluid/platform/place.h" +#include "paddle/pten/include/core.h" namespace paddle { namespace imperative { @@ -183,6 +184,8 @@ class OpBase { const framework::AttributeMap& default_attrs, const platform::Place& place); + static pten::KernelContext* GetKernelContext() { return &pt_kernel_context_; } + private: static const std::string& UnknownOpType() { static std::string kUnknownOpType{"unknown"}; @@ -197,6 +200,9 @@ class OpBase { std::unique_ptr op_; platform::Place place_; size_t id_{-1UL}; + // In order to reduce the compatibility phase + // performance overhead, temporarily cache KernelContext + static pten::KernelContext pt_kernel_context_; }; class GradOpNode { diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 7c0aaed25ab14fdfd977355fcde49877a54e1e86..c9e211809a4064fa25da8b5825bd792f7318ec96 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -17,6 +17,7 @@ #include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/imperative/infer_shape_context.h" +#include "paddle/fluid/imperative/tracer.h" #include "paddle/pten/common/scalar.h" #include "paddle/utils/small_vector.h" #ifdef PADDLE_WITH_XPU @@ -112,6 +113,7 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op, const framework::OpKernelType& kernel_type, const framework::KernelSignature& kernel_signature, const pten::Kernel& pt_kernel, + pten::KernelContext* pt_kernel_context, platform::DeviceContext* dev_ctx) : op_(op), ctx_(ctx), @@ -120,7 +122,8 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op, dev_ctx_(dev_ctx), run_pten_kernel_(true), pt_kernel_signature_(kernel_signature), - pt_kernel_(pt_kernel) {} + pt_kernel_(pt_kernel), + pt_kernel_context_(pt_kernel_context) {} template PreparedOp PrepareImpl(const NameVarMap& ins, @@ -128,7 +131,8 @@ PreparedOp PrepareImpl(const NameVarMap& ins, const framework::OperatorWithKernel& op, const platform::Place& place, const framework::AttributeMap& attrs, - const framework::AttributeMap& default_attrs) { + const framework::AttributeMap& default_attrs, + pten::KernelContext* pt_kernel_context) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(place); @@ -171,7 +175,7 @@ PreparedOp PrepareImpl(const NameVarMap& ins, // TODO(chenweihang): using CPUKernel when miss device kernel case return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature, - pt_kernel, dev_ctx); + pt_kernel, pt_kernel_context, dev_ctx); } else { VLOG(1) << "Dynamic mode ChoosePtenKernel - kernel `" << pt_kernel_name << "` not found."; @@ -230,8 +234,10 @@ PreparedOp PreparedOp::Prepare(const NameVarMap& ins, const framework::OperatorWithKernel& op, const platform::Place& place, const framework::AttributeMap& attrs, - const framework::AttributeMap& default_attrs) { - return PrepareImpl(ins, outs, op, place, attrs, default_attrs); + const framework::AttributeMap& default_attrs, + pten::KernelContext* pt_kernel_context) { + return PrepareImpl(ins, outs, op, place, attrs, default_attrs, + pt_kernel_context); } PreparedOp PreparedOp::Prepare(const NameVarMap& ins, @@ -239,18 +245,19 @@ PreparedOp PreparedOp::Prepare(const NameVarMap& ins, const framework::OperatorWithKernel& op, const platform::Place& place, const framework::AttributeMap& attrs, - const framework::AttributeMap& default_attrs) { + const framework::AttributeMap& default_attrs, + pten::KernelContext* pt_kernel_context) { return PrepareImpl(ins, outs, op, place, attrs, - default_attrs); + default_attrs, pt_kernel_context); } template -static pten::KernelContext BuildDygraphPtenKernelContext( +static void BuildDygraphPtenKernelContext( const framework::KernelSignature& pt_kernel_signature, const pten::Kernel& pt_kernel, const NameVarMap& ins, const NameVarMap& outs, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs, - const platform::DeviceContext& dev_ctx) { + platform::DeviceContext* dev_ctx, pten::KernelContext* kernel_ctx) { // TODO(chenweihang): now only work for very simple case, // many cases need to be deal with later: // 1. the input and output are not tensor @@ -258,7 +265,7 @@ static pten::KernelContext BuildDygraphPtenKernelContext( // 3. needless attributes remove // 4. use pt Tensor directly // 5. kernel input is not DenseTensor - pten::KernelContext op_kernel_ctx(dev_ctx); + kernel_ctx->SetDeviceContext(dev_ctx); auto& input_names = std::get<0>(pt_kernel_signature.args); auto& attr_names = std::get<1>(pt_kernel_signature.args); @@ -289,27 +296,53 @@ static pten::KernelContext BuildDygraphPtenKernelContext( for (size_t i = 0; i < input_names.size(); ++i) { auto& in_def = input_defs.at(i); auto& ins_vector = ins.at(input_names[i]); - - paddle::SmallVector> tmp_inputs; - for (auto var : ins_vector) { - const auto& variable = var->Var(); - tmp_inputs.emplace_back( - experimental::MakePtenTensorBaseFromVar(variable, in_def)); + if (kernel_ctx->InputsSize() <= i) { + paddle::SmallVector> tmp_inputs; + for (const auto& var : ins_vector) { + const auto& variable = var->Var(); + tmp_inputs.emplace_back( + experimental::MakePtenTensorBaseFromVar(variable, in_def)); + } + kernel_ctx->EmplaceBackInputs(std::move(tmp_inputs)); + } else { + size_t input_size = kernel_ctx->InputsSize(); + for (size_t j = 0; j < ins_vector.size(); ++j) { + if (input_size > i + j) { + experimental::ReMakePtenDenseTensorFromVar( + ins_vector[j]->Var(), in_def, + kernel_ctx->MutableInputAt(i + j)); + } + // TODO(chenweihang): adapt multi-input case later + } + kernel_ctx->MutableInputRangeAt(i) = + std::make_pair(i, i + ins_vector.size()); } - op_kernel_ctx.EmplaceBackInputs(std::move(tmp_inputs)); } for (size_t i = 0; i < output_names.size(); ++i) { auto& out_def = output_defs.at(i); auto& outs_vector = outs.at(output_names[i]); - - paddle::SmallVector> tmp_outputs; - for (auto var : outs_vector) { - auto* variable = var->MutableVar(); - tmp_outputs.emplace_back( - experimental::MakePtenTensorBaseFromVar(variable, out_def)); + if (kernel_ctx->OutputsSize() <= i) { + paddle::SmallVector> tmp_outputs; + for (auto& var : outs_vector) { + auto* variable = var->MutableVar(); + tmp_outputs.emplace_back( + experimental::MakePtenTensorBaseFromVar(variable, out_def)); + } + kernel_ctx->EmplaceBackOutputs(std::move(tmp_outputs)); + } else { + size_t output_size = kernel_ctx->OutputsSize(); + for (size_t j = 0; j < outs_vector.size(); ++j) { + if (output_size > i + j) { + experimental::ReMakePtenDenseTensorFromVar( + outs_vector[j]->MutableVar(), out_def, + kernel_ctx->MutableOutputAt(i + j)); + } + // TODO(chenweihang): adapt multi-output case later + } + kernel_ctx->MutableOutputRangeAt(i) = + std::make_pair(i, i + outs_vector.size()); } - op_kernel_ctx.EmplaceBackOutputs(std::move(tmp_outputs)); } for (size_t i = 0; i < attr_names.size(); ++i) { @@ -319,11 +352,11 @@ static pten::KernelContext BuildDygraphPtenKernelContext( // TODO(zhangyunfei): Scalar should hold scaler type, and we should check // attribtue type by attr_defs if (std::type_index(attr.type()) == std::type_index(typeid(float))) { - op_kernel_ctx.EmplaceBackAttr( + kernel_ctx->EmplaceBackAttr( std::move(pten::Scalar(BOOST_GET_CONST(float, attr)))); } else if (std::type_index(attr.type()) == std::type_index(typeid(std::string))) { - op_kernel_ctx.EmplaceBackAttr( + kernel_ctx->EmplaceBackAttr( std::move(pten::Scalar(BOOST_GET_CONST(std::string, attr)))); } else { PADDLE_THROW(platform::errors::Unimplemented( @@ -334,11 +367,11 @@ static pten::KernelContext BuildDygraphPtenKernelContext( } else { // TODO(chenweihang): support other attrs later if (attr_defs[i].type_index == std::type_index(typeid(int))) { - op_kernel_ctx.EmplaceBackAttr(BOOST_GET_CONST(int, attr)); + kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(int, attr)); } else if (attr_defs[i].type_index == std::type_index(typeid(float))) { - op_kernel_ctx.EmplaceBackAttr(BOOST_GET_CONST(float, attr)); + kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(float, attr)); } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) { - op_kernel_ctx.EmplaceBackAttr(BOOST_GET_CONST(bool, attr)); + kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(bool, attr)); } else { PADDLE_THROW(platform::errors::Unimplemented( "unsupported cast op attribute `%s` when construct " @@ -347,8 +380,6 @@ static pten::KernelContext BuildDygraphPtenKernelContext( } } } - - return op_kernel_ctx; } template @@ -409,20 +440,23 @@ template static void PreparedOpRunPtImpl( const framework::OperatorBase& op, const framework::KernelSignature& pt_kernel_signature, - const pten::Kernel& pt_kernel, platform::DeviceContext* dev_ctx, - const NameVarMap& ins, const NameVarMap& outs, - const framework::AttributeMap& attrs, + const pten::Kernel& pt_kernel, pten::KernelContext* pt_kernel_context, + platform::DeviceContext* dev_ctx, const NameVarMap& ins, + const NameVarMap& outs, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { DygraphInferShapeContext infer_shape_ctx(&ins, &outs, &attrs, &default_attrs, op.Type()); static_cast(op).InferShape( &infer_shape_ctx); - auto op_kernel_ctx = BuildDygraphPtenKernelContext( - pt_kernel_signature, pt_kernel, ins, outs, attrs, default_attrs, - *dev_ctx); + BuildDygraphPtenKernelContext(pt_kernel_signature, pt_kernel, ins, + outs, attrs, default_attrs, dev_ctx, + pt_kernel_context); + + pt_kernel(pt_kernel_context); - pt_kernel(&op_kernel_ctx); + // Ensure that it does not affect the VarBase life cycle management + pt_kernel_context->ClearData(); // TODO(chenweihang): add debug flags later // TODO(chenweihang): deal with complex cases later @@ -434,7 +468,8 @@ void PreparedOp::Run(const NameVarMap& ins, const framework::AttributeMap& default_attrs) { if (run_pten_kernel_) { PreparedOpRunPtImpl(op_, pt_kernel_signature_, pt_kernel_, - dev_ctx_, ins, outs, attrs, default_attrs); + pt_kernel_context_, dev_ctx_, ins, outs, attrs, + default_attrs); } else { PreparedOpRunImpl(op_, ctx_, kernel_type_, func_, dev_ctx_, ins, outs, attrs, default_attrs); @@ -447,8 +482,8 @@ void PreparedOp::Run(const NameVarMap& ins, const framework::AttributeMap& default_attrs) { if (run_pten_kernel_) { PreparedOpRunPtImpl(op_, pt_kernel_signature_, pt_kernel_, - dev_ctx_, ins, outs, attrs, - default_attrs); + pt_kernel_context_, dev_ctx_, ins, + outs, attrs, default_attrs); } else { PreparedOpRunImpl(op_, ctx_, kernel_type_, func_, dev_ctx_, ins, outs, attrs, default_attrs); diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 144f921861f9e1c0d4ace3a2d2ae089425e1e80b..5262b265b1b5397216d6b16abddac8c880acc3f9 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -155,21 +155,25 @@ class PreparedOp { const framework::RuntimeContext& ctx, const framework::OpKernelType& kernel_type, const framework::KernelSignature& kernel_signature, - const pten::Kernel& pt_kernel, platform::DeviceContext* dev_ctx); + const pten::Kernel& pt_kernel, + pten::KernelContext* pt_kernel_context, + platform::DeviceContext* dev_ctx); static PreparedOp Prepare(const NameVarMap& ins, const NameVarMap& outs, const framework::OperatorWithKernel& op, const platform::Place& place, const framework::AttributeMap& attrs, - const framework::AttributeMap& default_attrs); + const framework::AttributeMap& default_attrs, + pten::KernelContext* pt_kernel_context = nullptr); static PreparedOp Prepare(const NameVarMap& ins, const NameVarMap& outs, const framework::OperatorWithKernel& op, const platform::Place& place, const framework::AttributeMap& attrs, - const framework::AttributeMap& default_attrs); + const framework::AttributeMap& default_attrs, + pten::KernelContext* pt_kernel_context = nullptr); void Run(const NameVarMap& in, const NameVarMap& out, const framework::AttributeMap& attrs, @@ -194,6 +198,9 @@ class PreparedOp { bool run_pten_kernel_{false}; framework::KernelSignature pt_kernel_signature_; pten::Kernel pt_kernel_; + // In order to reduce the compatibility phase + // performance overhead, temporarily cache KernelContext + pten::KernelContext* pt_kernel_context_; }; } // namespace imperative diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 0f363d0ea1bff87c05c15912134e9c01bada521e..1d06a63e38f8d1ec4ed52b158fbfd62c135ac59c 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -213,6 +213,8 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins, OpBase::Run(*op, new_ins, outs, attrs, default_attrs, place); } catch (platform::EnforceNotMet& exception) { framework::AppendErrorOpHint(type, &exception); + // Compatible impl: clear pten kernel context data when throw error + OpBase::GetKernelContext()->ClearData(); throw std::move(exception); } catch (std::exception& ex) { PADDLE_THROW(platform::errors::Fatal( diff --git a/paddle/pten/api/lib/creation.cc b/paddle/pten/api/lib/creation.cc index 047b19010a26c99b05de84cbef6fe69c06f73f6a..e2cd611dbda5f53e81e75626be04ce64f41f4a71 100644 --- a/paddle/pten/api/lib/creation.cc +++ b/paddle/pten/api/lib/creation.cc @@ -38,7 +38,7 @@ Tensor full(const std::vector& shape, // 2. Get Device Context auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend()); - auto kernel_context = pten::KernelContext(*dev_ctx); + auto kernel_context = pten::KernelContext(dev_ctx); // 3. Auto data transform kernel_context.EmplaceBackAttr(value); @@ -75,7 +75,7 @@ Tensor full_like(const Tensor& x, // 2. Get Device Context auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend()); - auto kernel_context = pten::KernelContext(*dev_ctx); + auto kernel_context = pten::KernelContext(dev_ctx); // 3. Auto data transform auto dense_x = std::dynamic_pointer_cast(x.impl()); diff --git a/paddle/pten/api/lib/linalg.cc b/paddle/pten/api/lib/linalg.cc index 587b9cd0f2726fc80d7674a8faa1f222c5e5d2fc..0ede7b8a68b416c0101579321e4fd507a69dc897 100644 --- a/paddle/pten/api/lib/linalg.cc +++ b/paddle/pten/api/lib/linalg.cc @@ -38,7 +38,7 @@ Tensor dot(const Tensor& x, const Tensor& y) { // 2. Get Device Context auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend()); - auto kernel_context = pten::KernelContext(*dev_ctx); + auto kernel_context = pten::KernelContext(dev_ctx); // 3. Auto data transform auto dense_x = std::dynamic_pointer_cast(x.impl()); @@ -76,7 +76,7 @@ Tensor matmul(const Tensor& x, // 2. Get Device Context auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend()); - auto kernel_context = pten::KernelContext(*dev_ctx); + auto kernel_context = pten::KernelContext(dev_ctx); // 3. Auto data transform auto dense_x = std::dynamic_pointer_cast(x.impl()); diff --git a/paddle/pten/api/lib/manipulation.cc b/paddle/pten/api/lib/manipulation.cc index 9f071ce8c2d14be013d69e14fa387457b28b27cf..dd16f4f7f5825beb3e82f4eb5368a0b720d6f5ad 100644 --- a/paddle/pten/api/lib/manipulation.cc +++ b/paddle/pten/api/lib/manipulation.cc @@ -34,7 +34,7 @@ Tensor flatten(const Tensor& x, int start_axis, int stop_axis) { // 2. Get Device Context auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend()); - auto kernel_context = pten::KernelContext(*dev_ctx); + auto kernel_context = pten::KernelContext(dev_ctx); // 3. Auto data transform auto dense_x = std::dynamic_pointer_cast(x.impl()); diff --git a/paddle/pten/api/lib/math.cc b/paddle/pten/api/lib/math.cc index 6cb7849e529e038eb8e253749d3876c1bec87029..8102bbaaa58eaecc0b9551032f70d00057f3b856 100644 --- a/paddle/pten/api/lib/math.cc +++ b/paddle/pten/api/lib/math.cc @@ -36,7 +36,7 @@ Tensor mean(const Tensor& x) { // 2. Get Device Context auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend()); - auto kernel_context = pten::KernelContext(*dev_ctx); + auto kernel_context = pten::KernelContext(dev_ctx); // 3. Auto data transform auto dense_x = std::dynamic_pointer_cast(x.impl()); diff --git a/paddle/pten/api/lib/utils/storage.h b/paddle/pten/api/lib/utils/storage.h index 0a88c893f4dcf96e058cd85b0c43767dec9a6197..242ea6476ae983781d3d9eb1e959b5091b2495f4 100644 --- a/paddle/pten/api/lib/utils/storage.h +++ b/paddle/pten/api/lib/utils/storage.h @@ -75,6 +75,24 @@ class SharedStorage : public pten::Storage { return allocation_; } + // Temporary method: For compatible with fluid Tensor and improve performance + void ResetAllocation(std::shared_ptr allocation, + size_t offset) { + allocation_ = allocation; + data_ = pten::Allocation( + reinterpret_cast(reinterpret_cast(allocation->ptr()) + + offset), + allocation->place()); + size_ = allocation->size(); + } + + // Temporary method: For compatible with fluid Tensor and improve performance + void Reset() { + allocation_.reset(); + data_.Clear(); + size_ = 0; + } + private: int64_t size_{0}; std::shared_ptr allocation_; diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc index 628fde3a1a4ddb089979356292e23df01f3afb4b..52554bf7af0cadeff416546ec7c21cfe2988a189 100644 --- a/paddle/pten/api/lib/utils/tensor_utils.cc +++ b/paddle/pten/api/lib/utils/tensor_utils.cc @@ -14,6 +14,10 @@ limitations under the License. */ #include "paddle/pten/api/lib/utils/tensor_utils.h" +#include + +#include "paddle/pten/core/compat_utils.h" + namespace paddle { namespace experimental { @@ -126,5 +130,101 @@ void MovesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst) { MovesStorage(src, static_cast(dst)); } +void ReMakePtenDenseTensor(const paddle::framework::Tensor& src, + pten::DenseTensor* dst) { + auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst); + meta->dims = src.dims(); + // Since the type of DenseTensorMeta is const, const_cast must be used + const_cast(meta->type) = pten::TransToPtenDataType(src.type()); + // Since the type of DenseTensorMeta is const, const_cast must be used + const_cast(meta->layout) = + pten::TransToPtenDataLayout(src.layout()); + auto* shared_storage = static_cast( + pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(dst)); + PADDLE_ENFORCE_NOT_NULL( + shared_storage, + platform::errors::NotFound( + "Target DenseTensor's shared storage is nullptr.")); + shared_storage->ResetAllocation(src.Holder(), src.offset()); +} + +void ReMakePtenDenseTensor(const paddle::framework::LoDTensor& src, + pten::DenseTensor* dst) { + auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst); + meta->dims = src.dims(); + // Since the type of DenseTensorMeta is const, const_cast must be used + const_cast(meta->type) = pten::TransToPtenDataType(src.type()); + // Since the type of DenseTensorMeta is const, const_cast must be used + const_cast(meta->layout) = + pten::TransToPtenDataLayout(src.layout()); + SetLoD(&(meta->lod), src.lod()); + auto* shared_storage = static_cast( + pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(dst)); + PADDLE_ENFORCE_NOT_NULL( + shared_storage, + platform::errors::NotFound( + "Target DenseTensor's shared storage is nullptr.")); + shared_storage->ResetAllocation(src.Holder(), src.offset()); +} + +void ReMakePtenDenseTensorFromVar(const framework::Variable& variable, + const pten::TensorArgDef& arg_def, + pten::DenseTensor* dst) { + auto expected_place = pten::TransToFluidPlace(arg_def.backend); + + if (variable.IsType()) { + const auto& tensor = variable.Get(); + if (!platform::is_same_place(tensor.place(), expected_place)) { + framework::LoDTensor tmp_tensor; + framework::TensorCopySync(tensor, expected_place, &tmp_tensor); + ReMakePtenDenseTensor(tmp_tensor, dst); + } else { + ReMakePtenDenseTensor(tensor, dst); + } + } else if (variable.IsType()) { + // TODO(chenweihang): now we don't deal with row and height + // by xiaowei's advice + const auto& tensor = variable.Get(); + if (!platform::is_same_place(tensor.value().place(), expected_place)) { + framework::Tensor tmp_tensor; + TensorCopySync(tensor.value(), expected_place, &tmp_tensor); + // TODO(chenweihang): adapt SelectedRows by xiaowei's design + ReMakePtenDenseTensor(tmp_tensor, dst); + } else { + ReMakePtenDenseTensor(tensor.value(), dst); + } + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported shared input `%s` type now when call pt kernel.", + framework::ToTypeName(variable.Type()))); + } +} + +void ReMakePtenDenseTensorFromVar(framework::Variable* variable, + const pten::TensorArgDef& arg_def, + pten::DenseTensor* dst) { + // mutable_data before run kernel, to avoid share output form + // KernelContext to original tensor + if (variable->template IsType()) { + auto* tensor = variable->template GetMutable(); + // TODO(chenweihang): use original var type if arg_def.dtype is UNDEFINED + tensor->mutable_data(pten::TransToFluidPlace(arg_def.backend), + pten::TransToProtoVarType(arg_def.dtype)); + ReMakePtenDenseTensor(*tensor, dst); + } else if (variable->template IsType()) { + auto* tensor = variable->template GetMutable(); + tensor->mutable_value()->mutable_data( + pten::TransToFluidPlace(arg_def.backend), + pten::TransToProtoVarType(arg_def.dtype)); + // TODO(chenweihang): adapt SelectedRows by xiaowei's design, + // here the row and height will lost in output! + ReMakePtenDenseTensor(tensor->value(), dst); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported shared output `%s` type now when call pt kernel.", + framework::ToTypeName(variable->Type()))); + } +} + } // namespace experimental } // namespace paddle diff --git a/paddle/pten/api/lib/utils/tensor_utils.h b/paddle/pten/api/lib/utils/tensor_utils.h index 625d6702f8b6d4814b2fbc52154ed4e4efc6fbdd..c1840d97fd2e33859fc5dfcff556c72a1ddab0ac 100644 --- a/paddle/pten/api/lib/utils/tensor_utils.h +++ b/paddle/pten/api/lib/utils/tensor_utils.h @@ -44,5 +44,29 @@ void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst); void MovesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst); +/** + * In order to improve the compatibility state performance, some tricky tool + * functions are added. + * + * The ReMake** function takes out the LoDTensor information and directly + * replaces it with the corresponding member of the DenseTensor to avoid + * the overhead caused by frequent construction and destruction of the + * DenseTensor. + */ + +void ReMakePtenDenseTensor(const paddle::framework::Tensor& src, + pten::DenseTensor* dst); + +void ReMakePtenDenseTensor(const paddle::framework::LoDTensor& src, + pten::DenseTensor* dst); + +void ReMakePtenDenseTensorFromVar(const framework::Variable& variable, + const pten::TensorArgDef& arg_def, + pten::DenseTensor* dst); + +void ReMakePtenDenseTensorFromVar(framework::Variable* variable, + const pten::TensorArgDef& arg_def, + pten::DenseTensor* dst); + } // namespace experimental } // namespace paddle diff --git a/paddle/pten/core/compat_utils.h b/paddle/pten/core/compat_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..289c311bf3eba27f942c24657f9197f7c4b071e3 --- /dev/null +++ b/paddle/pten/core/compat_utils.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/pten/api/lib/utils/storage.h" +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/core/storage.h" +#include "paddle/pten/core/tensor_meta.h" + +namespace pten { + +/** + * In order to meet some adaptation requirements of the compatible state, + * these class is added to provide some tool functions. + * + * These utility functions may be deleted in the future, It is not recommended + * to be widely used in the framework + */ + +class CompatibleDenseTensorUtils { + public: + static Storage* UnsafeGetMutableStorage(DenseTensor* tensor) { + return tensor->storage_.get(); + } + + static DenseTensorMeta* GetMutableMeta(DenseTensor* tensor) { + return &(tensor->meta_); + } + + // only can deal with SharedStorage now + static void ClearStorage(DenseTensor* tensor) { + // use static_cast to improve performance, replace by dynamic_cast later + static_cast(tensor->storage_.get()) + ->Reset(); + } +}; + +} // namespace pten diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h index 46932ecac2ad0dd97cbc72c4e6a29ed16264f918..e8e57b333ae99e0cd836ddf0cab1b4f09664a749 100644 --- a/paddle/pten/core/dense_tensor.h +++ b/paddle/pten/core/dense_tensor.h @@ -21,6 +21,8 @@ limitations under the License. */ namespace pten { +class CompatibleDenseTensorUtils; + /// \brief The Dense tensor store values in a contiguous sequential block /// of memory where all values are represented. Tensors or multi-dimensional /// arrays are used in math operators. @@ -164,6 +166,9 @@ class DenseTensor : public TensorBase, /// \return The const data pointer value of raw type. const void* data() const; + private: + friend class CompatibleDenseTensorUtils; + private: DenseTensorMeta meta_; intrusive_ptr storage_; diff --git a/paddle/pten/core/kernel_context.h b/paddle/pten/core/kernel_context.h index ac1ed668f7bf5abbd3f0a9724a2921bb8a96bb41..973640906e0de0a121f5e87d7832e14db241969c 100644 --- a/paddle/pten/core/kernel_context.h +++ b/paddle/pten/core/kernel_context.h @@ -14,8 +14,10 @@ #pragma once +#include #include +#include "paddle/pten/core/compat_utils.h" #include "paddle/pten/core/tensor_base.h" #include "paddle/utils/any.h" #include "paddle/utils/small_vector.h" @@ -39,16 +41,14 @@ using DataLayout = paddle::experimental::DataLayout; */ class KernelContext { public: - explicit KernelContext(const DeviceContext& dev_ctx) : dev_ctx_(dev_ctx) {} - KernelContext(const DeviceContext& dev_ctx, - const paddle::SmallVector>& inputs, - const paddle::SmallVector>& outputs, - const paddle::SmallVector& attrs) - : dev_ctx_(dev_ctx), inputs_(inputs), outputs_(outputs), attrs_(attrs) {} + KernelContext() = default; + explicit KernelContext(DeviceContext* dev_ctx) : dev_ctx_(dev_ctx) {} + + void SetDeviceContext(DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; } template const CtxType& GetDeviceContext() const { - return static_cast(dev_ctx_); + return static_cast(*dev_ctx_); } void EmplaceBackInput(std::shared_ptr input) { @@ -59,14 +59,14 @@ class KernelContext { } void EmplaceBackInputs( - const paddle::SmallVector>& inputs) { + paddle::SmallVector> inputs) { int index = inputs_.size(); - for (auto in : inputs) { - inputs_.emplace_back(std::move(in)); - } // Record the start and end index of the input input_range_.emplace_back( std::pair(index, index + inputs.size())); + inputs_.insert(inputs_.end(), + std::make_move_iterator(inputs.begin()), + std::make_move_iterator(inputs.end())); } void EmplaceBackOutput(std::shared_ptr output) { @@ -77,14 +77,14 @@ class KernelContext { } void EmplaceBackOutputs( - const paddle::SmallVector>& outputs) { + paddle::SmallVector> outputs) { int index = outputs_.size(); - for (auto out : outputs) { - outputs_.emplace_back(std::move(out)); - } // Record the start and end index of the input output_range_.emplace_back( std::pair(index, index + outputs.size())); + outputs_.insert(outputs_.end(), + std::make_move_iterator(outputs.begin()), + std::make_move_iterator(outputs.end())); } void EmplaceBackAttr(paddle::any attr) { @@ -115,6 +115,19 @@ class KernelContext { return output_range_.at(idx); } + std::pair& MutableInputRangeAt(size_t idx) { + return input_range_[idx]; + } + + std::pair& MutableOutputRangeAt(size_t idx) { + return output_range_[idx]; + } + + template + TensorType* MutableInputAt(size_t idx) { + return static_cast(inputs_.at(idx).get()); + } + template TensorType* MutableOutputAt(size_t idx) { return static_cast(outputs_.at(idx).get()); @@ -140,12 +153,30 @@ class KernelContext { } } + // Temporary method: For compatible with fluid Tensor and improve performance + // Only deal with DenseTensor now + void ClearData() { + for (auto& in : inputs_) { + CompatibleDenseTensorUtils::ClearStorage( + static_cast(in.get())); + } + for (auto& out : outputs_) { + CompatibleDenseTensorUtils::ClearStorage( + static_cast(out.get())); + } + attrs_.clear(); + } + + size_t InputsSize() const { return inputs_.size(); } + size_t OutputsSize() const { return outputs_.size(); } + size_t AttrsSize() const { return attrs_.size(); } + private: bool IsDuplicable() const { return input_range_.size() != inputs_.size(); } private: // DeviceContext base class - const DeviceContext& dev_ctx_; + DeviceContext* dev_ctx_; // TODO(chenweihang): Tensor -> Tensor*, Tensor should by managed `scope` // Note: can't use API Tensor here, the inference don't use this API Tensor @@ -156,11 +187,6 @@ class KernelContext { // Only contains input like list[Tensor] need `range` paddle::SmallVector> input_range_; paddle::SmallVector> output_range_; - - // Only static graph need `name` - // TODO(chenweihang): replaced by paddle::string_view - paddle::SmallVector input_names_; - paddle::SmallVector output_names_; }; } // namespace pten