From 37e2f02713a825260492d7f179b361f38b129d44 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Wed, 27 Apr 2022 16:04:34 +0800 Subject: [PATCH] Optimize performance of dygraph (v4) (#42196) * optimize performance of dygraph * optimize performance of dygraph and elementwise_add * optimize the trace op * fix bug * fix bug * fix unittest bug * fix code format --- paddle/fluid/framework/data_type.cc | 4 +- paddle/fluid/framework/data_type.h | 2 +- paddle/fluid/framework/op_registry.cc | 12 ++-- paddle/fluid/framework/op_registry.h | 2 +- paddle/fluid/framework/phi_utils.cc | 26 ++++---- paddle/fluid/imperative/layer.cc | 2 +- paddle/fluid/imperative/prepared_operator.cc | 63 +++++++++++-------- paddle/fluid/imperative/prepared_operator.h | 4 ++ paddle/fluid/imperative/tracer.cc | 2 +- paddle/phi/core/compat/convert_utils.cc | 43 ++++++------- paddle/phi/core/dense_tensor.cc | 5 +- paddle/phi/core/kernel_utils.h | 12 ++-- paddle/phi/kernels/funcs/broadcast_function.h | 1 + .../kernels/impl/elementwise_kernel_impl.h | 2 + 14 files changed, 102 insertions(+), 78 deletions(-) diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc index 75ab747794..fda588db4d 100644 --- a/paddle/fluid/framework/data_type.cc +++ b/paddle/fluid/framework/data_type.cc @@ -109,8 +109,8 @@ size_t SizeOfType(proto::VarType::Type type) { } // Now only supports promotion of complex type -bool NeedPromoteTypes(const proto::VarType::Type a, - const proto::VarType::Type b) { +inline bool NeedPromoteTypes(const proto::VarType::Type& a, + const proto::VarType::Type& b) { return (IsComplexType(a) || IsComplexType(b)); } diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h index 124f2a86e9..81a7f6a41b 100644 --- a/paddle/fluid/framework/data_type.h +++ b/paddle/fluid/framework/data_type.h @@ -200,7 +200,7 @@ inline std::ostream& operator<<(std::ostream& out, return out; } -extern inline bool IsComplexType(const proto::VarType::Type type) { +extern inline bool IsComplexType(const proto::VarType::Type& type) { return (type == proto::VarType::COMPLEX64 || type == proto::VarType::COMPLEX128); } diff --git a/paddle/fluid/framework/op_registry.cc b/paddle/fluid/framework/op_registry.cc index d69edef784..d14254b735 100644 --- a/paddle/fluid/framework/op_registry.cc +++ b/paddle/fluid/framework/op_registry.cc @@ -21,13 +21,17 @@ namespace framework { std::unique_ptr OpRegistry::CreateOp( const std::string& type, const VariableNameMap& inputs, - const VariableNameMap& outputs, AttributeMap attrs, bool attr_check) { + const VariableNameMap& outputs, const AttributeMap& attrs, + bool attr_check) { auto& info = OpInfoMap::Instance().Get(type); if (attr_check && info.Checker() != nullptr) { - info.Checker()->Check(&attrs); + auto tmp_attrs = attrs; + info.Checker()->Check(&tmp_attrs); + return std::unique_ptr( + info.Creator()(type, inputs, outputs, tmp_attrs)); } - auto op = info.Creator()(type, inputs, outputs, attrs); - return std::unique_ptr(op); + return std::unique_ptr( + info.Creator()(type, inputs, outputs, attrs)); } static VariableNameMap ConvertOpDescVarsToVarNameMap( diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index eb40a49b40..a1f07f9f25 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -129,7 +129,7 @@ class OpRegistry { static std::unique_ptr CreateOp(const std::string& type, const VariableNameMap& inputs, const VariableNameMap& outputs, - AttributeMap attrs, + const AttributeMap& attrs, bool attr_check = true); static std::unique_ptr CreateOp(const proto::OpDesc& op_desc); diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc index 6fbb89cc8b..3eda00006f 100644 --- a/paddle/fluid/framework/phi_utils.cc +++ b/paddle/fluid/framework/phi_utils.cc @@ -81,19 +81,21 @@ OpKernelType TransPhiKernelKeyToOpKernelType(const phi::KernelKey& kernel_key) { phi::KernelKey TransOpKernelTypeToPhiKernelKey( const OpKernelType& kernel_type) { phi::Backend backend = phi::TransToPhiBackend(kernel_type.place_); - if (kernel_type.library_type_ == LibraryType::kMKLDNN) { - backend = phi::Backend::MKLDNN; - } else if (kernel_type.library_type_ == LibraryType::kCUDNN) { - backend = phi::Backend::GPUDNN; - } else if (kernel_type.library_type_ == LibraryType::kKP) { - backend = phi::Backend::KPS; - } else { - // do nothing + switch (kernel_type.library_type_) { + case LibraryType::kCUDNN: + backend = phi::Backend::GPUDNN; + break; + case LibraryType::kMKLDNN: + backend = phi::Backend::MKLDNN; + break; + case LibraryType::kKP: + backend = phi::Backend::KPS; + break; + default: + break; } - paddle::experimental::DataLayout layout = kernel_type.data_layout_; - paddle::experimental::DataType dtype = - paddle::framework::TransToPhiDataType(kernel_type.data_type_); - return phi::KernelKey(backend, layout, dtype); + return phi::KernelKey(backend, kernel_type.data_layout_, + framework::TransToPhiDataType(kernel_type.data_type_)); } phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key, diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 038ea57524..e928cbb654 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -459,7 +459,7 @@ static void OpBaseRunImpl(const framework::OperatorBase& op, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs, const platform::Place& place) { - auto* op_kernel = dynamic_cast(&op); + auto* op_kernel = static_cast(&op); PADDLE_ENFORCE_NOT_NULL( op_kernel, platform::errors::PermissionDenied( "Only support operator with kernel in Dygraph mode.")); diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 1fef559f21..bf69f6cf5a 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -40,6 +40,13 @@ static const phi::Kernel empty_kernel; static const framework::RuntimeContext empty_ctx({}, {}); static const framework::Scope empty_scope; +const phi::KernelFactory& PreparedOp::phi_kernel_factory = + phi::KernelFactory::Instance(); +const phi::OpUtilsMap& PreparedOp::phi_op_utils_map = + phi::OpUtilsMap::Instance(); +const phi::DefaultKernelSignatureMap& PreparedOp::default_phi_kernel_sig_map = + phi::DefaultKernelSignatureMap::Instance(); + const std::shared_ptr& GetVariableWrapper( const std::shared_ptr& var) { return var->SharedVar(); @@ -139,12 +146,14 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op, phi_kernel_(phi_kernel) {} template -PreparedOp PrepareImpl(const NameVarMap& ins, - const NameVarMap& outs, - const framework::OperatorWithKernel& op, - const platform::Place& place, - const framework::AttributeMap& attrs, - const framework::AttributeMap& default_attrs) { +PreparedOp PrepareImpl( + const NameVarMap& ins, const NameVarMap& outs, + const framework::OperatorWithKernel& op, const platform::Place& place, + const framework::AttributeMap& attrs, + const framework::AttributeMap& default_attrs, + const phi::KernelFactory& phi_kernel_factory, + const phi::OpUtilsMap& phi_op_utils_map, + const phi::DefaultKernelSignatureMap& default_phi_kernel_sig_map) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(place); @@ -184,15 +193,15 @@ PreparedOp PrepareImpl(const NameVarMap& ins, bool has_phi_kernel = false; - const auto* arg_map_fn = - phi::OpUtilsMap::Instance().GetArgumentMappingFn(op.Type()); + const auto* arg_map_fn = phi_op_utils_map.GetArgumentMappingFn(op.Type()); + if (arg_map_fn) { has_phi_kernel = true; kernel_signature = (*arg_map_fn)( framework::ExecutionArgumentMappingContext(dygraph_exe_ctx)); } else { default_kernel_signature = - phi::DefaultKernelSignatureMap::Instance().GetNullable(op.Type()); + default_phi_kernel_sig_map.GetNullable(op.Type()); if (default_kernel_signature) { has_phi_kernel = true; kernel_signature = *default_kernel_signature; @@ -228,8 +237,7 @@ PreparedOp PrepareImpl(const NameVarMap& ins, << ", using_kernel_key:" << expected_kernel_key; phi::KernelKey try_pt_kernel_key = TransOpKernelTypeToPhiKernelKey(expected_kernel_key); - if (!phi::KernelFactory::Instance().HasKernel(pt_kernel_name, - try_pt_kernel_key)) { + if (!phi_kernel_factory.HasKernel(pt_kernel_name, try_pt_kernel_key)) { expected_kernel_key.library_type_ = expected_kernel_key_library_type; VLOG(3) << "modify XPU KP kernel: " << op.Type() << " is failed " << expected_kernel_key; @@ -239,8 +247,8 @@ PreparedOp PrepareImpl(const NameVarMap& ins, #endif pt_kernel_key = TransOpKernelTypeToPhiKernelKey(expected_kernel_key); - auto& phi_kernel = phi::KernelFactory::Instance().SelectKernel( - pt_kernel_name, pt_kernel_key); + auto& phi_kernel = + phi_kernel_factory.SelectKernel(pt_kernel_name, pt_kernel_key); if (phi_kernel.IsValid() #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP) @@ -295,11 +303,11 @@ PreparedOp PrepareImpl(const NameVarMap& ins, || (is_xpu_unsupport && !is_xpu_kp_support) #endif ) { - if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op.Type())) { + if (has_phi_kernel) { auto pt_cpu_kernel_key = FallBackToCpu(expected_kernel_key, pt_kernel_key, op); - auto& pt_cpu_kernel = phi::KernelFactory::Instance().SelectKernel( - pt_kernel_name, pt_cpu_kernel_key); + auto& pt_cpu_kernel = + phi_kernel_factory.SelectKernel(pt_kernel_name, pt_cpu_kernel_key); if (pt_cpu_kernel.IsValid()) { VLOG(6) << "Dynamic mode PrepareImpl - kernel name: " << pt_kernel_name << " | kernel key: " << pt_cpu_kernel_key @@ -408,7 +416,9 @@ PreparedOp PreparedOp::Prepare(const NameVarMap& ins, const platform::Place& place, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { - return PrepareImpl(ins, outs, op, place, attrs, default_attrs); + return PrepareImpl(ins, outs, op, place, attrs, default_attrs, + phi_kernel_factory, phi_op_utils_map, + default_phi_kernel_sig_map); } PreparedOp PreparedOp::Prepare(const NameVarMap& ins, @@ -417,8 +427,9 @@ PreparedOp PreparedOp::Prepare(const NameVarMap& ins, const platform::Place& place, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { - return PrepareImpl(ins, outs, op, place, attrs, - default_attrs); + return PrepareImpl( + ins, outs, op, place, attrs, default_attrs, phi_kernel_factory, + phi_op_utils_map, default_phi_kernel_sig_map); } PreparedOp PreparedOp::Prepare(const NameVarMap& ins, @@ -427,8 +438,9 @@ PreparedOp PreparedOp::Prepare(const NameVarMap& ins, const platform::Place& place, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { - return PrepareImpl(ins, outs, op, place, attrs, - default_attrs); + return PrepareImpl( + ins, outs, op, place, attrs, default_attrs, phi_kernel_factory, + phi_op_utils_map, default_phi_kernel_sig_map); } template static void PreparedOpRunImpl( @@ -441,7 +453,6 @@ static void PreparedOpRunImpl( const NameVarMap& outs, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { // TODO(zjl): remove scope in dygraph - framework::Scope scope; { platform::RecordEvent record_event("infer_shape", @@ -458,8 +469,8 @@ static void PreparedOpRunImpl( platform::TracerEventType::OperatorInner, 1, platform::EventRole::kInnerOp); - func(DygraphExecutionContext(op, scope, *dev_ctx, ctx, ins, outs, - attrs, default_attrs)); + func(DygraphExecutionContext(op, empty_scope, *dev_ctx, ctx, ins, + outs, attrs, default_attrs)); } if (FLAGS_check_nan_inf) { @@ -503,7 +514,7 @@ static void PreparedOpRunPtImpl( const NameVarMap& outs, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { { - platform::RecordEvent record_event(op.Type() + "::infer_shape", + platform::RecordEvent record_event("infer_shape", platform::TracerEventType::OperatorInner, 1, platform::EventRole::kInnerOp); DygraphInferShapeContext infer_shape_ctx( @@ -513,7 +524,7 @@ static void PreparedOpRunPtImpl( } { - platform::RecordEvent record_event(op.Type() + "::compute", + platform::RecordEvent record_event("compute", platform::TracerEventType::OperatorInner, 1, platform::EventRole::kInnerOp); diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 8d930d6ed2..9e729fee69 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -214,6 +214,10 @@ class PreparedOp { const phi::KernelSignature* default_kernel_signature_; phi::KernelSignature kernel_signature_; const phi::Kernel& phi_kernel_; + + static const phi::KernelFactory& phi_kernel_factory; + static const phi::OpUtilsMap& phi_op_utils_map; + static const phi::DefaultKernelSignatureMap& default_phi_kernel_sig_map; }; const inline framework::Attribute& GetAttr( diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 47274f8a31..6c31b02550 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -192,7 +192,7 @@ void Tracer::TraceOpImpl(const std::string& type, paddle::framework::AttributeMap* passed_default_attrs_, bool use_default_attr_map) { platform::RecordEvent op_type_record_event( - type + " trace_op", platform::TracerEventType::Operator, 1); + "trace_op", platform::TracerEventType::Operator, 1); platform::ScopedFlushDenormal flush; VLOG(1) << "Trace Op: " << type; if (FLAGS_use_mkldnn) { diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc index 4388bd1f75..18c39bfae1 100644 --- a/paddle/phi/core/compat/convert_utils.cc +++ b/paddle/phi/core/compat/convert_utils.cc @@ -28,27 +28,28 @@ namespace phi { Backend TransToPhiBackend(const phi::Place& place) { auto allocation_type = place.GetType(); - if (allocation_type == phi::AllocationType::CPU) { - return Backend::CPU; - } else if (allocation_type == phi::AllocationType::GPU) { - return Backend::GPU; - } else if (allocation_type == phi::AllocationType::GPUPINNED) { - return Backend::GPU; - } else if (allocation_type == phi::AllocationType::XPU) { - return Backend::XPU; - } else if (allocation_type == phi::AllocationType::NPU) { - return Backend::NPU; - } else if (allocation_type == phi::AllocationType::IPU) { - return Backend::IPU; - } else if (allocation_type == phi::AllocationType::MLU) { - return Backend::MLU; - } else if (allocation_type == phi::AllocationType::CUSTOM) { - return static_cast( - static_cast(Backend::NUM_BACKENDS) + - GetOrRegisterGlobalDeviceTypeId(place.GetDeviceType())); - } else { - PADDLE_THROW(phi::errors::InvalidArgument( - "Unsupported transform %s to phi Backend.", place)); + switch (allocation_type) { + case phi::AllocationType::GPU: + return Backend::GPU; + case AllocationType::CPU: + return Backend::CPU; + case AllocationType::GPUPINNED: + return Backend::GPU; + case AllocationType::XPU: + return Backend::XPU; + case AllocationType::NPU: + return Backend::NPU; + case AllocationType::IPU: + return Backend::IPU; + case AllocationType::MLU: + return Backend::MLU; + case AllocationType::CUSTOM: + return static_cast( + static_cast(Backend::NUM_BACKENDS) + + GetOrRegisterGlobalDeviceTypeId(place.GetDeviceType())); + default: + PADDLE_THROW(phi::errors::InvalidArgument( + "Unsupported transform %s to phi Backend.", place)); } } diff --git a/paddle/phi/core/dense_tensor.cc b/paddle/phi/core/dense_tensor.cc index 2b9a5f5e0e..6c9291f816 100644 --- a/paddle/phi/core/dense_tensor.cc +++ b/paddle/phi/core/dense_tensor.cc @@ -135,7 +135,6 @@ void* DenseTensor::AllocateFrom(Allocator* allocator, template const T* DenseTensor::data() const { - check_memory_size(); PADDLE_ENFORCE_EQ( dtype(), paddle::experimental::CppTypeToDataType::Type(), @@ -147,13 +146,13 @@ const T* DenseTensor::data() const { template T* DenseTensor::data() { - check_memory_size(); + T* ret = static_cast(data()); PADDLE_ENFORCE( (dtype() == paddle::experimental::CppTypeToDataType::Type()), phi::errors::InvalidArgument( "The type of data we are trying to retrieve does not match the " "type of data currently contained in the container.")); - return static_cast(data()); + return ret; } void* DenseTensor::data() { diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h index ddc58f512b..f548d1da2d 100644 --- a/paddle/phi/core/kernel_utils.h +++ b/paddle/phi/core/kernel_utils.h @@ -75,7 +75,7 @@ namespace phi { "Kernel's Input should appear before Attributes."); \ static_assert(out_idx == 0, \ "Kernel's Input should appear before Outputs."); \ - const std::pair range = ctx->InputRangeAt(in_idx); \ + const std::pair& range = ctx->InputRangeAt(in_idx); \ const tensor_type& arg = ctx->InputAt(range.first); \ KernelCallHelper:: \ template Compute( \ @@ -96,7 +96,7 @@ namespace phi { "Kernel's Input should appear before Attributes."); \ static_assert(out_idx == 0, \ "Kernel's Input should appear before Outputs."); \ - const std::pair range = ctx->InputRangeAt(in_idx); \ + const std::pair& range = ctx->InputRangeAt(in_idx); \ auto arg = ctx->OptionalInputAt(range.first); \ KernelCallHelper:: \ template Compute( \ @@ -117,7 +117,7 @@ namespace phi { "Kernel's Input should appear before Attributes."); \ static_assert(out_idx == 0, \ "Kernel's Input should appear before Outputs."); \ - const std::pair range = ctx->InputRangeAt(in_idx); \ + const std::pair& range = ctx->InputRangeAt(in_idx); \ std::vector arg = std::move( \ ctx->InputsBetween(range.first, range.second)); \ KernelCallHelper:: \ @@ -141,7 +141,7 @@ namespace phi { "Kernel's Input should appear before Attributes."); \ static_assert(out_idx == 0, \ "Kernel's Input should appear before Outputs."); \ - const std::pair range = ctx->InputRangeAt(in_idx); \ + const std::pair& range = ctx->InputRangeAt(in_idx); \ paddle::optional> arg = \ ctx->OptionalInputsBetween(range.first, range.second); \ KernelCallHelper:: \ @@ -195,7 +195,7 @@ namespace phi { int out_idx, \ typename... PreviousArgs> \ static void Compute(KernelContext* ctx, PreviousArgs&... pargs) { \ - const std::pair range = ctx->OutputRangeAt(out_idx); \ + const std::pair& range = ctx->OutputRangeAt(out_idx); \ tensor_type* arg = ctx->MutableOutputAt(range.first); \ KernelCallHelper:: \ template Compute( \ @@ -212,7 +212,7 @@ namespace phi { int out_idx, \ typename... PreviousArgs> \ static void Compute(KernelContext* ctx, PreviousArgs&... pargs) { \ - const std::pair range = ctx->OutputRangeAt(out_idx); \ + const std::pair& range = ctx->OutputRangeAt(out_idx); \ std::vector arg = std::move( \ ctx->MutableOutputBetween(range.first, range.second)); \ KernelCallHelper:: \ diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index 10216f80c0..aafa40a3d0 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -592,6 +592,7 @@ void BroadcastKernel(const KPDevice &ctx, int axis, Functor func) { std::vector dims_size; + dims_size.reserve(ins.size()); bool no_broadcast_flag = true; for (auto *in : ins) { no_broadcast_flag &= ins[0]->dims() == in->dims(); diff --git a/paddle/phi/kernels/impl/elementwise_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_kernel_impl.h index b126ca9b84..4f1e7af582 100644 --- a/paddle/phi/kernels/impl/elementwise_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_kernel_impl.h @@ -55,7 +55,9 @@ namespace phi { int axis, \ DenseTensor* out) { \ std::vector inputs; \ + inputs.reserve(2); \ std::vector outputs; \ + outputs.reserve(1); \ inputs.emplace_back(&x); \ inputs.emplace_back(&y); \ outputs.emplace_back(out); \ -- GitLab