diff --git a/cmake/operators.cmake b/cmake/operators.cmake index e8d7ba1401ebe921f588391798e93ec03ec43783..c560dddfef5e79a113b00466be67bcd508afdbee 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -510,7 +510,7 @@ function(op_library TARGET) if(WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0) # Append first implemented MKLDNN activation operator if(${MKLDNN_FILE} STREQUAL "activation_mkldnn_op") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(gelu, MKLDNN);\n") elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n") diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc index a3b1f730dfc24bdfe9490043f2319d4793e28a9e..d78c0c4356266b7d6567c2b1faada889aae3905b 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc @@ -20,17 +20,18 @@ #include "paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP_ITSELF(leaky_relu); -USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN); +PD_DECLARE_KERNEL(leaky_relu, OneDNN, ALL_LAYOUT); USE_OP_ITSELF(gelu); USE_OP_ITSELF(relu); USE_OP_ITSELF(tanh); -USE_OP_DEVICE_KERNEL(tanh, MKLDNN); +PD_DECLARE_KERNEL(tanh, OneDNN, ALL_LAYOUT); PD_DECLARE_ARG_MAPPING_FN(gelu); namespace paddle { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 6c85cee0b049d40095b44c8d6f12306eda0be9a9..a2bade9809b21c1843d8243e13aa1ffb4d72977d 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -2200,7 +2200,9 @@ Scope* OperatorWithKernel::PrepareData( (in_def->backend != phi::Backend::GPUDNN || tensor_backend != phi::Backend::GPU) && (in_def->backend != phi::Backend::KPS || - tensor_backend != phi::Backend::XPU)) || + tensor_backend != phi::Backend::XPU) && + (in_def->backend != phi::Backend::ONEDNN || + tensor_backend != phi::Backend::CPU)) || tensor_in->place().GetType() == AllocationType::GPUPINNED) { new_expected_kernel_key = std::make_unique( expected_kernel_key.data_type_, diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc index e417d3ad2f23bd5fa8a175e5f0fcb35fa58d424f..e35568eb50c9aeabf47317e97062205992c34fc9 100644 --- a/paddle/fluid/imperative/tests/test_prepare_op.cc +++ b/paddle/fluid/imperative/tests/test_prepare_op.cc @@ -259,5 +259,5 @@ TEST(test_prepare_op, test_prepare_data_cpu_mkldnn) { USE_OP_ITSELF(split); USE_OP_ITSELF(relu); #ifdef PADDLE_WITH_MKLDNN -USE_OP_DEVICE_KERNEL(relu, MKLDNN); +PD_DECLARE_KERNEL(relu, OneDNN, ALL_LAYOUT); #endif diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index eb0d03ce00a978c904d20c29a1a3728a5fc875b3..6fba33e10ffcf41bc94c3b6078136128d5785b82 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -196,100 +196,21 @@ struct SoftplusMKLDNNFunctor : public BaseActivationFunctor { } }; -template -using ReluMKLDNNFunctor = - MKLDNNActivationFunc; - template using Relu6MKLDNNFunctor = MKLDNNActivationFunc; -template -using SwishMKLDNNFunctor = - MKLDNNActivationFunc; - -template -using HardSwishMKLDNNFunctor = - MKLDNNActivationFunc; - -template -using MishMKLDNNFunctor = - MKLDNNActivationFunc; - -template -using SigmoidMKLDNNFunctor = - MKLDNNActivationFunc; - -template -using TanhMKLDNNFunctor = - MKLDNNActivationFunc; - -template -using SqrtMKLDNNFunctor = - MKLDNNActivationFunc; - template using AbsMKLDNNFunctor = MKLDNNActivationFunc; -template -using EluMKLDNNFunctor = MKLDNNActivationFunc; - -template -using ExpMKLDNNFunctor = MKLDNNActivationFunc; - -template -using RoundMKLDNNFunctor = - MKLDNNActivationFunc; - -template -using ReluMKLDNNGradFunctor = - MKLDNNActivationGradFunc; - template using Relu6MKLDNNGradFunctor = MKLDNNActivationGradFunc; -template -using SwishMKLDNNGradFunctor = - MKLDNNActivationGradFunc; - -template -using HardSwishMKLDNNGradFunctor = - MKLDNNActivationGradFunc; - -template -using MishMKLDNNGradFunctor = - MKLDNNActivationGradFunc; - -template -using SigmoidMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc< - T, - dnnl::algorithm::eltwise_logistic_use_dst_for_bwd>; - -template -using TanhMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc< - T, - dnnl::algorithm::eltwise_tanh_use_dst_for_bwd>; - -template -using SqrtMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc< - T, - dnnl::algorithm::eltwise_sqrt_use_dst_for_bwd>; - template using AbsMKLDNNGradFunctor = MKLDNNActivationGradFunc; -template -using EluMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc< - T, - dnnl::algorithm::eltwise_elu_use_dst_for_bwd>; - -template -using ExpMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc< - T, - dnnl::algorithm::eltwise_exp_use_dst_for_bwd>; - } // namespace operators } // namespace paddle @@ -316,26 +237,13 @@ namespace ops = paddle::operators; ::paddle::platform::CPUPlace, \ ops::MKLDNNActivationKernel>); -#define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro) \ - __macro(abs, AbsMKLDNNFunctor, AbsMKLDNNGradFunctor); \ - __macro(elu, EluMKLDNNFunctor, EluMKLDNNGradUseOutFunctor); \ - __macro(exp, ExpMKLDNNFunctor, ExpMKLDNNGradUseOutFunctor); \ - __macro(gelu, GeluMKLDNNFunctor, GeluMKLDNNGradFunctor); \ - __macro(hard_swish, HardSwishMKLDNNFunctor, HardSwishMKLDNNGradFunctor); \ - __macro(leaky_relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \ - __macro(mish, MishMKLDNNFunctor, MishMKLDNNGradFunctor); \ - __macro(relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \ - __macro(relu6, Relu6MKLDNNFunctor, Relu6MKLDNNGradFunctor); \ - __macro(sigmoid, SigmoidMKLDNNFunctor, SigmoidMKLDNNGradUseOutFunctor); \ - __macro(sqrt, SqrtMKLDNNFunctor, SqrtMKLDNNGradUseOutFunctor); \ - __macro(swish, SwishMKLDNNFunctor, SwishMKLDNNGradFunctor); \ - __macro(tanh, TanhMKLDNNFunctor, TanhMKLDNNGradUseOutFunctor); +#define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro) \ + __macro(abs, AbsMKLDNNFunctor, AbsMKLDNNGradFunctor); \ + __macro(gelu, GeluMKLDNNFunctor, GeluMKLDNNGradFunctor); \ + __macro(relu6, Relu6MKLDNNFunctor, Relu6MKLDNNGradFunctor); FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL); -// round eltwise primitive doesn't support BF16, nor does it support grad -REGISTER_ACTIVATION_MKLDNN_KERNEL_FWD_ONLY(round, RoundMKLDNNFunctor); - namespace ops = paddle::operators; REGISTER_OP_KERNEL( softplus, diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index 076460c4b76423c3c3f8627d90a0bd89f9924493..f05bd2635116c11cb48b73b4bef82d94f16e4252 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -25,13 +25,14 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP_ITSELF(elementwise_mul); USE_OP_DEVICE_KERNEL(elementwise_mul, MKLDNN); USE_OP_ITSELF(relu); -USE_OP_DEVICE_KERNEL(relu, MKLDNN); +PD_DECLARE_KERNEL(relu, OneDNN, ALL_LAYOUT); USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); USE_OP_ITSELF(conv2d); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc index 2a9f07494c5d4c0ea2e681fcfaca094f56d32718..196e018507069c94ddefbc5bcb84156ae0a8b805 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc @@ -30,7 +30,7 @@ USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP_ITSELF(relu); -USE_OP_DEVICE_KERNEL(relu, MKLDNN); +PD_DECLARE_KERNEL(relu, OneDNN, ALL_LAYOUT); USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc index 6303a137453846834ddf98247ce8f2df0f75f66e..db590807179d915e105a8f3cb7e40a50433fe974 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc @@ -30,7 +30,7 @@ USE_OP_ITSELF(pool2d); USE_OP_DEVICE_KERNEL(pool2d, MKLDNN); USE_OP_ITSELF(relu); -USE_OP_DEVICE_KERNEL(relu, MKLDNN); +PD_DECLARE_KERNEL(relu, OneDNN, ALL_LAYOUT); USE_OP_ITSELF(transpose); USE_OP_DEVICE_KERNEL(transpose, MKLDNN); USE_OP_ITSELF(shape); diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 1c3208e4414bcb97da872e504c046fd51978b505..a88dc2a24863bfa2e828071bb53d93af89cb2550 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -25,6 +25,7 @@ limitations under the License. */ #include "paddle/fluid/operators/pool_op.h" #include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/kernels/funcs/onednn/mkldnn_reuse.h" namespace paddle { namespace platform { @@ -38,216 +39,8 @@ template -class MKLDNNHandlerNoCachingT { - public: - MKLDNNHandlerNoCachingT(dnnl::engine engine, platform::Place cpu_place) - : engine_(engine), place_(cpu_place), fwd_pd_(nullptr), bwd_pd_(nullptr) { - platform::MKLDNNDeviceContext::tls().log_lib_version(); - } - - std::shared_ptr AcquireForwardPrimitive() { - return std::make_shared(*fwd_pd_); - } - - std::shared_ptr AcquireBackwardPrimitive() { - return std::make_shared(*bwd_pd_); - } - - std::shared_ptr AcquireBackwardWeightsPrimitive() { - PADDLE_ENFORCE_NOT_NULL( - bwd_w_pd_, - platform::errors::Unavailable("BWD_PD should be set when " - "getting BWD prim .")); - return std::make_shared(*bwd_w_pd_); - } - - std::shared_ptr AcquireSrcMemory( - const framework::Tensor* input) { - const T* input_data = input->data(); - return this->AcquireMemoryFromPrimitive(fwd_pd_->src_desc(), - to_void_cast(input_data)); - } - - template - std::shared_ptr AcquireDstMemory(framework::Tensor* output) { - T_out* ptr = - output->mutable_data(place_, fwd_pd_->dst_desc().get_size()); - return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), ptr); - } - - template - std::shared_ptr AcquireDstMemory(void) { - return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc()); - } - - template - std::shared_ptr AcquireDstMemory( - const framework::Tensor* output) { - const T_out* output_data = output->data(); - return this->AcquireMemoryFromPrimitive(bwd_pd_->dst_desc(), - to_void_cast(output_data)); - } - - std::shared_ptr AcquireDiffDstMemory( - const framework::Tensor* diffdst) { - const T* ptr = diffdst->data(); - return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_dst_desc(), - to_void_cast(ptr)); - } - - std::shared_ptr AcquireDiffSrcMemory( - framework::Tensor* diffsrc) { - T* ptr = - diffsrc->mutable_data(place_, bwd_pd_->diff_src_desc().get_size()); - return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_src_desc(), ptr); - } - - // Buffer of given Tensor is used for oneDNN computation - std::shared_ptr AcquireDiffWeightsMemory( - framework::Tensor* diff_weights) { - PADDLE_ENFORCE_NOT_NULL( - bwd_w_pd_, - platform::errors::Unavailable( - "BWD_W_PD should be set when getting BWD grad of weights.")); - T* ptr = diff_weights->mutable_data( - place_, bwd_w_pd_->diff_weights_desc().get_size()); - return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), - ptr); - } - - // Buffer is allocated by oneDNN to store computation results - std::shared_ptr AcquireDiffWeightsMemory(void) { - PADDLE_ENFORCE_NOT_NULL( - bwd_w_pd_, - platform::errors::Unavailable( - "BWD_W_PD should be set when getting BWD grad of weights.")); - return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc()); - } - - protected: - // If your primitive descriptor requires attributes, pass them as a - // first argument and paramters to descriptor constructor in the following - // arguments. Otherwise, all arguments will be forwarded to descriptor - // constructor, including the first one. - template - void AcquireForwardPrimitiveDescriptor(Arg&& first_arg, Args&&... args) { - CreateForwardPrimitiveDescriptor(first_arg, std::forward(args)...); - } - - // Using sfinae to specialise variadic function. Workaround for not having - // if constexpr in C++ 11. - template - typename std::enable_if::type, - dnnl::primitive_attr>::value>::type - CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) { - auto fwd_desc = typename TForward::desc(std::forward(args)...); - fwd_pd_ = std::make_shared( - fwd_desc, first, engine_); - } - - template - typename std::enable_if::type, - dnnl::primitive_attr>::value>::type - CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) { - auto fwd_desc = typename TForward::desc(std::forward(first), - std::forward(args)...); - fwd_pd_ = - std::make_shared(fwd_desc, engine_); - } - - template - void AcquireBackwardPrimitiveDescriptor(Args&&... args) { - // fwd_pd_ is set during grad by calling - // AcquireForwardPrimitiveDescriptor - PADDLE_ENFORCE_NOT_NULL(fwd_pd_, - platform::errors::Unavailable( - "Get MKLDNN Forward primitive %s failed.")); - auto bwd_desc = typename TBackward::desc(std::forward(args)...); - bwd_pd_ = std::make_shared( - bwd_desc, engine_, *fwd_pd_); - } - - template - void AcquireBackwardWeightsPrimitiveDescriptor(Args&&... args) { - // fwd_pd_ is set during grad by calling - // AcquireForwardPrimitiveDescriptor - PADDLE_ENFORCE_NOT_NULL(fwd_pd_, - platform::errors::Unavailable( - "Get MKLDNN Forward primitive %s failed.")); - auto bwd_desc = - typename TBackward_params::desc(std::forward(args)...); - bwd_w_pd_ = std::make_shared( - bwd_desc, engine_, *fwd_pd_); - } - - std::shared_ptr AcquireMemoryFromPrimitive( - dnnl::memory::desc md, void* ptr) { - return std::make_shared(md, engine_, ptr); - } - - std::shared_ptr AcquireMemoryFromPrimitive( - dnnl::memory::desc md) { - return std::make_shared(md, engine_); - } - - void AcquireReorder(const std::shared_ptr& user_memory_p, - const std::shared_ptr& target_memory_p) { - auto reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - - platform::RecordEvent record_reorder("int_reorder", - platform::TracerEventType::UserDefined, - 2, - platform::EventRole::kUniqueOp); - reorder_p->execute( - astream, - {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); - astream.wait(); - } - - template - std::shared_ptr AcquireMemoryWithReorder( - const dnnl::memory::desc& user_md, - const dnnl::memory::desc& target_md, - void* ptr, - bool is_persistent = false, - std::function(const F*)> custom_reorder_func = {}) { - std::shared_ptr target_memory_p; - if (custom_reorder_func) { - auto reordered_data = - custom_reorder_func(reinterpret_cast(ptr)); - ptr = reinterpret_cast(reordered_data.get()); - } - auto user_memory_p = std::make_shared(user_md, engine_, ptr); - if (user_md != target_md) { - target_memory_p = std::make_shared(target_md, engine_); - auto reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - platform::RecordEvent record_reorder( - "int_reorder", - platform::TracerEventType::UserDefined, - 2, - platform::EventRole::kUniqueOp); - reorder_p->execute( - astream, - {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); - astream.wait(); - } else { - target_memory_p = user_memory_p; - } - return target_memory_p; - } - - dnnl::engine engine_; - platform::Place place_; - std::shared_ptr fwd_pd_; - std::shared_ptr bwd_pd_; - std::shared_ptr bwd_w_pd_; -}; +using MKLDNNHandlerNoCachingT = phi::funcs:: + MKLDNNHandlerNoCachingT; template +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/data_layout_transform.h" +#include "paddle/fluid/platform/mkldnn_helper.h" +#include "paddle/phi/backends/onednn/onednn_context.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { +namespace funcs { + +using user_function = std::function(const float*)>; +using memory = dnnl::memory; +using Place = phi::Place; + +template +class MKLDNNHandlerNoCachingT { + public: + MKLDNNHandlerNoCachingT(dnnl::engine engine, Place cpu_place) + : engine_(engine), place_(cpu_place), fwd_pd_(nullptr), bwd_pd_(nullptr) { + phi::OneDNNContext::tls().log_lib_version(); + } + + std::shared_ptr AcquireForwardPrimitive() { + return std::make_shared(*fwd_pd_); + } + + std::shared_ptr AcquireBackwardPrimitive() { + return std::make_shared(*bwd_pd_); + } + + std::shared_ptr AcquireBackwardWeightsPrimitive() { + PADDLE_ENFORCE_NOT_NULL( + bwd_w_pd_, + phi::errors::Unavailable("BWD_PD should be set when " + "getting BWD prim .")); + return std::make_shared(*bwd_w_pd_); + } + + std::shared_ptr AcquireSrcMemory(const DenseTensor* input) { + const T* input_data = input->data(); + return this->AcquireMemoryFromPrimitive( + fwd_pd_->src_desc(), paddle::platform::to_void_cast(input_data)); + } + + template + std::shared_ptr AcquireDstMemory(DenseTensor* output) { + T_out* ptr = + output->mutable_data(place_, fwd_pd_->dst_desc().get_size()); + return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), ptr); + } + + template + std::shared_ptr AcquireDstMemory(void) { + return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc()); + } + + template + std::shared_ptr AcquireDstMemory(const DenseTensor* output) { + const T_out* output_data = output->data(); + return this->AcquireMemoryFromPrimitive( + bwd_pd_->dst_desc(), + paddle::platform::to_void_cast(output_data)); + } + + std::shared_ptr AcquireDiffDstMemory( + const DenseTensor* diffdst) { + const T* ptr = diffdst->data(); + return this->AcquireMemoryFromPrimitive( + bwd_pd_->diff_dst_desc(), paddle::platform::to_void_cast(ptr)); + } + + std::shared_ptr AcquireDiffSrcMemory(DenseTensor* diffsrc) { + T* ptr = + diffsrc->mutable_data(place_, bwd_pd_->diff_src_desc().get_size()); + return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_src_desc(), ptr); + } + + // Buffer of given Tensor is used for oneDNN computation + std::shared_ptr AcquireDiffWeightsMemory( + DenseTensor* diff_weights) { + PADDLE_ENFORCE_NOT_NULL( + bwd_w_pd_, + phi::errors::Unavailable( + "BWD_W_PD should be set when getting BWD grad of weights.")); + T* ptr = diff_weights->mutable_data( + place_, bwd_w_pd_->diff_weights_desc().get_size()); + return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), + ptr); + } + + // Buffer is allocated by oneDNN to store computation results + std::shared_ptr AcquireDiffWeightsMemory(void) { + PADDLE_ENFORCE_NOT_NULL( + bwd_w_pd_, + phi::errors::Unavailable( + "BWD_W_PD should be set when getting BWD grad of weights.")); + return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc()); + } + + protected: + // If your primitive descriptor requires attributes, pass them as a + // first argument and paramters to descriptor constructor in the following + // arguments. Otherwise, all arguments will be forwarded to descriptor + // constructor, including the first one. + template + void AcquireForwardPrimitiveDescriptor(Arg&& first_arg, Args&&... args) { + CreateForwardPrimitiveDescriptor(first_arg, std::forward(args)...); + } + + // Using sfinae to specialise variadic function. Workaround for not having + // if constexpr in C++ 11. + template + typename std::enable_if::type, + dnnl::primitive_attr>::value>::type + CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) { + auto fwd_desc = typename TForward::desc(std::forward(args)...); + fwd_pd_ = std::make_shared( + fwd_desc, first, engine_); + } + + template + typename std::enable_if::type, + dnnl::primitive_attr>::value>::type + CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) { + auto fwd_desc = typename TForward::desc(std::forward(first), + std::forward(args)...); + fwd_pd_ = + std::make_shared(fwd_desc, engine_); + } + + template + void AcquireBackwardPrimitiveDescriptor(Args&&... args) { + // fwd_pd_ is set during grad by calling + // AcquireForwardPrimitiveDescriptor + PADDLE_ENFORCE_NOT_NULL( + fwd_pd_, + phi::errors::Unavailable("Get MKLDNN Forward primitive %s failed.")); + auto bwd_desc = typename TBackward::desc(std::forward(args)...); + bwd_pd_ = std::make_shared( + bwd_desc, engine_, *fwd_pd_); + } + + template + void AcquireBackwardWeightsPrimitiveDescriptor(Args&&... args) { + // fwd_pd_ is set during grad by calling + // AcquireForwardPrimitiveDescriptor + PADDLE_ENFORCE_NOT_NULL( + fwd_pd_, + phi::errors::Unavailable("Get MKLDNN Forward primitive %s failed.")); + auto bwd_desc = + typename TBackward_params::desc(std::forward(args)...); + bwd_w_pd_ = std::make_shared( + bwd_desc, engine_, *fwd_pd_); + } + + std::shared_ptr AcquireMemoryFromPrimitive( + dnnl::memory::desc md, void* ptr) { + return std::make_shared(md, engine_, ptr); + } + + std::shared_ptr AcquireMemoryFromPrimitive( + dnnl::memory::desc md) { + return std::make_shared(md, engine_); + } + + void AcquireReorder(const std::shared_ptr& user_memory_p, + const std::shared_ptr& target_memory_p) { + auto reorder_p = + std::make_shared(*user_memory_p, *target_memory_p); + + auto& astream = phi::OneDNNContext::tls().get_stream(); + + paddle::platform::RecordEvent record_reorder( + "int_reorder", + paddle::platform::TracerEventType::UserDefined, + 2, + paddle::platform::EventRole::kUniqueOp); + reorder_p->execute( + astream, + {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); + astream.wait(); + } + + template + std::shared_ptr AcquireMemoryWithReorder( + const dnnl::memory::desc& user_md, + const dnnl::memory::desc& target_md, + void* ptr, + bool is_persistent = false, + std::function(const F*)> custom_reorder_func = {}) { + std::shared_ptr target_memory_p; + if (custom_reorder_func) { + auto reordered_data = + custom_reorder_func(reinterpret_cast(ptr)); + ptr = reinterpret_cast(reordered_data.get()); + } + auto user_memory_p = std::make_shared(user_md, engine_, ptr); + if (user_md != target_md) { + target_memory_p = std::make_shared(target_md, engine_); + auto reorder_p = + std::make_shared(*user_memory_p, *target_memory_p); + + auto& astream = phi::OneDNNContext::tls().get_stream(); + paddle::platform::RecordEvent record_reorder( + "int_reorder", + paddle::platform::TracerEventType::UserDefined, + 2, + paddle::platform::EventRole::kUniqueOp); + reorder_p->execute( + astream, + {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); + astream.wait(); + } else { + target_memory_p = user_memory_p; + } + return target_memory_p; + } + + dnnl::engine engine_; + Place place_; + std::shared_ptr fwd_pd_; + std::shared_ptr bwd_pd_; + std::shared_ptr bwd_w_pd_; +}; + +template +class ActivationMKLDNNHandler + : public MKLDNNHandlerNoCachingT { + public: + ActivationMKLDNNHandler(dnnl::algorithm algorithm, + float alpha, + float beta, + const dnnl::engine engine, + Place cpu_place, + const DenseTensor* x) + : MKLDNNHandlerNoCachingT(engine, cpu_place) { + this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_training, + algorithm, + x->mem_desc(), + alpha, + beta); + } + + ActivationMKLDNNHandler(dnnl::algorithm algorithm, + float alpha, + float beta, + const dnnl::engine engine, + Place cpu_place, + const DenseTensor* x, + const DenseTensor* dout) + : MKLDNNHandlerNoCachingT(engine, cpu_place) { + this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_training, + algorithm, + x->mem_desc(), + alpha, + beta); + this->AcquireBackwardPrimitiveDescriptor( + algorithm, dout->mem_desc(), x->mem_desc(), alpha, beta); + } + + std::shared_ptr AcquireBackwardSrcMemory( + const DenseTensor* input) { + const T* input_data = input->data(); + return this->AcquireMemoryFromPrimitive( + this->bwd_pd_->src_desc(), + paddle::platform::to_void_cast(input_data)); + } +}; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/onednn/activation_grad_kernel.cc b/paddle/phi/kernels/onednn/activation_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..2eff072e647fc11c9debfbdeb895149ab7dc6cf3 --- /dev/null +++ b/paddle/phi/kernels/onednn/activation_grad_kernel.cc @@ -0,0 +1,251 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/activation_grad_kernel.h" + +#include "paddle/phi/backends/onednn/onednn_context.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/activation_functor.h" +#include "paddle/phi/kernels/funcs/onednn/mkldnn_reuse.h" + +namespace phi { + +#define DEFINE_ONEDNN_ACTIVATION_GRAD_KERNEL_DEPX(name, functor_class) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + DenseTensor* dx) { \ + functor_class functor; \ + functor(dev_ctx, x, dout, 0, 0, dx); \ + } + +#define DEFINE_ONEDNN_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX( \ + name, functor_class, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx) { \ + functor_class functor; \ + functor(dev_ctx, x, dout, attr, 0, dx); \ + } + +#define DEFINE_ONEDNN_ACTIVATION_GRAD_KERNEL_DEPOUT(name, functor_class) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + DenseTensor* dx) { \ + functor_class functor; \ + functor(dev_ctx, out, dout, 0, 0, dx); \ + } + +#define DEFINE_ONEDNN_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT( \ + name, functor_class, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx) { \ + functor_class functor; \ + functor(dev_ctx, out, dout, attr, 0, dx); \ + } + +template +void eltwise_grad(const OneDNNContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& dout, + float alpha, + float beta, + DenseTensor* dx, + dnnl::algorithm algorithm) { + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + funcs::ActivationMKLDNNHandler handler( + algorithm, alpha, beta, mkldnn_engine, dev_ctx.GetPlace(), &x, &dout); + + auto src_memory_p = handler.AcquireBackwardSrcMemory(&x); + auto diff_dst_memory_p = handler.AcquireDiffDstMemory(&dout); + auto diff_src_memory_p = handler.AcquireDiffSrcMemory(dx); + auto activation_backward_p = handler.AcquireBackwardPrimitive(); + + auto& astream = OneDNNContext::tls().get_stream(); + activation_backward_p->execute(astream, + {{DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_DIFF_DST, *diff_dst_memory_p}, + {DNNL_ARG_DIFF_SRC, *diff_src_memory_p}}); + astream.wait(); + + dx->set_mem_desc(diff_src_memory_p->get_desc()); +} + +template +void eltwise_grad_use_out(const OneDNNContext& dev_ctx, + const DenseTensor& out, + const DenseTensor& dout, + float alpha, + float beta, + DenseTensor* dx, + dnnl::algorithm algorithm) { + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + funcs::ActivationMKLDNNHandler handler( + algorithm, alpha, beta, mkldnn_engine, dev_ctx.GetPlace(), &out, &dout); + + auto dst_memory_p = handler.AcquireBackwardSrcMemory(&out); + auto diff_dst_memory_p = handler.AcquireDiffDstMemory(&dout); + auto diff_src_memory_p = handler.AcquireDiffSrcMemory(dx); + auto activation_backward_p = handler.AcquireBackwardPrimitive(); + + auto& astream = OneDNNContext::tls().get_stream(); + activation_backward_p->execute(astream, + {{DNNL_ARG_DST, *dst_memory_p}, + {DNNL_ARG_DIFF_DST, *diff_dst_memory_p}, + {DNNL_ARG_DIFF_SRC, *diff_src_memory_p}}); + astream.wait(); + + dx->set_mem_desc(diff_src_memory_p->get_desc()); +} + +template +struct MKLDNNActivationGradFunc : public funcs::BaseActivationFunctor { + void operator()(const OneDNNContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& dout, + float alpha, + float beta, + DenseTensor* dx) const { + eltwise_grad(dev_ctx, x, dout, alpha, beta, dx, algorithm); + } +}; + +template +struct MKLDNNActivationGradUseOutFunc : public funcs::BaseActivationFunctor { + void operator()(const OneDNNContext& dev_ctx, + const DenseTensor& out, + const DenseTensor& dout, + float alpha, + float beta, + DenseTensor* dx) const { + eltwise_grad_use_out(dev_ctx, out, dout, alpha, beta, dx, algorithm); + } +}; + +template +using ReluMKLDNNGradFunctor = + MKLDNNActivationGradFunc; + +template +using SwishMKLDNNGradFunctor = + MKLDNNActivationGradFunc; + +template +using HardSwishMKLDNNGradFunctor = + MKLDNNActivationGradFunc; + +template +using MishMKLDNNGradFunctor = + MKLDNNActivationGradFunc; + +template +using SigmoidMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc< + T, + dnnl::algorithm::eltwise_logistic_use_dst_for_bwd>; + +template +using TanhMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc< + T, + dnnl::algorithm::eltwise_tanh_use_dst_for_bwd>; + +template +using SqrtMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc< + T, + dnnl::algorithm::eltwise_sqrt_use_dst_for_bwd>; + +template +using EluMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc< + T, + dnnl::algorithm::eltwise_elu_use_dst_for_bwd>; + +template +using ExpMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc< + T, + dnnl::algorithm::eltwise_exp_use_dst_for_bwd>; + +DEFINE_ONEDNN_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, TanhMKLDNNGradUseOutFunctor); +DEFINE_ONEDNN_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, SqrtMKLDNNGradUseOutFunctor); +DEFINE_ONEDNN_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid, + SigmoidMKLDNNGradUseOutFunctor); +DEFINE_ONEDNN_ACTIVATION_GRAD_KERNEL_DEPOUT(Exp, ExpMKLDNNGradUseOutFunctor); +DEFINE_ONEDNN_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, ReluMKLDNNGradFunctor); + +DEFINE_ONEDNN_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, + ReluMKLDNNGradFunctor, + alpha); +DEFINE_ONEDNN_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish, + MishMKLDNNGradFunctor, + threshold); +DEFINE_ONEDNN_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish, + SwishMKLDNNGradFunctor, + beta); +template +void HardSwishGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& dout, + float threshold, + float scale, + float offset, + DenseTensor* dx) { + HardSwishMKLDNNGradFunctor functor; + functor(dev_ctx, x, dout, threshold, 0, dx); +} + +template +void EluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + float alpha, + DenseTensor* dx) { + EluMKLDNNGradUseOutFunctor functor; + functor(dev_ctx, out, dout, alpha, 0, dx); +} + +} // namespace phi + +PD_REGISTER_KERNEL(relu_grad, + OneDNN, + ALL_LAYOUT, + phi::ReluGradKernel, + float, + phi::dtype::bfloat16) {} + +#define PD_REGISTER_ACTIVATION_GRAD_KERNEL(name, func) \ + PD_REGISTER_KERNEL( \ + name, OneDNN, ALL_LAYOUT, phi::func, float, phi::dtype::bfloat16) {} + +PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(exp_grad, ExpGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_swish_grad, HardSwishGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(mish_grad, MishGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sqrt_grad, SqrtGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(swish_grad, SwishGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_grad, TanhGradKernel) diff --git a/paddle/phi/kernels/onednn/activation_kernel.cc b/paddle/phi/kernels/onednn/activation_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..fa0af71d399c392e8e5fb4b52b5d97bdea7a03b5 --- /dev/null +++ b/paddle/phi/kernels/onednn/activation_kernel.cc @@ -0,0 +1,170 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/activation_kernel.h" + +#include "paddle/phi/backends/onednn/onednn_context.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/activation_functor.h" +#include "paddle/phi/kernels/funcs/onednn/mkldnn_reuse.h" + +namespace phi { + +#define DEFINE_ONEDNN_ACTIVATION_KERNEL(name, functor_class) \ + template \ + void name##Kernel( \ + const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \ + functor_class functor; \ + functor(dev_ctx, x, 0, 0, out); \ + } + +#define DEFINE_ONEDNN_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + float attr, \ + DenseTensor* out) { \ + functor_class functor; \ + functor(dev_ctx, x, attr, 0, out); \ + } + +template +void EltwiseForward(const OneDNNContext& dev_ctx, + const DenseTensor& x, + float alpha, + float beta, + DenseTensor* out, + dnnl::algorithm algorithm) { + PADDLE_ENFORCE_EQ(paddle::platform::is_cpu_place(dev_ctx.GetPlace()), + true, + phi::errors::PreconditionNotMet( + "Operator DNNL eletwise_forward must use ONEDNNPlace")); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + bool is_inplaced = x.IsSharedBufferWith(*out); + + funcs::ActivationMKLDNNHandler handler( + algorithm, alpha, beta, mkldnn_engine, dev_ctx.GetPlace(), &x); + + auto src_memory_p = handler.AcquireSrcMemory(&x); + std::shared_ptr dst_memory_p = nullptr; + if (is_inplaced) { + dst_memory_p = src_memory_p; + dev_ctx.template Alloc(out); + } else { + dst_memory_p = handler.AcquireDstMemory(out); + } + auto activation_p = handler.AcquireForwardPrimitive(); + + auto& astream = OneDNNContext::tls().get_stream(); + activation_p->execute( + astream, {{DNNL_ARG_FROM, *src_memory_p}, {DNNL_ARG_TO, *dst_memory_p}}); + astream.wait(); + + out->set_mem_desc(dst_memory_p->get_desc()); +} + +template +struct MKLDNNActivationFunc : public funcs::BaseActivationFunctor { + void operator()(const OneDNNContext& dev_ctx, + const DenseTensor& x, + float alpha, + float beta, + DenseTensor* out) const { + EltwiseForward(dev_ctx, x, alpha, beta, out, algorithm); + } +}; + +template +using ReluMKLDNNFunctor = + MKLDNNActivationFunc; + +template +using SwishMKLDNNFunctor = + MKLDNNActivationFunc; + +template +using HardSwishMKLDNNFunctor = + MKLDNNActivationFunc; + +template +using MishMKLDNNFunctor = + MKLDNNActivationFunc; + +template +using SigmoidMKLDNNFunctor = + MKLDNNActivationFunc; + +template +using TanhMKLDNNFunctor = + MKLDNNActivationFunc; + +template +using SqrtMKLDNNFunctor = + MKLDNNActivationFunc; + +template +using EluMKLDNNFunctor = MKLDNNActivationFunc; + +template +using ExpMKLDNNFunctor = MKLDNNActivationFunc; + +template +using RoundMKLDNNFunctor = + MKLDNNActivationFunc; + +DEFINE_ONEDNN_ACTIVATION_KERNEL(Relu, ReluMKLDNNFunctor) +DEFINE_ONEDNN_ACTIVATION_KERNEL(Tanh, TanhMKLDNNFunctor) +DEFINE_ONEDNN_ACTIVATION_KERNEL(Exp, ExpMKLDNNFunctor) +DEFINE_ONEDNN_ACTIVATION_KERNEL(Sqrt, SqrtMKLDNNFunctor) +DEFINE_ONEDNN_ACTIVATION_KERNEL(Sigmoid, SigmoidMKLDNNFunctor) +// round eltwise primitive doesn't support BF16, nor does it support grad +DEFINE_ONEDNN_ACTIVATION_KERNEL(Round, RoundMKLDNNFunctor) + +DEFINE_ONEDNN_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, ReluMKLDNNFunctor, alpha) +DEFINE_ONEDNN_ACT_KERNEL_WITH_ONE_ATTRS(Mish, MishMKLDNNFunctor, threshold) +DEFINE_ONEDNN_ACT_KERNEL_WITH_ONE_ATTRS(Elu, EluMKLDNNFunctor, alpha) +DEFINE_ONEDNN_ACT_KERNEL_WITH_ONE_ATTRS(Swish, SwishMKLDNNFunctor, beta) + +template +void HardSwishKernel(const Context& dev_ctx, + const DenseTensor& x, + float threshold, + float scale, + float offset, + DenseTensor* out) { + HardSwishMKLDNNFunctor functor; + functor(dev_ctx, x, threshold, 0, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(round, OneDNN, ALL_LAYOUT, phi::RoundKernel, float) {} + +#define PD_REGISTER_ACTIVATION_KERNEL(name, func) \ + PD_REGISTER_KERNEL( \ + name, OneDNN, ALL_LAYOUT, phi::func, float, phi::dtype::bfloat16) {} + +PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel) +PD_REGISTER_ACTIVATION_KERNEL(exp, ExpKernel) +PD_REGISTER_ACTIVATION_KERNEL(hard_swish, HardSwishKernel) +PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel) +PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel) +PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel) +PD_REGISTER_ACTIVATION_KERNEL(sqrt, SqrtKernel) +PD_REGISTER_ACTIVATION_KERNEL(swish, SwishKernel) +PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel) +PD_REGISTER_ACTIVATION_KERNEL(relu, ReluKernel)