diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc index f2b424d055e47602816c00ea59207e59a271a0c7..fccb2ee5a755085e4964841af7055789a1c9c17e 100644 --- a/paddle/fluid/framework/new_executor/data_transfer.cc +++ b/paddle/fluid/framework/new_executor/data_transfer.cc @@ -212,10 +212,10 @@ std::shared_ptr TransferLayout(const std::string& var_name, out_layout = framework::DataLayout::kNCHW; } - if (in_layout == framework::DataLayout::MKLDNN && - out_layout != framework::DataLayout::MKLDNN) { + if (in_layout == framework::DataLayout::ONEDNN && + out_layout != framework::DataLayout::ONEDNN) { auto target_layout = phi::OneDNNContext::tls().get_cur_paddle_data_layout(); - VLOG(4) << "TransDataLayoutFromMKLDNN: " << in_layout << "->" + VLOG(4) << "TransDataLayoutFromOneDNN: " << in_layout << "->" << target_layout; if (out_layout == DataLayout::kNCHW && diff --git a/paddle/fluid/framework/phi_utils_test.cc b/paddle/fluid/framework/phi_utils_test.cc index 94ab77f310f9954a0f65835ad348c115bd8f20d3..5861e27a3982cc8cc0dbbbf2af607c4cfef90463 100644 --- a/paddle/fluid/framework/phi_utils_test.cc +++ b/paddle/fluid/framework/phi_utils_test.cc @@ -75,7 +75,7 @@ TEST(PhiUtils, TransOpKernelTypeToPhiKernelKey) { auto kernel_key_mkldnn = paddle::framework::TransOpKernelTypeToPhiKernelKey(op_kernel_type_mkldnn); ASSERT_EQ(kernel_key_mkldnn.dtype(), phi::DataType::FLOAT32); - ASSERT_EQ(kernel_key_mkldnn.layout(), phi::DataLayout::MKLDNN); + ASSERT_EQ(kernel_key_mkldnn.layout(), phi::DataLayout::ONEDNN); ASSERT_EQ(kernel_key_mkldnn.backend(), phi::Backend::ONEDNN); #endif diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index fd309a7f85f603797df3c5747d2e5c36b08787f9..933ac4f12e3c4ef4a300689729fb4151f35cb82f 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -39,537 +39,24 @@ template -using MKLDNNHandlerNoCachingT = phi::funcs:: - MKLDNNHandlerNoCachingT; +using MKLDNNHandlerT = + phi::funcs::OneDNNHandlerT; template -class MKLDNNHandlerT { - public: - MKLDNNHandlerT(const MKLDNNDeviceContext& dev_ctx, - dnnl::engine engine, - platform::Place cpu_place, - const std::string& base_key) - : dev_ctx_(dev_ctx), - engine_(engine), - place_(cpu_place), - key_common_(base_key), - key_(platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, base_key)), - fwd_pd_(nullptr), - bwd_pd_(nullptr) { - platform::MKLDNNDeviceContext::tls().log_lib_version(); - } - - std::shared_ptr AcquireForwardPrimitive() { - const std::string key_p = key_ + "@fwd_p"; - auto forward_p = - std::static_pointer_cast(dev_ctx_.GetBlob(key_p)); - if (forward_p == nullptr) { - forward_p = std::make_shared(*fwd_pd_); - dev_ctx_.SetBlob(key_p, forward_p); - } - return forward_p; - } - - std::shared_ptr AcquireBackwardPrimitive() { - const std::string key_p = key_ + "@bwd_p"; - auto backward_p = - std::static_pointer_cast(dev_ctx_.GetBlob(key_p)); - if (backward_p == nullptr) { - backward_p = std::make_shared(*bwd_pd_); - dev_ctx_.SetBlob(key_p, backward_p); - } - return backward_p; - } - - std::shared_ptr AcquireBackwardWeightsPrimitive() { - const std::string key_p = key_ + "@bwd_w_p"; - auto backward_p = - std::static_pointer_cast(dev_ctx_.GetBlob(key_p)); - if (backward_p == nullptr) { - PADDLE_ENFORCE_NOT_NULL( - bwd_w_pd_, - platform::errors::Unavailable("BWD_PD should be set when " - "getting BWD prim witk key: %s .", - key_p)); - backward_p = std::make_shared(*bwd_w_pd_); - dev_ctx_.SetBlob(key_p, backward_p); - } - return backward_p; - } - - std::shared_ptr AcquireSrcMemory( - const framework::Tensor* input) { - const T* input_data = input->data(); - return this->AcquireMemoryFromPrimitive( - fwd_pd_->src_desc(), to_void_cast(input_data), "@src_mem_p"); - } - - template - std::shared_ptr AcquireDstMemory(framework::Tensor* output) { - T_out* ptr = - output->mutable_data(place_, fwd_pd_->dst_desc().get_size()); - return this->AcquireMemoryFromPrimitive( - fwd_pd_->dst_desc(), ptr, "@dst_mem_p"); - } - - template - std::shared_ptr AcquireDstMemory(void) { - return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), "@dstt_mem_p"); - } - - template - std::shared_ptr AcquireDstMemory( - const framework::Tensor* output) { - const T_out* output_data = output->data(); - return this->AcquireMemoryFromPrimitive(bwd_pd_->dst_desc(), - to_void_cast(output_data), - "@bwd-dst_mem_p"); - } - - std::shared_ptr AcquireDiffDstMemory( - const framework::Tensor* diffdst) { - const T* ptr = diffdst->data(); - return this->AcquireMemoryFromPrimitive( - bwd_pd_->diff_dst_desc(), to_void_cast(ptr), "@diff_dst_mem_p"); - } - - std::shared_ptr AcquireDiffSrcMemory( - framework::Tensor* diffsrc) { - T* ptr = - diffsrc->mutable_data(place_, bwd_pd_->diff_src_desc().get_size()); - return this->AcquireMemoryFromPrimitive( - bwd_pd_->diff_src_desc(), ptr, "@diff_src_mem_p"); - } - - // Buffer of given Tensor is used for oneDNN computation - std::shared_ptr AcquireDiffWeightsMemory( - framework::Tensor* diff_weights) { - PADDLE_ENFORCE_NOT_NULL( - bwd_w_pd_, - platform::errors::Unavailable( - "BWD_W_PD should be set when getting BWD grad of weights.")); - T* ptr = diff_weights->mutable_data( - place_, bwd_w_pd_->diff_weights_desc().get_size()); - return this->AcquireMemoryFromPrimitive( - bwd_w_pd_->diff_weights_desc(), ptr, "@diff_wei_mem_p"); - } - - // Buffer is allocated by oneDNN to store computation results - std::shared_ptr AcquireDiffWeightsMemory(void) { - PADDLE_ENFORCE_NOT_NULL( - bwd_w_pd_, - platform::errors::Unavailable( - "BWD_W_PD should be set when getting BWD grad of weights.")); - return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), - "@diff_wei_mem_p"); - } - - protected: - bool isCached() { - const std::string key_pd = key_ + "@fwd_pd"; - fwd_pd_ = std::static_pointer_cast( - dev_ctx_.GetBlob(key_pd)); - - return (fwd_pd_ != nullptr); - } - - bool isBwdCached() { - const std::string key_pd = key_ + "@bwd_pd"; - bwd_pd_ = std::static_pointer_cast( - dev_ctx_.GetBlob(key_pd)); - - if (bwd_pd_ == nullptr) { - return false; - } else { - if (std::is_same::value == - false) { - const std::string key_bw_w_pd = key_ + "@bwd_w_pd"; - bwd_w_pd_ = - std::static_pointer_cast( - dev_ctx_.GetBlob(key_bw_w_pd)); - } - - // When BWD is cached then still we need to Get FWD PD - const std::string key_fpd = key_ + "@fwd_pd"; - fwd_pd_ = std::static_pointer_cast( - dev_ctx_.GetBlob(key_fpd)); - PADDLE_ENFORCE_NOT_NULL( - fwd_pd_, - platform::errors::Unavailable( - "Error: FWD PD should be set when BWD PD is cached.")); - return true; - } - } - - // If your primitive descriptor requires attributes, pass them as a - // first argument and paramters to descriptor constructor in the following - // arguments. Otherwise, all arguments will be forwarded to descriptor - // constructor, including the first one. - template - void AcquireForwardPrimitiveDescriptor(Arg&& first_arg, Args&&... args) { - // This is used when we can recreate FWD PD in BWD so - // we do not need to pass FWD to BWD - const std::string key_pd = key_ + "@fwd_pd"; - fwd_pd_ = std::static_pointer_cast( - dev_ctx_.GetBlob(key_pd)); - if (fwd_pd_ == nullptr) { - CreateForwardPrimitiveDescriptor(first_arg, std::forward(args)...); - dev_ctx_.SetBlob(key_pd, fwd_pd_); - } - } - - // Using sfinae to specialise variadic function. Workaround for not having - // if constexpr in C++ 11. - template - typename std::enable_if::type, - dnnl::primitive_attr>::value>::type - CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) { - auto fwd_desc = typename TForward::desc(std::forward(args)...); - fwd_pd_ = std::make_shared( - fwd_desc, first, engine_); - } - - template - typename std::enable_if::type, - dnnl::primitive_attr>::value>::type - CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) { - auto fwd_desc = typename TForward::desc(std::forward(first), - std::forward(args)...); - fwd_pd_ = - std::make_shared(fwd_desc, engine_); - } - - template - void AcquireBackwardPrimitiveDescriptor(Args&&... args) { - // fwd_pd_ is set during grad by calling - // AcquireForwardPrimitiveDescriptor - PADDLE_ENFORCE_NOT_NULL( - fwd_pd_, - platform::errors::Unavailable("Get MKLDNN Forward primitive %s failed.", - key_ + "@fwd_pd")); - const std::string key_pd = key_ + "@bwd_pd"; - bwd_pd_ = std::static_pointer_cast( - dev_ctx_.GetBlob(key_pd)); - if (bwd_pd_ == nullptr) { - auto bwd_desc = typename TBackward::desc(std::forward(args)...); - bwd_pd_ = std::make_shared( - bwd_desc, engine_, *fwd_pd_); - dev_ctx_.SetBlob(key_pd, bwd_pd_); - } - } - - template - void AcquireBackwardWeightsPrimitiveDescriptor(Args&&... args) { - // fwd_pd_ is set during grad by calling - // AcquireForwardPrimitiveDescriptor - PADDLE_ENFORCE_NOT_NULL( - fwd_pd_, - platform::errors::Unavailable("Get MKLDNN Forward primitive %s failed.", - key_ + "@fwd_pd")); - const std::string key_pd = key_ + "@bwd_w_pd"; - bwd_w_pd_ = - std::static_pointer_cast( - dev_ctx_.GetBlob(key_pd)); - if (bwd_w_pd_ == nullptr) { - auto bwd_desc = - typename TBackward_params::desc(std::forward(args)...); - bwd_w_pd_ = std::make_shared( - bwd_desc, engine_, *fwd_pd_); - dev_ctx_.SetBlob(key_pd, bwd_w_pd_); - } - } - - std::shared_ptr AcquireMemoryFromPrimitive( - const std::string& suffix) { - return std::static_pointer_cast( - dev_ctx_.GetBlob(key_ + suffix)); - } - - std::shared_ptr AcquireMemoryFromPrimitive( - dnnl::memory::desc md, void* ptr, const std::string& suffix) { - const auto local_key = key_ + suffix; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - if (mem_p == nullptr) { - mem_p = std::make_shared(md, engine_, ptr); - dev_ctx_.SetBlob(local_key, mem_p); - } else { - mem_p->set_data_handle(ptr); - } - return mem_p; - } - - std::shared_ptr AcquireMemoryFromPrimitive( - dnnl::memory::desc md, const std::string& suffix) { - const auto local_key = key_ + suffix; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - if (mem_p == nullptr) { - mem_p = std::make_shared(md, engine_); - dev_ctx_.SetBlob(local_key, mem_p); - } - return mem_p; - } - - void AcquireReorder(const std::shared_ptr& user_memory_p, - const std::shared_ptr& target_memory_p) { - auto reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - - platform::RecordEvent record_reorder("int_reorder", - platform::TracerEventType::UserDefined, - 2, - platform::EventRole::kUniqueOp); - reorder_p->execute( - astream, - {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); - astream.wait(); - } - - template - std::shared_ptr AcquireMemoryWithReorder( - const dnnl::memory::desc& user_md, - const dnnl::memory::desc& target_md, - void* ptr, - const std::string& suffix, - bool is_persistent = false, - std::function(const F*)> custom_reorder_func = {}, - const std::vector& scale_data = {1.0f}, - int mask = 0) { - const auto target_key = key_ + suffix + "_target"; - const auto key_reorder_p = key_ + suffix + "reorder_p"; - const auto user_key = key_ + suffix + "_user"; - - auto target_memory_p = - std::static_pointer_cast(dev_ctx_.GetBlob(target_key)); - - if (target_memory_p == nullptr) { - if (custom_reorder_func) { - auto reordered_data = - custom_reorder_func(reinterpret_cast(ptr)); - dev_ctx_.SetBlob(key_reorder_p + "-custom_reorder", reordered_data); - ptr = reinterpret_cast(reordered_data.get()); - } - auto user_memory_p = - std::make_shared(user_md, engine_, ptr); - if (user_md != target_md) { - target_memory_p = std::make_shared(target_md, engine_); - dnnl::reorder::primitive_desc reorder_pdesc; - if (is_int8()) { - dnnl::primitive_attr attr; - attr.set_output_scales(mask, scale_data); - reorder_pdesc = dnnl::reorder::primitive_desc( - *user_memory_p, *target_memory_p, attr); - } else { - reorder_pdesc = - dnnl::reorder::primitive_desc(*user_memory_p, *target_memory_p); - } - auto reorder_p = std::make_shared(reorder_pdesc); - dev_ctx_.SetBlob(key_reorder_p, reorder_p); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - platform::RecordEvent record_reorder( - "int_reorder", - platform::TracerEventType::UserDefined, - 2, - platform::EventRole::kUniqueOp); - reorder_p->execute( - astream, - {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); - astream.wait(); - } else { - target_memory_p = user_memory_p; - } - dev_ctx_.SetBlob(user_key, user_memory_p); - dev_ctx_.SetBlob(target_key, target_memory_p); - } else if (!is_persistent) { - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - - auto user_memory_p = - std::static_pointer_cast(dev_ctx_.GetBlob(user_key)); - user_memory_p->set_data_handle(ptr); - - // TODO(jczaja): Here we detect if reorder is cached it means it is needed - // need to change this to get rid of keys - auto reorder_p = std::static_pointer_cast( - dev_ctx_.GetBlob(key_reorder_p)); - if (reorder_p != nullptr) { - platform::RecordEvent record_reorder( - "int_reorder", - platform::TracerEventType::UserDefined, - 2, - platform::EventRole::kUniqueOp); - reorder_p->execute( - astream, - {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); - astream.wait(); - } - } - return target_memory_p; - } - - std::shared_ptr AcquireMemory(const std::string& suffix) { - const auto local_key = key_ + suffix; - return std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - } - - const MKLDNNDeviceContext& dev_ctx_; - dnnl::engine engine_; - platform::Place place_; - std::string key_common_; - std::string key_; - std::shared_ptr fwd_pd_; - std::shared_ptr bwd_pd_; - std::shared_ptr bwd_w_pd_; -}; +using MKLDNNHandlerNoCachingT = phi::funcs:: + OneDNNHandlerNoCachingT; template -class BinaryMKLDNNHandler - : public platform::MKLDNNHandlerNoCachingT { - public: - BinaryMKLDNNHandler(const dnnl::algorithm algo, - const int axis, - const dnnl::engine engine, - platform::Place cpu_place, - const Tensor* x, - const Tensor* y, - Tensor* out, - float scale_x, - float scale_y, - float scale_out, - const dnnl::post_ops& post_ops = dnnl::post_ops{}) - : platform::MKLDNNHandlerNoCachingT(engine, cpu_place) { - const auto src_x_tz = phi::vectorize(x->dims()); - const auto src_y_tz = phi::vectorize(y->dims()); - // if output tensor(z) is nullptr then we are computing into oneDNN - // managed buffer - auto rankdiff = x->dims().size() - y->dims().size(); - const auto dst_tz = (out == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz) - : phi::vectorize(out->dims()); - - auto src0_md = x->mem_desc(); - auto src1_md = y->mem_desc(); - if (rankdiff > 0) { // Second input is of smaller rank than first - std::vector dims1_ex(rankdiff, 1); - dims1_ex.insert(next(dims1_ex.begin(), (axis == -1 ? rankdiff : axis)), - src_y_tz.begin(), - src_y_tz.end()); - // For broadcasting for NHWC we need rotate extended shape - if (MKLDNNDeviceContext::tls().get_cur_paddle_data_layout() == - framework::DataLayout::kNHWC) { - std::rotate(dims1_ex.begin() + 1, dims1_ex.end() - 1, dims1_ex.end()); - } - src1_md = src1_md.reshape(dims1_ex); - } else if (rankdiff < 0) { // First input is of smaller than second - std::vector dims0_ex(-rankdiff, 1); - dims0_ex.insert(next(dims0_ex.begin(), (axis == -1 ? -rankdiff : axis)), - src_x_tz.begin(), - src_x_tz.end()); - // For broadcasting for NHWC we need rotate extended shape - if (MKLDNNDeviceContext::tls().get_cur_paddle_data_layout() == - framework::DataLayout::kNHWC) { - std::rotate(dims0_ex.begin() + 1, dims0_ex.end() - 1, dims0_ex.end()); - } - src0_md = src0_md.reshape(dims0_ex); - } - const auto dst_md = memory::desc( - dst_tz, platform::MKLDNNGetDataType(), MKLDNNMemoryFormat::any); - - auto attributes = - CreateAttributes(algo, scale_x, scale_y, scale_out, post_ops); - - if (x->numel() < y->numel()) { - this->AcquireForwardPrimitiveDescriptor( - attributes, algo, src1_md, src0_md, dst_md); - } else { - this->AcquireForwardPrimitiveDescriptor( - attributes, algo, src0_md, src1_md, dst_md); - } - } - std::shared_ptr AcquireSecondSrcMemory( - const framework::Tensor* input) { - const T* input_data = input->data(); - return this->AcquireMemoryFromPrimitive(this->fwd_pd_->src1_desc(), - to_void_cast(input_data)); - } - - private: - static inline dnnl::primitive_attr CreateAttributes( - dnnl::algorithm op, - float scale_x, - float scale_y, - float scale_out, - dnnl::post_ops post_ops = dnnl::post_ops{}) { - // Scales set in attributes for inputs contibute to the output equation - // in the following way (assuming no broadcasting takes place): - // output_i = scale_0 * x_i <+ or *> scale_1 * y_i; - // Hence we have to create scales that will: - // 1. Dequantize both values, by multiplying with (1.0 / scale_x_or_y) - // 2. Quantize their result to output scale range, by multiplying with - // (scale_z) - // If we combine these two, we end up with following equation - // output = scale_out * (1/scale_x * x <* or +> 1/scale_y * y) - // Hence, to mimic such behaviour using provided interface, - // For add operation the equation is equal to: - // output = (scale_out / scale_x) * x + (scale_out / scale_y) * y - // - // For mul operation on the other hand - // output = (scale_out / scale_x) * x * (1.0 / scale_y) * y - // - float scale_0 = scale_out / scale_x; - float scale_1 = - op == dnnl::algorithm::binary_add ? scale_out / scale_y : 1.0 / scale_y; - dnnl::primitive_attr attributes; - attributes.set_scales( - /* input_x_id = */ DNNL_ARG_SRC_0, /* mask = */ 0, {scale_0}); - attributes.set_scales( - /* input_y_id = */ DNNL_ARG_SRC_1, /* mask = */ 0, {scale_1}); - if (post_ops.len() > 0) attributes.set_post_ops(post_ops); - return attributes; - } -}; +using ReductionMKLDNNHandler = phi::funcs::ReductionOneDNNHandler; template -class BroadcastDataMKLDNNHandler - : public platform::MKLDNNHandlerNoCachingT { - public: - BroadcastDataMKLDNNHandler(const dnnl::algorithm algo, - const dnnl::engine engine, - platform::Place cpu_place, - const Tensor* x, - Tensor* out, - float scale_x, - float scale_y, - const std::vector& extended_x_dims) - : platform::MKLDNNHandlerNoCachingT(engine, cpu_place) { - const auto src0_tz = phi::vectorize(out->dims()); - const auto src0_md = - dnnl::memory::desc(src0_tz, - platform::MKLDNNGetDataType(), - platform::GetPlainMKLDNNFormat(src0_tz.size())); - const auto src1_md = x->mem_desc().reshape(extended_x_dims); - - dnnl::primitive_attr attributes; - attributes.set_scales(DNNL_ARG_SRC_0, 0, {scale_x}); - attributes.set_scales(DNNL_ARG_SRC_1, 0, {scale_y}); - - this->AcquireForwardPrimitiveDescriptor( - attributes, algo, src0_md, src1_md, src0_md); - } +using BroadcastDataMKLDNNHandler = phi::funcs::BroadcastDataOneDNNHandler; - template - std::shared_ptr AcquireZeroedDstMemory(framework::Tensor* out) { - T_out* ptr = out->mutable_data(this->place_, - this->fwd_pd_->dst_desc().get_size()); - memset(ptr, 0, this->fwd_pd_->dst_desc().get_size()); - return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr); - } -}; +template +using BinaryMKLDNNHandler = phi::funcs::BinaryOneDNNHandler; static void AppendActivation(const framework::ExecutionContext& ctx, dnnl::post_ops& post_ops, // NOLINT @@ -624,34 +111,6 @@ static void AppendActivation(const framework::ExecutionContext& ctx, } } -template -class ReductionMKLDNNHandler - : public platform::MKLDNNHandlerNoCachingT { - public: - ReductionMKLDNNHandler(const dnnl::algorithm algo, - const float p, - const float eps, - const dnnl::engine engine, - platform::Place cpu_place, - const Tensor* x, - const Tensor* out, - std::vector out_tz, - const dnnl::primitive_attr& attrs = NULL) - : platform::MKLDNNHandlerNoCachingT(engine, - cpu_place) { - const auto out_md = memory::desc(out_tz, - platform::MKLDNNGetDataType(), - dnnl::memory::format_tag::any); - - if (attrs) - this->AcquireForwardPrimitiveDescriptor( - attrs, algo, x->mem_desc(), out_md, p, eps); - else - this->AcquireForwardPrimitiveDescriptor( - algo, x->mem_desc(), out_md, p, eps); - } -}; - template constexpr bool IsInt8() { return std::is_same::value || std::is_same::value; @@ -1071,37 +530,5 @@ class ReorderMKLDNNHandler { dnnl::memory::data_type dtype_, dtype_dst_; dnnl::engine engine_; }; - -template -static void SetDstMemoryQuantized( - const framework::ExecutionContext& ctx, - framework::Tensor* output, - std::vector dst_tz, - const dnnl::engine& engine, - std::shared_ptr& dst_md, // NOLINT - std::shared_ptr& dst_memory, // NOLINT - MKLDNNMemoryFormat output_format) { - T* output_data = output->mutable_data(ctx.GetPlace()); - const size_t dst_dims = dst_tz.size(); - MKLDNNMemoryFormat dst_fmt; - - PADDLE_ENFORCE_LE(dst_dims, - 5, - platform::errors::InvalidArgument( - "Dst memory for quantization can not have " - "dims > 5. But received dst_dims is %d.", - dst_dims)); - dst_fmt = platform::MKLDNNFormatForSize(dst_dims, output_format); - - auto tmp_dst_md = - platform::MKLDNNMemDesc({dst_tz}, - paddle::framework::ToMKLDNNDataType( - framework::DataTypeTrait::DataType()), - dst_fmt); - dst_md.reset(new dnnl::memory::desc(tmp_dst_md)); - dst_memory.reset( - new dnnl::memory(*dst_md, engine, to_void_cast(output_data))); -} - } // namespace platform } // namespace paddle diff --git a/paddle/phi/api/lib/kernel_dispatch.cc b/paddle/phi/api/lib/kernel_dispatch.cc index bd3d4ebba28341154e9dcbdd62ea68047d435851..75ea1d493d9355131a9d832429ca891cf73aef7e 100644 --- a/paddle/phi/api/lib/kernel_dispatch.cc +++ b/paddle/phi/api/lib/kernel_dispatch.cc @@ -56,7 +56,7 @@ BackendSet GetTensorBackendSet(const phi::TensorBase& t) { if (HasAllocation(t) && t.place().GetType() != AllocationType::UNDEFINED) { BackendSet backend_set(phi::TransToPhiBackend(t.place())); switch (t.layout()) { - case DataLayout::MKLDNN: + case DataLayout::ONEDNN: backend_set = backend_set | BackendSet(Backend::ONEDNN); break; default: diff --git a/paddle/phi/backends/onednn/onednn_helper.h b/paddle/phi/backends/onednn/onednn_helper.h index e25eafd2e027797e7e077d3632c049cac30beb63..aeaecf7491e616e501eb2275062da99dff2192ca 100644 --- a/paddle/phi/backends/onednn/onednn_helper.h +++ b/paddle/phi/backends/onednn/onednn_helper.h @@ -14,6 +14,7 @@ #pragma once +#include #include "dnnl.hpp" // NOLINT #include "glog/logging.h" @@ -94,6 +95,106 @@ inline dnnl::memory::format_tag GetPlainOneDNNFormat(int tensor_rank) { } } +template +dnnl::memory::data_type oneDNNGetDataType() { + return dnnl::memory::data_type::undef; +} + +template <> +inline dnnl::memory::data_type oneDNNGetDataType() { + return dnnl::memory::data_type::f32; +} +template <> +inline dnnl::memory::data_type oneDNNGetDataType() { + return dnnl::memory::data_type::s32; +} +template <> +inline dnnl::memory::data_type oneDNNGetDataType() { + return dnnl::memory::data_type::s8; +} +template <> +inline dnnl::memory::data_type oneDNNGetDataType() { + return dnnl::memory::data_type::u8; +} + +template <> +inline dnnl::memory::data_type oneDNNGetDataType() { + return dnnl::memory::data_type::bf16; +} + +inline std::vector> ToOneDNNPadding( + const std::vector& paddings) { + if (paddings.size() == 6) { + int padding_front = paddings[0]; + int padding_back = paddings[1]; + int padding_top = paddings[2]; + int padding_bottom = paddings[3]; + int padding_left = paddings[4]; + int padding_right = paddings[5]; + + return {{padding_front, padding_top, padding_left}, + {padding_back, padding_bottom, padding_right}}; + } else { + int padding_top = paddings[0]; + int padding_bottom = paddings[1]; + int padding_left = paddings[2]; + int padding_right = paddings[3]; + + return {{padding_top, padding_left}, {padding_bottom, padding_right}}; + } +} + +template +inline void AppendKey(std::string* key, const T& num) { + key->append(std::to_string(num)); +} + +template <> +inline void AppendKey(std::string* key, + const dnnl::memory::format_tag& format) { + key->append(std::to_string(static_cast(format))); +} + +template <> +inline void AppendKey(std::string* key, + const dnnl::memory::data_type& data_type) { + key->append(std::to_string(static_cast(data_type))); +} + +template <> +inline void AppendKey(std::string* key, const dnnl::algorithm& algorithm) { + key->append(std::to_string(static_cast(algorithm))); +} + +template <> +inline void AppendKey(std::string* key, + const dnnl::normalization_flags& flags) { + key->append(std::to_string(static_cast(flags))); +} + +inline void AppendKey(std::string* key, const std::string& str) { + key->append(str); +} + +inline void AppendKey(std::string* key, const char* str) { key->append(str); } + +template +inline void AppendKey(std::string* key, const std::vector& dims) { + for (size_t i = 0; i < dims.size(); i++) { + AppendKey(key, std::to_string(dims[i])); + } +} + +template +inline std::string CreateKey(const OneDNNContext& dev_ctx, ArgTypes&&... args) { + std::string key; + key.reserve(64); + using expand_type = int[]; + expand_type{0, (AppendKey(&key, std::forward(args)), 0)...}; + key += OneDNNContext::tls().get_key_suffix(); + return key; +} + inline void MatchShapeToLayout(DenseTensor* tensor_in, DataLayout from, DataLayout to) { @@ -117,28 +218,28 @@ inline void MatchShapeToLayout(DenseTensor* tensor_in, // at last nhwC, so for dim==2 these layouts are the same and nothing should // be done. Similarly for dim==1 when you have just one possible combination. if (tensor_in->dims().size() < 3) { - VLOG(3) << "Keeping MKLDNN/NHWC/NDHWC output_shape" + VLOG(3) << "Keeping ONEDNN/NHWC/NDHWC output_shape" << print_dims(phi::vectorize(tensor_in->dims())); return; } switch (from) { - case DataLayout::MKLDNN: + case DataLayout::ONEDNN: if ((to == DataLayout::NHWC) || (to == DataLayout::NDHWC)) { auto dims = phi::vectorize(tensor_in->dims()); std::rotate(dims.begin() + 1, dims.begin() + 2, dims.end()); tensor_in->Resize(phi::make_ddim(dims)); - VLOG(3) << "Rotating Shape from: MKLDNN to: NHWC/NDHWC output_shape" + VLOG(3) << "Rotating Shape from: ONEDNN to: NHWC/NDHWC output_shape" << print_dims(dims); } break; case DataLayout::NHWC: case DataLayout::NDHWC: - if (to == DataLayout::MKLDNN) { + if (to == DataLayout::ONEDNN) { auto dims = phi::vectorize(tensor_in->dims()); std::rotate(dims.begin() + 1, dims.end() - 1, dims.end()); tensor_in->Resize(phi::make_ddim(dims)); - VLOG(3) << "Rotating Shape from: NHWC/NDHWC to: MKLDNN output_shape" + VLOG(3) << "Rotating Shape from: NHWC/NDHWC to: ONEDNN output_shape" << print_dims(dims); } break; @@ -158,5 +259,22 @@ inline dnnl::memory::desc OneDNNMemDesc(const std::vector& dims, return dnnl::memory::desc({dims}, data_type, format); } +inline std::string ThreadIDasStr(void) { + return std::to_string( + std::hash()(std::this_thread::get_id())); +} + +inline std::string ExtendKeyWithThreadInfoIfNeeded(const OneDNNContext& dev_ctx, + const std::string& key) { + return (OneDNNContext::tls().is_tid_used_in_key() == true) + ? key + "-t:" + ThreadIDasStr() + : key; +} + +template +bool constexpr is_int8() { + return std::is_same::value || std::is_same::value; +} + } // namespace funcs } // namespace phi diff --git a/paddle/phi/backends/onednn/onednn_reuse.h b/paddle/phi/backends/onednn/onednn_reuse.h index e6194368378b1f8a903910c9a1c28173b6c8ded0..4a540ec884d9359fe34191fd170408a0dc76311e 100644 --- a/paddle/phi/backends/onednn/onednn_reuse.h +++ b/paddle/phi/backends/onednn/onednn_reuse.h @@ -33,19 +33,402 @@ namespace funcs { using user_function = std::function(const float*)>; using memory = dnnl::memory; -using Place = phi::Place; -using MKLDNNMemoryFormat = dnnl::memory::format_tag; +using OneDNNMemoryFormat = dnnl::memory::format_tag; template -class MKLDNNHandlerNoCachingT { +class OneDNNHandlerT { public: - MKLDNNHandlerNoCachingT(dnnl::engine engine, Place cpu_place) + OneDNNHandlerT(const OneDNNContext& dev_ctx, + dnnl::engine engine, + Place cpu_place, + const std::string& base_key) + : dev_ctx_(dev_ctx), + engine_(engine), + place_(cpu_place), + key_common_(base_key), + key_(ExtendKeyWithThreadInfoIfNeeded(dev_ctx, base_key)), + fwd_pd_(nullptr), + bwd_pd_(nullptr) { + OneDNNContext::tls().log_lib_version(); + } + + std::shared_ptr AcquireForwardPrimitive() { + const std::string key_p = key_ + "@fwd_p"; + auto forward_p = + std::static_pointer_cast(dev_ctx_.GetBlob(key_p)); + if (forward_p == nullptr) { + forward_p = std::make_shared(*fwd_pd_); + dev_ctx_.SetBlob(key_p, forward_p); + } + return forward_p; + } + + std::shared_ptr AcquireBackwardPrimitive() { + const std::string key_p = key_ + "@bwd_p"; + auto backward_p = + std::static_pointer_cast(dev_ctx_.GetBlob(key_p)); + if (backward_p == nullptr) { + backward_p = std::make_shared(*bwd_pd_); + dev_ctx_.SetBlob(key_p, backward_p); + } + return backward_p; + } + + std::shared_ptr AcquireBackwardWeightsPrimitive() { + const std::string key_p = key_ + "@bwd_w_p"; + auto backward_p = + std::static_pointer_cast(dev_ctx_.GetBlob(key_p)); + if (backward_p == nullptr) { + PADDLE_ENFORCE_NOT_NULL( + bwd_w_pd_, + errors::Unavailable("BWD_PD should be set when " + "getting BWD prim witk key: %s .", + key_p)); + backward_p = std::make_shared(*bwd_w_pd_); + dev_ctx_.SetBlob(key_p, backward_p); + } + return backward_p; + } + + std::shared_ptr AcquireSrcMemory(const DenseTensor* input) { + const T* input_data = input->data(); + return this->AcquireMemoryFromPrimitive( + fwd_pd_->src_desc(), to_void_cast(input_data), "@src_mem_p"); + } + + template + std::shared_ptr AcquireDstMemory(DenseTensor* output) { + T_out* ptr = + output->mutable_data(place_, fwd_pd_->dst_desc().get_size()); + return this->AcquireMemoryFromPrimitive( + fwd_pd_->dst_desc(), ptr, "@dst_mem_p"); + } + + template + std::shared_ptr AcquireDstMemory(void) { + return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), "@dstt_mem_p"); + } + + template + std::shared_ptr AcquireDstMemory(const DenseTensor* output) { + const T_out* output_data = output->data(); + return this->AcquireMemoryFromPrimitive(bwd_pd_->dst_desc(), + to_void_cast(output_data), + "@bwd-dst_mem_p"); + } + + std::shared_ptr AcquireDiffDstMemory( + const DenseTensor* diffdst) { + const T* ptr = diffdst->data(); + return this->AcquireMemoryFromPrimitive( + bwd_pd_->diff_dst_desc(), to_void_cast(ptr), "@diff_dst_mem_p"); + } + + std::shared_ptr AcquireDiffSrcMemory(DenseTensor* diffsrc) { + T* ptr = + diffsrc->mutable_data(place_, bwd_pd_->diff_src_desc().get_size()); + return this->AcquireMemoryFromPrimitive( + bwd_pd_->diff_src_desc(), ptr, "@diff_src_mem_p"); + } + + // Buffer of given DenseTensor is used for oneDNN computation + std::shared_ptr AcquireDiffWeightsMemory( + DenseTensor* diff_weights) { + PADDLE_ENFORCE_NOT_NULL( + bwd_w_pd_, + errors::Unavailable( + "BWD_W_PD should be set when getting BWD grad of weights.")); + T* ptr = diff_weights->mutable_data( + place_, bwd_w_pd_->diff_weights_desc().get_size()); + return this->AcquireMemoryFromPrimitive( + bwd_w_pd_->diff_weights_desc(), ptr, "@diff_wei_mem_p"); + } + + // Buffer is allocated by oneDNN to store computation results + std::shared_ptr AcquireDiffWeightsMemory(void) { + PADDLE_ENFORCE_NOT_NULL( + bwd_w_pd_, + errors::Unavailable( + "BWD_W_PD should be set when getting BWD grad of weights.")); + return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), + "@diff_wei_mem_p"); + } + + protected: + bool isCached() { + const std::string key_pd = key_ + "@fwd_pd"; + fwd_pd_ = std::static_pointer_cast( + dev_ctx_.GetBlob(key_pd)); + + return (fwd_pd_ != nullptr); + } + + bool isBwdCached() { + const std::string key_pd = key_ + "@bwd_pd"; + bwd_pd_ = std::static_pointer_cast( + dev_ctx_.GetBlob(key_pd)); + + if (bwd_pd_ == nullptr) { + return false; + } else { + if (std::is_same::value == + false) { + const std::string key_bw_w_pd = key_ + "@bwd_w_pd"; + bwd_w_pd_ = + std::static_pointer_cast( + dev_ctx_.GetBlob(key_bw_w_pd)); + } + + // When BWD is cached then still we need to Get FWD PD + const std::string key_fpd = key_ + "@fwd_pd"; + fwd_pd_ = std::static_pointer_cast( + dev_ctx_.GetBlob(key_fpd)); + PADDLE_ENFORCE_NOT_NULL( + fwd_pd_, + errors::Unavailable( + "Error: FWD PD should be set when BWD PD is cached.")); + return true; + } + } + + // If your primitive descriptor requires attributes, pass them as a + // first argument and paramters to descriptor constructor in the following + // arguments. Otherwise, all arguments will be forwarded to descriptor + // constructor, including the first one. + template + void AcquireForwardPrimitiveDescriptor(Arg&& first_arg, Args&&... args) { + // This is used when we can recreate FWD PD in BWD so + // we do not need to pass FWD to BWD + const std::string key_pd = key_ + "@fwd_pd"; + fwd_pd_ = std::static_pointer_cast( + dev_ctx_.GetBlob(key_pd)); + if (fwd_pd_ == nullptr) { + CreateForwardPrimitiveDescriptor(first_arg, std::forward(args)...); + dev_ctx_.SetBlob(key_pd, fwd_pd_); + } + } + + // Using sfinae to specialise variadic function. Workaround for not having + // if constexpr in C++ 11. + template + typename std::enable_if::type, + dnnl::primitive_attr>::value>::type + CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) { + auto fwd_desc = typename TForward::desc(std::forward(args)...); + fwd_pd_ = std::make_shared( + fwd_desc, first, engine_); + } + + template + typename std::enable_if::type, + dnnl::primitive_attr>::value>::type + CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) { + auto fwd_desc = typename TForward::desc(std::forward(first), + std::forward(args)...); + fwd_pd_ = + std::make_shared(fwd_desc, engine_); + } + + template + void AcquireBackwardPrimitiveDescriptor(Args&&... args) { + // fwd_pd_ is set during grad by calling + // AcquireForwardPrimitiveDescriptor + PADDLE_ENFORCE_NOT_NULL( + fwd_pd_, + errors::Unavailable("Get OneDNN Forward primitive %s failed.", + key_ + "@fwd_pd")); + const std::string key_pd = key_ + "@bwd_pd"; + bwd_pd_ = std::static_pointer_cast( + dev_ctx_.GetBlob(key_pd)); + if (bwd_pd_ == nullptr) { + auto bwd_desc = typename TBackward::desc(std::forward(args)...); + bwd_pd_ = std::make_shared( + bwd_desc, engine_, *fwd_pd_); + dev_ctx_.SetBlob(key_pd, bwd_pd_); + } + } + + template + void AcquireBackwardWeightsPrimitiveDescriptor(Args&&... args) { + // fwd_pd_ is set during grad by calling + // AcquireForwardPrimitiveDescriptor + PADDLE_ENFORCE_NOT_NULL( + fwd_pd_, + errors::Unavailable("Get OneDNN Forward primitive %s failed.", + key_ + "@fwd_pd")); + const std::string key_pd = key_ + "@bwd_w_pd"; + bwd_w_pd_ = + std::static_pointer_cast( + dev_ctx_.GetBlob(key_pd)); + if (bwd_w_pd_ == nullptr) { + auto bwd_desc = + typename TBackward_params::desc(std::forward(args)...); + bwd_w_pd_ = std::make_shared( + bwd_desc, engine_, *fwd_pd_); + dev_ctx_.SetBlob(key_pd, bwd_w_pd_); + } + } + + std::shared_ptr AcquireMemoryFromPrimitive( + const std::string& suffix) { + return std::static_pointer_cast( + dev_ctx_.GetBlob(key_ + suffix)); + } + + std::shared_ptr AcquireMemoryFromPrimitive( + dnnl::memory::desc md, void* ptr, const std::string& suffix) { + const auto local_key = key_ + suffix; + auto mem_p = + std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + if (mem_p == nullptr) { + mem_p = std::make_shared(md, engine_, ptr); + dev_ctx_.SetBlob(local_key, mem_p); + } else { + mem_p->set_data_handle(ptr); + } + return mem_p; + } + + std::shared_ptr AcquireMemoryFromPrimitive( + dnnl::memory::desc md, const std::string& suffix) { + const auto local_key = key_ + suffix; + auto mem_p = + std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + if (mem_p == nullptr) { + mem_p = std::make_shared(md, engine_); + dev_ctx_.SetBlob(local_key, mem_p); + } + return mem_p; + } + + void AcquireReorder(const std::shared_ptr& user_memory_p, + const std::shared_ptr& target_memory_p) { + auto reorder_p = + std::make_shared(*user_memory_p, *target_memory_p); + + auto& astream = OneDNNContext::tls().get_stream(); + + paddle::platform::RecordEvent record_reorder( + "int_reorder", + paddle::platform::TracerEventType::UserDefined, + 2, + paddle::platform::EventRole::kUniqueOp); + reorder_p->execute( + astream, + {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); + astream.wait(); + } + + template + std::shared_ptr AcquireMemoryWithReorder( + const dnnl::memory::desc& user_md, + const dnnl::memory::desc& target_md, + void* ptr, + const std::string& suffix, + bool is_persistent = false, + std::function(const F*)> custom_reorder_func = {}, + const std::vector& scale_data = {1.0f}, + int mask = 0) { + const auto target_key = key_ + suffix + "_target"; + const auto key_reorder_p = key_ + suffix + "reorder_p"; + const auto user_key = key_ + suffix + "_user"; + + auto target_memory_p = + std::static_pointer_cast(dev_ctx_.GetBlob(target_key)); + + if (target_memory_p == nullptr) { + if (custom_reorder_func) { + auto reordered_data = + custom_reorder_func(reinterpret_cast(ptr)); + dev_ctx_.SetBlob(key_reorder_p + "-custom_reorder", reordered_data); + ptr = reinterpret_cast(reordered_data.get()); + } + auto user_memory_p = + std::make_shared(user_md, engine_, ptr); + if (user_md != target_md) { + target_memory_p = std::make_shared(target_md, engine_); + dnnl::reorder::primitive_desc reorder_pdesc; + if (is_int8()) { + dnnl::primitive_attr attr; + attr.set_output_scales(mask, scale_data); + reorder_pdesc = dnnl::reorder::primitive_desc( + *user_memory_p, *target_memory_p, attr); + } else { + reorder_pdesc = + dnnl::reorder::primitive_desc(*user_memory_p, *target_memory_p); + } + auto reorder_p = std::make_shared(reorder_pdesc); + dev_ctx_.SetBlob(key_reorder_p, reorder_p); + + auto& astream = OneDNNContext::tls().get_stream(); + paddle::platform::RecordEvent record_reorder( + "int_reorder", + paddle::platform::TracerEventType::UserDefined, + 2, + paddle::platform::EventRole::kUniqueOp); + reorder_p->execute( + astream, + {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); + astream.wait(); + } else { + target_memory_p = user_memory_p; + } + dev_ctx_.SetBlob(user_key, user_memory_p); + dev_ctx_.SetBlob(target_key, target_memory_p); + } else if (!is_persistent) { + auto& astream = OneDNNContext::tls().get_stream(); + + auto user_memory_p = + std::static_pointer_cast(dev_ctx_.GetBlob(user_key)); + user_memory_p->set_data_handle(ptr); + + // TODO(jczaja): Here we detect if reorder is cached it means it is needed + // need to change this to get rid of keys + auto reorder_p = std::static_pointer_cast( + dev_ctx_.GetBlob(key_reorder_p)); + if (reorder_p != nullptr) { + paddle::platform::RecordEvent record_reorder( + "int_reorder", + paddle::platform::TracerEventType::UserDefined, + 2, + paddle::platform::EventRole::kUniqueOp); + reorder_p->execute( + astream, + {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); + astream.wait(); + } + } + return target_memory_p; + } + + std::shared_ptr AcquireMemory(const std::string& suffix) { + const auto local_key = key_ + suffix; + return std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + } + + const OneDNNContext& dev_ctx_; + dnnl::engine engine_; + Place place_; + std::string key_common_; + std::string key_; + std::shared_ptr fwd_pd_; + std::shared_ptr bwd_pd_; + std::shared_ptr bwd_w_pd_; +}; + +template +class OneDNNHandlerNoCachingT { + public: + OneDNNHandlerNoCachingT(dnnl::engine engine, Place cpu_place) : engine_(engine), place_(cpu_place), fwd_pd_(nullptr), bwd_pd_(nullptr) { - phi::OneDNNContext::tls().log_lib_version(); + OneDNNContext::tls().log_lib_version(); } std::shared_ptr AcquireForwardPrimitive() { @@ -57,10 +440,9 @@ class MKLDNNHandlerNoCachingT { } std::shared_ptr AcquireBackwardWeightsPrimitive() { - PADDLE_ENFORCE_NOT_NULL( - bwd_w_pd_, - phi::errors::Unavailable("BWD_PD should be set when " - "getting BWD prim .")); + PADDLE_ENFORCE_NOT_NULL(bwd_w_pd_, + errors::Unavailable("BWD_PD should be set when " + "getting BWD prim .")); return std::make_shared(*bwd_w_pd_); } @@ -102,12 +484,12 @@ class MKLDNNHandlerNoCachingT { return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_src_desc(), ptr); } - // Buffer of given Tensor is used for oneDNN computation + // Buffer of given DenseTensor is used for oneDNN computation std::shared_ptr AcquireDiffWeightsMemory( DenseTensor* diff_weights) { PADDLE_ENFORCE_NOT_NULL( bwd_w_pd_, - phi::errors::Unavailable( + errors::Unavailable( "BWD_W_PD should be set when getting BWD grad of weights.")); T* ptr = diff_weights->mutable_data( place_, bwd_w_pd_->diff_weights_desc().get_size()); @@ -119,7 +501,7 @@ class MKLDNNHandlerNoCachingT { std::shared_ptr AcquireDiffWeightsMemory(void) { PADDLE_ENFORCE_NOT_NULL( bwd_w_pd_, - phi::errors::Unavailable( + errors::Unavailable( "BWD_W_PD should be set when getting BWD grad of weights.")); return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc()); } @@ -161,7 +543,7 @@ class MKLDNNHandlerNoCachingT { // AcquireForwardPrimitiveDescriptor PADDLE_ENFORCE_NOT_NULL( fwd_pd_, - phi::errors::Unavailable("Get MKLDNN Forward primitive %s failed.")); + errors::Unavailable("Get oneDNN Forward primitive %s failed.")); auto bwd_desc = typename TBackward::desc(std::forward(args)...); bwd_pd_ = std::make_shared( bwd_desc, engine_, *fwd_pd_); @@ -173,7 +555,7 @@ class MKLDNNHandlerNoCachingT { // AcquireForwardPrimitiveDescriptor PADDLE_ENFORCE_NOT_NULL( fwd_pd_, - phi::errors::Unavailable("Get MKLDNN Forward primitive %s failed.")); + errors::Unavailable("Get oneDNN Forward primitive %s failed.")); auto bwd_desc = typename TBackward_params::desc(std::forward(args)...); bwd_w_pd_ = std::make_shared( @@ -195,7 +577,7 @@ class MKLDNNHandlerNoCachingT { auto reorder_p = std::make_shared(*user_memory_p, *target_memory_p); - auto& astream = phi::OneDNNContext::tls().get_stream(); + auto& astream = OneDNNContext::tls().get_stream(); paddle::platform::RecordEvent record_reorder( "int_reorder", @@ -227,7 +609,7 @@ class MKLDNNHandlerNoCachingT { auto reorder_p = std::make_shared(*user_memory_p, *target_memory_p); - auto& astream = phi::OneDNNContext::tls().get_stream(); + auto& astream = OneDNNContext::tls().get_stream(); paddle::platform::RecordEvent record_reorder( "int_reorder", paddle::platform::TracerEventType::UserDefined, @@ -252,7 +634,7 @@ class MKLDNNHandlerNoCachingT { template class ActivationOneDNNHandler - : public MKLDNNHandlerNoCachingT { public: @@ -262,7 +644,7 @@ class ActivationOneDNNHandler const dnnl::engine engine, Place cpu_place, const DenseTensor* x) - : MKLDNNHandlerNoCachingT(engine, cpu_place) { this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_training, @@ -279,7 +661,7 @@ class ActivationOneDNNHandler Place cpu_place, const DenseTensor* x, const DenseTensor* dout) - : MKLDNNHandlerNoCachingT(engine, cpu_place) { this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_training, @@ -330,7 +712,7 @@ class ReorderOneDNNHandler { return std::make_shared(md, engine_, ptr); } - std::shared_ptr AcquireSrcMemory(const MKLDNNMemoryFormat& fmt, + std::shared_ptr AcquireSrcMemory(const OneDNNMemoryFormat& fmt, void* ptr) { auto md = dnnl::memory::desc(dims_, dtype_, fmt); return std::make_shared(md, engine_, ptr); @@ -347,7 +729,7 @@ class ReorderOneDNNHandler { } std::shared_ptr AcquireDstMemory(DenseTensor* output, - const MKLDNNMemoryFormat& fmt, + const OneDNNMemoryFormat& fmt, Place place) { auto dst_md = OneDNNMemDesc(dims_, dtype_dst_, fmt); auto dst_data = output->mutable_data(place, ptype_dst_, dst_md.get_size()); @@ -372,7 +754,7 @@ class ReorderOneDNNHandler { std::shared_ptr AcquireDstMemory( DenseTensor* output, const std::vector& dims, - const MKLDNNMemoryFormat& fmt, + const OneDNNMemoryFormat& fmt, Place place) { auto dst_md = OneDNNMemDesc(dims, dtype_dst_, fmt); auto dst_data = output->mutable_data(place, ptype_dst_, dst_md.get_size()); @@ -400,5 +782,170 @@ class ReorderOneDNNHandler { dnnl::engine engine_; }; +template +class BinaryOneDNNHandler : public OneDNNHandlerNoCachingT { + public: + BinaryOneDNNHandler(const dnnl::algorithm algo, + const int axis, + const dnnl::engine engine, + Place cpu_place, + const DenseTensor* x, + const DenseTensor* y, + DenseTensor* out, + float scale_x, + float scale_y, + float scale_out, + const dnnl::post_ops& post_ops = dnnl::post_ops{}) + : OneDNNHandlerNoCachingT(engine, cpu_place) { + const auto src_x_tz = vectorize(x->dims()); + const auto src_y_tz = vectorize(y->dims()); + // if output tensor(z) is nullptr then we are computing into oneDNN + // managed buffer + auto rankdiff = x->dims().size() - y->dims().size(); + const auto dst_tz = (out == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz) + : vectorize(out->dims()); + + auto src0_md = x->mem_desc(); + auto src1_md = y->mem_desc(); + if (rankdiff > 0) { // Second input is of smaller rank than first + std::vector dims1_ex(rankdiff, 1); + dims1_ex.insert(next(dims1_ex.begin(), (axis == -1 ? rankdiff : axis)), + src_y_tz.begin(), + src_y_tz.end()); + // For broadcasting for NHWC we need rotate extended shape + if (OneDNNContext::tls().get_cur_paddle_data_layout() == + DataLayout::kNHWC) { + std::rotate(dims1_ex.begin() + 1, dims1_ex.end() - 1, dims1_ex.end()); + } + src1_md = src1_md.reshape(dims1_ex); + } else if (rankdiff < 0) { // First input is of smaller than second + std::vector dims0_ex(-rankdiff, 1); + dims0_ex.insert(next(dims0_ex.begin(), (axis == -1 ? -rankdiff : axis)), + src_x_tz.begin(), + src_x_tz.end()); + // For broadcasting for NHWC we need rotate extended shape + if (OneDNNContext::tls().get_cur_paddle_data_layout() == + DataLayout::kNHWC) { + std::rotate(dims0_ex.begin() + 1, dims0_ex.end() - 1, dims0_ex.end()); + } + src0_md = src0_md.reshape(dims0_ex); + } + const auto dst_md = + memory::desc(dst_tz, oneDNNGetDataType(), OneDNNMemoryFormat::any); + + auto attributes = + CreateAttributes(algo, scale_x, scale_y, scale_out, post_ops); + + if (x->numel() < y->numel()) { + this->AcquireForwardPrimitiveDescriptor( + attributes, algo, src1_md, src0_md, dst_md); + } else { + this->AcquireForwardPrimitiveDescriptor( + attributes, algo, src0_md, src1_md, dst_md); + } + } + std::shared_ptr AcquireSecondSrcMemory( + const DenseTensor* input) { + const T* input_data = input->data(); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->src1_desc(), + to_void_cast(input_data)); + } + + private: + static inline dnnl::primitive_attr CreateAttributes( + dnnl::algorithm op, + float scale_x, + float scale_y, + float scale_out, + dnnl::post_ops post_ops = dnnl::post_ops{}) { + // Scales set in attributes for inputs contibute to the output equation + // in the following way (assuming no broadcasting takes place): + // output_i = scale_0 * x_i <+ or *> scale_1 * y_i; + // Hence we have to create scales that will: + // 1. Dequantize both values, by multiplying with (1.0 / scale_x_or_y) + // 2. Quantize their result to output scale range, by multiplying with + // (scale_z) + // If we combine these two, we end up with following equation + // output = scale_out * (1/scale_x * x <* or +> 1/scale_y * y) + // Hence, to mimic such behaviour using provided interface, + // For add operation the equation is equal to: + // output = (scale_out / scale_x) * x + (scale_out / scale_y) * y + // + // For mul operation on the other hand + // output = (scale_out / scale_x) * x * (1.0 / scale_y) * y + // + float scale_0 = scale_out / scale_x; + float scale_1 = + op == dnnl::algorithm::binary_add ? scale_out / scale_y : 1.0 / scale_y; + dnnl::primitive_attr attributes; + attributes.set_scales( + /* input_x_id = */ DNNL_ARG_SRC_0, /* mask = */ 0, {scale_0}); + attributes.set_scales( + /* input_y_id = */ DNNL_ARG_SRC_1, /* mask = */ 0, {scale_1}); + if (post_ops.len() > 0) attributes.set_post_ops(post_ops); + return attributes; + } +}; + +template +class BroadcastDataOneDNNHandler + : public OneDNNHandlerNoCachingT { + public: + BroadcastDataOneDNNHandler(const dnnl::algorithm algo, + const dnnl::engine engine, + Place cpu_place, + const DenseTensor* x, + DenseTensor* out, + float scale_x, + float scale_y, + const std::vector& extended_x_dims) + : OneDNNHandlerNoCachingT(engine, cpu_place) { + const auto src0_tz = vectorize(out->dims()); + const auto src0_md = dnnl::memory::desc( + src0_tz, oneDNNGetDataType(), GetPlainOneDNNFormat(src0_tz.size())); + const auto src1_md = x->mem_desc().reshape(extended_x_dims); + + dnnl::primitive_attr attributes; + attributes.set_scales(DNNL_ARG_SRC_0, 0, {scale_x}); + attributes.set_scales(DNNL_ARG_SRC_1, 0, {scale_y}); + + this->AcquireForwardPrimitiveDescriptor( + attributes, algo, src0_md, src1_md, src0_md); + } + + template + std::shared_ptr AcquireZeroedDstMemory(DenseTensor* out) { + T_out* ptr = out->mutable_data(this->place_, + this->fwd_pd_->dst_desc().get_size()); + memset(ptr, 0, this->fwd_pd_->dst_desc().get_size()); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr); + } +}; + +template +class ReductionOneDNNHandler + : public OneDNNHandlerNoCachingT { + public: + ReductionOneDNNHandler(const dnnl::algorithm algo, + const float p, + const float eps, + const dnnl::engine engine, + Place cpu_place, + const DenseTensor* x, + const DenseTensor* out, + std::vector out_tz, + const dnnl::primitive_attr& attrs = NULL) + : OneDNNHandlerNoCachingT(engine, cpu_place) { + const auto out_md = memory::desc( + out_tz, oneDNNGetDataType(), dnnl::memory::format_tag::any); + + if (attrs) + this->AcquireForwardPrimitiveDescriptor( + attrs, algo, x->mem_desc(), out_md, p, eps); + else + this->AcquireForwardPrimitiveDescriptor( + algo, x->mem_desc(), out_md, p, eps); + } +}; } // namespace funcs } // namespace phi diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h index bfdc3814823183069a983bd46f4b31179e07bc1e..6f1774fe8e46ac35bb6fabe05350c1ba307d59c0 100644 --- a/paddle/phi/common/backend.h +++ b/paddle/phi/common/backend.h @@ -32,7 +32,7 @@ namespace experimental { * more specific, we need to distinguish the calculation method. * * Such as the kernel for CPU device, it can be a native CPU kernel, - * or a kernel implemented by MKLDNN library. + * or a kernel implemented by oneDNN library. * * Note(chenweihang): HIP is not needed now, we can added it if needed * in the future diff --git a/paddle/phi/common/layout.h b/paddle/phi/common/layout.h index 2d74abeb84d641fd6ed23fc67a6c5d68c21e564d..b7f4abcc63a62670059ec7853ed9223c691e9200 100644 --- a/paddle/phi/common/layout.h +++ b/paddle/phi/common/layout.h @@ -40,7 +40,7 @@ enum class DataLayout { NCHW, NCDHW, NDHWC, - MKLDNN, + ONEDNN, SPARSE_COO, SPARSE_CSR, PSTRING_UNION, @@ -62,7 +62,7 @@ enum class DataLayout { kAnyLayout = ANY, kNHWC = NHWC, kNCHW = NCHW, - kMKLDNN = MKLDNN, // all layouts supported by MKLDNN internally + kMKLDNN = ONEDNN, // all layouts supported by ONEDNN internally kNDHWC = NDHWC, kNCDHW = NCDHW, }; diff --git a/paddle/phi/kernels/onednn/log_softmax_kernel.cc b/paddle/phi/kernels/onednn/log_softmax_kernel.cc index 254e975dd45ec43820c16d692cad7a229149ec7f..963bb7e0e3224d3de6ec7c4b6249b51ce2710e32 100644 --- a/paddle/phi/kernels/onednn/log_softmax_kernel.cc +++ b/paddle/phi/kernels/onednn/log_softmax_kernel.cc @@ -14,8 +14,8 @@ #include "paddle/phi/kernels/log_softmax_kernel.h" -#include "paddle/fluid/platform/mkldnn_reuse.h" #include "paddle/phi/backends/onednn/onednn_context.h" +#include "paddle/phi/backends/onednn/onednn_reuse.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" @@ -23,16 +23,15 @@ namespace phi { template -class LogSoftmaxMKLDNNHandler - : public paddle::platform:: - MKLDNNHandlerNoCachingT { +class LogSoftmaxOneDNNHandler + : public funcs::OneDNNHandlerNoCachingT { public: - LogSoftmaxMKLDNNHandler(const dnnl::engine mkldnn_engine, + LogSoftmaxOneDNNHandler(const dnnl::engine onednn_engine, Place cpu_place, const DenseTensor& x, const int axis) - : paddle::platform::MKLDNNHandlerNoCachingT( - mkldnn_engine, cpu_place) { + : funcs::OneDNNHandlerNoCachingT( + onednn_engine, cpu_place) { this->AcquireForwardPrimitiveDescriptor( dnnl::prop_kind::forward_inference, x.mem_desc(), axis); } @@ -43,11 +42,11 @@ void LogSoftmaxKernel(const Context& dev_ctx, const DenseTensor& x, int axis, DenseTensor* out) { - const auto& mkldnn_engine = dev_ctx.GetEngine(); + const auto& onednn_engine = dev_ctx.GetEngine(); axis = axis >= 0 ? axis : x.dims().size() + axis; - LogSoftmaxMKLDNNHandler handler( - mkldnn_engine, dev_ctx.GetPlace(), x, axis); + LogSoftmaxOneDNNHandler handler( + onednn_engine, dev_ctx.GetPlace(), x, axis); auto src_memory_p = handler.AcquireSrcMemory(&x); auto dst_memory_p = handler.AcquireDstMemory(out); diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc index 8aa3a2257e2464d390e85753371ca7669908be9f..25a986ea82fb020457dee5b7741d5bd7e70238a6 100644 --- a/paddle/phi/kernels/transfer_layout_kernel.cc +++ b/paddle/phi/kernels/transfer_layout_kernel.cc @@ -97,7 +97,7 @@ void TransferLayoutMKLDNN(const Context& dev_ctx, // NOTE(zhiqiu): to handle the special case in ApplyDataTransform() in // data_transfer.cc - if (!x.IsInitialized() && src_layout == DataLayout::MKLDNN && + if (!x.IsInitialized() && src_layout == DataLayout::ONEDNN && dst_layout == DataLayout::NHWC) { VLOG(4) << src_layout << "->" << dst_layout << " " << x.layout(); out->Resize(x.dims()); @@ -106,7 +106,7 @@ void TransferLayoutMKLDNN(const Context& dev_ctx, return; } - if (src_layout != DataLayout::MKLDNN && dst_layout == DataLayout::MKLDNN) { + if (src_layout != DataLayout::ONEDNN && dst_layout == DataLayout::ONEDNN) { // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel // Just set layout/format. No real transform occur auto out_format = funcs::OneDNNFormatForSize( @@ -121,16 +121,16 @@ void TransferLayoutMKLDNN(const Context& dev_ctx, OneDNNContext::tls().set_cur_paddle_data_layout(src_layout); } - out->set_layout(DataLayout::MKLDNN); + out->set_layout(DataLayout::ONEDNN); out->set_format(out_format); - } else if (src_layout == DataLayout::MKLDNN && - dst_layout != DataLayout::MKLDNN) { + } else if (src_layout == DataLayout::ONEDNN && + dst_layout != DataLayout::ONEDNN) { // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel // Do transform via MKLDNN lib funcs::innerTransDataLayoutFromOneDNN( src_layout, dst_layout, x, out, dev_ctx.GetPlace()); - } else if (src_layout == DataLayout::MKLDNN && - dst_layout == DataLayout::MKLDNN) { + } else if (src_layout == DataLayout::ONEDNN && + dst_layout == DataLayout::ONEDNN) { PADDLE_ENFORCE_NE( src_layout, dst_layout, diff --git a/paddle/phi/tests/common/test_data_layout.cc b/paddle/phi/tests/common/test_data_layout.cc index 3a53e25f92b2caa7cb153b63989ed7c28e612d91..90a0813d7b2381f640a3ad0b7e88e269703c2d1f 100644 --- a/paddle/phi/tests/common/test_data_layout.cc +++ b/paddle/phi/tests/common/test_data_layout.cc @@ -37,7 +37,7 @@ TEST(DataLayout, OStream) { oss << phi::DataLayout::NCHW; EXPECT_EQ(oss.str(), "NCHW"); oss.str(""); - oss << phi::DataLayout::MKLDNN; + oss << phi::DataLayout::ONEDNN; EXPECT_EQ(oss.str(), "MKLDNN"); oss.str(""); try { diff --git a/paddle/phi/tests/kernels/test_transfer_layout_dev_api.cc b/paddle/phi/tests/kernels/test_transfer_layout_dev_api.cc index 0c81ecada96e1d7889a81f0d451842c193e73053..97d5cfe65ae79ba4eea6cbb63137b43f6c6a63fd 100644 --- a/paddle/phi/tests/kernels/test_transfer_layout_dev_api.cc +++ b/paddle/phi/tests/kernels/test_transfer_layout_dev_api.cc @@ -40,7 +40,7 @@ TEST(DEV_API, transfer_layout) { DenseTensor x; MetaTensor meta_x(&x); meta_x.set_dtype(DataType::FLOAT32); - meta_x.set_layout(DataLayout::MKLDNN); + meta_x.set_layout(DataLayout::ONEDNN); meta_x.set_dims(make_ddim({n, c, h, w})); DenseTensor out;