diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc index 24bbcbca01b4f4917a81586cb030427f4bf08d09..bf88b82fc30a18cfdf1aa37d3e0a92391c1012ac 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc @@ -20,6 +20,7 @@ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/string/pretty_log.h" namespace paddle { namespace framework { @@ -315,6 +316,12 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const { }; gpd(graph, handler); AddStatis(found_conv_bias_count); + if ((!Has("disable_logs") || !Get("disable_logs")) && + found_conv_bias_count > 0) { + string::PrettyLogDetail("--- fused %d %s with elementwise_add as bias", + found_conv_bias_count, + type()); + } } } // namespace ir } // namespace framework diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc index bdb2bef362be43ef40518ea6cb2ed659690784a7..1762f638e7de83c149be66850c594046ae351b71 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc @@ -34,7 +34,7 @@ PD_DECLARE_KERNEL(gelu, CPU, ALL_LAYOUT); USE_OP_ITSELF(batch_norm); PD_DECLARE_KERNEL(batch_norm, OneDNN, ONEDNN); USE_OP_ITSELF(conv2d_transpose); -USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN); +PD_DECLARE_KERNEL(conv2d_transpose, OneDNN, ONEDNN); USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP_ITSELF(gelu); diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc deleted file mode 100644 index 63fe71bce7c35de3f608ce862ce6267920a90124..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc +++ /dev/null @@ -1,430 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include "paddle/fluid/framework/data_layout_transform.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/operators/conv_op.h" -#include "paddle/fluid/platform/mkldnn_reuse.h" - -namespace paddle { -namespace operators { - -using Tensor = phi::DenseTensor; -using phi::DataLayout; -using phi::funcs::OneDNNMemDesc; - -inline dnnl::memory::dims GetWeightsTz(const phi::DenseTensor* filter, - const int groups) { - auto weights_tz = phi::vectorize(filter->dims()); - int g = std::max(groups, 1); - int g_dim = (g > 1) ? 1 : 0; - phi::funcs::GetGroupConvWeightsTz(weights_tz, g); - // gIOHW -> gOIHW || IOHW -> OIHW - std::swap(weights_tz[g_dim + 0], weights_tz[g_dim + 1]); - return weights_tz; -} - -template -class ConvTransposeMKLDNNHandlerT - : public phi::funcs::OneDNNHandlerNoCachingT { - public: - ConvTransposeMKLDNNHandlerT(const framework::ExecutionContext& ctx, - const dnnl::engine mkldnn_engine, - const phi::DenseTensor* input, - const phi::DenseTensor* filter, - const phi::DenseTensor* bias, - phi::DenseTensor* output) - : phi::funcs::OneDNNHandlerNoCachingT( - mkldnn_engine, ctx.GetPlace()), - is_test_(ctx.Attr("is_test")) { - PADDLE_ENFORCE_EQ(is_test_, - true, - platform::errors::InvalidArgument( - "ConvTransposeMKLDNN works only for inference. " - "The attribute \'is_test\' value should be set to " - "True, but got is_test=False.")); - - PADDLE_ENFORCE_EQ( - input->layout(), - DataLayout::ONEDNN, - platform::errors::InvalidArgument( - "Got wrong layout = %d for Input tensor.", input->layout())); - - PADDLE_ENFORCE_EQ( - filter->layout(), - DataLayout::ONEDNN, - platform::errors::InvalidArgument( - "The filter tensor's layout should be %d, but got %d.", - DataLayout::ONEDNN, - filter->layout())); - - PADDLE_ENFORCE_EQ( - input->dims().size(), - 4, - platform::errors::InvalidArgument("Input must be with 4 dimensions, " - "i.e. NCHW. but got dimension =%d", - input->dims().size())); - PADDLE_ENFORCE_EQ( - filter->dims().size(), - 4, - platform::errors::InvalidArgument("Filter must be with 4 dimensions, " - "i.e. OIHW, but got dimension =%d", - filter->dims().size())); - - if (bias) { - PADDLE_ENFORCE_EQ( - bias->layout(), - DataLayout::ONEDNN, - platform::errors::InvalidArgument( - "The bias tensor's laytout should be %d, but got %d.", - DataLayout::ONEDNN, - bias->layout())); - - PADDLE_ENFORCE_EQ( - bias->dims().size(), - 1, - platform::errors::InvalidArgument("Bias must only have 1 dimension, " - "i.e. X, but got dimension = %d .", - bias->dims().size())); - } - - std::vector strides_temp = ctx.Attr>("strides"); - dnnl::memory::dims strides(begin(strides_temp), end(strides_temp)); - - std::vector paddings_temp = ctx.Attr>("paddings"); - dnnl::memory::dims paddings(begin(paddings_temp), end(paddings_temp)); - - std::vector dilations_temp = ctx.Attr>("dilations"); - dnnl::memory::dims dilations(begin(dilations_temp), end(dilations_temp)); - - int groups = ctx.Attr("groups"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - - PADDLE_ENFORCE_EQ( - strides.size(), - 2, - platform::errors::Unimplemented( - "Now we only support 2d oneDNN convolution transpose op")); - - const auto& input_dims = input->dims(); - const auto data_dims = phi::slice_ddim(input_dims, 2, input_dims.size()); - const auto& filter_dims = filter->dims(); - const auto filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - - const auto ksize = phi::vectorize(filter_data_dims); - - UpdatePaddingAndDilation( - &paddings, &dilations, padding_algorithm, data_dims, strides, ksize); - - std::transform( - dilations.begin(), dilations.end(), dilations.begin(), [](int64_t i) { - return i - 1; - }); - - const auto src_tz = phi::vectorize(input->dims()); - const auto weights_tz = GetWeightsTz(filter, groups); - const auto dst_tz = phi::vectorize(output->dims()); - const auto mkldnn_paddings = phi::funcs::ToOneDNNPadding(paddings); - - /* create memory descriptor for convolution without specified format - * ('any') which lets a primitive (convolution in this case) choose - * the memory format preferred for best performance - */ - const auto chosen_memory_format = OneDNNMemoryFormat::any; - - auto data_type = dnnl::memory::data_type::f32; - if (ctx.Attr("mkldnn_data_type") == "bfloat16" || - std::is_same::value) - data_type = dnnl::memory::data_type::bf16; - - const auto src_md = OneDNNMemDesc(src_tz, data_type, chosen_memory_format); - const auto weights_md = - OneDNNMemDesc(weights_tz, data_type, chosen_memory_format); - const auto dst_md = OneDNNMemDesc( - dst_tz, phi::funcs::OneDNNGetDataType(), chosen_memory_format); - - const dnnl::primitive_attr conv_trans_attr = CreateConvAttrs(ctx); - auto fwd_prop_kind = is_test_ ? dnnl::prop_kind::forward_inference - : dnnl::prop_kind::forward_training; - if (bias) { - std::vector bias_tz = phi::vectorize(bias->dims()); - const auto bias_md = - OneDNNMemDesc(bias_tz, data_type, OneDNNMemoryFormat::x); - this->AcquireForwardPrimitiveDescriptor( - conv_trans_attr, - fwd_prop_kind, - dnnl::algorithm::deconvolution_direct, - src_md, - weights_md, - bias_md, - dst_md, - strides, - dilations, - mkldnn_paddings[0], - mkldnn_paddings[1]); - } else { - this->AcquireForwardPrimitiveDescriptor( - conv_trans_attr, - fwd_prop_kind, - dnnl::algorithm::deconvolution_direct, - src_md, - weights_md, - dst_md, - strides, - dilations, - mkldnn_paddings[0], - mkldnn_paddings[1]); - } - } - - dnnl::primitive_attr CreateConvAttrs(const framework::ExecutionContext& ctx) { - dnnl::primitive_attr conv_attr; - dnnl::post_ops post_operations; - - const std::string fuse_activation = - ctx.Attr("fuse_activation"); - const float fuse_alpha = ctx.Attr("fuse_alpha"); - const float fuse_beta = ctx.Attr("fuse_beta"); - - // Fusion with ReLU layer is executed through the PostOps feature. Create a - // PostOps object and configure it to execute an eltwise relu operation. - if (fuse_activation == "relu" || fuse_activation == "leaky_relu") { - constexpr float scale = 1.0f; - post_operations.append_eltwise( - scale, dnnl::algorithm::eltwise_relu, fuse_alpha, fuse_beta); - } else if (fuse_activation == "relu6") { - constexpr float scale = 1.0f; - post_operations.append_eltwise( - scale, dnnl::algorithm::eltwise_bounded_relu, fuse_alpha, fuse_beta); - } else if (fuse_activation == "swish") { - constexpr float scale = 1.0f; - post_operations.append_eltwise( - scale, dnnl::algorithm::eltwise_swish, fuse_alpha, fuse_beta); - } - conv_attr.set_post_ops(post_operations); - return conv_attr; - } - - std::shared_ptr AcquireSrcMemoryWithReorder( - const phi::DenseTensor* input) { - const T* input_data = input->data(); - return phi::funcs::OneDNNHandlerNoCachingT:: - AcquireMemoryWithReorder(input->mem_desc(), - this->fwd_pd_->src_desc(), - phi::funcs::to_void_cast(input_data)); - } - - std::shared_ptr AcquireWeightsMemoryWithReorder( - const platform::MKLDNNDeviceContext& dev_ctx, - const std::string& key, - const phi::DenseTensor* filter, - const int& groups) { - const K* filter_data = filter->data(); - auto weights_tz = GetWeightsTz(filter, groups); - int g = std::max(groups, 1); - - auto user_src_md = OneDNNMemDesc( - weights_tz, - phi::funcs::OneDNNGetDataType(), - (g == 1) ? OneDNNMemoryFormat::iohw : OneDNNMemoryFormat::giohw); - - return this->template AcquireMemoryWithReorder( - dev_ctx, - user_src_md, - this->fwd_pd_->weights_desc(), - phi::funcs::to_void_cast(filter_data), - key, - "@weights_mem_p", - is_test_); - } - - template - std::shared_ptr AcquireMemoryWithReorder( - const platform::MKLDNNDeviceContext& dev_ctx, - const dnnl::memory::desc& user_md, - const dnnl::memory::desc& target_md, - void* ptr, - const std::string& key, - const std::string& suffix, - bool is_persistent = false, - const std::vector& scale_data = {1.0f}, - int mask = 0) { - const auto target_key = key + suffix + "_target"; - const auto key_reorder_p = key + suffix + "reorder_p"; - const auto user_key = key + suffix + "_user"; - - auto target_memory_p = - std::static_pointer_cast(dev_ctx.GetBlob(target_key)); - - if (target_memory_p == nullptr) { - auto user_memory_p = - std::make_shared(user_md, this->engine_, ptr); - if (user_md != target_md) { - target_memory_p = - std::make_shared(target_md, this->engine_); - dnnl::reorder::primitive_desc reorder_pdesc; - if (phi::funcs::is_int8()) { - dnnl::primitive_attr attr; - attr.set_output_scales(mask, scale_data); - reorder_pdesc = dnnl::reorder::primitive_desc( - *user_memory_p, *target_memory_p, attr); - } else { - reorder_pdesc = - dnnl::reorder::primitive_desc(*user_memory_p, *target_memory_p); - } - auto reorder_p = std::make_shared(reorder_pdesc); - dev_ctx.SetBlob(key_reorder_p, reorder_p); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - platform::RecordEvent record_reorder( - "int_reorder", - platform::TracerEventType::UserDefined, - 1, - platform::EventRole::kUniqueOp); - reorder_p->execute( - astream, - {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); - astream.wait(); - } else { - target_memory_p = user_memory_p; - } - dev_ctx.SetBlob(user_key, user_memory_p); - dev_ctx.SetBlob(target_key, target_memory_p); - } else if (!is_persistent) { - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - - auto user_memory_p = - std::static_pointer_cast(dev_ctx.GetBlob(user_key)); - user_memory_p->set_data_handle(ptr); - - // TODO(jczaja): Here we detect if reorder is cached it means it is needed - // need to change this to get rid of keys - auto reorder_p = std::static_pointer_cast( - dev_ctx.GetBlob(key_reorder_p)); - if (reorder_p != nullptr) { - platform::RecordEvent record_reorder( - "int_reorder", - platform::TracerEventType::UserDefined, - 1, - platform::EventRole::kUniqueOp); - reorder_p->execute( - astream, - {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); - astream.wait(); - } - } - return target_memory_p; - } - - std::shared_ptr AcquireBiasMemoryWithReorder( - const platform::MKLDNNDeviceContext& dev_ctx, - const std::string& key, - const phi::DenseTensor* bias) { - const K* bias_data = bias->data(); - auto user_bias_md = OneDNNMemDesc(phi::vectorize(bias->dims()), - phi::funcs::OneDNNGetDataType(), - OneDNNMemoryFormat::x); - return this->AcquireMemoryWithReorder( - dev_ctx, - user_bias_md, - this->fwd_pd_->bias_desc(), - phi::funcs::to_void_cast(bias_data), - key, - "@bias_mem_p", - is_test_); - } - - private: - const bool is_test_; -}; - -template -class ConvTransposeMKLDNNOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), - true, - platform::errors::PreconditionNotMet( - "Operator DNNL ConvTranspose must use CPUPlace")); - const bool is_bfloat16 = - ctx.Attr("mkldnn_data_type") == "bfloat16"; - const bool force_fp32_output = ctx.Attr("force_fp32_output"); - if (is_bfloat16) { - if (force_fp32_output) - Execute(ctx); - else - Execute(ctx); - } else { - Execute(ctx); - } - } - - template - void Execute(const framework::ExecutionContext& ctx) const { - auto& dev_ctx = - ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); - - const auto* input = ctx.Input("Input"); - const auto* filter = ctx.Input("Filter"); - const auto* bias = - ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; - auto* output = ctx.Output("Output"); - ConvTransposeMKLDNNHandlerT handler( - ctx, mkldnn_engine, input, filter, bias, output); - auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input); - // Caching Key for weights is needed - std::string key = platform::CreateKey(dev_ctx, - ctx.InputName("Input"), - ctx.InputName("Filter"), - (bias ? ctx.InputName("Bias") : "")); - key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key); - auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder( - dev_ctx, key, filter, ctx.Attr("groups")); - - std::shared_ptr dst_memory_p = - handler.template AcquireDstMemory(output); - auto conv_p = handler.AcquireForwardPrimitive(); - - std::unordered_map args = { - {DNNL_ARG_SRC, *src_memory_p}, - {DNNL_ARG_WEIGHTS, *weights_memory_p}, - {DNNL_ARG_DST, *dst_memory_p}}; - - if (bias) { - auto bias_memory_p = - handler.AcquireBiasMemoryWithReorder(dev_ctx, key, bias); - args.insert({DNNL_ARG_BIAS, *bias_memory_p}); - } - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - conv_p->execute(astream, args); - astream.wait(); - output->set_mem_desc(dst_memory_p->get_desc()); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_KERNEL( - conv2d_transpose, - MKLDNN, - ::paddle::platform::CPUPlace, - ops::ConvTransposeMKLDNNOpKernel, - ops::ConvTransposeMKLDNNOpKernel); diff --git a/paddle/fluid/operators/ops_extra_info.h b/paddle/fluid/operators/ops_extra_info.h index d0847df43e230383c7160f4e1d583002300de87e..6aa6bd21fba1f4f21777661e9cf9d208a37592d8 100644 --- a/paddle/fluid/operators/ops_extra_info.h +++ b/paddle/fluid/operators/ops_extra_info.h @@ -222,6 +222,7 @@ class ExtraInfoUtils { // TODO(chenweihang): move these extra inputs into op_compat.yaml std::unordered_map> g_extra_input_names_map_ = {{"conv2d", {"Bias", "ResidualData"}}, + {"conv2d_transpose", {"Bias"}}, {"conv2d_grad", {"Bias"}}}; std::vector empty_extra_input_names_; }; diff --git a/paddle/phi/kernels/onednn/conv_transpose_kernel.cc b/paddle/phi/kernels/onednn/conv_transpose_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..fd47d5ce540f402e51975d8ab46803bed736afff --- /dev/null +++ b/paddle/phi/kernels/onednn/conv_transpose_kernel.cc @@ -0,0 +1,440 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_transpose_kernel.h" + +#include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/phi/backends/onednn/onednn_helper.h" +#include "paddle/phi/backends/onednn/onednn_reuse.h" +#include "paddle/phi/core/expect.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/data_layout_transform.h" + +namespace phi { + +inline dnnl::memory::dims GetWeightsTz(const phi::DenseTensor* filter, + const int groups) { + auto weights_tz = phi::vectorize(filter->dims()); + int g = std::max(groups, 1); + int g_dim = (g > 1) ? 1 : 0; + funcs::GetGroupConvWeightsTz(weights_tz, g); + // gIOHW -> gOIHW || IOHW -> OIHW + std::swap(weights_tz[g_dim + 0], weights_tz[g_dim + 1]); + return weights_tz; +} + +template +class ConvTransposeOneDNNHandlerT + : public funcs::OneDNNHandlerNoCachingT { + private: + const bool is_test_; + + public: + ConvTransposeOneDNNHandlerT(const OneDNNContext& dev_ctx, + const DenseTensor* x, + const DenseTensor* filter, + const DenseTensor* bias, + const std::vector& strides_in, + const std::vector& paddings_in, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_in, + DenseTensor* out) + : funcs::OneDNNHandlerNoCachingT( + dev_ctx.GetEngine(), dev_ctx.GetPlace()), + is_test_(dev_ctx.HasDnnAttr("is_test") + ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("is_test")) + : false) { + PADDLE_ENFORCE_EQ(is_test_, + true, + phi::errors::InvalidArgument( + "ConvTransposeOneDNN works only for inference. " + "The attribute \'is_test\' value should be set to " + "True, but got is_test=False.")); + + PADDLE_ENFORCE_EQ( + x->layout(), + DataLayout::ONEDNN, + phi::errors::InvalidArgument("Got wrong layout = %d for Input tensor.", + x->layout())); + + PADDLE_ENFORCE_EQ( + filter->layout(), + DataLayout::ONEDNN, + phi::errors::InvalidArgument( + "The filter tensor's layout should be %d, but got %d.", + DataLayout::ONEDNN, + filter->layout())); + + PADDLE_ENFORCE_EQ( + x->dims().size(), + 4, + phi::errors::InvalidArgument("Input must be with 4 dimensions, " + "i.e. NCHW. but got dimension =%d", + x->dims().size())); + PADDLE_ENFORCE_EQ( + filter->dims().size(), + 4, + phi::errors::InvalidArgument("Filter must be with 4 dimensions, " + "i.e. OIHW, but got dimension =%d", + filter->dims().size())); + + if (bias) { + PADDLE_ENFORCE_EQ( + bias->layout(), + DataLayout::ONEDNN, + phi::errors::InvalidArgument( + "The bias tensor's laytout should be %d, but got %d.", + DataLayout::ONEDNN, + bias->layout())); + + PADDLE_ENFORCE_EQ( + bias->dims().size(), + 1, + phi::errors::InvalidArgument("Bias must only have 1 dimension, " + "i.e. X, but got dimension = %d .", + bias->dims().size())); + } + + dnnl::memory::dims strides(begin(strides_in), end(strides_in)); + dnnl::memory::dims paddings(begin(paddings_in), end(paddings_in)); + dnnl::memory::dims dilations(begin(dilations_in), end(dilations_in)); + + PADDLE_ENFORCE_EQ( + strides.size(), + 2, + phi::errors::Unimplemented( + "Now we only support 2d oneDNN convolution transpose op")); + + const auto x_dims = x->dims(); + const auto x_data_dims = phi::slice_ddim(x_dims, 2, x_dims.size()); + const auto filter_dims = filter->dims(); + const auto filter_data_dims = + phi::slice_ddim(filter_dims, 2, filter_dims.size()); + const auto ksize = phi::vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, x_data_dims, strides, ksize); + + std::transform( + dilations.begin(), dilations.end(), dilations.begin(), [](int64_t i) { + return i - 1; + }); + + const auto src_tz = phi::vectorize(x->dims()); + const auto weights_tz = GetWeightsTz(filter, groups); + const auto dst_tz = phi::vectorize(out->dims()); + const auto onednn_paddings = funcs::ToOneDNNPadding(paddings); + + /* create memory descriptor for convolution without specified format + * ('any') which lets a primitive (convolution in this case) choose + * the memory format preferred for best performance + */ + auto chosen_memory_format = funcs::OneDNNMemoryFormat::any; + auto data_type = dnnl::memory::data_type::f32; + const bool is_BFLOAT16 = + dev_ctx.HasDnnAttr("mkldnn_data_type") + ? PADDLE_GET_CONST(std::string, + dev_ctx.GetDnnAttr("mkldnn_data_type")) == + "bfloat16" + : false; + if (is_BFLOAT16 || std::is_same::value) { + data_type = dnnl::memory::data_type::bf16; + } + + const auto src_md = + funcs::OneDNNMemDesc(src_tz, data_type, chosen_memory_format); + const auto weights_md = + funcs::OneDNNMemDesc(weights_tz, data_type, chosen_memory_format); + const auto dst_md = funcs::OneDNNMemDesc( + dst_tz, funcs::OneDNNGetDataType(), chosen_memory_format); + + auto fwd_prop_kind = is_test_ ? dnnl::prop_kind::forward_inference + : dnnl::prop_kind::forward_training; + + if (bias) { + std::vector bias_tz = phi::vectorize(bias->dims()); + const auto bias_md = funcs::OneDNNMemDesc( + bias_tz, data_type, funcs::OneDNNMemoryFormat::x); + this->AcquireForwardPrimitiveDescriptor( + fwd_prop_kind, + dnnl::algorithm::deconvolution_direct, + src_md, + weights_md, + bias_md, + dst_md, + strides, + dilations, + onednn_paddings[0], + onednn_paddings[1]); + } else { + this->AcquireForwardPrimitiveDescriptor( + fwd_prop_kind, + dnnl::algorithm::deconvolution_direct, + src_md, + weights_md, + dst_md, + strides, + dilations, + onednn_paddings[0], + onednn_paddings[1]); + } + } + + std::shared_ptr AcquireSrcMemoryWithReorder( + const phi::DenseTensor* x) { + const T* input_data = x->data(); + return funcs::OneDNNHandlerNoCachingT:: + AcquireMemoryWithReorder(x->mem_desc(), + this->fwd_pd_->src_desc(), + funcs::to_void_cast(input_data)); + } + + std::shared_ptr AcquireWeightsMemoryWithReorder( + const OneDNNContext& dev_ctx, + const std::string& key, + const phi::DenseTensor* filter, + const int& groups) { + const K* filter_data = filter->data(); + auto weights_tz = GetWeightsTz(filter, groups); + int g = std::max(groups, 1); + + auto user_src_md = + funcs::OneDNNMemDesc(weights_tz, + funcs::OneDNNGetDataType(), + (g == 1) ? funcs::OneDNNMemoryFormat::iohw + : funcs::OneDNNMemoryFormat::giohw); + + return this->template AcquireMemoryWithReorder( + dev_ctx, + user_src_md, + this->fwd_pd_->weights_desc(), + funcs::to_void_cast(filter_data), + key, + "@weights_mem_p", + is_test_); + } + + template + std::shared_ptr AcquireMemoryWithReorder( + const OneDNNContext& dev_ctx, + const dnnl::memory::desc& user_md, + const dnnl::memory::desc& target_md, + void* ptr, + const std::string& key, + const std::string& suffix, + bool is_persistent = false, + const std::vector& scale_data = {1.0f}, + int mask = 0) { + const auto target_key = key + suffix + "_target"; + const auto key_reorder_p = key + suffix + "reorder_p"; + const auto user_key = key + suffix + "_user"; + + auto target_memory_p = + std::static_pointer_cast(dev_ctx.GetBlob(target_key)); + + if (target_memory_p == nullptr) { + auto user_memory_p = + std::make_shared(user_md, this->engine_, ptr); + if (user_md != target_md) { + target_memory_p = + std::make_shared(target_md, this->engine_); + dnnl::reorder::primitive_desc reorder_pdesc; + if (funcs::is_int8()) { + dnnl::primitive_attr attr; + attr.set_output_scales(mask, scale_data); + reorder_pdesc = dnnl::reorder::primitive_desc( + *user_memory_p, *target_memory_p, attr); + } else { + reorder_pdesc = + dnnl::reorder::primitive_desc(*user_memory_p, *target_memory_p); + } + auto reorder_p = std::make_shared(reorder_pdesc); + dev_ctx.SetBlob(key_reorder_p, reorder_p); + + auto& astream = OneDNNContext::tls().get_stream(); + paddle::platform::RecordEvent record_reorder( + "int_reorder", + paddle::platform::TracerEventType::UserDefined, + 1, + paddle::platform::EventRole::kUniqueOp); + reorder_p->execute( + astream, + {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); + astream.wait(); + } else { + target_memory_p = user_memory_p; + } + dev_ctx.SetBlob(user_key, user_memory_p); + dev_ctx.SetBlob(target_key, target_memory_p); + } else if (!is_persistent) { + auto& astream = OneDNNContext::tls().get_stream(); + + auto user_memory_p = + std::static_pointer_cast(dev_ctx.GetBlob(user_key)); + user_memory_p->set_data_handle(ptr); + + // TODO(jczaja): Here we detect if reorder is cached it means it is needed + // need to change this to get rid of keys + auto reorder_p = std::static_pointer_cast( + dev_ctx.GetBlob(key_reorder_p)); + if (reorder_p != nullptr) { + paddle::platform::RecordEvent record_reorder( + "int_reorder", + paddle::platform::TracerEventType::UserDefined, + 1, + paddle::platform::EventRole::kUniqueOp); + reorder_p->execute( + astream, + {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); + astream.wait(); + } + } + return target_memory_p; + } + + std::shared_ptr AcquireBiasMemoryWithReorder( + const OneDNNContext& dev_ctx, + const std::string& key, + const phi::DenseTensor* bias) { + const K* bias_data = bias->data(); + auto user_bias_md = funcs::OneDNNMemDesc(phi::vectorize(bias->dims()), + funcs::OneDNNGetDataType(), + funcs::OneDNNMemoryFormat::x); + return this->AcquireMemoryWithReorder(dev_ctx, + user_bias_md, + this->fwd_pd_->bias_desc(), + funcs::to_void_cast(bias_data), + key, + "@bias_mem_p", + is_test_); + } +}; + +template +void Execute(const OneDNNContext& dev_ctx, + const DenseTensor* x, + const DenseTensor* filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + DenseTensor* out) { + const auto* bias = + dev_ctx.HasDnnInput("Bias") ? dev_ctx.GetDnnInput("Bias") : nullptr; + + ConvTransposeOneDNNHandlerT handler(dev_ctx, + x, + filter, + bias, + strides, + paddings, + padding_algorithm, + groups, + dilations, + out); + + auto src_memory_p = handler.AcquireSrcMemoryWithReorder(x); + // Caching Key for weights is needed + std::string key = + funcs::CreateKey(dev_ctx, + dev_ctx.GetInputsName("Input")[0], + dev_ctx.GetInputsName("Filter")[0], + (bias ? dev_ctx.GetInputsName("Bias")[0] : "")); + key = funcs::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key); + auto weights_memory_p = + handler.AcquireWeightsMemoryWithReorder(dev_ctx, key, filter, groups); + + std::shared_ptr dst_memory_p = + handler.template AcquireDstMemory(out); + auto conv_p = handler.AcquireForwardPrimitive(); + + std::unordered_map args = { + {DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_WEIGHTS, *weights_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}; + + if (bias) { + auto bias_memory_p = + handler.AcquireBiasMemoryWithReorder(dev_ctx, key, bias); + args.insert({DNNL_ARG_BIAS, *bias_memory_p}); + } + auto& astream = OneDNNContext::tls().get_stream(); + conv_p->execute(astream, args); + astream.wait(); + out->set_mem_desc(dst_memory_p->get_desc()); +} + +template +void Conv2dTransposeKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const IntArray& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* out) { + PADDLE_ENFORCE_EQ(dev_ctx.GetPlace().GetType(), + AllocationType::CPU, + phi::errors::PreconditionNotMet( + "Operator oneDNN Conv must use CPUPlace")); + + const bool is_BFLOAT16 = + dev_ctx.HasDnnAttr("mkldnn_data_type") + ? PADDLE_GET_CONST(std::string, + dev_ctx.GetDnnAttr("mkldnn_data_type")) == + "bfloat16" + : false; + const bool force_fp32_output = + dev_ctx.HasDnnAttr("force_fp32_output") + ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("force_fp32_output")) + : false; + const bool use_bfloat16 = (!force_fp32_output && is_BFLOAT16); + + if (use_bfloat16) { + Execute(dev_ctx, + &x, + &filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + out); + } else { + Execute(dev_ctx, + &x, + &filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + out); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(conv2d_transpose, + OneDNN, + ONEDNN, + phi::Conv2dTransposeKernel, + float, + phi::dtype::bfloat16) {}