From 48abaec6d9998075ab0141c35c5411ab48f292a9 Mon Sep 17 00:00:00 2001 From: jakpiase Date: Wed, 6 Jul 2022 16:12:31 +0200 Subject: [PATCH] Performance fix for recommender model (#43803) * fix for binary kernels * fixed performance for elementwise, reduce and concat * added comment * CI fix * CI fix * added formatting * reverted one file * Revert "reverted one file" This reverts commit 54725e1c62318d3a18913821200e973816751019. * Revert "added formatting" This reverts commit b9795dd253d755a329376d7ab0542860aa7815c6. * added enforcing oneDNN BF16 reduce kernel * fix for eltwise and reenabled reshape kernels * fix for binary handler * added formatting * referted changes for flatten,squeeze and reshape ops --- .../mkldnn/elementwise_mkldnn_op.h | 17 +++++++-- .../operators/mkldnn/concat_mkldnn_op.cc | 19 +++++++++- paddle/fluid/operators/reduce_ops/reduce_op.h | 35 ++++++++++++++++++- paddle/fluid/platform/mkldnn_reuse.h | 9 +++-- .../mkldnn/test_elementwise_add_mkldnn_op.py | 8 +++++ 5 files changed, 82 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index 61552e492d..7f6566460a 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -75,8 +75,8 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); - const auto* x = ctx.Input("X"); - const auto* y = ctx.Input("Y"); + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); auto* z = ctx.Output("Out"); float scale_x = ctx.Attr("Scale_x"); @@ -96,6 +96,12 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { scale_o, get_post_ops(ctx)); + // oneDNN's binary is optimized for broadcasting y into x, so in other case + // we have to swap tensors to achieve optimal performance + if (x->numel() < y->numel()) { + std::swap(x, y); + } + const auto src_x_memory = handler.AcquireSrcMemory(x); const auto src_y_memory = handler.AcquireSecondSrcMemory(y); // (jczaja) For Inplace src and dst should be the same memory object. @@ -159,6 +165,13 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { auto* dy = ctx.Output(framework::GradVarName("Y")); auto* dout = ctx.Input(framework::GradVarName("Out")); + // oneDNN's binary is optimized for broadcasting y into x, so in other case + // we have to swap tensors to achieve optimal performance + if (x->numel() < y->numel()) { + std::swap(x, y); + std::swap(dx, dy); + } + int axis = ctx.Attr("axis"); auto tz = phi::vectorize(dout->dims()); diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc index cefa4fc1b9..837d435773 100644 --- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc @@ -77,7 +77,24 @@ class ConcatMKLDNNHandler } auto dst_dims = phi::vectorize(output->dims()); - auto dst_md = memory::desc(dst_dims, dt, MKLDNNMemoryFormat::any); + + dnnl::memory::desc dst_md; + + // if concat is being used as a stack op(all source memories dims on + // concat_axis are equal to 1), then it may choose a non-optimal memory + // format tag for destination, because concat primitive is chosing it based + // on source memory descriptors and f.e.200x1x10 can be described as both + // abc and bac and both would be using exact same physical layout, but in + // that scenario bac will be chosen for destination no matter which + // formats are being set in inputs. In that scenario we are enforcing using + // a dense format, because it is the most common one and should be the best + // in terms of the performance + if (dst_dims[concat_axis] == static_cast(srcs_md.size())) { + dst_md = memory::desc( + dst_dims, dt, platform::GetPlainMKLDNNFormat(dst_dims.size())); + } else { + dst_md = memory::desc(dst_dims, dt, MKLDNNMemoryFormat::any); + } this->AcquireForwardPrimitiveDescriptor(dst_md, concat_axis, srcs_md); } diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index ec3cf1908c..e9bc3905a2 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -545,6 +545,38 @@ class ReduceOp : public framework::OperatorWithKernel { } } + // oneDNN's reduction kernel is optimized only for reducing throughout the + // most outer dims, so in case of another type of reduction, it would be + // better to fallback to native implementation + static bool HasOptimizedOneDNNKernel(const framework::ExecutionContext& ctx) { + // native reduce kernels don't support bf16 + // so oneDNN kernel is enforced in that case + if (ctx.Input("X")->dtype() == + experimental::DataType::BFLOAT16) + return true; + + auto reduce_dims = ctx.Attr>("dim"); + const bool reduce_all = ctx.Attr("reduce_all"); + int ndims = ctx.Input("X")->dims().size(); + + if (reduce_all) { + return true; + } + + for (size_t i = 0; i < reduce_dims.size(); ++i) { + if (reduce_dims[i] < 0) reduce_dims[i] = ndims + reduce_dims[i]; + } + sort(reduce_dims.begin(), reduce_dims.end()); + for (size_t i = 0; i < reduce_dims.size(); ++i) { + if (reduce_dims[reduce_dims.size() - i - 1] != + static_cast(ndims - i - 1)) { + return false; + } + } + + return true; + } + framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { // choose cudnn kernel if the runtime supported. @@ -554,7 +586,8 @@ class ReduceOp : public framework::OperatorWithKernel { return framework::OpKernelType(input_data_type, ctx.GetPlace()); #ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + if (this->CanMKLDNNBeUsed(ctx, input_data_type) && + HasOptimizedOneDNNKernel(ctx)) { return framework::OpKernelType(input_data_type, ctx.GetPlace(), framework::DataLayout::kMKLDNN, diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 41a4f551ce..2f4bbfaf74 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -690,8 +690,13 @@ class BinaryMKLDNNHandler auto attributes = CreateAttributes(algo, scale_x, scale_y, scale_out, post_ops); - this->AcquireForwardPrimitiveDescriptor( - attributes, algo, src0_md, src1_md, dst_md); + if (x->numel() < y->numel()) { + this->AcquireForwardPrimitiveDescriptor( + attributes, algo, src1_md, src0_md, dst_md); + } else { + this->AcquireForwardPrimitiveDescriptor( + attributes, algo, src0_md, src1_md, dst_md); + } } std::shared_ptr AcquireSecondSrcMemory( const framework::Tensor* input) { diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py index 2ae717d64a..dc9a3862e0 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py @@ -68,6 +68,14 @@ class TestMKLDNNElementwiseAddOp5(TestMKLDNNElementwiseAddOp): self.out = np.add(self.x, self.y) +class TestMKLDNNElementwiseAddOpBroadcastXintoY(TestMKLDNNElementwiseAddOp): + + def init_input_output(self): + self.x = np.random.uniform(1, 2, [2, 50, 1]).astype(self.dtype) + self.y = np.random.uniform(1, 2, [2, 50, 160]).astype(self.dtype) + self.out = np.add(self.x, self.y) + + class TestMKLDNNElementwiseAddOp_broadcast_3(TestMKLDNNElementwiseAddOp): def init_input_output(self): -- GitLab