Performance fix for recommender model (#43803)

* fix for binary kernels * fixed performance for elementwise, reduce and concat * added comment * CI fix * CI fix * added formatting * reverted one file * Revert "reverted one file" This reverts commit 54725e1c62318d3a18913821200e973816751019. * Revert "added formatting" This reverts commit b9795dd253d755a329376d7ab0542860aa7815c6. * added enforcing oneDNN BF16 reduce kernel * fix for eltwise and reenabled reshape kernels * fix for binary handler * added formatting * referted changes for flatten,squeeze and reshape ops

Performance fix for recommender model (#43803)
* fix for binary kernels * fixed performance for elementwise, reduce and concat * added comment * CI fix * CI fix * added formatting * reverted one file * Revert "reverted one file" This reverts commit 54725e1c62318d3a18913821200e973816751019. * Revert "added formatting" This reverts commit b9795dd253d755a329376d7ab0542860aa7815c6. * added enforcing oneDNN BF16 reduce kernel * fix for eltwise and reenabled reshape kernels * fix for binary handler * added formatting * referted changes for flatten,squeeze and reshape ops
48abaec6 · jakpiase · GitHub · f39183ea · 48abaec6 · 48abaec6
5 changed file
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
@@ -75,8 +75,8 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
    const auto& mkldnn_engine = dev_ctx.GetEngine();
-    const auto* x = ctx.Input<Tensor>("X");
+    auto* x = ctx.Input<Tensor>("X");
-    const auto* y = ctx.Input<Tensor>("Y");
+    auto* y = ctx.Input<Tensor>("Y");
    auto* z = ctx.Output<Tensor>("Out");
    float scale_x = ctx.Attr<float>("Scale_x");
@@ -96,6 +96,12 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
                                             scale_o,
                                             get_post_ops(ctx));
+    // oneDNN's binary is optimized for broadcasting y into x, so in other case
+    // we have to swap tensors to achieve optimal performance
+    if (x->numel() < y->numel()) {
+      std::swap(x, y);
+    }
    const auto src_x_memory = handler.AcquireSrcMemory(x);
    const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
    // (jczaja) For Inplace src and dst should be the same memory object.
@@ -159,6 +165,13 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    // oneDNN's binary is optimized for broadcasting y into x, so in other case
+    // we have to swap tensors to achieve optimal performance
+    if (x->numel() < y->numel()) {
+      std::swap(x, y);
+      std::swap(dx, dy);
+    }
    int axis = ctx.Attr<int>("axis");
    auto tz = phi::vectorize<int64_t>(dout->dims());

--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -77,7 +77,24 @@ class ConcatMKLDNNHandler
    }
    auto dst_dims = phi::vectorize<int64_t>(output->dims());
-    auto dst_md = memory::desc(dst_dims, dt, MKLDNNMemoryFormat::any);
+    dnnl::memory::desc dst_md;
+    // if concat is being used as a stack op(all source memories dims on
+    // concat_axis are equal to 1), then it may choose a non-optimal memory
+    // format tag for destination, because concat primitive is chosing it based
+    // on source memory descriptors and f.e.200x1x10 can be described as both
+    // abc and bac and both would be using exact same physical layout, but in
+    // that scenario bac will be chosen for destination no matter which
+    // formats are being set in inputs. In that scenario we are enforcing using
+    // a dense format, because it is the most common one and should be the best
+    // in terms of the performance
+    if (dst_dims[concat_axis] == static_cast<int64_t>(srcs_md.size())) {
+      dst_md = memory::desc(
+          dst_dims, dt, platform::GetPlainMKLDNNFormat(dst_dims.size()));
+    } else {
+      dst_md = memory::desc(dst_dims, dt, MKLDNNMemoryFormat::any);
+    }
    this->AcquireForwardPrimitiveDescriptor(dst_md, concat_axis, srcs_md);
  }

--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -545,6 +545,38 @@ class ReduceOp : public framework::OperatorWithKernel {
    }
  }
+  // oneDNN's reduction kernel is optimized only for reducing throughout the
+  // most outer dims, so in case of another type of reduction, it would be
+  // better to fallback to native implementation
+  static bool HasOptimizedOneDNNKernel(const framework::ExecutionContext& ctx) {
+    // native reduce kernels don't support bf16
+    // so oneDNN kernel is enforced in that case
+    if (ctx.Input<framework::LoDTensor>("X")->dtype() ==
+        experimental::DataType::BFLOAT16)
+      return true;
+    auto reduce_dims = ctx.Attr<std::vector<int>>("dim");
+    const bool reduce_all = ctx.Attr<bool>("reduce_all");
+    int ndims = ctx.Input<framework::LoDTensor>("X")->dims().size();
+    if (reduce_all) {
+      return true;
+    }
+    for (size_t i = 0; i < reduce_dims.size(); ++i) {
+      if (reduce_dims[i] < 0) reduce_dims[i] = ndims + reduce_dims[i];
+    }
+    sort(reduce_dims.begin(), reduce_dims.end());
+    for (size_t i = 0; i < reduce_dims.size(); ++i) {
+      if (reduce_dims[reduce_dims.size() - i - 1] !=
+          static_cast<int>(ndims - i - 1)) {
+        return false;
+      }
+    }
+    return true;
+  }
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    // choose cudnn kernel if the runtime supported.
@@ -554,7 +586,8 @@ class ReduceOp : public framework::OperatorWithKernel {
      return framework::OpKernelType(input_data_type, ctx.GetPlace());
 #ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type) &&
+        HasOptimizedOneDNNKernel(ctx)) {
      return framework::OpKernelType(input_data_type,
                                     ctx.GetPlace(),
                                     framework::DataLayout::kMKLDNN,

--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -690,8 +690,13 @@ class BinaryMKLDNNHandler
    auto attributes =
        CreateAttributes(algo, scale_x, scale_y, scale_out, post_ops);
-    this->AcquireForwardPrimitiveDescriptor(
+    if (x->numel() < y->numel()) {
-        attributes, algo, src0_md, src1_md, dst_md);
+      this->AcquireForwardPrimitiveDescriptor(
+          attributes, algo, src1_md, src0_md, dst_md);
+    } else {
+      this->AcquireForwardPrimitiveDescriptor(
+          attributes, algo, src0_md, src1_md, dst_md);
+    }
  }
  std::shared_ptr<dnnl::memory> AcquireSecondSrcMemory(
      const framework::Tensor* input) {

--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
@@ -68,6 +68,14 @@ class TestMKLDNNElementwiseAddOp5(TestMKLDNNElementwiseAddOp):
        self.out = np.add(self.x, self.y)
+class TestMKLDNNElementwiseAddOpBroadcastXintoY(TestMKLDNNElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(1, 2, [2, 50, 1]).astype(self.dtype)
+        self.y = np.random.uniform(1, 2, [2, 50, 160]).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
 class TestMKLDNNElementwiseAddOp_broadcast_3(TestMKLDNNElementwiseAddOp):
    def init_input_output(self):