diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
index 61552e492dfa198af833cbb85e374d97bf77a601..7f6566460ab6290c623c03567cc21f3cd24b77be 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
@@ -75,8 +75,8 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
 
-    const auto* x = ctx.Input<Tensor>("X");
-    const auto* y = ctx.Input<Tensor>("Y");
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
     auto* z = ctx.Output<Tensor>("Out");
 
     float scale_x = ctx.Attr<float>("Scale_x");
@@ -96,6 +96,12 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
                                              scale_o,
                                              get_post_ops(ctx));
 
+    // oneDNN's binary is optimized for broadcasting y into x, so in other case
+    // we have to swap tensors to achieve optimal performance
+    if (x->numel() < y->numel()) {
+      std::swap(x, y);
+    }
+
     const auto src_x_memory = handler.AcquireSrcMemory(x);
     const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
     // (jczaja) For Inplace src and dst should be the same memory object.
@@ -159,6 +165,13 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
+    // oneDNN's binary is optimized for broadcasting y into x, so in other case
+    // we have to swap tensors to achieve optimal performance
+    if (x->numel() < y->numel()) {
+      std::swap(x, y);
+      std::swap(dx, dy);
+    }
+
     int axis = ctx.Attr<int>("axis");
 
     auto tz = phi::vectorize<int64_t>(dout->dims());
diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
index cefa4fc1b995bd3429d7b01448326dda06df148c..837d4357737a265db9311c99ac5e79a3064fcf3c 100644
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -77,7 +77,24 @@ class ConcatMKLDNNHandler
     }
 
     auto dst_dims = phi::vectorize<int64_t>(output->dims());
-    auto dst_md = memory::desc(dst_dims, dt, MKLDNNMemoryFormat::any);
+
+    dnnl::memory::desc dst_md;
+
+    // if concat is being used as a stack op(all source memories dims on
+    // concat_axis are equal to 1), then it may choose a non-optimal memory
+    // format tag for destination, because concat primitive is chosing it based
+    // on source memory descriptors and f.e.200x1x10 can be described as both
+    // abc and bac and both would be using exact same physical layout, but in
+    // that scenario bac will be chosen for destination no matter which
+    // formats are being set in inputs. In that scenario we are enforcing using
+    // a dense format, because it is the most common one and should be the best
+    // in terms of the performance
+    if (dst_dims[concat_axis] == static_cast<int64_t>(srcs_md.size())) {
+      dst_md = memory::desc(
+          dst_dims, dt, platform::GetPlainMKLDNNFormat(dst_dims.size()));
+    } else {
+      dst_md = memory::desc(dst_dims, dt, MKLDNNMemoryFormat::any);
+    }
 
     this->AcquireForwardPrimitiveDescriptor(dst_md, concat_axis, srcs_md);
   }
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index ec3cf1908c5b5d430f510a9068d3ea7a979d680c..e9bc3905a22ee30cfb5dae3efa5a5ee53e463fbb 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -545,6 +545,38 @@ class ReduceOp : public framework::OperatorWithKernel {
     }
   }
 
+  // oneDNN's reduction kernel is optimized only for reducing throughout the
+  // most outer dims, so in case of another type of reduction, it would be
+  // better to fallback to native implementation
+  static bool HasOptimizedOneDNNKernel(const framework::ExecutionContext& ctx) {
+    // native reduce kernels don't support bf16
+    // so oneDNN kernel is enforced in that case
+    if (ctx.Input<framework::LoDTensor>("X")->dtype() ==
+        experimental::DataType::BFLOAT16)
+      return true;
+
+    auto reduce_dims = ctx.Attr<std::vector<int>>("dim");
+    const bool reduce_all = ctx.Attr<bool>("reduce_all");
+    int ndims = ctx.Input<framework::LoDTensor>("X")->dims().size();
+
+    if (reduce_all) {
+      return true;
+    }
+
+    for (size_t i = 0; i < reduce_dims.size(); ++i) {
+      if (reduce_dims[i] < 0) reduce_dims[i] = ndims + reduce_dims[i];
+    }
+    sort(reduce_dims.begin(), reduce_dims.end());
+    for (size_t i = 0; i < reduce_dims.size(); ++i) {
+      if (reduce_dims[reduce_dims.size() - i - 1] !=
+          static_cast<int>(ndims - i - 1)) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     // choose cudnn kernel if the runtime supported.
@@ -554,7 +586,8 @@ class ReduceOp : public framework::OperatorWithKernel {
       return framework::OpKernelType(input_data_type, ctx.GetPlace());
 
 #ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type) &&
+        HasOptimizedOneDNNKernel(ctx)) {
       return framework::OpKernelType(input_data_type,
                                      ctx.GetPlace(),
                                      framework::DataLayout::kMKLDNN,
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 41a4f551cedc1ed925a02bd52c0055e5db7d4d3c..2f4bbfaf74fcc32e816badc904e2ef1c7e4be63f 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -690,8 +690,13 @@ class BinaryMKLDNNHandler
     auto attributes =
         CreateAttributes(algo, scale_x, scale_y, scale_out, post_ops);
 
-    this->AcquireForwardPrimitiveDescriptor(
-        attributes, algo, src0_md, src1_md, dst_md);
+    if (x->numel() < y->numel()) {
+      this->AcquireForwardPrimitiveDescriptor(
+          attributes, algo, src1_md, src0_md, dst_md);
+    } else {
+      this->AcquireForwardPrimitiveDescriptor(
+          attributes, algo, src0_md, src1_md, dst_md);
+    }
   }
   std::shared_ptr<dnnl::memory> AcquireSecondSrcMemory(
       const framework::Tensor* input) {
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
index 2ae717d64a30256e71d5e8a1ccf949c526341ec5..dc9a3862e0421fcc5ad8ef1101f566d47e08359d 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
@@ -68,6 +68,14 @@ class TestMKLDNNElementwiseAddOp5(TestMKLDNNElementwiseAddOp):
         self.out = np.add(self.x, self.y)
 
 
+class TestMKLDNNElementwiseAddOpBroadcastXintoY(TestMKLDNNElementwiseAddOp):
+
+    def init_input_output(self):
+        self.x = np.random.uniform(1, 2, [2, 50, 1]).astype(self.dtype)
+        self.y = np.random.uniform(1, 2, [2, 50, 160]).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+
 class TestMKLDNNElementwiseAddOp_broadcast_3(TestMKLDNNElementwiseAddOp):
 
     def init_input_output(self):