diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index a09fe4b67604130e4cc3aa1385f87586a258d886..6ec73b02ade11a03f230e85a3cb54efb04f3305b 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -277,7 +277,10 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
 #ifdef PADDLE_WITH_MKLDNN
     // If broadcasting is needed, use native implementation
     auto CanMKLDNNElementwiseAddGradBeUsed = [&]() {
-      return (ctx.Input<Tensor>("X")->dims() == ctx.Input<Tensor>("Y")->dims());
+      auto dx_dims = ctx.Input<Tensor>("X")->dims();
+      auto dy_dims = ctx.Input<Tensor>("Y")->dims();
+      // No broadcast or broadcasting of data on inner dims is supported
+      return (dx_dims[dx_dims.size() - 1] == dy_dims[dy_dims.size() - 1]);
     };
 
     if (this->CanMKLDNNBeUsed(ctx, input_data_type) &&
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
index 13acd3fa636804ad6534d407143cce07bdab2930..4db4adfe9e9ac4db1803827b4fa35b381ff4c19e 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
@@ -64,14 +64,29 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
     }
 
     if (dy) {
-      auto reorder_dst_memory_p =
-          handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace());
-      auto reorder_p =
-          handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
-      platform::RecordEvent record_reorder("int_reorder",
-                                           platform::EventRole::kUniqueOp);
-      reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
-      astream.wait();
+      // Direct copy
+      if (dout->dims() == dy->dims()) {
+        auto reorder_dst_memory_p =
+            handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace());
+        auto reorder_p =
+            handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
+        platform::RecordEvent record_reorder("int_reorder",
+                                             platform::EventRole::kUniqueOp);
+        reorder_p->execute(astream, *reorder_src_memory_p,
+                           *reorder_dst_memory_p);
+        astream.wait();
+      } else {
+        // Broadcasting
+        platform::ReductionMKLDNNHandler<T> handler_sum(
+            dnnl::algorithm::reduction_sum, 0.0f, 0.0f, dev_ctx, onednn_engine,
+            ctx.GetPlace(), dout, dy,
+            ctx.InputName(framework::GradVarName("Out")));
+        auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
+        auto reduction_p = handler_sum.AcquireForwardPrimitive();
+        reduction_p->execute(astream, {{DNNL_ARG_SRC, *reorder_src_memory_p},
+                                       {DNNL_ARG_DST, *dy_memory_p}});
+        astream.wait();
+      }
     }
   }
 };
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 3e02a8672c36094e12106ec838f9ec09d9b2565f..0503c3f71a80238bbbf2241905a4f69b9f2169f9 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
+#include <algorithm>
 #include <memory>
 #include <sstream>
 #include <string>
@@ -621,6 +622,49 @@ class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::binary> {
   }
 };
 
+template <typename T>
+class ReductionMKLDNNHandler
+    : public platform::MKLDNNHandlerT<T, dnnl::reduction> {
+ public:
+  ReductionMKLDNNHandler(const dnnl::algorithm algo, const float p,
+                         const float eps, const MKLDNNDeviceContext& dev_ctx,
+                         const mkldnn::engine engine, platform::Place cpu_place,
+                         const Tensor* x, const Tensor* y,
+                         const std::string& uniq_name)
+      : platform::MKLDNNHandlerT<T, dnnl::reduction>(
+            dev_ctx, engine, cpu_place,
+            platform::CreateKey(dev_ctx, framework::vectorize(x->dims()),
+                                uniq_name,
+                                (std::to_string(static_cast<int>(algo))))) {
+    if (!this->isCached()) {
+      PADDLE_ENFORCE_EQ(
+          x->layout(), DataLayout::kMKLDNN,
+          platform::errors::InvalidArgument("Wrong layout set for X tensor."));
+      PADDLE_ENFORCE_NE(
+          x->format(), MKLDNNMemoryFormat::undef,
+          platform::errors::InvalidArgument("Wrong format set for X tensor."));
+
+      const auto src_tz = framework::vectorize(x->dims());
+      const auto dst_tz = framework::vectorize(y->dims());
+
+      // For oneDNN dimensionality should match so we need to
+      // extend Y tensor dims with values of 1 (before and after pattern)
+      int j = 0;
+      std::vector<int64_t> dst_tz_ex(src_tz.size(), 1);
+      for (size_t i = 0; i < src_tz.size(); ++i) {
+        dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++];
+      }
+
+      const auto src_md = dnnl::memory::desc(
+          src_tz, platform::MKLDNNGetDataType<T>(), x->format());
+      const auto dst_md = memory::desc(
+          dst_tz_ex, platform::MKLDNNGetDataType<T>(), x->format());
+
+      this->AcquireForwardPrimitiveDescriptor(algo, src_md, dst_md, p, eps);
+    }
+  }
+};
+
 template <typename T>
 class ActivationMKLDNNHandler
     : public MKLDNNHandlerT<T, mkldnn::eltwise_forward,
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_bf16_mkldnn_op.py
index ac235e00755e93efd38e4fe8b9afa4a437880ae4..3a20ffde7a1b2a7a1168de33669904fc2c80dfed 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_bf16_mkldnn_op.py
@@ -45,13 +45,13 @@ class TestElementwiseAddBf16MklDNNOp(OpTest):
     def test_check_output(self):
         self.check_output_with_place(core.CPUPlace())
 
-    # elementwise_add grad is just passing upper gradients to either X or Y or both
+    # elementwise_add grad (no braodcasting) is just passing upper gradients to either X or Y or both
     def test_check_grad_normal(self):
         self.check_grad_with_place(
             core.CPUPlace(), ["X", "Y"],
             "Out",
             check_dygraph=False,
-            user_defined_grads=[self.x_bf16, self.x_bf16],
+            user_defined_grads=[self.x, self.x],
             user_defined_grad_outputs=[self.x_bf16])
 
     def test_check_grad_ingore_x(self):
@@ -59,7 +59,7 @@ class TestElementwiseAddBf16MklDNNOp(OpTest):
             core.CPUPlace(), ["Y"],
             "Out",
             check_dygraph=False,
-            user_defined_grads=[self.y_bf16],
+            user_defined_grads=[self.y],
             user_defined_grad_outputs=[self.y_bf16])
 
     def test_check_grad_ingore_y(self):
@@ -67,7 +67,40 @@ class TestElementwiseAddBf16MklDNNOp(OpTest):
             core.CPUPlace(), ["X"],
             "Out",
             check_dygraph=False,
-            user_defined_grads=[self.x_bf16],
+            user_defined_grads=[self.x],
+            user_defined_grad_outputs=[self.x_bf16])
+
+
+class TestElementwiseAddBroadCastingBf16MklDNNOp(
+        TestElementwiseAddBf16MklDNNOp):
+    def generate_data(self):
+        self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(np.float32)
+        self.y = np.random.uniform(1, 2, [100]).astype(np.float32)
+        self.out = np.add(self.x, self.y)
+
+    # Compute partial sums along all axes but last one
+    def compute_reduced_gradients(self, out_grads):
+        part_sum = np.add.reduceat(out_grads, [0], axis=0)
+        part_sum = np.add.reduceat(part_sum, [0], axis=1)
+        part_sum = np.add.reduceat(part_sum, [0], axis=2)
+        return part_sum.flatten()
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X", "Y"],
+            "Out",
+            check_dygraph=False,
+            user_defined_grads=[
+                self.x, self.compute_reduced_gradients(self.x)
+            ],
+            user_defined_grad_outputs=[self.x_bf16])
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad_with_place(
+            core.CPUPlace(), ["Y"],
+            "Out",
+            check_dygraph=False,
+            user_defined_grads=[self.compute_reduced_gradients(self.x)],
             user_defined_grad_outputs=[self.x_bf16])
 
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
index 532c6a606d1cc4bea95f40ca7ef16f6cbae0516f..28456a3e91dca43314b9bc350a667502aaa94753 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
@@ -17,6 +17,7 @@ import unittest
 import numpy as np
 from paddle.fluid.tests.unittests.op_test import skip_check_grad_ci
 from paddle.fluid.tests.unittests.test_elementwise_add_op import TestElementwiseAddOp
+from paddle import enable_static
 
 
 class TestMKLDNNElementwiseAddOp(TestElementwiseAddOp):
@@ -51,13 +52,17 @@ class TestMKLDNNElementwiseAddOp4(TestMKLDNNElementwiseAddOp):
     def test_check_grad_normal(self):
         pass
 
-    def test_check_grad_ingore_x(self):
-        pass
-
     def test_check_grad_ingore_y(self):
         pass
 
 
+class TestMKLDNNElementwiseAddOp5(TestMKLDNNElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype)
+        self.y = np.random.uniform(1, 2, [100]).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+
 class TestMKLDNNElementwiseAddOp_broadcast_3(TestMKLDNNElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
@@ -150,4 +155,5 @@ class TestUint8Scales(TestInt8Scales):
 
 
 if __name__ == '__main__':
+    enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_bf16_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_bf16_op.py
index ac9b881313a31663c09e25eeae4108991e0f84ff..ae844834154fbe71afe2eb417ed4cc2055440b88 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_bf16_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_bf16_op.py
@@ -50,8 +50,9 @@ class TestReshapeBf16Op(OpTest):
         self.infered_shape = (10, 2, 3, -1)
 
     def init_input_data(self):
-        self.input_data = convert_float_to_uint16(
-            np.random.random(self.ori_shape).astype(np.float32))
+        self.input_data_fp32 = np.random.random(self.ori_shape).astype(
+            np.float32)
+        self.input_data = convert_float_to_uint16(self.input_data_fp32)
 
     def test_check_output(self):
         self.check_output_with_place(core.CPUPlace(), no_check_set=['XShape'])
@@ -61,7 +62,7 @@ class TestReshapeBf16Op(OpTest):
             core.CPUPlace(), ["X"],
             "Out",
             check_dygraph=False,
-            user_defined_grads=[self.inputs["X"]],
+            user_defined_grads=[self.input_data_fp32],
             user_defined_grad_outputs=[
                 self.inputs["X"].reshape(self.infered_shape)
             ])
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 47c187a80c88f50a0acfa155843e203e3b783a43..8ca83d08d64de20b1493bdbecd216ca176ab8ce8 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -1452,6 +1452,16 @@ class OpTest(unittest.TestCase):
         analytic_grads = self._get_gradient(inputs_to_check, place,
                                             output_names, no_grad_set,
                                             user_defined_grad_outputs)
+        # comparison of bf16 results will happen as fp32
+        # loop over list of grads and convert bf16 to fp32
+        fp32_grads = []
+        for grad in analytic_grads:
+            if grad.dtype == np.uint16:
+                grad = convert_uint16_to_float(grad)
+                max_relative_error = 0.03
+            fp32_grads.append(grad)
+        analytic_grads = fp32_grads
+
         self._assert_is_close(numeric_grads, analytic_grads, inputs_to_check,
                               max_relative_error,
                               "Gradient Check On %s" % str(place))