[oneDNN] elementwise add bf16 grad kernel with broadcasting (#31385)

39a5424e · Jacek Czaja · GitHub · 5f621321 · 39a5424e · 39a5424e
7 changed file
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -277,7 +277,10 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
 #ifdef PADDLE_WITH_MKLDNN
    // If broadcasting is needed, use native implementation
    auto CanMKLDNNElementwiseAddGradBeUsed = [&]() {
-      return (ctx.Input<Tensor>("X")->dims() == ctx.Input<Tensor>("Y")->dims());
+      auto dx_dims = ctx.Input<Tensor>("X")->dims();
+      auto dy_dims = ctx.Input<Tensor>("Y")->dims();
+      // No broadcast or broadcasting of data on inner dims is supported
+      return (dx_dims[dx_dims.size() - 1] == dy_dims[dy_dims.size() - 1]);
    };
    if (this->CanMKLDNNBeUsed(ctx, input_data_type) &&

--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
@@ -64,16 +64,31 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
    }
    if (dy) {
+      // Direct copy
+      if (dout->dims() == dy->dims()) {
        auto reorder_dst_memory_p =
            handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace());
        auto reorder_p =
            handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
        platform::RecordEvent record_reorder("int_reorder",
                                             platform::EventRole::kUniqueOp);
-      reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+        reorder_p->execute(astream, *reorder_src_memory_p,
+                           *reorder_dst_memory_p);
+        astream.wait();
+      } else {
+        // Broadcasting
+        platform::ReductionMKLDNNHandler<T> handler_sum(
+            dnnl::algorithm::reduction_sum, 0.0f, 0.0f, dev_ctx, onednn_engine,
+            ctx.GetPlace(), dout, dy,
+            ctx.InputName(framework::GradVarName("Out")));
+        auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
+        auto reduction_p = handler_sum.AcquireForwardPrimitive();
+        reduction_p->execute(astream, {{DNNL_ARG_SRC, *reorder_src_memory_p},
+                                       {DNNL_ARG_DST, *dy_memory_p}});
        astream.wait();
      }
    }
+  }
 };
 }  // namespace operators

--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <algorithm>
 #include <memory>
 #include <sstream>
 #include <string>
@@ -621,6 +622,49 @@ class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::binary> {
  }
 };
+template <typename T>
+class ReductionMKLDNNHandler
+    : public platform::MKLDNNHandlerT<T, dnnl::reduction> {
+ public:
+  ReductionMKLDNNHandler(const dnnl::algorithm algo, const float p,
+                         const float eps, const MKLDNNDeviceContext& dev_ctx,
+                         const mkldnn::engine engine, platform::Place cpu_place,
+                         const Tensor* x, const Tensor* y,
+                         const std::string& uniq_name)
+      : platform::MKLDNNHandlerT<T, dnnl::reduction>(
+            dev_ctx, engine, cpu_place,
+            platform::CreateKey(dev_ctx, framework::vectorize(x->dims()),
+                                uniq_name,
+                                (std::to_string(static_cast<int>(algo))))) {
+    if (!this->isCached()) {
+      PADDLE_ENFORCE_EQ(
+          x->layout(), DataLayout::kMKLDNN,
+          platform::errors::InvalidArgument("Wrong layout set for X tensor."));
+      PADDLE_ENFORCE_NE(
+          x->format(), MKLDNNMemoryFormat::undef,
+          platform::errors::InvalidArgument("Wrong format set for X tensor."));
+      const auto src_tz = framework::vectorize(x->dims());
+      const auto dst_tz = framework::vectorize(y->dims());
+      // For oneDNN dimensionality should match so we need to
+      // extend Y tensor dims with values of 1 (before and after pattern)
+      int j = 0;
+      std::vector<int64_t> dst_tz_ex(src_tz.size(), 1);
+      for (size_t i = 0; i < src_tz.size(); ++i) {
+        dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++];
+      }
+      const auto src_md = dnnl::memory::desc(
+          src_tz, platform::MKLDNNGetDataType<T>(), x->format());
+      const auto dst_md = memory::desc(
+          dst_tz_ex, platform::MKLDNNGetDataType<T>(), x->format());
+      this->AcquireForwardPrimitiveDescriptor(algo, src_md, dst_md, p, eps);
+    }
+  }
+};
 template <typename T>
 class ActivationMKLDNNHandler
    : public MKLDNNHandlerT<T, mkldnn::eltwise_forward,

--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_bf16_mkldnn_op.py
@@ -45,13 +45,13 @@ class TestElementwiseAddBf16MklDNNOp(OpTest):
    def test_check_output(self):
        self.check_output_with_place(core.CPUPlace())
-    # elementwise_add grad is just passing upper gradients to either X or Y or both
+    # elementwise_add grad (no braodcasting) is just passing upper gradients to either X or Y or both
    def test_check_grad_normal(self):
        self.check_grad_with_place(
            core.CPUPlace(), ["X", "Y"],
            "Out",
            check_dygraph=False,
-            user_defined_grads=[self.x_bf16, self.x_bf16],
+            user_defined_grads=[self.x, self.x],
            user_defined_grad_outputs=[self.x_bf16])
    def test_check_grad_ingore_x(self):
@@ -59,7 +59,7 @@ class TestElementwiseAddBf16MklDNNOp(OpTest):
            core.CPUPlace(), ["Y"],
            "Out",
            check_dygraph=False,
-            user_defined_grads=[self.y_bf16],
+            user_defined_grads=[self.y],
            user_defined_grad_outputs=[self.y_bf16])
    def test_check_grad_ingore_y(self):
@@ -67,7 +67,40 @@ class TestElementwiseAddBf16MklDNNOp(OpTest):
            core.CPUPlace(), ["X"],
            "Out",
            check_dygraph=False,
-            user_defined_grads=[self.x_bf16],
+            user_defined_grads=[self.x],
+            user_defined_grad_outputs=[self.x_bf16])
+class TestElementwiseAddBroadCastingBf16MklDNNOp(
+        TestElementwiseAddBf16MklDNNOp):
+    def generate_data(self):
+        self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(np.float32)
+        self.y = np.random.uniform(1, 2, [100]).astype(np.float32)
+        self.out = np.add(self.x, self.y)
+    # Compute partial sums along all axes but last one
+    def compute_reduced_gradients(self, out_grads):
+        part_sum = np.add.reduceat(out_grads, [0], axis=0)
+        part_sum = np.add.reduceat(part_sum, [0], axis=1)
+        part_sum = np.add.reduceat(part_sum, [0], axis=2)
+        return part_sum.flatten()
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X", "Y"],
+            "Out",
+            check_dygraph=False,
+            user_defined_grads=[
+                self.x, self.compute_reduced_gradients(self.x)
+            ],
+            user_defined_grad_outputs=[self.x_bf16])
+    def test_check_grad_ingore_x(self):
+        self.check_grad_with_place(
+            core.CPUPlace(), ["Y"],
+            "Out",
+            check_dygraph=False,
+            user_defined_grads=[self.compute_reduced_gradients(self.x)],
            user_defined_grad_outputs=[self.x_bf16])

--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
@@ -17,6 +17,7 @@ import unittest
 import numpy as np
 from paddle.fluid.tests.unittests.op_test import skip_check_grad_ci
 from paddle.fluid.tests.unittests.test_elementwise_add_op import TestElementwiseAddOp
+from paddle import enable_static
 class TestMKLDNNElementwiseAddOp(TestElementwiseAddOp):
@@ -51,13 +52,17 @@ class TestMKLDNNElementwiseAddOp4(TestMKLDNNElementwiseAddOp):
    def test_check_grad_normal(self):
        pass
-    def test_check_grad_ingore_x(self):
-        pass
    def test_check_grad_ingore_y(self):
        pass
+class TestMKLDNNElementwiseAddOp5(TestMKLDNNElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype)
+        self.y = np.random.uniform(1, 2, [100]).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
 class TestMKLDNNElementwiseAddOp_broadcast_3(TestMKLDNNElementwiseAddOp):
    def init_input_output(self):
        self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
@@ -150,4 +155,5 @@ class TestUint8Scales(TestInt8Scales):
 if __name__ == '__main__':
+    enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_bf16_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_bf16_op.py
@@ -50,8 +50,9 @@ class TestReshapeBf16Op(OpTest):
        self.infered_shape = (10, 2, 3, -1)
    def init_input_data(self):
-        self.input_data = convert_float_to_uint16(
+        self.input_data_fp32 = np.random.random(self.ori_shape).astype(
-            np.random.random(self.ori_shape).astype(np.float32))
+            np.float32)
+        self.input_data = convert_float_to_uint16(self.input_data_fp32)
    def test_check_output(self):
        self.check_output_with_place(core.CPUPlace(), no_check_set=['XShape'])
@@ -61,7 +62,7 @@ class TestReshapeBf16Op(OpTest):
            core.CPUPlace(), ["X"],
            "Out",
            check_dygraph=False,
-            user_defined_grads=[self.inputs["X"]],
+            user_defined_grads=[self.input_data_fp32],
            user_defined_grad_outputs=[
                self.inputs["X"].reshape(self.infered_shape)
            ])

--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -1452,6 +1452,16 @@ class OpTest(unittest.TestCase):
        analytic_grads = self._get_gradient(inputs_to_check, place,
                                            output_names, no_grad_set,
                                            user_defined_grad_outputs)
+        # comparison of bf16 results will happen as fp32
+        # loop over list of grads and convert bf16 to fp32
+        fp32_grads = []
+        for grad in analytic_grads:
+            if grad.dtype == np.uint16:
+                grad = convert_uint16_to_float(grad)
+                max_relative_error = 0.03
+            fp32_grads.append(grad)
+        analytic_grads = fp32_grads
        self._assert_is_close(numeric_grads, analytic_grads, inputs_to_check,
                              max_relative_error,
                              "Gradient Check On %s" % str(place))