Added concat BF16/FP32 BWD OneDNN kernel (#35889)

* tmp * added concat BF16/FP32 BWD oneDNN kernel * minor change * minor change * fix for CI * added formatting * Reverted deleting static keyword * added reviewers suggestions * reverted deleting concat bf16 test file * fixed concat tests

Added concat BF16/FP32 BWD OneDNN kernel (#35889)
* tmp * added concat BF16/FP32 BWD oneDNN kernel * minor change * minor change * fix for CI * added formatting * Reverted deleting static keyword * added reviewers suggestions * reverted deleting concat bf16 test file * fixed concat tests
dc4d5719 · jakpiase · GitHub · 2cee0ea7 · dc4d5719 · dc4d5719
5 changed file
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -169,9 +169,21 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+    auto input_data_type = OperatorWithKernel::IndicateVarDataType(
-                                       ctx, framework::GradVarName("Out")),
+        ctx, framework::GradVarName("Out"));
-                                   ctx.GetPlace());
+#ifdef PADDLE_WITH_MKLDNN
+    // extra checking if attr "use_mkldnn" exist is needed because
+    // test_reverse_op is calling concat_grad kernel without setting
+    // "use_mkldnn" to any value
+    if (ctx.HasAttr("use_mkldnn") &&
+        this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
  }
  framework::OpKernelType GetKernelTypeForVar(

--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -23,6 +23,7 @@ namespace operators {
 using framework::DataLayout;
 using framework::Tensor;
+using framework::LoDTensor;
 using mkldnn::memory;
 using mkldnn::primitive;
 using mkldnn::concat;
@@ -149,6 +150,72 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    output->set_format(platform::GetMKLDNNFormat(*dst_mem));
  }
 };
+template <typename T>
+class ConcatGradMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    const auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& onednn_engine = dev_ctx.GetEngine();
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+    auto out_var_names = ctx.OutputNames(framework::GradVarName("X"));
+    const auto x = ctx.MultiInput<LoDTensor>("X");
+    const auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto dx = ctx.MultiOutput<LoDTensor>(framework::GradVarName("X"));
+    for (size_t i = 0; i < dx.size(); ++i) {
+      if (dx[i] != nullptr) {
+        dx[i]->set_lod(x[i]->lod());
+      }
+    }
+    int axis = ctx.Attr<int>("axis");
+    if (ctx.HasInput("AxisTensor")) {
+      auto* axis_tensor = ctx.Input<Tensor>("AxisTensor");
+      axis = GetDataFromTensor<int>(axis_tensor)[0];
+    }
+    auto dout_vec_dims = framework::vectorize(dout->dims());
+    axis = ComputeAxis(axis, dout_vec_dims.size());
+    std::vector<int64_t> offset(dout_vec_dims.size(), 0);
+    mkldnn::memory::data_type dout_type =
+        framework::ToMKLDNNDataType(dout->type());
+    platform::ReorderMKLDNNHandler reorder_handler(dout_vec_dims, dout->type(),
+                                                   dout_type, onednn_engine);
+    auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+        dout->format(), platform::to_void_cast(dout->data<T>()));
+    for (size_t i = 0; i < dx.size(); ++i) {
+      if (out_var_names[i] != framework::kEmptyVarName &&
+          dx[i]->numel() != 0UL) {
+        auto dx_vec_dims = framework::vectorize(dx[i]->dims());
+        auto slice_mem_p = reorder_handler.AcquireSubmemory(
+            dx_vec_dims, offset, reorder_src_memory_p);
+        auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+            dx[i], dx_vec_dims, dout->format(), ctx.GetPlace());
+        auto reorder_p =
+            reorder_handler.AcquireReorder(reorder_dst_memory_p, slice_mem_p);
+        reorder_p->execute(astream, *slice_mem_p, *reorder_dst_memory_p);
+        offset[axis] += dx[i]->dims()[axis];
+        dx[i]->set_layout(framework::DataLayout::kMKLDNN);
+        dx[i]->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
+      }
+    }
+    astream.wait();
+  }
+};
 }  // namespace operators
 }  // namespace paddle
@@ -159,3 +226,7 @@ REGISTER_OP_KERNEL(concat, MKLDNN, ::paddle::platform::CPUPlace,
                   ops::ConcatMKLDNNOpKernel<paddle::platform::bfloat16>,
                   ops::ConcatMKLDNNOpKernel<int8_t>,
                   ops::ConcatMKLDNNOpKernel<uint8_t>);
+REGISTER_OP_KERNEL(concat_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::ConcatGradMKLDNNOpKernel<float>,
+                   ops::ConcatGradMKLDNNOpKernel<paddle::platform::bfloat16>);
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py
@@ -40,13 +40,28 @@ class TestConcatBf16Op(OpTest):
            'mkldnn_data_type': self.mkldnn_data_type
        }
+        self.sections = [self.x0.shape[self.axis]] * 2
+        self.sections[1] += self.x1.shape[self.axis]
        self.output = np.concatenate(
            (self.x0, self.x1, self.x2), axis=self.axis).astype(np.uint16)
        self.outputs = {'Out': self.output}
+    def calculate_grads(self):
+        self.dout = self.outputs['Out']
+        self.dxs = np.split(self.dout, self.sections, self.axis)
    def test_check_output(self):
        self.check_output_with_place(core.CPUPlace())
+    def test_check_grad(self):
+        self.calculate_grads()
+        self.check_grad_with_place(
+            core.CPUPlace(), ["x0", "x1", "x2"],
+            "Out",
+            user_defined_grads=[self.dxs[0], self.dxs[1], self.dxs[2]],
+            user_defined_grad_outputs=[self.dout])
 # --------------------test concat bf16 in with axis 0--------------------
    def init_test_data(self):
@@ -61,9 +76,9 @@ class TestConcatBf16Op(OpTest):
        self.axis = 0
    def init_shape(self):
-        self.x0_shape = [2, 2, 1, 2]
+        self.x0_shape = [6, 2, 4, 3]
-        self.x1_shape = [1, 2, 1, 2]
+        self.x1_shape = [7, 2, 4, 3]
-        self.x2_shape = [3, 2, 1, 2]
+        self.x2_shape = [8, 2, 4, 3]
 # --------------------test concat bf16 in with axis 1--------------------
@@ -74,9 +89,9 @@ class TestAxis1Case(TestConcatBf16Op):
        self.axis = 1
    def init_shape(self):
-        self.x0_shape = [1, 1, 5, 5]
+        self.x0_shape = [1, 4, 5, 5]
-        self.x1_shape = [1, 2, 5, 5]
+        self.x1_shape = [1, 8, 5, 5]
-        self.x2_shape = [1, 3, 5, 5]
+        self.x2_shape = [1, 6, 5, 5]
 # --------------------test concat bf16 in with axis 2--------------------

--- a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py
@@ -15,78 +15,90 @@
 from __future__ import print_function
 import unittest
-from paddle.fluid.tests.unittests.test_concat_op import TestConcatOp, TestConcatOp2, TestConcatOp3, TestConcatOp4
+import numpy as np
+import struct
+import paddle.fluid.core as core
-class TestMKLDNNConcatOp(TestConcatOp):
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
-    def setUp(self):
+from paddle import enable_static
-        super(TestMKLDNNConcatOp, self).setUp()
-        self.attrs["use_mkldnn"] = True
-        self._cpu_only = True
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=(self.attrs["use_mkldnn"] == False))
-    def test_check_grad(self):
-        pass
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-class TestMKLDNNConcatOp2(TestConcatOp2):
+class TestConcatAxis0OneDNNOp(OpTest):
    def setUp(self):
-        super(TestMKLDNNConcatOp2, self).setUp()
+        self.op_type = "concat"
-        self.attrs["use_mkldnn"] = True
+        self.mkldnn_data_type = "float32"
-        self._cpu_only = True
+        self.init_axis()
+        self.init_shape()
+        self.init_test_data()
+        self.configure_datatype()
+        self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
+        self.attrs = {
+            'axis': self.axis,
+            'use_mkldnn': True,
+            'mkldnn_data_type': self.mkldnn_data_type
+        }
+        self.output = np.concatenate(
+            (self.x0, self.x1, self.x2), axis=self.axis).astype(self.dtype)
+        self.outputs = {'Out': self.output}
+    def configure_datatype(self):
+        self.mkldnn_data_type = "float32"
+        self.dtype = np.float32
    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        self.check_output_with_place(core.CPUPlace())
-        self.check_output(check_dygraph=(self.attrs["use_mkldnn"] == False))
    def test_check_grad(self):
-        pass
+        self.check_grad(['x0'], 'Out')
+        self.check_grad(['x1'], 'Out')
+        self.check_grad(['x2'], 'Out')
-    def init_kernel_type(self):
+    def init_test_data(self):
-        self.use_mkldnn = True
+        self.x0 = np.random.random(self.x0_shape).astype(np.float32)
+        self.x1 = np.random.random(self.x1_shape).astype(np.float32)
+        self.x2 = np.random.random(self.x2_shape).astype(np.float32)
+    def init_axis(self):
+        self.axis = 0
-class TestMKLDNNConcatOp3(TestConcatOp3):
+    def init_shape(self):
-    def setUp(self):
+        self.x0_shape = [2, 2, 1, 50]
-        super(TestMKLDNNConcatOp3, self).setUp()
+        self.x1_shape = [1, 2, 1, 50]
-        self.attrs["use_mkldnn"] = True
+        self.x2_shape = [3, 2, 1, 50]
-        self._cpu_only = True
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=(self.attrs["use_mkldnn"] == False))
-    def test_check_grad(self):
+class TestConcatAxis1OneDNNOp(TestConcatAxis0OneDNNOp):
-        pass
+    def init_axis(self):
+        self.axis = 1
-    def init_kernel_type(self):
+    def init_shape(self):
-        self.use_mkldnn = True
+        self.x0_shape = [1, 1, 5, 50]
+        self.x1_shape = [1, 2, 5, 50]
+        self.x2_shape = [1, 3, 5, 50]
-class TestMKLDNNConcatOp4(TestConcatOp4):
+class TestConcatAxis2OneDNNOp(TestConcatAxis0OneDNNOp):
-    def setUp(self):
+    def init_axis(self):
-        super(TestMKLDNNConcatOp4, self).setUp()
+        self.axis = 2
-        self.attrs["use_mkldnn"] = True
-        self._cpu_only = True
-    def test_check_output(self):
+    def init_shape(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        self.x0_shape = [2, 3, 4, 50]
-        self.check_output(check_dygraph=(self.attrs["use_mkldnn"] == False))
+        self.x1_shape = [2, 3, 5, 50]
+        self.x2_shape = [2, 3, 6, 50]
-    def test_check_grad(self):
-        pass
-    def init_kernel_type(self):
+class TestConcatAxis3OneDNNOp(TestConcatAxis0OneDNNOp):
-        self.use_mkldnn = True
+    def init_axis(self):
+        self.axis = 3
+    def init_shape(self):
+        self.x0_shape = [5, 3, 5, 5]
+        self.x1_shape = [5, 3, 5, 6]
+        self.x2_shape = [5, 3, 5, 7]
 if __name__ == '__main__':
-    from paddle import enable_static
    enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
-from op_test import OpTest, skip_check_grad_ci
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard, core
 import paddle