diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index a400d27b798e37b47ce2740de8d95165f5155b1a..e6b1f6a1c18c38d94d9e3bc4807de7d8b952d60d 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -169,9 +169,21 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
-                                       ctx, framework::GradVarName("Out")),
-                                   ctx.GetPlace());
+    auto input_data_type = OperatorWithKernel::IndicateVarDataType(
+        ctx, framework::GradVarName("Out"));
+
+#ifdef PADDLE_WITH_MKLDNN
+    // extra checking if attr "use_mkldnn" exist is needed because
+    // test_reverse_op is calling concat_grad kernel without setting
+    // "use_mkldnn" to any value
+    if (ctx.HasAttr("use_mkldnn") &&
+        this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
   framework::OpKernelType GetKernelTypeForVar(
diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
index 57a56776736ff9132ded8682f3dc1c8841d30e48..4cc96a48bd26f45cdeb9626c6cdd7936a3ba9818 100644
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -23,6 +23,7 @@ namespace operators {
 
 using framework::DataLayout;
 using framework::Tensor;
+using framework::LoDTensor;
 using mkldnn::memory;
 using mkldnn::primitive;
 using mkldnn::concat;
@@ -149,6 +150,72 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     output->set_format(platform::GetMKLDNNFormat(*dst_mem));
   }
 };
+
+template <typename T>
+class ConcatGradMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    const auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& onednn_engine = dev_ctx.GetEngine();
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+
+    auto out_var_names = ctx.OutputNames(framework::GradVarName("X"));
+
+    const auto x = ctx.MultiInput<LoDTensor>("X");
+    const auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto dx = ctx.MultiOutput<LoDTensor>(framework::GradVarName("X"));
+
+    for (size_t i = 0; i < dx.size(); ++i) {
+      if (dx[i] != nullptr) {
+        dx[i]->set_lod(x[i]->lod());
+      }
+    }
+
+    int axis = ctx.Attr<int>("axis");
+    if (ctx.HasInput("AxisTensor")) {
+      auto* axis_tensor = ctx.Input<Tensor>("AxisTensor");
+      axis = GetDataFromTensor<int>(axis_tensor)[0];
+    }
+
+    auto dout_vec_dims = framework::vectorize(dout->dims());
+
+    axis = ComputeAxis(axis, dout_vec_dims.size());
+
+    std::vector<int64_t> offset(dout_vec_dims.size(), 0);
+
+    mkldnn::memory::data_type dout_type =
+        framework::ToMKLDNNDataType(dout->type());
+    platform::ReorderMKLDNNHandler reorder_handler(dout_vec_dims, dout->type(),
+                                                   dout_type, onednn_engine);
+    auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+        dout->format(), platform::to_void_cast(dout->data<T>()));
+
+    for (size_t i = 0; i < dx.size(); ++i) {
+      if (out_var_names[i] != framework::kEmptyVarName &&
+          dx[i]->numel() != 0UL) {
+        auto dx_vec_dims = framework::vectorize(dx[i]->dims());
+        auto slice_mem_p = reorder_handler.AcquireSubmemory(
+            dx_vec_dims, offset, reorder_src_memory_p);
+
+        auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+            dx[i], dx_vec_dims, dout->format(), ctx.GetPlace());
+        auto reorder_p =
+            reorder_handler.AcquireReorder(reorder_dst_memory_p, slice_mem_p);
+
+        reorder_p->execute(astream, *slice_mem_p, *reorder_dst_memory_p);
+
+        offset[axis] += dx[i]->dims()[axis];
+
+        dx[i]->set_layout(framework::DataLayout::kMKLDNN);
+        dx[i]->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
+      }
+    }
+    astream.wait();
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -159,3 +226,7 @@ REGISTER_OP_KERNEL(concat, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::ConcatMKLDNNOpKernel<paddle::platform::bfloat16>,
                    ops::ConcatMKLDNNOpKernel<int8_t>,
                    ops::ConcatMKLDNNOpKernel<uint8_t>);
+
+REGISTER_OP_KERNEL(concat_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::ConcatGradMKLDNNOpKernel<float>,
+                   ops::ConcatGradMKLDNNOpKernel<paddle::platform::bfloat16>);
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py
index 2b7b2b36afa4fb22c3bdfbb9beb8415f2159d99d..e53afaa57be1c85a18d06719562950c29005a614 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py
@@ -40,13 +40,28 @@ class TestConcatBf16Op(OpTest):
             'mkldnn_data_type': self.mkldnn_data_type
         }
 
+        self.sections = [self.x0.shape[self.axis]] * 2
+        self.sections[1] += self.x1.shape[self.axis]
+
         self.output = np.concatenate(
             (self.x0, self.x1, self.x2), axis=self.axis).astype(np.uint16)
         self.outputs = {'Out': self.output}
 
+    def calculate_grads(self):
+        self.dout = self.outputs['Out']
+        self.dxs = np.split(self.dout, self.sections, self.axis)
+
     def test_check_output(self):
         self.check_output_with_place(core.CPUPlace())
 
+    def test_check_grad(self):
+        self.calculate_grads()
+        self.check_grad_with_place(
+            core.CPUPlace(), ["x0", "x1", "x2"],
+            "Out",
+            user_defined_grads=[self.dxs[0], self.dxs[1], self.dxs[2]],
+            user_defined_grad_outputs=[self.dout])
+
 # --------------------test concat bf16 in with axis 0--------------------
 
     def init_test_data(self):
@@ -61,9 +76,9 @@ class TestConcatBf16Op(OpTest):
         self.axis = 0
 
     def init_shape(self):
-        self.x0_shape = [2, 2, 1, 2]
-        self.x1_shape = [1, 2, 1, 2]
-        self.x2_shape = [3, 2, 1, 2]
+        self.x0_shape = [6, 2, 4, 3]
+        self.x1_shape = [7, 2, 4, 3]
+        self.x2_shape = [8, 2, 4, 3]
 
 
 # --------------------test concat bf16 in with axis 1--------------------
@@ -74,9 +89,9 @@ class TestAxis1Case(TestConcatBf16Op):
         self.axis = 1
 
     def init_shape(self):
-        self.x0_shape = [1, 1, 5, 5]
-        self.x1_shape = [1, 2, 5, 5]
-        self.x2_shape = [1, 3, 5, 5]
+        self.x0_shape = [1, 4, 5, 5]
+        self.x1_shape = [1, 8, 5, 5]
+        self.x2_shape = [1, 6, 5, 5]
 
 
 # --------------------test concat bf16 in with axis 2--------------------
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py
index 4900b42d3618d14c54a8a5beb2b027d7e415d047..7fc8f1d30802cdd309afcf4c5e32fc20ebddac4c 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py
@@ -15,78 +15,90 @@
 from __future__ import print_function
 
 import unittest
-from paddle.fluid.tests.unittests.test_concat_op import TestConcatOp, TestConcatOp2, TestConcatOp3, TestConcatOp4
+import numpy as np
+import struct
 
-
-class TestMKLDNNConcatOp(TestConcatOp):
-    def setUp(self):
-        super(TestMKLDNNConcatOp, self).setUp()
-        self.attrs["use_mkldnn"] = True
-        self._cpu_only = True
-
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=(self.attrs["use_mkldnn"] == False))
-
-    def test_check_grad(self):
-        pass
-
-    def init_kernel_type(self):
-        self.use_mkldnn = True
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
+from paddle import enable_static
 
 
-class TestMKLDNNConcatOp2(TestConcatOp2):
+class TestConcatAxis0OneDNNOp(OpTest):
     def setUp(self):
-        super(TestMKLDNNConcatOp2, self).setUp()
-        self.attrs["use_mkldnn"] = True
-        self._cpu_only = True
+        self.op_type = "concat"
+        self.mkldnn_data_type = "float32"
+        self.init_axis()
+        self.init_shape()
+        self.init_test_data()
+        self.configure_datatype()
+        self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
+        self.attrs = {
+            'axis': self.axis,
+            'use_mkldnn': True,
+            'mkldnn_data_type': self.mkldnn_data_type
+        }
+
+        self.output = np.concatenate(
+            (self.x0, self.x1, self.x2), axis=self.axis).astype(self.dtype)
+
+        self.outputs = {'Out': self.output}
+
+    def configure_datatype(self):
+        self.mkldnn_data_type = "float32"
+        self.dtype = np.float32
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=(self.attrs["use_mkldnn"] == False))
+        self.check_output_with_place(core.CPUPlace())
 
     def test_check_grad(self):
-        pass
+        self.check_grad(['x0'], 'Out')
+        self.check_grad(['x1'], 'Out')
+        self.check_grad(['x2'], 'Out')
 
-    def init_kernel_type(self):
-        self.use_mkldnn = True
+    def init_test_data(self):
+        self.x0 = np.random.random(self.x0_shape).astype(np.float32)
+        self.x1 = np.random.random(self.x1_shape).astype(np.float32)
+        self.x2 = np.random.random(self.x2_shape).astype(np.float32)
 
+    def init_axis(self):
+        self.axis = 0
 
-class TestMKLDNNConcatOp3(TestConcatOp3):
-    def setUp(self):
-        super(TestMKLDNNConcatOp3, self).setUp()
-        self.attrs["use_mkldnn"] = True
-        self._cpu_only = True
+    def init_shape(self):
+        self.x0_shape = [2, 2, 1, 50]
+        self.x1_shape = [1, 2, 1, 50]
+        self.x2_shape = [3, 2, 1, 50]
 
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=(self.attrs["use_mkldnn"] == False))
 
-    def test_check_grad(self):
-        pass
+class TestConcatAxis1OneDNNOp(TestConcatAxis0OneDNNOp):
+    def init_axis(self):
+        self.axis = 1
 
-    def init_kernel_type(self):
-        self.use_mkldnn = True
+    def init_shape(self):
+        self.x0_shape = [1, 1, 5, 50]
+        self.x1_shape = [1, 2, 5, 50]
+        self.x2_shape = [1, 3, 5, 50]
 
 
-class TestMKLDNNConcatOp4(TestConcatOp4):
-    def setUp(self):
-        super(TestMKLDNNConcatOp4, self).setUp()
-        self.attrs["use_mkldnn"] = True
-        self._cpu_only = True
+class TestConcatAxis2OneDNNOp(TestConcatAxis0OneDNNOp):
+    def init_axis(self):
+        self.axis = 2
 
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=(self.attrs["use_mkldnn"] == False))
+    def init_shape(self):
+        self.x0_shape = [2, 3, 4, 50]
+        self.x1_shape = [2, 3, 5, 50]
+        self.x2_shape = [2, 3, 6, 50]
 
-    def test_check_grad(self):
-        pass
 
-    def init_kernel_type(self):
-        self.use_mkldnn = True
+class TestConcatAxis3OneDNNOp(TestConcatAxis0OneDNNOp):
+    def init_axis(self):
+        self.axis = 3
+
+    def init_shape(self):
+        self.x0_shape = [5, 3, 5, 5]
+        self.x1_shape = [5, 3, 5, 6]
+        self.x2_shape = [5, 3, 5, 7]
 
 
 if __name__ == '__main__':
-    from paddle import enable_static
     enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py
index 10cd774ce04bec2c60cb2e672a20c5f52ae82449..5f936e577a06fd611a149a2501be7bd845cc7905 100644
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-from op_test import OpTest, skip_check_grad_ci
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard, core
 import paddle