From ac3dc0bbad72999d9c48f1494dee22a325809b95 Mon Sep 17 00:00:00 2001
From: joeqiao12 <45232181+joeqiao12@users.noreply.github.com>
Date: Tue, 25 Jan 2022 16:02:00 +0800
Subject: [PATCH] [MLU]add mlu kernel for split and concat (#39020)

* [MLU]add mlu kernel for concat and split op

* delete device_context DEPS
---
 paddle/fluid/operators/concat_op_mlu.cc       |  85 +++++++
 paddle/fluid/operators/split_op_mlu.cc        |  88 +++++++
 .../fluid/platform/device/mlu/CMakeLists.txt  |   2 +-
 .../tests/unittests/mlu/test_concat_op_mlu.py | 223 +++++++++++++++++
 .../tests/unittests/mlu/test_split_op_mlu.py  | 234 ++++++++++++++++++
 5 files changed, 631 insertions(+), 1 deletion(-)
 create mode 100644 paddle/fluid/operators/concat_op_mlu.cc
 create mode 100644 paddle/fluid/operators/split_op_mlu.cc
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_concat_op_mlu.py
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_split_op_mlu.py
diff --git a/paddle/fluid/operators/concat_op_mlu.cc b/paddle/fluid/operators/concat_op_mlu.cc
new file mode 100644
index 00000000000..f7a1cae72be
--- /dev/null
+++ b/paddle/fluid/operators/concat_op_mlu.cc
@@ -0,0 +1,85 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/concat_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ConcatMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto ins = ctx.MultiInput<framework::LoDTensor>("X");
+    framework::LoDTensor* out = ctx.Output<framework::LoDTensor>("Out");
+    PADDLE_ENFORCE_NOT_NULL(ins[0],
+                            platform::errors::NotFound(
+                                "The first input tensor is not initalized."));
+    auto axis = ctx.Attr<int>("axis");
+    auto ins_size = ins.size();
+    bool need_resize_out_dims = false;
+    if (ctx.HasInput("AxisTensor")) {
+      auto* axis_tensor = ctx.Input<framework::Tensor>("AxisTensor");
+      axis = GetDataFromTensor<int>(axis_tensor)[0];
+      need_resize_out_dims = true;
+    }
+    axis = ComputeAxis(static_cast<int64_t>(axis),
+                       static_cast<int64_t>(ins[0]->dims().size()));
+
+    if (need_resize_out_dims) {
+      const size_t n = ins.size();
+      std::vector<framework::DDim> ins_dims(n);
+      for (size_t i = 0; i < n; i++) {
+        ins_dims[i] = ins[i]->dims();
+      }
+
+      framework::DDim out_dims = ComputeAndCheckShape(true, ins_dims, axis);
+      out->Resize(out_dims);
+    }
+    const int axis_t = axis;
+    const int ins_size_t = ins_size;
+    auto place = ctx.GetPlace();
+    out->mutable_data<T>(place);
+
+    // mlu should do sth
+    // init ins tensors
+    std::vector<const void*> inputs;
+    std::vector<MLUCnnlTensorDesc> input_descs;
+    std::vector<cnnlTensorDescriptor_t> desc_vector;
+    for (size_t i = 0; i < ins_size; i++) {
+      input_descs.emplace_back(MLUCnnlTensorDesc(
+          *ins[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(ins[i]->type())));
+      desc_vector.push_back(input_descs.back().get());
+      inputs.push_back(GetBasePtr(ins[i]));
+    }
+    // init out tensors
+    MLUCnnlTensorDesc output_desc(*out, CNNL_LAYOUT_ARRAY,
+                                  ToCnnlDataType(out->type()));
+
+    // MLU should do sth
+    MLUCnnl::Concat(ctx, ins_size_t, axis_t, desc_vector.data(), inputs.data(),
+                    output_desc.get(), GetBasePtr(out));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_MLU_KERNEL(concat, ops::ConcatMLUKernel<float>,
+                       ops::ConcatMLUKernel<paddle::platform::float16>,
+                       ops::ConcatMLUKernel<int64_t>,
+                       ops::ConcatMLUKernel<bool>, ops::ConcatMLUKernel<int>,
+                       ops::ConcatMLUKernel<uint8_t>);
diff --git a/paddle/fluid/operators/split_op_mlu.cc b/paddle/fluid/operators/split_op_mlu.cc
new file mode 100644
index 00000000000..c569c9bf091
--- /dev/null
+++ b/paddle/fluid/operators/split_op_mlu.cc
@@ -0,0 +1,88 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/split_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class SplitMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // init parameter
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    int num = ctx.Attr<int>("num");
+    std::vector<int> sections = ctx.Attr<std::vector<int>>("sections");
+    int axis = ctx.Attr<int>("axis");
+    auto in_dims = in->dims();
+    auto out_size = outs.size();
+    auto num_tensor = num == 0 ? out_size : num;
+
+    bool need_resize_outs_dims = false;
+    if (ctx.HasInput("AxisTensor")) {
+      auto* axis_tensor = ctx.Input<framework::Tensor>("AxisTensor");
+      axis = GetDataFromTensor(axis_tensor)[0];
+      need_resize_outs_dims = true;
+    }
+    auto sections_tensor_list =
+        ctx.MultiInput<framework::Tensor>("SectionsTensorList");
+    if (sections_tensor_list.size() > 0) {
+      sections = GetDataFromTensorList(sections_tensor_list);
+      need_resize_outs_dims = true;
+    }
+    if (need_resize_outs_dims) {
+      std::vector<framework::DDim> outs_dims =
+          UpdateOutsDims(true, true, in_dims, num, sections, axis, out_size);
+      for (size_t j = 0; j < outs.size(); ++j) {
+        outs[j]->Resize(outs_dims[j]);
+      }
+    }
+
+    // init out tensors
+    std::vector<void*> vct_tensor;
+    std::vector<MLUCnnlTensorDesc> output_descs;
+    std::vector<cnnlTensorDescriptor_t> desc_vector;
+    auto place = ctx.GetPlace();
+    for (size_t i = 0; i < outs.size(); i++) {
+      outs[i]->mutable_data<T>(ctx.GetPlace());
+      output_descs.emplace_back(MLUCnnlTensorDesc(
+          *outs[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(outs[i]->type())));
+      desc_vector.push_back(output_descs.back().get());
+      vct_tensor.push_back(GetBasePtr(outs[i]));
+    }
+    // init in tensors
+    MLUCnnlTensorDesc input_desc(*in, CNNL_LAYOUT_ARRAY,
+                                 ToCnnlDataType(in->type()));
+
+    // MLU should do sth
+    MLUCnnl::Split(ctx, num_tensor, axis, input_desc.get(), GetBasePtr(in),
+                   desc_vector.data(), vct_tensor.data());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(split, ops::SplitMLUKernel<float>,
+                       ops::SplitMLUKernel<int64_t>, ops::SplitMLUKernel<int>,
+                       ops::SplitMLUKernel<bool>,
+                       ops::SplitMLUKernel<plat::float16>);
diff --git a/paddle/fluid/platform/device/mlu/CMakeLists.txt b/paddle/fluid/platform/device/mlu/CMakeLists.txt
index 9ef4439f39b..a4584f54637 100644
--- a/paddle/fluid/platform/device/mlu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/mlu/CMakeLists.txt
@@ -5,6 +5,6 @@ IF(WITH_MLU)
 
     cc_library(mlu_stream SRCS mlu_stream.cc DEPS boost mlu_info stream_callback_manager)
     
-    cc_library(mlu_device_context SRCS device_context.cc DEPS mlu_stream )
+    cc_library(mlu_device_context SRCS device_context.cc DEPS mlu_stream eigen3)
     cc_test(mlu_device_context_test SRCS device_context_test.cc DEPS mlu_device_context)
 ENDIF()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_concat_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_concat_op_mlu.py
new file mode 100644
index 00000000000..3bfa96b7001
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_concat_op_mlu.py
@@ -0,0 +1,223 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+class TestConcatOp(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "concat"
+        self.place = paddle.device.MLUPlace(0)
+        self.init_dtype()
+        self.init_test_data()
+
+        self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
+        self.attrs = {'axis': self.axis}
+        if self.axis < 0:
+            self.actual_axis = self.axis + len(self.x0.shape)
+            self.actual_axis = self.actual_axis if self.actual_axis > 0 else 0
+        else:
+            self.actual_axis = self.axis
+
+        self.outputs = {
+            'Out': np.concatenate(
+                (self.x0, self.x1, self.x2), axis=self.actual_axis)
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['x0', 'x2'], 'Out')
+        self.check_grad_with_place(self.place, ['x1'], 'Out')
+        self.check_grad_with_place(self.place, ['x2'], 'Out')
+
+    def init_test_data(self):
+        self.x0 = np.random.random((1, 4, 50)).astype(self.dtype)
+        self.x1 = np.random.random((2, 4, 50)).astype(self.dtype)
+        self.x2 = np.random.random((3, 4, 50)).astype(self.dtype)
+        self.axis = 0
+
+
+class TestConcatOp2(TestConcatOp):
+    def init_test_data(self):
+        self.x0 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
+        self.x1 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
+        self.x2 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
+        self.axis = 1
+
+
+@skip_check_grad_ci(
+    reason="The function 'check_grad' for large inputs is too slow.")
+class TestConcatOp3(TestConcatOp):
+    def init_test_data(self):
+        self.x0 = np.random.random((1, 256, 170, 256)).astype(self.dtype)
+        self.x1 = np.random.random((1, 128, 170, 256)).astype(self.dtype)
+        self.x2 = np.random.random((1, 128, 170, 256)).astype(self.dtype)
+        self.axis = 1
+
+    def test_check_grad(self):
+        pass
+
+
+@skip_check_grad_ci(
+    reason="This test will meet fetch error when there is a null grad. The detailed information is in PR#17015."
+)
+class TestConcatOp4(TestConcatOp):
+    def init_test_data(self):
+        self.x0 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
+        self.x1 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
+        self.x2 = np.random.random((0, 3, 4, 5)).astype(self.dtype)
+        self.axis = 0
+
+    def test_check_grad(self):
+        pass
+
+
+class TestConcatOp5(TestConcatOp):
+    def init_test_data(self):
+        self.x0 = np.random.random((5, 1, 4, 5)).astype(self.dtype)
+        self.x1 = np.random.random((5, 2, 4, 5)).astype(self.dtype)
+        self.x2 = np.random.random((5, 3, 4, 5)).astype(self.dtype)
+        self.axis = -3
+
+
+#----------------Concat Fp16----------------
+def create_test_fp16(parent):
+    class TestConcatFp16(parent):
+        def init_dtype(self):
+            self.dtype = np.float16
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
+    TestConcatFp16.__name__ = cls_name
+    globals()[cls_name] = TestConcatFp16
+
+
+create_test_fp16(TestConcatOp)
+create_test_fp16(TestConcatOp2)
+create_test_fp16(TestConcatOp3)
+create_test_fp16(TestConcatOp4)
+create_test_fp16(TestConcatOp5)
+
+
+#----------------Concat Int64----------------
+def create_test_int64(parent):
+    class TestConcatInt64(parent):
+        def init_dtype(self):
+            self.dtype = np.int64
+
+        def test_check_grad(self):
+            pass
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Int64")
+    TestConcatInt64.__name__ = cls_name
+    globals()[cls_name] = TestConcatInt64
+
+
+create_test_int64(TestConcatOp)
+create_test_int64(TestConcatOp2)
+create_test_int64(TestConcatOp3)
+create_test_int64(TestConcatOp4)
+create_test_int64(TestConcatOp5)
+
+
+#----------------Concat Int32----------------
+def create_test_int32(parent):
+    class TestConcatInt32(parent):
+        def init_dtype(self):
+            self.dtype = np.int32
+
+        def test_check_grad(self):
+            pass
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Int32")
+    TestConcatInt32.__name__ = cls_name
+    globals()[cls_name] = TestConcatInt32
+
+
+create_test_int32(TestConcatOp)
+create_test_int32(TestConcatOp2)
+create_test_int32(TestConcatOp3)
+create_test_int32(TestConcatOp4)
+create_test_int32(TestConcatOp5)
+
+
+#----------------Concat AxisTensor----------------
+def create_test_AxisTensor(parent):
+    class TestConcatAxisTensor(parent):
+        def setUp(self):
+            self.op_type = "concat"
+            self.dtype = self.init_dtype()
+            self.init_test_data()
+
+            self.inputs = {
+                'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)],
+                'AxisTensor': np.array([self.axis]).astype("int32")
+            }
+            self.attrs = {}
+
+            if self.axis < 0:
+                self.actual_axis = self.axis + len(self.x0.shape)
+                self.actual_axis = self.actual_axis if self.actual_axis > 0 else 0
+            else:
+                self.actual_axis = self.axis
+
+            self.outputs = {
+                'Out': np.concatenate(
+                    (self.x0, self.x1, self.x2), axis=self.actual_axis)
+            }
+
+            self.place = paddle.device.MLUPlace(0)
+            self.__class__.use_mlu = True
+
+        def init_test_data(self):
+            self.x0 = np.random.random((1, 4, 50)).astype(self.dtype)
+            self.x1 = np.random.random((2, 4, 50)).astype(self.dtype)
+            self.x2 = np.random.random((3, 4, 50)).astype(self.dtype)
+            self.axis = 0
+
+        def init_dtype(self):
+            self.dtype = np.float32
+
+    cls_name = "{0}_{1}".format(parent.__name__, "AxisTensor")
+    TestConcatAxisTensor.__name__ = cls_name
+    globals()[cls_name] = TestConcatAxisTensor
+
+
+create_test_AxisTensor(TestConcatOp)
+create_test_AxisTensor(TestConcatOp2)
+create_test_AxisTensor(TestConcatOp3)
+create_test_AxisTensor(TestConcatOp4)
+create_test_AxisTensor(TestConcatOp5)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_split_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_split_op_mlu.py
new file mode 100644
index 00000000000..b8363545d22
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_split_op_mlu.py
@@ -0,0 +1,234 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+paddle.enable_static()
+SEED = 2021
+
+
+class TestCase1(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.set_example()
+        self.op_type = "split"
+        self.place = paddle.device.MLUPlace(0)
+        ipt = self.x.astype(self.dtype)
+        axis = self.axis if isinstance(self.axis, int) else int(self.axis[0])
+        tmp_outs = np.split(
+            ipt, axis=axis, indices_or_sections=self.num_or_sections)
+        tmp_outs = [o.astype(self.dtype) for o in tmp_outs]
+        self.outputs = {'Out': []}
+        self.outs = []
+        for i, o in enumerate(tmp_outs):
+            self.outputs["Out"].append((str(i), o))
+            self.outs.append(str(i))
+
+        self.attrs = {"axis": self.axis, "num": self.num_or_sections}
+        self.inputs = {}
+        self.inputs.update({'X': ipt.astype(self.dtype)})
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.__class__.op_type = "split"
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def set_example(self):
+        self.dtype = "float32"
+        self.x = np.random.random((2, 4, 6))
+        self.axis = 1
+        self.num_or_sections = 2
+
+
+class TestCase2(TestCase1):
+    def set_example(self):
+        self.dtype = "float32"
+        self.x = np.random.random((20, 4, 50))
+        self.axis = 0
+        self.num_or_sections = 4
+
+
+class TestCase4(TestCase1):
+    def set_example(self):
+        self.dtype = "float16"
+        self.x = np.random.random((4, 50, 20))
+        self.axis = 2
+        self.num_or_sections = 4
+
+
+# Test Sections
+class TestCase5(TestCase1):
+    def set_example(self):
+        super().set_example()
+        self.x = np.random.random((2, 10, 4))
+        self.axis = 1
+        self.num_or_sections = [2, 4, 8]
+
+    def setUp(self):
+        super().setUp()
+        self.attrs.update({"sections": [2, 2, 4, 2], "num": 0})
+
+
+class API_TestSplit(unittest.TestCase):
+    def test_out(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data = fluid.layers.data('data', shape=[-1, 10], dtype='float32')
+            x0, x1 = paddle.split(data, num_or_sections=(3, 7), axis=1)
+            place = fluid.MLUPlace(0)
+            exe = fluid.Executor(place)
+            input1 = np.random.random([1, 10]).astype('float32')
+            r0, r1 = exe.run(feed={"data": input1}, fetch_list=[x0, x1])
+            ex_x0, ex_x1 = np.split(input1, (3, ), axis=1)
+            self.assertTrue(np.allclose(ex_x0, r0))
+            self.assertTrue(np.allclose(ex_x1, r1))
+
+
+class API_TestSplit2(unittest.TestCase):
+    def test_out(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data = fluid.layers.data('data', shape=[-1, 10], dtype='float32')
+            x0, x1 = paddle.split(data, num_or_sections=2, axis=1)
+            place = fluid.MLUPlace(0)
+            exe = fluid.Executor(place)
+            input1 = np.random.random([1, 10]).astype('float32')
+            r0, r1 = exe.run(feed={"data": input1}, fetch_list=[x0, x1])
+            ex_x0, ex_x1 = np.split(input1, 2, axis=1)
+            self.assertTrue(np.allclose(ex_x0, r0))
+            self.assertTrue(np.allclose(ex_x1, r1))
+
+
+class API_TestDygraphSplit(unittest.TestCase):
+    def test_out1(self):
+        with fluid.dygraph.guard(paddle.MLUPlace(0)):
+            input_1 = np.random.random([4, 6, 6]).astype("int32")
+            # input is a variable which shape is [4, 6, 6]
+            input = fluid.dygraph.to_variable(input_1)
+            x0, x1, x2 = paddle.split(input, num_or_sections=3, axis=1)
+            x0_out = x0.numpy()
+            x1_out = x1.numpy()
+            x2_out = x2.numpy()
+            ex_x0, ex_x1, ex_x2 = np.split(input_1, 3, axis=1)
+        self.assertTrue(np.allclose(ex_x0, x0_out))
+        self.assertTrue(np.allclose(ex_x1, x1_out))
+        self.assertTrue(np.allclose(ex_x2, x2_out))
+
+    def test_out2(self):
+        with fluid.dygraph.guard(paddle.MLUPlace(0)):
+            input_1 = np.random.random([4, 6, 6]).astype("int32")
+            # input is a variable which shape is [4, 6, 6]
+            input = fluid.dygraph.to_variable(input_1)
+            x0, x1, x2 = paddle.split(input, num_or_sections=[1, 2, 3], axis=1)
+            x0_out = x0.numpy()
+            x1_out = x1.numpy()
+            x2_out = x2.numpy()
+            ex_x0, ex_x1, ex_x2 = np.split(input_1, (1, 3), axis=1)
+        self.assertTrue(np.allclose(ex_x0, x0_out))
+        self.assertTrue(np.allclose(ex_x1, x1_out))
+        self.assertTrue(np.allclose(ex_x2, x2_out))
+
+
+# attr(axis) is Tensor
+class TestSplitOp_AxisTensor(OpTest):
+    def setUp(self):
+        self._set_op_type()
+        self.dtype = self.get_dtype()
+        self.init_data()
+        self.inputs = {
+            'X': self.x,
+            'AxisTensor': np.array([self.axis]).astype("int32")
+        }
+        self.attrs = {'sections': self.sections, 'num': self.num}
+
+        out = np.split(self.x, self.indices_or_sections, self.axis)
+        self.outputs = {'Out': [('out%d' % i, out[i]) \
+                                for i in range(len(out))]}
+
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def init_data(self):
+        self.x = np.random.random((4, 5, 6)).astype(self.dtype)
+        self.axis = 2
+        self.sections = []
+        self.num = 3
+        self.indices_or_sections = 3
+
+    def get_dtype(self):
+        return "float"
+
+    def _set_op_type(self):
+        self.op_type = "split"
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestSplitOp_SectionsTensor(OpTest):
+    def setUp(self):
+        self._set_op_type()
+        self.dtype = self.get_dtype()
+        self.init_data()
+        self.inputs = {'X': self.x}
+
+        sections_tensor = []
+        for index, ele in enumerate(self.sections):
+            sections_tensor.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+
+        self.inputs['SectionsTensorList'] = sections_tensor
+
+        self.attrs = {
+            'axis': self.axis,
+            'sections': self.sections_infer,
+            'num': self.num
+        }
+
+        out = np.split(self.x, self.indices_or_sections, self.axis)
+        self.outputs = {'Out': [('out%d' % i, out[i]) \
+                                for i in range(len(out))]}
+
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def init_data(self):
+        self.x = np.random.random((4, 5, 6)).astype(self.dtype)
+        self.axis = 1
+        self.sections = [2, 1, 2]
+        self.sections_infer = [-1, -1, -1]
+        self.num = 0
+        self.indices_or_sections = [2, 3]
+
+    def get_dtype(self):
+        return "float"
+
+    def _set_op_type(self):
+        self.op_type = "split"
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab