add expand_v2/expand_as_v2 for kunlun (#37592)

* add expand_v2/expand_as_v2 for kunlun * update expand_as_v2 * update expand_as_v2 * support float16/bool * update xpu.cmake

add expand_v2/expand_as_v2 for kunlun (#37592)
* add expand_v2/expand_as_v2 for kunlun * update expand_as_v2 * update expand_as_v2 * support float16/bool * update xpu.cmake
dae4e7f2 · TTerror · GitHub · 1b00fc48 · dae4e7f2 · dae4e7f2
8 changed file
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -35,7 +35,7 @@ ELSE ()
 ENDIF()
 SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211107")
+SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211129")
 SET(XPU_XRE_URL  "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)

--- a/paddle/fluid/operators/expand_as_v2_op_xpu.cc
+++ b/paddle/fluid/operators/expand_as_v2_op_xpu.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/expand_as_v2_op.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+class ExpandAsV2XPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto rank = context.Input<Tensor>("X")->dims().size();
+    auto target_shape = context.Attr<std::vector<int>>("target_shape");
+    auto target_rank = target_shape.size();
+    PADDLE_ENFORCE_GE(target_rank, rank,
+                      platform::errors::InvalidArgument(
+                          "The rank (%d) of the input 'target_tensor' for "
+                          "expand_as_v2 op must be greater than or equal to "
+                          "the rank (%d) of the input 'x'.",
+                          target_rank, rank));
+    PADDLE_ENFORCE_GE(rank, 1, platform::errors::InvalidArgument(
+                                   "The rank (%d) of the input 'x' for "
+                                   "expand_as_v2 op must be positive.",
+                                   rank));
+    PADDLE_ENFORCE_LE(target_rank, MAX_RANK_SUPPORTED,
+                      platform::errors::InvalidArgument(
+                          "The rank (%d) of the input 'target_tensor' for "
+                          "expand_as_v2 op must be less than or equal to %d.",
+                          target_rank, MAX_RANK_SUPPORTED));
+    ExpandAs(context);
+  }
+ protected:
+  void ExpandAs(const framework::ExecutionContext& context) const {
+    auto* in0 = context.Input<framework::Tensor>("X");
+    auto in_dims = in0->dims();
+    auto target_shape = context.Attr<std::vector<int>>("target_shape");
+    auto vec_in_dims = framework::vectorize<int>(in_dims);
+    auto diff = target_shape.size() - vec_in_dims.size();
+    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
+      PADDLE_ENFORCE_NE(target_shape[i], 0,
+                        platform::errors::InvalidArgument(
+                            "The value of target shape cannot be zero."));
+      if (vec_in_dims[i] != 1) {
+        PADDLE_ENFORCE_EQ(
+            vec_in_dims[i], target_shape[i],
+            platform::errors::InvalidArgument(
+                "The value (%d) of the non-singleton dimension does not match"
+                " the corresponding value (%d) in "
+                "target tensor for expand_as_v2 op.",
+                vec_in_dims[i], target_shape[i]));
+      }
+    }
+    auto* out0 = context.Output<framework::Tensor>("Out");
+    framework::DDim out_dims = framework::make_ddim(target_shape);
+    out0->Resize(out_dims);
+    out0->mutable_data<T>(context.GetPlace());
+    auto& in0_shape = vec_in_dims;
+    auto out0_shape = framework::vectorize<int>(out_dims);
+    const auto& dev_ctx =
+        context.template device_context<paddle::platform::XPUDeviceContext>();
+    int r = XPU_SUCCESS;
+    if (std::is_same<T, bool>::value) {
+      auto in0_data = reinterpret_cast<const int8_t*>(in0->data<T>());
+      auto out0_data = reinterpret_cast<int8_t*>(out0->data<T>());
+      r = xpu::broadcast<int8_t>(dev_ctx.x_context(), in0_data, out0_data,
+                                 in0_shape, out0_shape);
+    } else {
+      auto in0_data = reinterpret_cast<const XPUType*>(in0->data<T>());
+      auto out0_data = reinterpret_cast<XPUType*>(out0->data<T>());
+      r = xpu::broadcast<XPUType>(dev_ctx.x_context(), in0_data, out0_data,
+                                  in0_shape, out0_shape);
+    }
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External("XPU API(broadcast) return wrong "
+                                   "value[%d %s] in ExpandAsV2XPUKernel.",
+                                   r, XPUAPIErrorMsg[r]));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(expand_as_v2, ops::ExpandAsV2XPUKernel<float>,
+                       ops::ExpandAsV2XPUKernel<paddle::platform::float16>,
+                       ops::ExpandAsV2XPUKernel<bool>,
+                       ops::ExpandAsV2XPUKernel<int>,
+                       ops::ExpandAsV2XPUKernel<int64_t>);
+#endif
--- a/paddle/fluid/operators/expand_v2_op.h
+++ b/paddle/fluid/operators/expand_v2_op.h
@@ -41,6 +41,12 @@ inline std::vector<int> get_expand_shape(
      TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
      shape_data = cpu_shape_tensor.data<int>();
    }
+#endif
+#ifdef PADDLE_WITH_XPU
+    if (platform::is_xpu_place(shape_tensor->place())) {
+      TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
+      shape_data = cpu_shape_tensor.data<int>();
+    }
 #endif
    auto vec_shape =
        std::vector<int>(shape_data, shape_data + shape_tensor->numel());
@@ -65,6 +71,13 @@ inline std::vector<int> get_expand_shape(
        TensorCopySync(*tensor, platform::CPUPlace(), &temp);
        vec_epxand_shape.push_back(*temp.data<int32_t>());
      }
+#endif
+#ifdef PADDLE_WITH_XPU
+      else if (platform::is_xpu_place(tensor->place())) {  // NOLINT
+        framework::Tensor temp;
+        TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+        vec_epxand_shape.push_back(*temp.data<int32_t>());
+      }
 #endif
      else {  // NOLINT
        vec_epxand_shape.push_back(*tensor->data<int32_t>());

--- a/paddle/fluid/operators/expand_v2_op_xpu.cc
+++ b/paddle/fluid/operators/expand_v2_op_xpu.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/expand_v2_op.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+class ExpandV2XPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* Out = context.Output<framework::Tensor>("Out");
+    auto in_dims = X->dims();
+    auto expand_shape = get_expand_shape(context);
+    auto vec_in_dims = framework::vectorize<int>(in_dims);
+    auto diff = expand_shape.size() - vec_in_dims.size();
+    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+    std::vector<int> final_expand_shape(vec_in_dims.size());
+    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
+      PADDLE_ENFORCE_NE(expand_shape[i], 0,
+                        platform::errors::InvalidArgument(
+                            "The expanded size cannot be zero."));
+      if (i < diff) {  // expand_shape = [3,4,-1,-1], X = [10,2] -->
+                       // final_expand_shape = [3,4,10,2]
+        PADDLE_ENFORCE_GT(
+            expand_shape[i], 0,
+            platform::errors::InvalidArgument(
+                "The expanded size (%d) for non-existing dimensions must be "
+                "positive for expand_v2 op.",
+                expand_shape[i]));
+        final_expand_shape[i] = expand_shape[i];
+      } else if (expand_shape[i] > 0) {  // expand_shape = [3,4,10,4], X =
+                                         // [10,1] --> final_expand_shape =
+                                         // [3,4,10,4]
+        if (vec_in_dims[i] != 1) {
+          PADDLE_ENFORCE_EQ(
+              vec_in_dims[i], expand_shape[i],
+              platform::errors::InvalidArgument(
+                  "The value (%d) of the non-singleton dimension does not match"
+                  " the corresponding value (%d) in shape for expand_v2 op.",
+                  vec_in_dims[i], expand_shape[i]));
+          final_expand_shape[i] = expand_shape[i];
+        } else {
+          final_expand_shape[i] = expand_shape[i];
+        }
+      } else {  // expand_shape = [3,4,-1,-1], X = [10,2] --> final_expand_shape
+                // = [3,4,10,2]
+        PADDLE_ENFORCE_EQ(
+            expand_shape[i], -1,
+            platform::errors::InvalidArgument(
+                "When the value in shape is negative for expand_v2 op, "
+                "only -1 is supported, but the value received is %d.",
+                expand_shape[i]));
+        final_expand_shape[i] = vec_in_dims[i];
+      }
+    }
+    auto rank = X->dims().size();
+    PADDLE_ENFORCE_GE(
+        rank, 1,
+        platform::errors::InvalidArgument(
+            "The rank of the input 'X' for expand_v2_npu op must be positive, "
+            "but the value received is %d.",
+            rank));
+    auto shape_size = final_expand_shape.size();
+    PADDLE_ENFORCE_GE(
+        shape_size, rank,
+        platform::errors::InvalidArgument(
+            "The number (%d) of elements of 'shape' for expand_v2_npu op must "
+            "be "
+            "greater than or equal to the rank (%d) of the input 'X'.",
+            shape_size, rank));
+    framework::DDim out_dims = framework::make_ddim(final_expand_shape);
+    Out->Resize(out_dims);
+    Out->mutable_data<T>(context.GetPlace());
+    auto& x_shape = vec_in_dims;
+    auto out_shape = framework::vectorize<int>(out_dims);
+    const auto& dev_ctx =
+        context.template device_context<paddle::platform::XPUDeviceContext>();
+    int r = XPU_SUCCESS;
+    if (std::is_same<T, bool>::value) {
+      auto x_data = reinterpret_cast<const int8_t*>(X->data<T>());
+      auto out_data = reinterpret_cast<int8_t*>(Out->data<T>());
+      r = xpu::broadcast<int8_t>(dev_ctx.x_context(), x_data, out_data, x_shape,
+                                 out_shape);
+    } else {
+      auto x_data = reinterpret_cast<const XPUType*>(X->data<T>());
+      auto out_data = reinterpret_cast<XPUType*>(Out->data<T>());
+      r = xpu::broadcast<XPUType>(dev_ctx.x_context(), x_data, out_data,
+                                  x_shape, out_shape);
+    }
+    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
+                                          "XPU API(broadcast) return wrong "
+                                          "value[%d %s] in ExpandV2XPUKernel.",
+                                          r, XPUAPIErrorMsg[r]));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(expand_v2, ops::ExpandV2XPUKernel<float>,
+                       ops::ExpandV2XPUKernel<paddle::platform::float16>,
+                       ops::ExpandV2XPUKernel<bool>,
+                       ops::ExpandV2XPUKernel<int>,
+                       ops::ExpandV2XPUKernel<int64_t>);
+#endif
--- a/paddle/fluid/platform/device/xpu/xpu1_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu1_op_list.h
@@ -321,7 +321,18 @@ XPUOpMap& get_kl1_ops() {
      {"momuntem", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
      {"iou_similarity",
       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}
+      {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"expand_v2", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                                  pOpKernelType(vartype::INT64, XPUPlace()),
+                                  pOpKernelType(vartype::BOOL, XPUPlace()),
+                                  pOpKernelType(vartype::FP16, XPUPlace()),
+                                  pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"expand_as_v2",
+       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::INT64, XPUPlace()),
+                     pOpKernelType(vartype::BOOL, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace()),
+                     pOpKernelType(vartype::FP32, XPUPlace())})},
      // AddMore
  };

--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -271,7 +271,24 @@ XPUOpMap& get_kl2_ops() {
      {"masked_select",
       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})}
+                     pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"expand_v2", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                                  pOpKernelType(vartype::INT64, XPUPlace()),
+                                  pOpKernelType(vartype::BOOL, XPUPlace()),
+                                  pOpKernelType(vartype::FP16, XPUPlace()),
+                                  pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"expand_as_v2",
+       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::INT64, XPUPlace()),
+                     pOpKernelType(vartype::BOOL, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace()),
+                     pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"depthwise_conv2d",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"depthwise_conv2d_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"conv2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"conv2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
      // AddMore
  };

--- a/python/paddle/fluid/tests/unittests/xpu/test_expand_as_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_expand_as_v2_op_xpu.py
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+import paddle
+import paddle.fluid as fluid
+paddle.enable_static()
+np.random.seed(10)
+class TestExpandAsOpRank1(XPUOpTest):
+    def setUp(self):
+        self.set_xpu()
+        self.place = paddle.XPUPlace(0)
+        self.op_type = "expand_as_v2"
+        x = np.random.rand(100).astype("float32")
+        target_tensor = np.random.rand(2, 100).astype("float32")
+        self.inputs = {'X': x}
+        self.attrs = {'target_shape': target_tensor.shape}
+        bcast_dims = [2, 1]
+        output = np.tile(self.inputs['X'], bcast_dims)
+        self.outputs = {'Out': output}
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+    def test_check_grad(self):
+        pass
+class TestExpandAsOpRank2(XPUOpTest):
+    def setUp(self):
+        self.set_xpu()
+        self.place = paddle.XPUPlace(0)
+        self.op_type = "expand_as_v2"
+        x = np.random.rand(10, 12).astype("float32")
+        target_tensor = np.random.rand(10, 12).astype("float32")
+        self.inputs = {'X': x}
+        self.attrs = {'target_shape': target_tensor.shape}
+        bcast_dims = [1, 1]
+        output = np.tile(self.inputs['X'], bcast_dims)
+        self.outputs = {'Out': output}
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+    def test_check_grad(self):
+        pass
+class TestExpandAsOpRank3(XPUOpTest):
+    def setUp(self):
+        self.set_xpu()
+        self.place = paddle.XPUPlace(0)
+        self.op_type = "expand_as_v2"
+        x = np.random.rand(2, 3, 20).astype("float32")
+        target_tensor = np.random.rand(2, 3, 20).astype("float32")
+        self.inputs = {'X': x}
+        self.attrs = {'target_shape': target_tensor.shape}
+        bcast_dims = [1, 1, 1]
+        output = np.tile(self.inputs['X'], bcast_dims)
+        self.outputs = {'Out': output}
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+    def test_check_grad(self):
+        pass
+class TestExpandAsOpRank4(XPUOpTest):
+    def setUp(self):
+        self.set_xpu()
+        self.place = paddle.XPUPlace(0)
+        self.op_type = "expand_as_v2"
+        x = np.random.rand(1, 1, 7, 16).astype("float32")
+        target_tensor = np.random.rand(4, 6, 7, 16).astype("float32")
+        self.inputs = {'X': x}
+        self.attrs = {'target_shape': target_tensor.shape}
+        bcast_dims = [4, 6, 1, 1]
+        output = np.tile(self.inputs['X'], bcast_dims)
+        self.outputs = {'Out': output}
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+    def test_check_grad(self):
+        pass
+class TestExpandAsOpRank5(XPUOpTest):
+    def setUp(self):
+        self.set_xpu()
+        self.place = paddle.XPUPlace(0)
+        self.op_type = "expand_as_v2"
+        x = np.random.rand(1, 1, 7, 16).astype("int32")
+        target_tensor = np.random.rand(4, 6, 7, 16).astype("int32")
+        self.inputs = {'X': x}
+        self.attrs = {'target_shape': target_tensor.shape}
+        bcast_dims = [4, 6, 1, 1]
+        output = np.tile(self.inputs['X'], bcast_dims)
+        self.outputs = {'Out': output}
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+    def test_check_grad(self):
+        pass
+class TestExpandAsOpRank6(XPUOpTest):
+    def setUp(self):
+        self.set_xpu()
+        self.place = paddle.XPUPlace(0)
+        self.op_type = "expand_as_v2"
+        x = np.random.rand(1, 1, 7, 16).astype("int64")
+        target_tensor = np.random.rand(4, 6, 7, 16).astype("int64")
+        self.inputs = {'X': x}
+        self.attrs = {'target_shape': target_tensor.shape}
+        bcast_dims = [4, 6, 1, 1]
+        output = np.tile(self.inputs['X'], bcast_dims)
+        self.outputs = {'Out': output}
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+    def test_check_grad(self):
+        pass
+class TestExpandAsOpRank6BOOL(XPUOpTest):
+    def setUp(self):
+        self.set_xpu()
+        self.place = paddle.XPUPlace(0)
+        self.op_type = "expand_as_v2"
+        x = np.random.rand(1, 1, 7, 16).astype("bool")
+        target_tensor = np.random.rand(4, 6, 7, 16).astype("bool")
+        self.inputs = {'X': x}
+        self.attrs = {'target_shape': target_tensor.shape}
+        bcast_dims = [4, 6, 1, 1]
+        output = np.tile(self.inputs['X'], bcast_dims)
+        self.outputs = {'Out': output}
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+    def test_check_grad(self):
+        pass
+class TestExpandAsOpRank6FP16(XPUOpTest):
+    def setUp(self):
+        self.set_xpu()
+        self.place = paddle.XPUPlace(0)
+        self.op_type = "expand_as_v2"
+        x = np.random.rand(1, 1, 7, 16).astype("float16")
+        target_tensor = np.random.rand(4, 6, 7, 16).astype("float16")
+        self.inputs = {'X': x}
+        self.attrs = {'target_shape': target_tensor.shape}
+        bcast_dims = [4, 6, 1, 1]
+        output = np.tile(self.inputs['X'], bcast_dims)
+        self.outputs = {'Out': output}
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+        self.__class__.no_need_check_grad = True
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+    def test_check_grad(self):
+        pass
+# Test python API
+class TestExpandAsV2API(unittest.TestCase):
+    def test_api(self):
+        input1 = np.random.random([12, 14]).astype("float32")
+        input2 = np.random.random([2, 12, 14]).astype("float32")
+        x = fluid.layers.data(
+            name='x', shape=[12, 14], append_batch_size=False, dtype="float32")
+        y = fluid.layers.data(
+            name='target_tensor',
+            shape=[2, 12, 14],
+            append_batch_size=False,
+            dtype="float32")
+        out_1 = paddle.expand_as(x, y=y)
+        exe = fluid.Executor(place=fluid.XPUPlace(0))
+        res_1 = exe.run(fluid.default_main_program(),
+                        feed={"x": input1,
+                              "target_tensor": input2},
+                        fetch_list=[out_1])
+        assert np.array_equal(res_1[0], np.tile(input1, (2, 1, 1)))
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/xpu/test_expand_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_expand_v2_op_xpu.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import sys
+import numpy as np
+sys.path.append("..")
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+paddle.enable_static()
+np.random.seed(10)
+# CANN Op Support X: float32, int32, int64
+# Situation 1: shape is a list(without tensor)
+class TestExpandV2XPUOpRank1(XPUOpTest):
+    def setUp(self):
+        self.set_xpu()
+        self.place = paddle.XPUPlace(0)
+        self.op_type = "expand_v2"
+        self.dtype = np.float32
+        self.init_data()
+        self.inputs = {'X': np.random.random(self.ori_shape).astype(self.dtype)}
+        self.attrs = {'shape': self.shape}
+        output = np.tile(self.inputs['X'], self.expand_times)
+        self.outputs = {'Out': output}
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+    def init_data(self):
+        self.ori_shape = [100]
+        self.shape = [100]
+        self.expand_times = [1]
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+    def test_check_grad(self):
+        pass
+class TestExpandV2OpRank2_DimExpanding(TestExpandV2XPUOpRank1):
+    def init_data(self):
+        self.ori_shape = [120]
+        self.shape = [2, 120]
+        self.expand_times = [2, 1]
+class TestExpandV2OpRank2(TestExpandV2XPUOpRank1):
+    def init_data(self):
+        self.ori_shape = [1, 140]
+        self.shape = [12, 140]
+        self.expand_times = [12, 1]
+class TestExpandV2OpRank3_Corner(TestExpandV2XPUOpRank1):
+    def init_data(self):
+        self.ori_shape = (2, 10, 5)
+        self.shape = (2, 10, 5)
+        self.expand_times = (1, 1, 1)
+class TestExpandV2OpRank4(TestExpandV2XPUOpRank1):
+    def init_data(self):
+        self.ori_shape = (2, 4, 5, 7)
+        self.shape = (-1, -1, -1, -1)
+        self.expand_times = (1, 1, 1, 1)
+class TestExpandV2OpRank5(TestExpandV2XPUOpRank1):
+    def init_data(self):
+        self.ori_shape = (2, 4, 1, 15)
+        self.shape = (2, -1, 4, -1)
+        self.expand_times = (1, 1, 4, 1)
+class TestExpandV2OpRank6(TestExpandV2XPUOpRank1):
+    def init_data(self):
+        self.ori_shape = (4, 1, 30)
+        self.shape = (2, -1, 4, 30)
+        self.expand_times = (2, 1, 4, 1)
+# Situation 2: shape is a list(with tensor)
+class TestExpandV2OpXPURank1_tensor_attr(XPUOpTest):
+    def setUp(self):
+        self.set_xpu()
+        self.place = paddle.XPUPlace(0)
+        self.op_type = "expand_v2"
+        self.init_data()
+        self.dtype = np.float32
+        expand_shapes_tensor = []
+        for index, ele in enumerate(self.expand_shape):
+            expand_shapes_tensor.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+        self.inputs = {
+            'X': np.random.random(self.ori_shape).astype(self.dtype),
+            'expand_shapes_tensor': expand_shapes_tensor,
+        }
+        self.attrs = {"shape": self.infer_expand_shape}
+        output = np.tile(self.inputs['X'], self.expand_times)
+        self.outputs = {'Out': output}
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+    def init_data(self):
+        self.ori_shape = [100]
+        self.expand_times = [1]
+        self.expand_shape = [100]
+        self.infer_expand_shape = [-1]
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+    def test_check_grad(self):
+        pass
+class TestExpandV2OpRank2_Corner_tensor_attr(
+        TestExpandV2OpXPURank1_tensor_attr):
+    def init_data(self):
+        self.ori_shape = [12, 14]
+        self.expand_times = [1, 1]
+        self.expand_shape = [12, 14]
+        self.infer_expand_shape = [12, -1]
+# Situation 3: shape is a tensor
+class TestExpandV2XPUOpRank1_tensor(XPUOpTest):
+    def setUp(self):
+        self.set_xpu()
+        self.place = paddle.XPUPlace(0)
+        self.op_type = "expand_v2"
+        self.init_data()
+        self.dtype = np.float32
+        self.inputs = {
+            'X': np.random.random(self.ori_shape).astype(self.dtype),
+            'Shape': np.array(self.expand_shape).astype("int32"),
+        }
+        self.attrs = {}
+        output = np.tile(self.inputs['X'], self.expand_times)
+        self.outputs = {'Out': output}
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+    def init_data(self):
+        self.ori_shape = [100]
+        self.expand_times = [2, 1]
+        self.expand_shape = [2, 100]
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+    def test_check_grad(self):
+        pass
+# Situation 5: input x is int32
+# skip grad check for int32
+class TestExpandV2OpInteger(XPUOpTest):
+    def init_type(self):
+        self.dtype = 'int32'
+    def setUp(self):
+        self.set_xpu()
+        self.init_type()
+        self.place = paddle.XPUPlace(0)
+        self.op_type = "expand_v2"
+        self.inputs = {
+            'X': np.random.randint(
+                10, size=(2, 4, 20)).astype(self.dtype)
+        }
+        self.attrs = {'shape': [2, 4, 20]}
+        output = np.tile(self.inputs['X'], (1, 1, 1))
+        self.outputs = {'Out': output}
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+    def test_check_grad(self):
+        pass
+class TesstExpandV2OpInt64(TestExpandV2OpInteger):
+    def init_dtype(self):
+        self.dtype = 'int64'
+class TesstExpandV2OpBool(TestExpandV2OpInteger):
+    def init_dtype(self):
+        self.dtype = 'bool'
+class TesstExpandV2OpFP16(TestExpandV2OpInteger):
+    def init_dtype(self):
+        self.dtype = 'float16'
+# Test python API
+class TestExpandV2API(unittest.TestCase):
+    def test_static(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = np.random.random([12, 14]).astype("float32")
+            x = fluid.layers.data(
+                name='x',
+                shape=[12, 14],
+                append_batch_size=False,
+                dtype="float32")
+            positive_2 = fluid.layers.fill_constant([1], "int32", 12)
+            expand_shape = fluid.layers.data(
+                name="expand_shape",
+                shape=[2],
+                append_batch_size=False,
+                dtype="int32")
+            out_1 = paddle.expand(x, shape=[12, 14])
+            out_2 = paddle.expand(x, shape=[positive_2, 14])
+            out_3 = paddle.expand(x, shape=expand_shape)
+            g0 = fluid.backward.calc_gradient(out_2, x)
+            exe = fluid.Executor(place=paddle.XPUPlace(0))
+            res_1, res_2, res_3 = exe.run(fluid.default_main_program(),
+                                          feed={
+                                              "x": input,
+                                              "expand_shape":
+                                              np.array([12, 14]).astype("int32")
+                                          },
+                                          fetch_list=[out_1, out_2, out_3])
+            assert np.array_equal(res_1, np.tile(input, (1, 1)))
+            assert np.array_equal(res_2, np.tile(input, (1, 1)))
+            assert np.array_equal(res_3, np.tile(input, (1, 1)))
+if __name__ == "__main__":
+    unittest.main()