add xpu elementwise ops (#29031)

a5aa4dc7 · taixiurong · GitHub · e9acd9c9 · a5aa4dc7 · a5aa4dc7
20 changed file
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
@@ -27,7 +27,7 @@ template <typename DeviceContext, typename T>
 class ElementwiseAddXPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
-    XPUElementwise<T, XPUAddFunctor<T>>(ctx);
+    XPUElementwise<T>(ctx, xpu::add<T>);
  }
 };

@@ -36,161 +36,7 @@ class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-
-    auto dx_dims = dout->dims();
-    auto dy_dims_untrimed = dout->dims();
-    T *dx_data = NULL;
-    T *dy_data = NULL;
-
-    int axis = ctx.Attr<int>("axis");
-    PADDLE_ENFORCE_GE(dx_dims.size(), dy_dims_untrimed.size(),
-                      platform::errors::InvalidArgument(
-                          "Rank of first input must >= rank of second input."));
-
-    if (dx != nullptr) {
-      dx->mutable_data<T>(ctx.GetPlace());
-      dx_dims = dx->dims();
-      dx_data = dx->data<T>();
-    }
-
-    if (dy != nullptr) {
-      dy->mutable_data<T>(ctx.GetPlace());
-      dy_dims_untrimed = dy->dims();
-      dy_data = dy->data<T>();
-    }
-
-    int pre, n, post, is_common_broadcast;
-    if (dx_dims == dy_dims_untrimed) {
-      pre = post = 1;
-      n = dout->numel();
-    } else {
-      axis = (axis == -1 ? dx_dims.size() - dy_dims_untrimed.size() : axis);
-      PADDLE_ENFORCE_EQ(axis >= 0 && axis < dx_dims.size(), true,
-                        platform::errors::InvalidArgument(
-                            "Axis should be in range [0, dx_dims)"));
-      auto dy_dims = trim_trailing_singular_dims(dy_dims_untrimed);
-      axis = (dy_dims.size() == 0) ? dx_dims.size() : axis;
-      get_mid_dims(dx_dims, dy_dims, axis, &pre, &n, &post,
-                   &is_common_broadcast);
-    }
-    int len = pre * n * post;
-
-    auto &dev_ctx =
-        ctx.template device_context<paddle::platform::XPUDeviceContext>();
-    if (post == 1) {
-      int r = xpu::matrix_vector_add_grad(
-          dev_ctx.x_context(), dout->data<T>(), dout->data<T>(),
-          dout->data<T>(), dout->data<T>(), dx_data, dy_data, pre, n);
-      if (r == xpu::Error_t::INVALID_PARAM) {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::InvalidArgument(
-                              "XPU kernel error of ElementWiseAddOp, error "
-                              "message: INVALID_PARAM, "
-                              "please check your input & output."));
-      } else if (r == xpu::Error_t::RUNTIME_ERROR) {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::Unavailable(
-                              "XPU kernel error of ElementWiseAddOp, error "
-                              "message: RUNTIME_ERROR, "
-                              "please check whether Baidu Kunlun card is "
-                              "properly installed."));
-      } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
-        PADDLE_ENFORCE_EQ(
-            r, xpu::Error_t::SUCCESS,
-            platform::errors::ResourceExhausted(
-                "XPU kernel error of ElementWiseAddOp, error message: "
-                "NO_ENOUGH_WORKSPACE, XPU has no enough memory."));
-      }
-      return;
-    }
-
-    if (dx == nullptr) {
-      PADDLE_ENFORCE_EQ(
-          xpu_malloc(reinterpret_cast<void **>(&dx_data), len * sizeof(float)),
-          XPU_SUCCESS,
-          platform::errors::ResourceExhausted("XPU has no enough memory"));
-    }
-
-    if (dy == nullptr) {
-      PADDLE_ENFORCE_EQ(
-          xpu_malloc(reinterpret_cast<void **>(&dy_data), len * sizeof(float)),
-          XPU_SUCCESS,
-          platform::errors::ResourceExhausted("XPU has no enough memory"));
-    } else {
-      if (len != n) {
-        PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void **>(&dy_data),
-                                     len * sizeof(float)),
-                          XPU_SUCCESS, platform::errors::ResourceExhausted(
-                                           "XPU has no enough memory"));
-      }
-    }
-
-    int r = xpu::elementwise_add_grad(
-        dev_ctx.x_context(), dout->data<T>() /*x*/, dout->data<T>() /*y*/,
-        dout->data<T>() /*out*/, dout->data<T>(), dx_data, dy_data, len);
-    if (r == xpu::Error_t::INVALID_PARAM) {
-      PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                        platform::errors::InvalidArgument(
-                            "XPU kernel error of ElementWiseAddOp, error "
-                            "message: INVALID_PARAM, "
-                            "please check your input & output."));
-    } else if (r == xpu::Error_t::RUNTIME_ERROR) {
-      PADDLE_ENFORCE_EQ(
-          r, xpu::Error_t::SUCCESS,
-          platform::errors::Unavailable(
-              "XPU kernel error of ElementWiseAddOp, error message: "
-              "RUNTIME_ERROR, "
-              "please check whether Baidu Kunlun card is properly installed."));
-    } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
-      PADDLE_ENFORCE_EQ(
-          r, xpu::Error_t::SUCCESS,
-          platform::errors::ResourceExhausted(
-              "XPU kernel error of ElementWiseAddOp, error message: "
-              "NO_ENOUGH_WORKSPACE, XPU has no enough memory."));
-    }
-
-    if ((dy != nullptr) && (len != n)) {
-      r = xpu::reduce_ew(dev_ctx.x_context(), dy_data, dy->data<T>(), pre, n,
-                         post, xpu::ElementwiseOp::ASSIGN);
-      if (r == xpu::Error_t::INVALID_PARAM) {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::InvalidArgument(
-                              "XPU kernel error of ElementWiseAddOp, error "
-                              "message: INVALID_PARAM, "
-                              "please check your input & output."));
-      } else if (r == xpu::Error_t::RUNTIME_ERROR) {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::Unavailable(
-                              "XPU kernel error of ElementWiseAddOp, error "
-                              "message: RUNTIME_ERROR, "
-                              "please check whether Baidu Kunlun card is "
-                              "properly installed."));
-      } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
-        PADDLE_ENFORCE_EQ(
-            r, xpu::Error_t::SUCCESS,
-            platform::errors::ResourceExhausted(
-                "XPU kernel error of ElementWiseAddOp, error message: "
-                "NO_ENOUGH_WORKSPACE, XPU has no enough memory."));
-      }
-      dev_ctx.Wait();
-      xpu_free(dy_data);
-    }
-
-    if ((dx == nullptr || dy == nullptr) && !(dy != nullptr && len != n)) {
-      dev_ctx.Wait();
-    }
-
-    if (dx == nullptr) {
-      xpu_free(dx_data);
-    }
-    if (dy == nullptr) {
-      xpu_free(dy_data);
-    }
+    XPUElementwiseGrad<T>(ctx, xpu::add_grad<T>, false);
  }
 };


--- a/paddle/fluid/operators/elementwise/elementwise_div_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op_xpu.cc
@@ -19,18 +19,19 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename T>
-struct XPUDivFunctor {
-  int operator()(xpu::Context* ctx, const T* x, const T* y, T* z, int len) {
-    return xpu::elementwise_div(ctx, x, y, z, len);
+template <typename DeviceContext, typename T>
+class ElementwiseDivXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUElementwise<T>(ctx, xpu::div<T>);
  }
 };

 template <typename DeviceContext, typename T>
-class ElementwiseDivXPUKernel : public framework::OpKernel<T> {
+class ElementwiseDivGradXPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    XPUElementwise<T, XPUDivFunctor<T>>(ctx);
+    XPUElementwiseGrad<T>(ctx, xpu::div_grad<T>, true);
  }
 };

@@ -40,4 +41,7 @@ namespace ops = paddle::operators;
 REGISTER_OP_XPU_KERNEL(
    elementwise_div,
    ops::ElementwiseDivXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(elementwise_div_grad,
+                       ops::ElementwiseDivGradXPUKernel<
+                           paddle::platform::XPUDeviceContext, float>);
 #endif
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_xpu.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ElementwiseFloordivXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUElementwise<T>(ctx, xpu::floordiv<T>);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(elementwise_floordiv,
+                       ops::ElementwiseFloordivXPUKernel<
+                           paddle::platform::XPUDeviceContext, float>);
+
+#endif
--- a/paddle/fluid/operators/elementwise/elementwise_max_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op_xpu.cc
@@ -20,18 +20,19 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename T>
-struct XPUMaxFunctor {
-  int operator()(xpu::Context* ctx, const T* x, const T* y, T* z, int len) {
-    return xpu::elementwise_max(ctx, x, y, z, len);
+template <typename DeviceContext, typename T>
+class ElementwiseMaxXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUElementwise<T>(ctx, xpu::max<T>);
  }
 };

 template <typename DeviceContext, typename T>
-class ElementwiseMaxXPUKernel : public framework::OpKernel<T> {
+class ElementwiseMaxGradXPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    XPUElementwise<T, XPUMaxFunctor<T>>(ctx);
+    XPUElementwiseGrad<T>(ctx, xpu::max_grad<T>, true);
  }
 };

@@ -42,4 +43,7 @@ namespace ops = paddle::operators;
 REGISTER_OP_XPU_KERNEL(
    elementwise_max,
    ops::ElementwiseMaxXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(elementwise_max_grad,
+                       ops::ElementwiseMaxGradXPUKernel<
+                           paddle::platform::XPUDeviceContext, float>);
 #endif
--- a/paddle/fluid/operators/elementwise/elementwise_min_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op_xpu.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ElementwiseMinXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUElementwise<T>(ctx, xpu::min<T>);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseMinGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUElementwiseGrad<T>(ctx, xpu::min_grad<T>, true);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    elementwise_min,
+    ops::ElementwiseMinXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(elementwise_min_grad,
+                       ops::ElementwiseMinGradXPUKernel<
+                           paddle::platform::XPUDeviceContext, float>);
+#endif
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_xpu.cc
@@ -22,10 +22,18 @@ template <typename DeviceContext, typename T>
 class ElementwiseMulXPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    XPUElementwise<T, XPUMulFunctor<T>>(ctx);
+    XPUElementwise<T>(ctx, xpu::mul<T>);
  }
 };
-DEFINE_XPU_GRAD_KERNEL(Mul, mul, true);
+// DEFINE_XPU_GRAD_KERNEL(Mul, mul, true);
+template <typename DeviceContext, typename T>
+class ElementwiseMulGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUElementwiseGrad<T>(ctx, xpu::mul_grad<T>, true);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle


--- a/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
+#include "xpu/refactor/math.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ElementwisePowXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUElementwise<T>(ctx, xpu::pow<float>);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    elementwise_pow,
+    ops::ElementwisePowXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc
@@ -16,25 +16,28 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
+#include "xpu/refactor/math.h"
+
 namespace paddle {
 namespace operators {

-template <typename T>
-struct XPUSubFunctor {
-  int operator()(xpu::Context* ctx, const T* x, const T* y, T* z, int len) {
-    return xpu::elementwise_sub(ctx, x, y, z, len);
+template <typename DeviceContext, typename T>
+class ElementwiseSubXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUElementwise<T>(ctx, xpu::sub<float>);
  }
 };

 template <typename DeviceContext, typename T>
-class ElementwiseSubXPUKernel : public framework::OpKernel<T> {
+class ElementwiseSubGradXPUKernel : public ElemwiseGradKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    XPUElementwise<T, XPUSubFunctor<T>>(ctx);
+    ElemwiseGradKernel<T>::Compute(ctx);
+    XPUElementwiseGrad<T>(ctx, xpu::sub_grad<float>, false);
  }
 };

-DEFINE_XPU_GRAD_KERNEL(Sub, sub, false);
 }  // namespace operators
 }  // namespace paddle


--- a/paddle/fluid/operators/elementwise/elementwise_xpu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_xpu.h
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
@@ -19,6 +19,9 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>

+#include "xpu/refactor/math.h"
+#include "xpu/refactor/nn.h"
+
 namespace paddle {
 namespace operators {

@@ -41,11 +44,13 @@ class SoftmaxWithCrossEntropyXPUKernel : public framework::OpKernel<T> {
    loss->mutable_data<T>(context.GetPlace());
    const int n = SizeToAxis(axis, logits->dims());
    const int d = SizeFromAxis(axis, logits->dims());
+    std::vector<int> logits_dims = framework::vectorize<int>(logits->dims());
    // softmax
    auto& dev_ctx =
        context.template device_context<platform::XPUDeviceContext>();
-    int r = xpu::softmax2d_forward(dev_ctx.x_context(), logits->data<float>(),
-                                   softmax->data<float>(), n, d);
+    int r = xpu::softmax(dev_ctx.x_context(), logits->data<float>(),
+                         softmax->data<float>(), logits_dims, axis);
+
    PADDLE_ENFORCE_EQ(
        r, xpu::Error_t::SUCCESS,
        platform::errors::External("XPU kernel error. Softmax2d_forward "
@@ -55,44 +60,35 @@ class SoftmaxWithCrossEntropyXPUKernel : public framework::OpKernel<T> {
    auto ignore_index = context.Attr<int>("ignore_index");
    const bool soft_label = context.Attr<bool>("soft_label");
    if (soft_label) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "XPU only support soft_label == false for now!"));
+      r = xpu::soft_cross_entropy<float>(
+          dev_ctx.x_context(), softmax->data<float>(), labels->data<float>(),
+          loss->data<float>(), n, d);
+      PADDLE_ENFORCE_EQ(
+          r, xpu::Error_t::SUCCESS,
+          platform::errors::External("XPU kernel error. soft_cross_entropy "
+                                     "execution not succeed, error code=%d",
+                                     r));
    } else {
-      auto* p_labels = labels->data<int64_t>();
-      int64_t* labels_int64_host =
-          reinterpret_cast<int64_t*>(std::malloc(n * sizeof(int64_t)));
-      int* labels_int32_host =
-          reinterpret_cast<int*>(std::malloc(n * sizeof(int)));
-      int* labels_int32_device = NULL;
-      int ret = xpu_malloc(reinterpret_cast<void**>(&labels_int32_device),
-                           n * sizeof(int));
-      PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                        platform::errors::External(
-                            "XPU API return wrong value[%d], please check "
-                            "where Baidu Kunlun Card is properly installed.",
-                            ret));
-      dev_ctx.Wait();
-      memory::Copy(platform::CPUPlace(), labels_int64_host,
-                   BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()),
-                   p_labels, n * sizeof(int64_t));
-      for (int i = 0; i < n; ++i) {
-        labels_int32_host[i] = labels_int64_host[i];
-      }
-      memory::Copy(BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()),
-                   labels_int32_device, platform::CPUPlace(), labels_int32_host,
-                   n * sizeof(int));
-      int r = xpu::cross_entropy_forward(
-          dev_ctx.x_context(), n, d, softmax->data<float>(),
-          labels_int32_device, loss->data<float>(), nullptr, ignore_index);
+      Tensor labels_int32;
+      labels_int32.mutable_data<int32_t>(context.GetPlace(), labels->numel());
+      r = xpu::cast_v2<int64_t, int32_t>(
+          dev_ctx.x_context(), labels->data<int64_t>(),
+          labels_int32.data<int32_t>(), labels->numel());
+      PADDLE_ENFORCE_EQ(
+          r, xpu::Error_t::SUCCESS,
+          platform::errors::External("XPU kernel error. cast_v2 "
+                                     "execution not succeed, error code=%d",
+                                     r));
+
+      r = xpu::hard_cross_entropy<float, int32_t>(
+          dev_ctx.x_context(), softmax->data<float>(),
+          labels_int32.data<int32_t>(), loss->data<float>(), nullptr, n, d,
+          ignore_index);
      PADDLE_ENFORCE_EQ(
          r, xpu::Error_t::SUCCESS,
-          platform::errors::External("XPU kernel error. Cross_entropy_forward "
+          platform::errors::External("XPU kernel error. hard_cross_entropy "
                                     "execution not succeed, error code=%d",
                                     r));
-      dev_ctx.Wait();
-      std::free(labels_int32_host);
-      std::free(labels_int64_host);
-      xpu_free(labels_int32_device);
    }
  }
 };

--- a/python/paddle/fluid/tests/unittests/xpu/elementwise.py
+++ b/python/paddle/fluid/tests/unittests/xpu/elementwise.py
-#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-paddle.enable_static()
-
-
-class TestXPUElementwiseOpBase(object):
-    def setUp(self, op_type):
-        self.op_type = op_type
-        self.attrs = {'use_xpu': True}
-        self.is_common_broadcast = False
-        self.is_x_size_less_than_y = False
-        self.grad_implemented = False
-        self.y_grad_implemented = True
-        self.dtype = np.float32
-        self.__class__.op_type = self.op_type
-        self.__class__.use_xpu = True
-        self.__class__.dtype = self.dtype
-
-    def net(self, place):
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.layers.data(
-                name='X', shape=self.inputs['X'].shape, dtype=self.dtype)
-            y = fluid.layers.data(
-                name='Y', shape=self.inputs['Y'].shape, dtype=self.dtype)
-            op = getattr(fluid.layers, self.op_type)
-            z = op(x, y)
-            exe = fluid.Executor(place)
-            z_value = exe.run(feed=self.inputs, fetch_list=[z.name])
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            if not self.is_common_broadcast and not self.is_x_size_less_than_y:
-                self.check_output_with_place(place, atol=1e-3)
-            else:
-                with self.assertRaises(BaseException):
-                    self.net(place)
-
-    def _check_grad_xpu_helper(self,
-                               inputs_to_check,
-                               output_names,
-                               no_grad_set=None,
-                               max_relative_error=0.01):
-        if self.grad_implemented and not self.is_common_broadcast   \
-          and not self.is_x_size_less_than_y:
-            if paddle.is_compiled_with_xpu():
-                place = paddle.XPUPlace(0)
-                self.check_grad_with_place(
-                    place,
-                    inputs_to_check,
-                    output_names,
-                    no_grad_set=no_grad_set,
-                    max_relative_error=max_relative_error)
-
-    def test_check_grad_normal(self):
-        self._check_grad_xpu_helper(['X', 'Y'], 'Out')
-
-    def test_check_grad_ingore_x(self):
-        self._check_grad_xpu_helper(['Y'], 'Out', set("X"))
-
-    def test_check_grad_ingore_y(self):
-        if self.y_grad_implemented:
-            self._check_grad_xpu_helper(['X'], 'Out', set("Y"))
-
-    def init_axis(self):
-        self.axis = -1
-
-    def make_input(self, x_shape=[13, 17], y_shape=[13, 17]):
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, x_shape).astype(self.dtype),
-            'Y': np.random.uniform(0.1, 1, y_shape).astype(self.dtype)
-        }
-
-    def reshape_input(self, x_shape=None, y_shape=None):
-        if x_shape is None:
-            x = self.inputs['X']
-        else:
-            x = self.inputs['X'].reshape(x_shape)
-        if y_shape is None:
-            y = self.inputs['Y']
-        else:
-            y = self.inputs['Y'].reshape(y_shape)
-        return x, y
-
-    def make_output(self, x_shape=None, y_shape=None):
-        pass
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py
@@ -13,18 +13,21 @@
 # limitations under the License.

 from __future__ import print_function
+import numpy as np
 import sys
 sys.path.append("..")
-import unittest
-import numpy as np
 import paddle
-import paddle.fluid.core as core
 from op_test import OpTest, skip_check_grad_ci
+from op_test_xpu import XPUOpTest
+import unittest
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
+paddle.enable_static()


-class TestElementwiseAddOp(OpTest):
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp(XPUOpTest):
    def init_kernel_type(self):
        self.use_mkldnn = False

@@ -34,6 +37,7 @@ class TestElementwiseAddOp(OpTest):
        self.init_input_output()
        self.init_kernel_type()
        self.init_axis()
+        self.use_xpu = True

        self.inputs = {
            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
@@ -43,80 +47,33 @@ class TestElementwiseAddOp(OpTest):
        self.outputs = {'Out': self.out}

    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=(self.use_mkldnn == False))
-
-    def test_check_grad_normal(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        if self.dtype == np.float16:
-            return
-        self.check_grad(
-            ['X', 'Y'], 'Out', check_dygraph=(self.use_mkldnn == False))
-
-    def test_check_grad_ingore_x(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        if self.dtype == np.float16:
-            return
-        self.check_grad(
-            ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-            check_dygraph=(self.use_mkldnn == False))
-
-    def test_check_grad_ingore_y(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        if self.dtype == np.float16:
-            return
-        self.check_grad(
-            ['X'],
-            'Out',
-            no_grad_set=set('Y'),
-            check_dygraph=(self.use_mkldnn == False))
-
-    def init_input_output(self):
-        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
-        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
-        self.out = np.add(self.x, self.y)
-
-    def init_dtype(self):
-        self.dtype = np.float64
-
-    def init_axis(self):
-        self.axis = -1
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUElementwiseAddOp(OpTest):
-    def setUp(self):
-        self.op_type = "elementwise_add"
-        self.init_dtype()
-        self.init_input_output()
-        self.init_axis()
-
-        self.inputs = {'X': self.x, 'Y': self.y}
-        self.attrs = {'axis': self.axis, 'use_mkldnn': False, 'use_xpu': True}
-        self.outputs = {'Out': self.out}
-
-    def test_check_output(self):
-        if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
+        if paddle.is_compiled_with_xpu():
            place = paddle.XPUPlace(0)
            self.check_output_with_place(place)

    def test_check_grad_normal(self):
-        if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
+        if paddle.is_compiled_with_xpu():
            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X', 'Y'], 'Out')
+            self.check_grad_with_place(
+                place, ['X', 'Y'], 'Out', max_relative_error=0.006)

    def test_check_grad_ingore_x(self):
-        if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
+        if paddle.is_compiled_with_xpu():
            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['Y'], 'Out')
+            self.check_grad_with_place(
+                place, ['Y'],
+                'Out',
+                no_grad_set=set("X"),
+                max_relative_error=0.006)

    def test_check_grad_ingore_y(self):
-        if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
+        if paddle.is_compiled_with_xpu():
            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out')
+            self.check_grad_with_place(
+                place, ['X'],
+                'Out',
+                no_grad_set=set("Y"),
+                max_relative_error=0.006)

    def init_input_output(self):
        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
@@ -130,6 +87,8 @@ class TestXPUElementwiseAddOp(OpTest):
        self.axis = -1


+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 @skip_check_grad_ci(
    reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseAddOp_scalar(TestElementwiseAddOp):
@@ -139,6 +98,8 @@ class TestElementwiseAddOp_scalar(TestElementwiseAddOp):
        self.out = self.x + self.y


+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 @skip_check_grad_ci(
    reason="[skip shape check] Use y_shape(1,1) to test broadcast.")
 class TestElementwiseAddOp_scalar2(TestElementwiseAddOp):
@@ -148,6 +109,8 @@ class TestElementwiseAddOp_scalar2(TestElementwiseAddOp):
        self.out = self.x + self.y


+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_Vector(TestElementwiseAddOp):
    def init_input_output(self):
        self.x = np.random.random((100, )).astype(self.dtype)
@@ -155,6 +118,8 @@ class TestElementwiseAddOp_Vector(TestElementwiseAddOp):
        self.out = np.add(self.x, self.y)


+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_broadcast_0(TestElementwiseAddOp):
    def init_input_output(self):
        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
@@ -165,6 +130,8 @@ class TestElementwiseAddOp_broadcast_0(TestElementwiseAddOp):
        self.axis = 0


+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_broadcast_1(TestElementwiseAddOp):
    def init_input_output(self):
        self.x = np.random.rand(2, 100, 3).astype(self.dtype)
@@ -175,6 +142,8 @@ class TestElementwiseAddOp_broadcast_1(TestElementwiseAddOp):
        self.axis = 1


+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_broadcast_2(TestElementwiseAddOp):
    def init_input_output(self):
        self.x = np.random.rand(2, 3, 100).astype(self.dtype)
@@ -182,6 +151,8 @@ class TestElementwiseAddOp_broadcast_2(TestElementwiseAddOp):
        self.out = self.x + self.y.reshape(1, 1, 100)


+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp):
    def init_input_output(self):
        self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
@@ -192,6 +163,8 @@ class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp):
        self.axis = 1


+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp):
    def init_input_output(self):
        self.x = np.random.rand(100, 2, 3, 4).astype(self.dtype)
@@ -202,6 +175,8 @@ class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp):
        self.axis = 0


+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_broadcast_5(TestElementwiseAddOp):
    def init_input_output(self):
        self.x = np.random.rand(10, 3, 12).astype(self.dtype)
@@ -209,6 +184,8 @@ class TestElementwiseAddOp_broadcast_5(TestElementwiseAddOp):
        self.out = self.x + self.y


+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_broadcast_6(TestElementwiseAddOp):
    def init_input_output(self):
        self.x = np.random.rand(2, 12, 3, 5).astype(self.dtype)
@@ -216,6 +193,8 @@ class TestElementwiseAddOp_broadcast_6(TestElementwiseAddOp):
        self.out = self.x + self.y


+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_broadcast_7(TestElementwiseAddOp):
    def init_input_output(self):
        self.x = np.random.rand(1, 1, 20, 5).astype(self.dtype)
@@ -223,6 +202,8 @@ class TestElementwiseAddOp_broadcast_7(TestElementwiseAddOp):
        self.out = self.x + self.y


+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_rowwise_add_0(TestElementwiseAddOp):
    def init_input_output(self):
        self.x = np.random.rand(2, 10, 12).astype(self.dtype)
@@ -233,6 +214,8 @@ class TestElementwiseAddOp_rowwise_add_0(TestElementwiseAddOp):
        self.axis = 1


+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 @skip_check_grad_ci(
    reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseAddOp_rowwise_add_1(TestElementwiseAddOp):
@@ -245,6 +228,8 @@ class TestElementwiseAddOp_rowwise_add_1(TestElementwiseAddOp):
        self.axis = 1


+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp):
    def init_input_output(self):
        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
@@ -255,6 +240,8 @@ class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp):
        self.axis = -1


+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_commonuse_add1(TestElementwiseAddOp):
    def init_input_output(self):
        self.x = np.random.rand(2, 3, 100).astype(self.dtype)
@@ -265,6 +252,8 @@ class TestElementwiseAddOp_commonuse_add1(TestElementwiseAddOp):
        self.axis = -1


+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_commonuse_add2(TestElementwiseAddOp):
    def init_input_output(self):
        self.x = np.random.rand(10, 3, 1, 4).astype(self.dtype)
@@ -275,6 +264,8 @@ class TestElementwiseAddOp_commonuse_add2(TestElementwiseAddOp):
        self.axis = -1


+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestElementwiseAddOp):
    def init_input_output(self):
        self.x = np.random.rand(10, 12).astype(self.dtype)
@@ -285,14 +276,16 @@ class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestElementwiseAddOp):
        self.axis = 2


+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOpError(unittest.TestCase):
    def test_errors(self):
        with program_guard(Program(), Program()):
            # the input of elementwise_add must be Variable.
            x1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.XPUPlace(0))
            y1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.XPUPlace(0))
            self.assertRaises(TypeError, fluid.layers.elementwise_add, x1, y1)

            # the input dtype of elementwise_add must be float16 or float32 or float64 or int32 or int64
@@ -302,6 +295,8 @@ class TestElementwiseAddOpError(unittest.TestCase):
            self.assertRaises(TypeError, fluid.layers.elementwise_add, x2, y2)


+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestAddOp(unittest.TestCase):
    def test_name(self):
        with fluid.program_guard(fluid.Program()):
@@ -324,7 +319,7 @@ class TestAddOp(unittest.TestCase):
            y = fluid.data(name="y", shape=[3], dtype='float32')
            z = paddle.add(x, y)

-            place = fluid.CPUPlace()
+            place = fluid.XPUPlace(0)
            exe = fluid.Executor(place)
            z_value = exe.run(feed=gen_data(), fetch_list=[z.name])
            z_expected = np.array([3., 8., 6.])
@@ -332,8 +327,8 @@ class TestAddOp(unittest.TestCase):

    def test_dygraph(self):
        with fluid.dygraph.guard():
-            np_x = np.array([2, 3, 4]).astype('float64')
-            np_y = np.array([1, 5, 2]).astype('float64')
+            np_x = np.array([2, 3, 4]).astype('float32')
+            np_y = np.array([1, 5, 2]).astype('float32')
            x = fluid.dygraph.to_variable(np_x)
            y = fluid.dygraph.to_variable(np_y)
            z = paddle.add(x, y)

--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_div_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_div_op_xpu.py
@@ -17,121 +17,233 @@ import unittest
 import numpy as np
 import paddle
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 from op_test import OpTest, skip_check_grad_ci
-from elementwise import TestXPUElementwiseOpBase
+from op_test_xpu import XPUOpTest
 paddle.enable_static()


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestXPUElementwiseDivOp(OpTest, TestXPUElementwiseOpBase):
+class ElementwiseDivOp(XPUOpTest):
    def setUp(self):
-        TestXPUElementwiseOpBase.setUp(self, "elementwise_div")
-        self.make_input()
-        self.make_output()
-
-    def make_output(self, x_shape=None, y_shape=None):
-        x, y = self.reshape_input(x_shape, y_shape)
-        self.outputs = {'Out': np.divide(x, y)}
+        self.op_type = "elementwise_div"
+        self.dtype = np.float32
+        self.init_dtype()
+        self.use_xpu = True
+        """ Warning
+        CPU gradient check error!
+        'X': np.random.random((32,84)).astype("float32"),
+        'Y': np.random.random((32,84)).astype("float32")
+        """
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype),
+            'Y': np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        }
+        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad_normal(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X', 'Y'], 'Out', max_relative_error=0.05)
+
+    def test_check_grad_ingore_x(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['Y'],
+                'Out',
+                max_relative_error=0.05,
+                no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X'],
+                'Out',
+                max_relative_error=0.05,
+                no_grad_set=set('Y'))
+
+    def init_dtype(self):
+        pass
+
+
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseDivOp_scalar(ElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [20, 3, 4]).astype(np.float32),
+            'Y': np.random.uniform(0.1, 1, [1]).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] / self.inputs['Y']}


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseDivOp_scalar(TestXPUElementwiseDivOp):
+class TestElementwiseDivOp_Vector(ElementwiseDivOp):
    def setUp(self):
-        super(TestElementwiseDivOp_scalar, self).setUp()
-        self.grad_implemented = False
-        self.make_input([20, 3, 4], [1])
-        self.make_output()
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [100]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [100]).astype("float32")
+        }
+        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseDivOp_Vector(TestXPUElementwiseDivOp):
+class TestElementwiseDivOp_broadcast_0(ElementwiseDivOp):
    def setUp(self):
-        super(TestElementwiseDivOp_Vector, self).setUp()
-        self.make_input([100, ], [100, ])
-        self.make_output()
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [100, 3, 4]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [100]).astype("float32")
+        }
+
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out':
+            np.divide(self.inputs['X'], self.inputs['Y'].reshape(100, 1, 1))
+        }


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseDivOp_broadcast_0(TestXPUElementwiseDivOp):
+class TestElementwiseDivOp_broadcast_1(ElementwiseDivOp):
    def setUp(self):
-        super(TestElementwiseDivOp_broadcast_0, self).setUp()
-        self.attrs['axis'] = 0
-        self.make_input([100, 3, 4], [100, ])
-        self.make_output(y_shape=[100, 1, 1])
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 100, 4]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [100]).astype("float32")
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.divide(self.inputs['X'], self.inputs['Y'].reshape(1, 100, 1))
+        }


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseDivOp_broadcast_1(TestXPUElementwiseDivOp):
+class TestElementwiseDivOp_broadcast_2(ElementwiseDivOp):
    def setUp(self):
-        super(TestElementwiseDivOp_broadcast_1, self).setUp()
-        self.attrs['axis'] = 1
-        self.make_input([2, 100, 4], [100, ])
-        self.make_output(y_shape=[1, 100, 1])
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 100]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [100]).astype("float32")
+        }
+
+        self.outputs = {
+            'Out':
+            np.divide(self.inputs['X'], self.inputs['Y'].reshape(1, 1, 100))
+        }


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseDivOp_broadcast_2(TestXPUElementwiseDivOp):
+class TestElementwiseDivOp_broadcast_3(ElementwiseDivOp):
    def setUp(self):
-        super(TestElementwiseDivOp_broadcast_2, self).setUp()
-        self.make_input([2, 3, 100], [100, ])
-        self.make_output(y_shape=[1, 1, 100])
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 10, 12, 5]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [10, 12]).astype("float32")
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.divide(self.inputs['X'], self.inputs['Y'].reshape(1, 10, 12, 1))
+        }


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseDivOp_broadcast_3(TestXPUElementwiseDivOp):
+class TestElementwiseDivOp_broadcast_4(ElementwiseDivOp):
    def setUp(self):
-        super(TestElementwiseDivOp_broadcast_3, self).setUp()
-        self.attrs['axis'] = 1
-        self.make_input([2, 10, 12, 5], [10, 12])
-        self.make_output(y_shape=[1, 10, 12, 1])
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 50]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [2, 1, 50]).astype("float32")
+        }
+        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseDivOp_broadcast_4(TestXPUElementwiseDivOp):
+class TestElementwiseDivOp_broadcast_5(ElementwiseDivOp):
    def setUp(self):
-        super(TestElementwiseDivOp_broadcast_4, self).setUp()
-        self.is_common_broadcast = True
-        self.make_input([2, 3, 50], [2, 1, 50])
-        self.make_output()
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4, 20]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [2, 3, 1, 20]).astype("float32")
+        }
+        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseDivOp_broadcast_5(TestXPUElementwiseDivOp):
+class TestElementwiseDivOp_commonuse_1(ElementwiseDivOp):
    def setUp(self):
-        super(TestElementwiseDivOp_broadcast_5, self).setUp()
-        self.is_common_broadcast = True
-        self.make_input([2, 3, 4, 20], [2, 3, 1, 20])
-        self.make_output()
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 100]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [1, 1, 100]).astype("float32"),
+        }
+        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseDivOp_commonuse_1(TestXPUElementwiseDivOp):
+class TestElementwiseDivOp_commonuse_2(ElementwiseDivOp):
    def setUp(self):
-        super(TestElementwiseDivOp_commonuse_1, self).setUp()
-        self.is_common_broadcast = True
-        self.make_input([2, 3, 100], [1, 1, 100])
-        self.make_output()
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [30, 3, 1, 5]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [30, 1, 4, 1]).astype("float32"),
+        }
+        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseDivOp_xsize_lessthan_ysize(TestXPUElementwiseDivOp):
+class TestElementwiseDivOp_xsize_lessthan_ysize(ElementwiseDivOp):
    def setUp(self):
-        super(TestElementwiseDivOp_xsize_lessthan_ysize, self).setUp()
-        self.is_x_size_less_than_y = True
-        self.attrs['axis'] = 2
-        self.make_input([10, 12], [2, 3, 10, 12])
-        self.make_output(x_shape=[1, 1, 10, 12])
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [10, 12]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [2, 3, 10, 12]).astype("float32"),
+        }
+
+        self.attrs = {'axis': 2}
+
+        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseDivBroadcast(unittest.TestCase):
+    def test_shape_with_batch_sizes(self):
+        with fluid.program_guard(fluid.Program()):
+            x_var = fluid.data(
+                name='x', dtype='float32', shape=[None, 3, None, None])
+            one = 2.
+            out = one / x_var
+            exe = fluid.Executor(fluid.XPUPlace(0))
+            x = np.random.uniform(0.1, 0.6, (1, 3, 32, 32)).astype("float32")
+            out_result, = exe.run(feed={'x': x}, fetch_list=[out])
+            self.assertEqual((out_result == (2 / x)).all(), True)


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_floordiv_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_floordiv_op_xpu.py
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from op_test import OpTest, skip_check_grad_ci
+from op_test_xpu import XPUOpTest
+paddle.enable_static()
+import random
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseModOp(XPUOpTest):
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
+    def setUp(self):
+        self.op_type = "elementwise_floordiv"
+        self.dtype = np.float32
+        self.axis = -1
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.out}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0, 10000, [10, 10]).astype(self.dtype)
+        self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype)
+        self.out = np.floor_divide(self.x, self.y)
+
+    def init_dtype(self):
+        pass
+
+    def init_axis(self):
+        pass
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseModOp_scalar(TestElementwiseModOp):
+    def init_input_output(self):
+        scale_x = random.randint(0, 100000000)
+        scale_y = random.randint(1, 100000000)
+        self.x = (np.random.rand(2, 3, 4) * scale_x).astype(self.dtype)
+        self.y = (np.random.rand(1) * scale_y + 1).astype(self.dtype)
+        self.out = np.floor_divide(self.x, self.y)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseModOpInverse(TestElementwiseModOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(0, 10000, [10]).astype(self.dtype)
+        self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype)
+        self.out = np.floor_divide(self.x, self.y)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_max_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_max_op_xpu.py
@@ -16,113 +16,163 @@ sys.path.append("..")
 import unittest
 import numpy as np
 from op_test import OpTest, skip_check_grad_ci
+from op_test_xpu import XPUOpTest
 import paddle
-from elementwise import TestXPUElementwiseOpBase
 paddle.enable_static()


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestXPUElementwiseOp(OpTest, TestXPUElementwiseOpBase):
+class TestElementwiseOp(XPUOpTest):
    def setUp(self):
-        TestXPUElementwiseOpBase.setUp(self, "elementwise_max")
-        self.make_input()
-        self.make_output()
-
-    def make_input(self, x_shape=[13, 17], y_shape=[13, 17], idx_list=None):
-        x = np.random.random(x_shape).astype(self.dtype)
-        sgn = np.random.choice([-1, 1], y_shape).astype(self.dtype)
-        if idx_list is None:
-            y = x + sgn * np.random.uniform(0.1, 1, y_shape).astype(self.dtype)
-        else:
-            x_temp = x
-            for idx in idx_list:
-                x_temp = np.take(x_temp, [0], axis=idx)
-            sgn = sgn.reshape(x_temp.shape)
-            y = x_temp + sgn * np.random.uniform(0.1, 1, x_temp.shape)
-            y = y.reshape(y_shape).astype(self.dtype)
-
+        self.use_xpu = True
+        self.op_type = "elementwise_max"
+        # If x and y have the same value, the max() is not differentiable.
+        # So we generate test data by the following method
+        # to avoid them being too close to each other.
+        x = np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        sgn = np.random.choice([-1, 1], [13, 17]).astype("float32")
+        y = x + sgn * np.random.uniform(0.1, 1, [13, 17]).astype("float32")
        self.inputs = {'X': x, 'Y': y}
-
-    def make_output(self, x_shape=None, y_shape=None):
-        x, y = self.reshape_input(x_shape, y_shape)
-        self.outputs = {'Out': np.maximum(x, y)}
-
-
+        self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad_normal(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['Y'],
+                'Out',
+                max_relative_error=0.006,
+                no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X'],
+                'Out',
+                max_relative_error=0.006,
+                no_grad_set=set('Y'))
+
+
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseMaxOp_scalar(TestXPUElementwiseOp):
+class TestElementwiseMaxOp_scalar(TestElementwiseOp):
    def setUp(self):
-        super(TestElementwiseMaxOp_scalar, self).setUp()
-        self.make_input([2, 3, 20], [1])
-        self.make_output()
-        self.grad_implemented = False
+        self.op_type = "elementwise_max"
+        x = np.random.random_integers(-5, 5, [2, 3, 20]).astype("float32")
+        y = np.array([0.5]).astype("float32")
+        self.inputs = {'X': x, 'Y': y}
+        self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])}


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseMaxOp_Vector(TestXPUElementwiseOp):
+class TestElementwiseMaxOp_Vector(TestElementwiseOp):
    def setUp(self):
-        super(TestElementwiseMaxOp_Vector, self).setUp()
-        self.make_input([100, ], [100, ])
-        self.make_output()
+        self.op_type = "elementwise_max"
+        x = np.random.random((100, )).astype("float32")
+        sgn = np.random.choice([-1, 1], (100, )).astype("float32")
+        y = x + sgn * np.random.uniform(0.1, 1, (100, )).astype("float32")
+        self.inputs = {'X': x, 'Y': y}
+        self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])}


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseMaxOp_broadcast_0(TestXPUElementwiseOp):
+class TestElementwiseMaxOp_broadcast_0(TestElementwiseOp):
    def setUp(self):
-        super(TestElementwiseMaxOp_broadcast_0, self).setUp()
-        self.attrs['axis'] = 0
-        self.make_input([100, 5, 2], [100, ], [1, 2])
-        self.make_output(y_shape=[100, 1, 1])
+        self.op_type = "elementwise_max"
+        x = np.random.uniform(0.5, 1, (100, 5, 2)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (100, )).astype(np.float32)
+        y = x[:, 0, 0] + sgn * \
+            np.random.uniform(1, 2, (100, )).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}

-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestElementwiseMaxOp_broadcast_1(TestXPUElementwiseOp):
-    def setUp(self):
-        super(TestElementwiseMaxOp_broadcast_1, self).setUp()
-        self.attrs['axis'] = 1
-        self.make_input([2, 100, 3], [100, ], [0, 2])
-        self.make_output(y_shape=[1, 100, 1])
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out':
+            np.maximum(self.inputs['X'], self.inputs['Y'].reshape(100, 1, 1))
+        }


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseMaxOp_broadcast_2(TestXPUElementwiseOp):
+class TestElementwiseMaxOp_broadcast_1(TestElementwiseOp):
    def setUp(self):
-        super(TestElementwiseMaxOp_broadcast_2, self).setUp()
-        self.make_input([1, 3, 100], [100, ], [0, 1])
-        self.make_output(y_shape=[1, 1, 100])
+        self.op_type = "elementwise_max"
+        x = np.random.uniform(0.5, 1, (2, 100, 3)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (100, )).astype(np.float32)
+        y = x[0, :, 0] + sgn * \
+            np.random.uniform(1, 2, (100, )).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.maximum(self.inputs['X'], self.inputs['Y'].reshape(1, 100, 1))
+        }


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseMaxOp_broadcast_3(TestXPUElementwiseOp):
+class TestElementwiseMaxOp_broadcast_2(TestElementwiseOp):
    def setUp(self):
-        super(TestElementwiseMaxOp_broadcast_3, self).setUp()
-        self.attrs['axis'] = 1
-        self.make_input([2, 50, 2, 1], [50, 2], [0, 3])
-        self.make_output(y_shape=[1, 50, 2, 1])
+        self.op_type = "elementwise_max"
+        x = np.random.uniform(0.5, 1, (1, 3, 100)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (100, )).astype(np.float32)
+        y = x[0, 0, :] + sgn * \
+            np.random.uniform(1, 2, (100, )).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.outputs = {
+            'Out':
+            np.maximum(self.inputs['X'], self.inputs['Y'].reshape(1, 1, 100))
+        }


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseMaxOp_broadcast_4(TestXPUElementwiseOp):
+class TestElementwiseMaxOp_broadcast_3(TestElementwiseOp):
    def setUp(self):
-        super(TestElementwiseMaxOp_broadcast_4, self).setUp()
-        self.make_input([2, 3, 4, 5], [2, 3, 1, 5])
-        self.make_output()
+        self.op_type = "elementwise_max"
+        x = np.random.uniform(0.5, 1, (2, 50, 2, 1)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (50, 2)).astype(np.float32)
+        y = x[0, :, :, 0] + sgn * \
+            np.random.uniform(1, 2, (50, 2)).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.maximum(self.inputs['X'], self.inputs['Y'].reshape(1, 50, 2, 1))
+        }


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseMaxOp_broadcast_5(TestXPUElementwiseOp):
+class TestElementwiseMaxOp_broadcast_4(TestElementwiseOp):
    def setUp(self):
-        super(TestElementwiseMaxOp_broadcast_5, self).setUp()
-        self.make_input([2, 3, 100], [1, 1, 100])
-        self.make_output()
+        self.op_type = "elementwise_max"
+        x = np.random.uniform(0.5, 1, (2, 3, 4, 5)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (2, 3, 1, 5)).astype(np.float32)
+        y = x + sgn * \
+            np.random.uniform(1, 2, (2, 3, 1, 5)).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])}


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_min_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_min_op_xpu.py
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+from op_test import OpTest, skip_check_grad_ci
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+import paddle
+from op_test_xpu import XPUOpTest
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseOp(XPUOpTest):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        # If x and y have the same value, the min() is not differentiable.
+        # So we generate test data by the following method
+        # to avoid them being too close to each other.
+        x = np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        sgn = np.random.choice([-1, 1], [13, 17]).astype("float32")
+        y = x + sgn * np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        self.inputs = {'X': x, 'Y': y}
+        self.outputs = {'Out': np.minimum(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad_normal(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['Y'],
+                'Out',
+                max_relative_error=0.005,
+                no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X'],
+                'Out',
+                max_relative_error=0.005,
+                no_grad_set=set('Y'))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestElementwiseMinOp_scalar(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        x = np.random.random_integers(-5, 5, [10, 3, 4]).astype("float32")
+        y = np.array([0.5]).astype("float32")
+        self.inputs = {'X': x, 'Y': y}
+        self.outputs = {'Out': np.minimum(self.inputs['X'], self.inputs['Y'])}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMinOp_Vector(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        x = np.random.random((100, )).astype("float32")
+        sgn = np.random.choice([-1, 1], (100, )).astype("float32")
+        y = x + sgn * np.random.uniform(0.1, 1, (100, )).astype("float32")
+        self.inputs = {'X': x, 'Y': y}
+        self.outputs = {'Out': np.minimum(self.inputs['X'], self.inputs['Y'])}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMinOp_broadcast_0(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        x = np.random.uniform(0.5, 1, (100, 3, 2)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (100, )).astype(np.float32)
+        y = x[:, 0, 0] + sgn * \
+            np.random.uniform(1, 2, (100, )).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out':
+            np.minimum(self.inputs['X'], self.inputs['Y'].reshape(100, 1, 1))
+        }
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMinOp_broadcast_1(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        x = np.random.uniform(0.5, 1, (2, 100, 3)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (100, )).astype(np.float32)
+        y = x[0, :, 0] + sgn * \
+            np.random.uniform(1, 2, (100, )).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.minimum(self.inputs['X'], self.inputs['Y'].reshape(1, 100, 1))
+        }
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMinOp_broadcast_2(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        x = np.random.uniform(0.5, 1, (2, 3, 100)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (100, )).astype(np.float32)
+        y = x[0, 0, :] + sgn * \
+            np.random.uniform(1, 2, (100, )).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.outputs = {
+            'Out':
+            np.minimum(self.inputs['X'], self.inputs['Y'].reshape(1, 1, 100))
+        }
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMinOp_broadcast_3(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        x = np.random.uniform(0.5, 1, (2, 25, 4, 1)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (25, 4)).astype(np.float32)
+        y = x[0, :, :, 0] + sgn * \
+            np.random.uniform(1, 2, (25, 4)).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.minimum(self.inputs['X'], self.inputs['Y'].reshape(1, 25, 4, 1))
+        }
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMinOp_broadcast_4(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        x = np.random.uniform(0.5, 1, (2, 10, 2, 5)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (2, 10, 1, 5)).astype(np.float32)
+        y = x + sgn * \
+            np.random.uniform(1, 2, (2, 10, 1, 5)).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.outputs = {'Out': np.minimum(self.inputs['X'], self.inputs['Y'])}
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mul_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mul_op_xpu.py
@@ -19,58 +19,111 @@ from op_test import OpTest, skip_check_grad_ci
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 import paddle
-from elementwise import TestXPUElementwiseOpBase
+from op_test_xpu import XPUOpTest
 paddle.enable_static()


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestXPUElementwiseMulOp(OpTest, TestXPUElementwiseOpBase):
+class ElementwiseMulOp(XPUOpTest):
    def init_kernel_type(self):
        self.use_mkldnn = False

    def setUp(self):
-        TestXPUElementwiseOpBase.setUp(self, "elementwise_mul")
+        self.use_xpu = True
+        self.op_type = "elementwise_mul"
+        self.dtype = np.float32
+        self.axis = -1
+        self.init_dtype()
+        self.init_input_output()
        self.init_kernel_type()
        self.init_axis()
-        self.attrs['axis'] = self.axis
-        self.attrs['use_mkldnn'] = self.use_mkldnn
-        self.grad_implemented = True
-        self.make_input()
-        self.make_output()

-    def make_output(self, x_shape=None, y_shape=None):
-        x, y = self.reshape_input(x_shape, y_shape)
-        self.outputs = {'Out': np.multiply(x, y)}
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.outputs = {'Out': self.out}
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}

+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)

+    def test_check_grad_normal(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X', 'Y'],
+                'Out',
+                check_dygraph=(self.use_mkldnn == False))
+
+    def test_check_grad_ingore_x(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['Y'],
+                'Out',
+                no_grad_set=set("X"),
+                check_dygraph=(self.use_mkldnn == False))
+
+    def test_check_grad_ingore_y(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X'],
+                'Out',
+                no_grad_set=set('Y'),
+                check_dygraph=(self.use_mkldnn == False))
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.multiply(self.x, self.y)
+
+    def init_dtype(self):
+        pass
+
+    def init_axis(self):
+        pass
+
+
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestXPUElementwiseMulOp_scalar(TestXPUElementwiseMulOp):
+class TestElementwiseMulOp_scalar(ElementwiseMulOp):
    def setUp(self):
-        super(TestXPUElementwiseMulOp_scalar, self).setUp()
-        self.make_input((10, 3, 4), (1, ))
-        self.make_output()
-        self.grad_implemented = False
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(10, 3, 4).astype(np.float32),
+            'Y': np.random.rand(1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+        self.init_kernel_type()


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestXPUElementwiseMulOp_Vector(TestXPUElementwiseMulOp):
+class TestElementwiseMulOp_Vector(ElementwiseMulOp):
    def setUp(self):
-        super(TestXPUElementwiseMulOp_Vector, self).setUp()
-        self.make_input((100, ), (100, ))
-        self.make_output()
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.random((100, )).astype("float32"),
+            'Y': np.random.random((100, )).astype("float32")
+        }
+        self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])}
+        self.init_kernel_type()


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestXPUElementwiseMulOp_broadcast_0(TestXPUElementwiseMulOp):
-    def setUp(self):
-        super(TestXPUElementwiseMulOp_broadcast_0, self).setUp()
-        self.make_input((100, 2, 3), (100, ))
-        self.make_output(y_shape=(100, 1, 1))
-        self.y_grad_implemented = False
+class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x * self.y.reshape(100, 1, 1)

    def init_axis(self):
        self.axis = 0
@@ -78,75 +131,140 @@ class TestXPUElementwiseMulOp_broadcast_0(TestXPUElementwiseMulOp):

 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseMulOp_broadcast_1(TestXPUElementwiseMulOp):
+class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 100, 3).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 100, 1)
+        }
+        self.init_kernel_type()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMulOp_broadcast_2(ElementwiseMulOp):
    def setUp(self):
-        super(TestElementwiseMulOp_broadcast_1, self).setUp()
-        self.attrs['axis'] = 1
-        self.y_grad_implemented = False
-        self.make_input((2, 100, 3), (100, ))
-        self.make_output(y_shape=(1, 100, 1))
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 100).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32)
+        }
+
+        self.outputs = {
+            'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 1, 100)
+        }
+        self.init_kernel_type()


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseMulOp_broadcast_2(TestXPUElementwiseMulOp):
+class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
    def setUp(self):
-        super(TestElementwiseMulOp_broadcast_2, self).setUp()
-        self.y_grad_implemented = False
-        self.make_input((2, 3, 100), (100, ))
-        self.make_output(y_shape=(1, 1, 100))
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 10, 12, 3).astype(np.float32),
+            'Y': np.random.rand(10, 12).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 10, 12, 1)
+        }
+        self.init_kernel_type()


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseMulOp_broadcast_3(TestXPUElementwiseMulOp):
+class TestElementwiseMulOp_broadcast_4(ElementwiseMulOp):
    def setUp(self):
-        super(TestElementwiseMulOp_broadcast_3, self).setUp()
-        self.attrs['axis'] = 1
-        self.y_grad_implemented = False
-        self.make_input((2, 10, 12, 3), (10, 12))
-        self.make_output(y_shape=(1, 10, 12, 1))
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(10, 2, 11).astype(np.float32),
+            'Y': np.random.rand(10, 1, 11).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+        self.init_kernel_type()


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseMulOp_broadcast_4(TestXPUElementwiseMulOp):
+class TestElementwiseMulOp_broadcast_5(ElementwiseMulOp):
    def setUp(self):
-        super(TestElementwiseMulOp_broadcast_4, self).setUp()
-        self.is_common_broadcast = True
-        self.make_input((10, 2, 11), (10, 1, 11))
-        self.make_output()
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(10, 4, 2, 3).astype(np.float32),
+            'Y': np.random.rand(10, 4, 1, 3).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+        self.init_kernel_type()


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseMulOp_broadcast_5(TestXPUElementwiseMulOp):
+class TestElementwiseMulOp_commonuse_1(ElementwiseMulOp):
    def setUp(self):
-        super(TestElementwiseMulOp_broadcast_5, self).setUp()
-        self.is_common_broadcast = True
-        self.make_input((10, 4, 2, 3), (10, 4, 1, 3))
-        self.make_output()
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 100).astype(np.float32),
+            'Y': np.random.rand(1, 1, 100).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+        self.init_kernel_type()


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestXPUElementwiseMulOp_commonuse_1(TestXPUElementwiseMulOp):
+class TestElementwiseMulOp_commonuse_2(ElementwiseMulOp):
    def setUp(self):
-        super(TestXPUElementwiseMulOp_commonuse_1, self).setUp()
-        self.is_common_broadcast = True
-        self.make_input((2, 3, 100), (1, 1, 100))
-        self.make_output()
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(30, 3, 1, 5).astype(np.float32),
+            'Y': np.random.rand(30, 1, 4, 1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+        self.init_kernel_type()


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestXPUElementwiseMulOp_xsize_lessthan_ysize(TestXPUElementwiseMulOp):
+class TestElementwiseMulOp_xsize_lessthan_ysize(ElementwiseMulOp):
    def setUp(self):
-        super(TestXPUElementwiseMulOp_xsize_lessthan_ysize, self).setUp()
-        self.attrs['axis'] = 2
-        self.is_x_size_less_than_y = True
-        self.make_input((10, 10), (2, 2, 10, 10))
-        self.make_output(x_shape=(1, 1, 10, 10))
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(10, 10).astype(np.float32),
+            'Y': np.random.rand(2, 2, 10, 10).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 2}
+
+        self.outputs = {
+            'Out': self.inputs['X'].reshape(1, 1, 10, 10) * self.inputs['Y']
+        }
+        self.init_kernel_type()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMulOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # the input of elementwise_mul must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.XPUPlace(0))
+            y1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.XPUPlace(0))
+            self.assertRaises(TypeError, fluid.layers.elementwise_mul, x1, y1)
+
+            # the input dtype of elementwise_mul must be float32
+            x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="uint8")
+            y2 = fluid.layers.data(name='y2', shape=[3, 4, 5, 6], dtype="uint8")
+            self.assertRaises(TypeError, fluid.layers.elementwise_mul, x2, y2)


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_pow_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_pow_op_xpu.py
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from op_test import OpTest, skip_check_grad_ci
+from op_test_xpu import XPUOpTest
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwisePowOp(XPUOpTest):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.uniform(1, 2, [20, 5]).astype("float32"),
+            'Y': np.random.uniform(1, 2, [20, 5]).astype("float32")
+        }
+        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad_normal(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X', 'Y'], 'Out')
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwisePowOp_big_shape_1(TestElementwisePowOp):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.uniform(1, 2, [10, 10]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [10, 10]).astype("float32")
+        }
+        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwisePowOp_big_shape_2(TestElementwisePowOp):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.uniform(1, 2, [10, 10]).astype("float32"),
+            'Y': np.random.uniform(0.2, 2, [10, 10]).astype("float32")
+        }
+        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestElementwisePowOp_scalar(TestElementwisePowOp):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [3, 3, 4]).astype(np.float32),
+            'Y': np.random.uniform(0.1, 1, [1]).astype(np.float32)
+        }
+        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwisePowOp_tensor(TestElementwisePowOp):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [100]).astype("float32"),
+            'Y': np.random.uniform(1, 3, [100]).astype("float32")
+        }
+        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwisePowOp_broadcast_0(TestElementwisePowOp):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 1, 100]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [100]).astype("float32")
+        }
+        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwisePowOp_broadcast_1(TestElementwisePowOp):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 100, 1]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [100]).astype("float32")
+        }
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': np.power(self.inputs['X'], self.inputs['Y'].reshape(100, 1))
+        }
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwisePowOp_broadcast_2(TestElementwisePowOp):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [100, 3, 1]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [100]).astype("float32")
+        }
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out':
+            np.power(self.inputs['X'], self.inputs['Y'].reshape(100, 1, 1))
+        }
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwisePowOp_broadcast_3(TestElementwisePowOp):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 20, 5, 1]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [20, 5]).astype("float32")
+        }
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': np.power(self.inputs['X'], self.inputs['Y'].reshape(1, 20, 5,
+                                                                       1))
+        }
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwisePowOp_broadcast_4(TestElementwisePowOp):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 10, 3, 5]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [2, 10, 1, 5]).astype("float32")
+        }
+        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwisePowOpInt(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {'X': np.asarray([1, 3, 6]), 'Y': np.asarray([1, 1, 1])}
+        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_sub_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_sub_op_xpu.py
@@ -11,117 +11,198 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import unittest
+
 import numpy as np
 import sys
 sys.path.append("..")
-from op_test import OpTest, skip_check_grad_ci
 import paddle
-from elementwise import TestXPUElementwiseOpBase
+from op_test import OpTest, skip_check_grad_ci
+from op_test_xpu import XPUOpTest
+import unittest
 paddle.enable_static()


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestXPUElementwiseSubOp(OpTest, TestXPUElementwiseOpBase):
+class TestElementwiseOp(OpTest):
    def setUp(self):
-        TestXPUElementwiseOpBase.setUp(self, "elementwise_sub")
-        self.make_input()
-        self.make_output()
-        self.grad_implemented = True
+        self.use_xpu = True
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float32")
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, atol=1e-3)
+
+    def test_check_grad_normal(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['Y'],
+                'Out',
+                max_relative_error=0.005,
+                no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X'],
+                'Out',
+                max_relative_error=0.005,
+                no_grad_set=set('Y'))

-    def make_output(self, x_shape=None, y_shape=None):
-        x, y = self.reshape_input(x_shape, y_shape)
-        self.outputs = {'Out': x - y}
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestElementwiseSubOp_scalar(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(10, 3, 4).astype(np.float32),
+            'Y': np.random.rand(1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseSubOp_scalar(TestXPUElementwiseSubOp):
+class TestElementwiseSubOp_Vector(TestElementwiseOp):
    def setUp(self):
-        super(TestElementwiseSubOp_scalar, self).setUp()
-        self.grad_implemented = False
-        self.make_input((10, 3, 4), (1, ))
-        self.make_output()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.random((100, )).astype("float32"),
+            'Y': np.random.random((100, )).astype("float32")
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseSubOp_Vector(TestXPUElementwiseSubOp):
+class TestElementwiseSubOp_broadcast_0(TestElementwiseOp):
    def setUp(self):
-        super(TestElementwiseSubOp_Vector, self).setUp()
-        self.make_input((100, ), (100, ))
-        self.make_output()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(100, 3, 2).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(100, 1, 1)
+        }


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseSubOp_broadcast_0(TestXPUElementwiseSubOp):
+class TestElementwiseSubOp_broadcast_1(TestElementwiseOp):
    def setUp(self):
-        super(TestElementwiseSubOp_broadcast_0, self).setUp()
-        self.attrs['axis'] = 0
-        self.make_input((100, 3, 2), (100, ))
-        self.make_output(y_shape=(100, 1, 1))
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 100, 3).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 100, 1)
+        }


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseSubOp_broadcast_1(TestXPUElementwiseSubOp):
+class TestElementwiseSubOp_broadcast_2(TestElementwiseOp):
    def setUp(self):
-        super(TestElementwiseSubOp_broadcast_1, self).setUp()
-        self.attrs['axis'] = 1
-        self.make_input((2, 100, 3), (100, ))
-        self.make_output(y_shape=(1, 100, 1))
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 100).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32)
+        }
+
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 1, 100)
+        }


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseSubOp_broadcast_2(TestXPUElementwiseSubOp):
+class TestElementwiseSubOp_broadcast_3(TestElementwiseOp):
    def setUp(self):
-        super(TestElementwiseSubOp_broadcast_2, self).setUp()
-        self.make_input((2, 3, 100), (100, ))
-        self.make_output(y_shape=(1, 1, 100))
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 10, 12, 3).astype(np.float32),
+            'Y': np.random.rand(10, 12).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 10, 12, 1)
+        }


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseSubOp_broadcast_3(TestXPUElementwiseSubOp):
+class TestElementwiseSubOp_broadcast_4(TestElementwiseOp):
    def setUp(self):
-        super(TestElementwiseSubOp_broadcast_3, self).setUp()
-        self.attrs['axis'] = 1
-        self.make_input((2, 10, 12, 3), (10, 12))
-        self.make_output(y_shape=(1, 10, 12, 1))
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 5, 3, 12).astype(np.float32),
+            'Y': np.random.rand(2, 5, 1, 12).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseSubOp_broadcast_4(TestXPUElementwiseSubOp):
+class TestElementwiseSubOp_commonuse_1(TestElementwiseOp):
    def setUp(self):
-        super(TestElementwiseSubOp_broadcast_4, self).setUp()
-        self.is_common_broadcast = True
-        self.make_input((2, 5, 3, 12), (2, 5, 1, 12))
-        self.make_output()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 100).astype(np.float32),
+            'Y': np.random.rand(1, 1, 100).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseSubOp_commonuse_1(TestXPUElementwiseSubOp):
+class TestElementwiseSubOp_commonuse_2(TestElementwiseOp):
    def setUp(self):
-        super(TestElementwiseSubOp_commonuse_1, self).setUp()
-        self.is_common_broadcast = True
-        self.make_input((2, 3, 100), (1, 1, 100))
-        self.make_output()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(10, 3, 1, 4).astype(np.float32),
+            'Y': np.random.rand(10, 1, 12, 1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}


 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
-class TestElementwiseSubOp_xsize_lessthan_ysize(TestXPUElementwiseSubOp):
+class TestElementwiseSubOp_xsize_lessthan_ysize(TestElementwiseOp):
    def setUp(self):
-        super(TestElementwiseSubOp_xsize_lessthan_ysize, self).setUp()
-        self.attrs['axis'] = 2
-        self.is_x_size_less_than_y = True
-        self.make_input((10, 12), (2, 3, 10, 12))
-        self.make_output(x_shape=(1, 1, 10, 12))
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(10, 12).astype(np.float32),
+            'Y': np.random.rand(2, 3, 10, 12).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 2}
+
+        self.outputs = {
+            'Out': self.inputs['X'].reshape(1, 1, 10, 12) - self.inputs['Y']
+        }


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/xpu/test_softmax_with_cross_entropy_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_softmax_with_cross_entropy_op_xpu.py
@@ -13,16 +13,15 @@
 # limitations under the License.

 from __future__ import print_function
+from test_softmax_op import stable_softmax
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle

 import unittest
 import numpy as np
 import sys
 sys.path.append("..")
-import paddle
-import paddle.fluid.core as core
-
-from op_test import OpTest
-from test_softmax_op import stable_softmax


 def cross_entropy(softmax, label, soft_label, axis, ignore_index=-1):
@@ -54,10 +53,11 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
        self.op_type = "softmax_with_cross_entropy"
        self.numeric_stable_mode = False
        self.soft_label = False
-        self.dtype = np.float64
+        self.dtype = np.float32
        self.axis = -1
        self.ignore_index = -1
        self.shape = [41, 37]
+        self.use_xpu = True

    def setUp(self):
        self.initParams()
@@ -103,7 +103,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
            paddle.enable_static()
            place = paddle.XPUPlace(0)
            self.check_grad_with_place(
-                place, ["Logits"], "Loss", max_relative_error=0.1)
+                place, ["Logits"], "Loss", max_relative_error=0.2)


 class TestXPUSoftmaxWithCrossEntropyOp(TestSoftmaxWithCrossEntropyOp):
@@ -115,6 +115,7 @@ class TestXPUSoftmaxWithCrossEntropyOp(TestSoftmaxWithCrossEntropyOp):
        self.axis = -1
        self.ignore_index = -1
        self.dtype = np.float32
+        self.use_xpu = True

    def test_check_output(self):
        if paddle.is_compiled_with_xpu():
@@ -127,7 +128,7 @@ class TestXPUSoftmaxWithCrossEntropyOp(TestSoftmaxWithCrossEntropyOp):
            paddle.enable_static()
            place = paddle.XPUPlace(0)
            self.check_grad_with_place(
-                place, ["Logits"], "Loss", max_relative_error=0.1)
+                place, ["Logits"], "Loss", max_relative_error=0.2)


 class TestXPUSoftmaxWithCrossEntropyOp2(TestXPUSoftmaxWithCrossEntropyOp):
@@ -139,10 +140,11 @@ class TestXPUSoftmaxWithCrossEntropyOp2(TestXPUSoftmaxWithCrossEntropyOp):
        self.op_type = "softmax_with_cross_entropy"
        self.numeric_stable_mode = True
        self.soft_label = True
-        self.dtype = np.float64
+        self.dtype = np.float32
        self.axis = -1
        self.ignore_index = -1
        self.shape = [41, 37]
+        self.use_xpu = True

    def test_check_output(self):
        if paddle.is_compiled_with_xpu():
@@ -155,7 +157,7 @@ class TestXPUSoftmaxWithCrossEntropyOp2(TestXPUSoftmaxWithCrossEntropyOp):
            paddle.enable_static()
            place = paddle.XPUPlace(0)
            self.check_grad_with_place(
-                place, ["Logits"], "Loss", max_relative_error=0.1)
+                place, ["Logits"], "Loss", max_relative_error=0.2)


 class TestXPUSoftmaxWithCrossEntropyOp3(TestXPUSoftmaxWithCrossEntropyOp):
@@ -170,55 +172,56 @@ class TestXPUSoftmaxWithCrossEntropyOp3(TestXPUSoftmaxWithCrossEntropyOp):
        self.shape = [41, 37]
        self.ignore_index = 5
        self.axis = -1
-        self.dtype = np.float64
-
-
-class TestXPUSoftmaxWithCrossEntropyOpAxis1(TestXPUSoftmaxWithCrossEntropyOp):
-    """
-    Test softmax with cross entropy operator with discreate one-hot labels.
-    Given axis != -1
-    """
-
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = False
-        self.dtype = np.float64
-        self.axis = 0
-        self.ignore_index = -1
-        self.shape = [3, 5, 7, 11]
-
-
-class TestXPUSoftmaxWithCrossEntropyOpAxis2(TestXPUSoftmaxWithCrossEntropyOp):
-    """
-    Test softmax with cross entropy operator with discreate one-hot labels.
-    Given axis != -1
-    """
-
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = False
-        self.dtype = np.float64
-        self.axis = 1
-        self.ignore_index = -1
-        self.shape = [3, 5, 7, 11]
-
+        self.dtype = np.float32

-class TestXPUSoftmaxWithCrossEntropyOpAxis3(TestXPUSoftmaxWithCrossEntropyOp):
-    """
-    Test softmax with cross entropy operator with discreate one-hot labels.
-    Given axis != -1
-    """

-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = False
-        self.dtype = np.float64
-        self.axis = 2
-        self.ignore_index = -1
-        self.shape = [3, 5, 7, 11]
+# xpu only support axis = rank -1
+# class TestXPUSoftmaxWithCrossEntropyOpAxis1(TestXPUSoftmaxWithCrossEntropyOp):
+#     """
+#     Test softmax with cross entropy operator with discreate one-hot labels.
+#     Given axis != -1
+#     """
+
+#     def initParams(self):
+#         self.op_type = "softmax_with_cross_entropy"
+#         self.numeric_stable_mode = True
+#         self.soft_label = False
+#         self.dtype = np.float32
+#         self.axis = 0
+#         self.ignore_index = -1
+#         self.shape = [3, 5, 7, 11]
+
+# xpu only support axis = rank -1
+# class TestXPUSoftmaxWithCrossEntropyOpAxis2(TestXPUSoftmaxWithCrossEntropyOp):
+#     """
+#     Test softmax with cross entropy operator with discreate one-hot labels.
+#     Given axis != -1
+#     """
+
+#     def initParams(self):
+#         self.op_type = "softmax_with_cross_entropy"
+#         self.numeric_stable_mode = True
+#         self.soft_label = False
+#         self.dtype = np.float32
+#         self.axis = 1
+#         self.ignore_index = -1
+#         self.shape = [3, 5, 7, 11]
+
+# xpu only support axis = rank -1
+# class TestXPUSoftmaxWithCrossEntropyOpAxis3(TestXPUSoftmaxWithCrossEntropyOp):
+#     """
+#     Test softmax with cross entropy operator with discreate one-hot labels.
+#     Given axis != -1
+#     """
+
+#     def initParams(self):
+#         self.op_type = "softmax_with_cross_entropy"
+#         self.numeric_stable_mode = True
+#         self.soft_label = False
+#         self.dtype = np.float32
+#         self.axis = 2
+#         self.ignore_index = -1
+#         self.shape = [3, 5, 7, 11]


 class TestXPUSoftmaxWithCrossEntropyOpAxis4(TestXPUSoftmaxWithCrossEntropyOp):
@@ -231,7 +234,7 @@ class TestXPUSoftmaxWithCrossEntropyOpAxis4(TestXPUSoftmaxWithCrossEntropyOp):
        self.op_type = "softmax_with_cross_entropy"
        self.numeric_stable_mode = True
        self.soft_label = False
-        self.dtype = np.float64
+        self.dtype = np.float32
        self.axis = 3
        self.ignore_index = -1
        self.shape = [3, 5, 7, 11]
@@ -248,46 +251,47 @@ class TestXPUSoftmaxWithCrossEntropyOpAxisDimEqualOne(
        self.op_type = "softmax_with_cross_entropy"
        self.numeric_stable_mode = True
        self.soft_label = False
-        self.dtype = np.float64
+        self.dtype = np.float32
        self.axis = -1
        self.ignore_index = -1
        self.shape = [3, 5, 7, 1]


-class TestXPUSoftmaxWithCrossEntropyOpSoftLabelAxis1(
-        TestXPUSoftmaxWithCrossEntropyOp):
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = True
-        self.shape = [3, 5, 7, 11]
-        self.axis = 0
-        self.ignore_index = -1
-        self.dtype = np.float64
-
-
-class TestXPUSoftmaxWithCrossEntropyOpSoftLabelAxis2(
-        TestXPUSoftmaxWithCrossEntropyOp2):
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = True
-        self.shape = [3, 5, 7, 11]
-        self.axis = 1
-        self.ignore_index = -1
-        self.dtype = np.float64
-
-
-class TestXPUSoftmaxWithCrossEntropyOpSoftLabelAxis3(
-        TestXPUSoftmaxWithCrossEntropyOp2):
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = True
-        self.shape = [3, 5, 7, 11]
-        self.axis = 2
-        self.ignore_index = -1
-        self.dtype = np.float64
+# xpu only support axis = rank -1
+# class TestXPUSoftmaxWithCrossEntropyOpSoftLabelAxis1(
+#         TestXPUSoftmaxWithCrossEntropyOp):
+#     def initParams(self):
+#         self.op_type = "softmax_with_cross_entropy"
+#         self.numeric_stable_mode = True
+#         self.soft_label = True
+#         self.shape = [3, 5, 7, 11]
+#         self.axis = 0
+#         self.ignore_index = -1
+#         self.dtype = np.float32
+
+# xpu only support axis = rank -1
+# class TestXPUSoftmaxWithCrossEntropyOpSoftLabelAxis2(
+#         TestXPUSoftmaxWithCrossEntropyOp2):
+#     def initParams(self):
+#         self.op_type = "softmax_with_cross_entropy"
+#         self.numeric_stable_mode = True
+#         self.soft_label = True
+#         self.shape = [3, 5, 7, 11]
+#         self.axis = 1
+#         self.ignore_index = -1
+#         self.dtype = np.float32
+
+# xpu only support axis = rank -1
+# class TestXPUSoftmaxWithCrossEntropyOpSoftLabelAxis3(
+#         TestXPUSoftmaxWithCrossEntropyOp2):
+#     def initParams(self):
+#         self.op_type = "softmax_with_cross_entropy"
+#         self.numeric_stable_mode = True
+#         self.soft_label = True
+#         self.shape = [3, 5, 7, 11]
+#         self.axis = 2
+#         self.ignore_index = -1
+#         self.dtype = np.float32


 class TestXPUSoftmaxWithCrossEntropyOpSoftLabelAxis4(
@@ -299,43 +303,44 @@ class TestXPUSoftmaxWithCrossEntropyOpSoftLabelAxis4(
        self.shape = [3, 5, 7, 11]
        self.axis = 3
        self.ignore_index = -1
-        self.dtype = np.float64
-
-
-class TestXPUSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis1(
-        TestXPUSoftmaxWithCrossEntropyOp3):
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = False
-        self.shape = [3, 5, 7, 11]
-        self.ignore_index = 1
-        self.axis = 0
-        self.dtype = np.float64
-
-
-class TestXPUSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis2(
-        TestXPUSoftmaxWithCrossEntropyOp3):
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = False
-        self.shape = [3, 5, 7, 11]
-        self.ignore_index = 0
-        self.axis = 1
-        self.dtype = np.float64
+        self.dtype = np.float32


-class TestXPUSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis3(
-        TestXPUSoftmaxWithCrossEntropyOp3):
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = False
-        self.shape = [3, 5, 7, 11]
-        self.ignore_index = 3
-        self.axis = 2
-        self.dtype = np.float64
+# xpu only support axis = rank -1
+# class TestXPUSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis1(
+#         TestXPUSoftmaxWithCrossEntropyOp3):
+#     def initParams(self):
+#         self.op_type = "softmax_with_cross_entropy"
+#         self.numeric_stable_mode = True
+#         self.soft_label = False
+#         self.shape = [3, 5, 7, 11]
+#         self.ignore_index = 1
+#         self.axis = 0
+#         self.dtype = np.float32
+
+# xpu only support axis = rank -1
+# class TestXPUSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis2(
+#         TestXPUSoftmaxWithCrossEntropyOp3):
+#     def initParams(self):
+#         self.op_type = "softmax_with_cross_entropy"
+#         self.numeric_stable_mode = True
+#         self.soft_label = False
+#         self.shape = [3, 5, 7, 11]
+#         self.ignore_index = 0
+#         self.axis = 1
+#         self.dtype = np.float32
+
+# xpu only support axis = rank -1
+# class TestXPUSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis3(
+#         TestXPUSoftmaxWithCrossEntropyOp3):
+#     def initParams(self):
+#         self.op_type = "softmax_with_cross_entropy"
+#         self.numeric_stable_mode = True
+#         self.soft_label = False
+#         self.shape = [3, 5, 7, 11]
+#         self.ignore_index = 3
+#         self.axis = 2
+#         self.dtype = np.float32


 class TestXPUSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis4(
@@ -347,7 +352,7 @@ class TestXPUSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis4(
        self.shape = [3, 5, 7, 11]
        self.ignore_index = 3
        self.axis = 3
-        self.dtype = np.float64
+        self.dtype = np.float32


 class TestXPUSoftmaxWithCrossEntropyOpBoundary0(
@@ -364,7 +369,7 @@ class TestXPUSoftmaxWithCrossEntropyOpBoundary0(
        self.shape = [3, 5, 7, 11]
        self.axis = -1
        self.ignore_index = -1
-        self.dtype = np.float64
+        self.dtype = np.float32
        self.logits = np.full(self.shape, -500.0).astype(self.dtype)


@@ -382,7 +387,7 @@ class TestXPUSoftmaxWithCrossEntropyOpBoundary1(
        self.shape = [3, 5, 7, 11]
        self.axis = -1
        self.ignore_index = -1
-        self.dtype = np.float64
+        self.dtype = np.float32
        self.logits = np.full(self.shape, 1000.0).astype(self.dtype)
        self.logits[:, :, 0, :] = -1000.0