From a5aa4dc7a92b894050efbd46eb78ab5f938434dd Mon Sep 17 00:00:00 2001
From: taixiurong <taixiurong@126.com>
Date: Wed, 25 Nov 2020 15:37:46 +0800
Subject: [PATCH] add xpu elementwise ops (#29031)

---
 .../elementwise/elementwise_add_op_xpu.cc     | 158 +-----
 .../elementwise/elementwise_div_op_xpu.cc     |  16 +-
 .../elementwise_floordiv_op_xpu.cc            |  37 ++
 .../elementwise/elementwise_max_op_xpu.cc     |  16 +-
 .../elementwise/elementwise_min_op_xpu.cc     |  49 ++
 .../elementwise/elementwise_mul_op_xpu.cc     |  12 +-
 .../elementwise/elementwise_pow_op_xpu.cc     |  40 ++
 .../elementwise/elementwise_sub_op_xpu.cc     |  17 +-
 .../operators/elementwise/elementwise_xpu.h   | 471 ++++++++++--------
 .../softmax_with_cross_entropy_op_xpu.cc      |  66 ++-
 .../fluid/tests/unittests/xpu/elementwise.py  | 100 ----
 .../xpu/test_elementwise_add_op_xpu.py        | 139 +++---
 .../xpu/test_elementwise_div_op_xpu.py        | 228 ++++++---
 .../xpu/test_elementwise_floordiv_op_xpu.py   |  87 ++++
 .../xpu/test_elementwise_max_op_xpu.py        | 180 ++++---
 .../xpu/test_elementwise_min_op_xpu.py        | 180 +++++++
 .../xpu/test_elementwise_mul_op_xpu.py        | 246 ++++++---
 .../xpu/test_elementwise_pow_op_xpu.py        | 182 +++++++
 .../xpu/test_elementwise_sub_op_xpu.py        | 191 +++++--
 .../test_softmax_with_cross_entropy_op_xpu.py | 267 +++++-----
 20 files changed, 1716 insertions(+), 966 deletions(-)
 create mode 100644 paddle/fluid/operators/elementwise/elementwise_floordiv_op_xpu.cc
 create mode 100644 paddle/fluid/operators/elementwise/elementwise_min_op_xpu.cc
 create mode 100644 paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc
 delete mode 100644 python/paddle/fluid/tests/unittests/xpu/elementwise.py
 create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_elementwise_floordiv_op_xpu.py
 create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_elementwise_min_op_xpu.py
 create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_elementwise_pow_op_xpu.py
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
index ad4a16c6e0..625e66d5f3 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
@@ -27,7 +27,7 @@ template <typename DeviceContext, typename T>
 class ElementwiseAddXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    XPUElementwise<T, XPUAddFunctor<T>>(ctx);
+    XPUElementwise<T>(ctx, xpu::add<T>);
   }
 };
 
@@ -36,161 +36,7 @@ class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-
-    auto dx_dims = dout->dims();
-    auto dy_dims_untrimed = dout->dims();
-    T *dx_data = NULL;
-    T *dy_data = NULL;
-
-    int axis = ctx.Attr<int>("axis");
-    PADDLE_ENFORCE_GE(dx_dims.size(), dy_dims_untrimed.size(),
-                      platform::errors::InvalidArgument(
-                          "Rank of first input must >= rank of second input."));
-
-    if (dx != nullptr) {
-      dx->mutable_data<T>(ctx.GetPlace());
-      dx_dims = dx->dims();
-      dx_data = dx->data<T>();
-    }
-
-    if (dy != nullptr) {
-      dy->mutable_data<T>(ctx.GetPlace());
-      dy_dims_untrimed = dy->dims();
-      dy_data = dy->data<T>();
-    }
-
-    int pre, n, post, is_common_broadcast;
-    if (dx_dims == dy_dims_untrimed) {
-      pre = post = 1;
-      n = dout->numel();
-    } else {
-      axis = (axis == -1 ? dx_dims.size() - dy_dims_untrimed.size() : axis);
-      PADDLE_ENFORCE_EQ(axis >= 0 && axis < dx_dims.size(), true,
-                        platform::errors::InvalidArgument(
-                            "Axis should be in range [0, dx_dims)"));
-      auto dy_dims = trim_trailing_singular_dims(dy_dims_untrimed);
-      axis = (dy_dims.size() == 0) ? dx_dims.size() : axis;
-      get_mid_dims(dx_dims, dy_dims, axis, &pre, &n, &post,
-                   &is_common_broadcast);
-    }
-    int len = pre * n * post;
-
-    auto &dev_ctx =
-        ctx.template device_context<paddle::platform::XPUDeviceContext>();
-    if (post == 1) {
-      int r = xpu::matrix_vector_add_grad(
-          dev_ctx.x_context(), dout->data<T>(), dout->data<T>(),
-          dout->data<T>(), dout->data<T>(), dx_data, dy_data, pre, n);
-      if (r == xpu::Error_t::INVALID_PARAM) {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::InvalidArgument(
-                              "XPU kernel error of ElementWiseAddOp, error "
-                              "message: INVALID_PARAM, "
-                              "please check your input & output."));
-      } else if (r == xpu::Error_t::RUNTIME_ERROR) {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::Unavailable(
-                              "XPU kernel error of ElementWiseAddOp, error "
-                              "message: RUNTIME_ERROR, "
-                              "please check whether Baidu Kunlun card is "
-                              "properly installed."));
-      } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
-        PADDLE_ENFORCE_EQ(
-            r, xpu::Error_t::SUCCESS,
-            platform::errors::ResourceExhausted(
-                "XPU kernel error of ElementWiseAddOp, error message: "
-                "NO_ENOUGH_WORKSPACE, XPU has no enough memory."));
-      }
-      return;
-    }
-
-    if (dx == nullptr) {
-      PADDLE_ENFORCE_EQ(
-          xpu_malloc(reinterpret_cast<void **>(&dx_data), len * sizeof(float)),
-          XPU_SUCCESS,
-          platform::errors::ResourceExhausted("XPU has no enough memory"));
-    }
-
-    if (dy == nullptr) {
-      PADDLE_ENFORCE_EQ(
-          xpu_malloc(reinterpret_cast<void **>(&dy_data), len * sizeof(float)),
-          XPU_SUCCESS,
-          platform::errors::ResourceExhausted("XPU has no enough memory"));
-    } else {
-      if (len != n) {
-        PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void **>(&dy_data),
-                                     len * sizeof(float)),
-                          XPU_SUCCESS, platform::errors::ResourceExhausted(
-                                           "XPU has no enough memory"));
-      }
-    }
-
-    int r = xpu::elementwise_add_grad(
-        dev_ctx.x_context(), dout->data<T>() /*x*/, dout->data<T>() /*y*/,
-        dout->data<T>() /*out*/, dout->data<T>(), dx_data, dy_data, len);
-    if (r == xpu::Error_t::INVALID_PARAM) {
-      PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                        platform::errors::InvalidArgument(
-                            "XPU kernel error of ElementWiseAddOp, error "
-                            "message: INVALID_PARAM, "
-                            "please check your input & output."));
-    } else if (r == xpu::Error_t::RUNTIME_ERROR) {
-      PADDLE_ENFORCE_EQ(
-          r, xpu::Error_t::SUCCESS,
-          platform::errors::Unavailable(
-              "XPU kernel error of ElementWiseAddOp, error message: "
-              "RUNTIME_ERROR, "
-              "please check whether Baidu Kunlun card is properly installed."));
-    } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
-      PADDLE_ENFORCE_EQ(
-          r, xpu::Error_t::SUCCESS,
-          platform::errors::ResourceExhausted(
-              "XPU kernel error of ElementWiseAddOp, error message: "
-              "NO_ENOUGH_WORKSPACE, XPU has no enough memory."));
-    }
-
-    if ((dy != nullptr) && (len != n)) {
-      r = xpu::reduce_ew(dev_ctx.x_context(), dy_data, dy->data<T>(), pre, n,
-                         post, xpu::ElementwiseOp::ASSIGN);
-      if (r == xpu::Error_t::INVALID_PARAM) {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::InvalidArgument(
-                              "XPU kernel error of ElementWiseAddOp, error "
-                              "message: INVALID_PARAM, "
-                              "please check your input & output."));
-      } else if (r == xpu::Error_t::RUNTIME_ERROR) {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::Unavailable(
-                              "XPU kernel error of ElementWiseAddOp, error "
-                              "message: RUNTIME_ERROR, "
-                              "please check whether Baidu Kunlun card is "
-                              "properly installed."));
-      } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
-        PADDLE_ENFORCE_EQ(
-            r, xpu::Error_t::SUCCESS,
-            platform::errors::ResourceExhausted(
-                "XPU kernel error of ElementWiseAddOp, error message: "
-                "NO_ENOUGH_WORKSPACE, XPU has no enough memory."));
-      }
-      dev_ctx.Wait();
-      xpu_free(dy_data);
-    }
-
-    if ((dx == nullptr || dy == nullptr) && !(dy != nullptr && len != n)) {
-      dev_ctx.Wait();
-    }
-
-    if (dx == nullptr) {
-      xpu_free(dx_data);
-    }
-    if (dy == nullptr) {
-      xpu_free(dy_data);
-    }
+    XPUElementwiseGrad<T>(ctx, xpu::add_grad<T>, false);
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_div_op_xpu.cc
index 6cc4276680..4f254a5307 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op_xpu.cc
@@ -19,18 +19,19 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct XPUDivFunctor {
-  int operator()(xpu::Context* ctx, const T* x, const T* y, T* z, int len) {
-    return xpu::elementwise_div(ctx, x, y, z, len);
+template <typename DeviceContext, typename T>
+class ElementwiseDivXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUElementwise<T>(ctx, xpu::div<T>);
   }
 };
 
 template <typename DeviceContext, typename T>
-class ElementwiseDivXPUKernel : public framework::OpKernel<T> {
+class ElementwiseDivGradXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    XPUElementwise<T, XPUDivFunctor<T>>(ctx);
+    XPUElementwiseGrad<T>(ctx, xpu::div_grad<T>, true);
   }
 };
 
@@ -40,4 +41,7 @@ namespace ops = paddle::operators;
 REGISTER_OP_XPU_KERNEL(
     elementwise_div,
     ops::ElementwiseDivXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(elementwise_div_grad,
+                       ops::ElementwiseDivGradXPUKernel<
+                           paddle::platform::XPUDeviceContext, float>);
 #endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_xpu.cc
new file mode 100644
index 0000000000..32ae3a6f2c
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_xpu.cc
@@ -0,0 +1,37 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ElementwiseFloordivXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUElementwise<T>(ctx, xpu::floordiv<T>);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(elementwise_floordiv,
+                       ops::ElementwiseFloordivXPUKernel<
+                           paddle::platform::XPUDeviceContext, float>);
+
+#endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_max_op_xpu.cc
index 232cfa0239..411ddb2660 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op_xpu.cc
@@ -20,18 +20,19 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct XPUMaxFunctor {
-  int operator()(xpu::Context* ctx, const T* x, const T* y, T* z, int len) {
-    return xpu::elementwise_max(ctx, x, y, z, len);
+template <typename DeviceContext, typename T>
+class ElementwiseMaxXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUElementwise<T>(ctx, xpu::max<T>);
   }
 };
 
 template <typename DeviceContext, typename T>
-class ElementwiseMaxXPUKernel : public framework::OpKernel<T> {
+class ElementwiseMaxGradXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    XPUElementwise<T, XPUMaxFunctor<T>>(ctx);
+    XPUElementwiseGrad<T>(ctx, xpu::max_grad<T>, true);
   }
 };
 
@@ -42,4 +43,7 @@ namespace ops = paddle::operators;
 REGISTER_OP_XPU_KERNEL(
     elementwise_max,
     ops::ElementwiseMaxXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(elementwise_max_grad,
+                       ops::ElementwiseMaxGradXPUKernel<
+                           paddle::platform::XPUDeviceContext, float>);
 #endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_min_op_xpu.cc
new file mode 100644
index 0000000000..0b1e131226
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op_xpu.cc
@@ -0,0 +1,49 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ElementwiseMinXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUElementwise<T>(ctx, xpu::min<T>);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseMinGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUElementwiseGrad<T>(ctx, xpu::min_grad<T>, true);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    elementwise_min,
+    ops::ElementwiseMinXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(elementwise_min_grad,
+                       ops::ElementwiseMinGradXPUKernel<
+                           paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_xpu.cc
index d9a6ca844a..02c6900c7c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_xpu.cc
@@ -22,10 +22,18 @@ template <typename DeviceContext, typename T>
 class ElementwiseMulXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    XPUElementwise<T, XPUMulFunctor<T>>(ctx);
+    XPUElementwise<T>(ctx, xpu::mul<T>);
   }
 };
-DEFINE_XPU_GRAD_KERNEL(Mul, mul, true);
+// DEFINE_XPU_GRAD_KERNEL(Mul, mul, true);
+template <typename DeviceContext, typename T>
+class ElementwiseMulGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUElementwiseGrad<T>(ctx, xpu::mul_grad<T>, true);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc
new file mode 100644
index 0000000000..31b6ef9abc
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc
@@ -0,0 +1,40 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
+#include "xpu/refactor/math.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ElementwisePowXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUElementwise<T>(ctx, xpu::pow<float>);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    elementwise_pow,
+    ops::ElementwisePowXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc
index 4e205fe492..bef3a4904f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc
@@ -16,25 +16,28 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
+#include "xpu/refactor/math.h"
+
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct XPUSubFunctor {
-  int operator()(xpu::Context* ctx, const T* x, const T* y, T* z, int len) {
-    return xpu::elementwise_sub(ctx, x, y, z, len);
+template <typename DeviceContext, typename T>
+class ElementwiseSubXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUElementwise<T>(ctx, xpu::sub<float>);
   }
 };
 
 template <typename DeviceContext, typename T>
-class ElementwiseSubXPUKernel : public framework::OpKernel<T> {
+class ElementwiseSubGradXPUKernel : public ElemwiseGradKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    XPUElementwise<T, XPUSubFunctor<T>>(ctx);
+    ElemwiseGradKernel<T>::Compute(ctx);
+    XPUElementwiseGrad<T>(ctx, xpu::sub_grad<float>, false);
   }
 };
 
-DEFINE_XPU_GRAD_KERNEL(Sub, sub, false);
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_xpu.h b/paddle/fluid/operators/elementwise/elementwise_xpu.h
index 53f2cd2dcc..fdf5aeeba5 100644
--- a/paddle/fluid/operators/elementwise/elementwise_xpu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_xpu.h
@@ -13,175 +13,76 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #ifdef PADDLE_WITH_XPU
+#include <algorithm>
 #include <string>
-#include <unordered_map>
+#include <tuple>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/place.h"
-
-inline std::string get_xpu_error_message(int error_type) {
-  static std::unordered_map<int, std::string> xpu_error_map = {
-      {baidu::xpu::api::INVALID_PARAM, "Parameter is invalid."},
-      {baidu::xpu::api::RUNTIME_ERROR,
-       "Please check whether Baidu Kunlun Card "
-       "is properly installed."},
-      {baidu::xpu::api::NO_ENOUGH_WORKSPACE,
-       "There is not enough memory in Baidu"
-       " Kunlun Card."}};
-  if (xpu_error_map.find(error_type) == xpu_error_map.end()) {
-    return "Unknown error type!";
-  }
-  return xpu_error_map[error_type];
-}
-
-#define XPU_MALLOC(addr, num_bytes)                                        \
-  PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(addr), num_bytes), \
-                    XPU_SUCCESS,                                           \
-                    platform::errors::ResourceExhausted(                   \
-                        "\n\nOut of memory error on XPU, Cannot"           \
-                        "allocate %s memory on XPU. \n\nPlease "           \
-                        "check whether there is any other process "        \
-                        "using XPU.\n",                                    \
-                        string::HumanReadableSize(num_bytes)))
-
-#define DEFINE_XPU_GRAD_KERNEL(kernel_type, kernel_name, use_x_y_data)         \
-  template <typename DeviceContext, typename T>                                \
-  class Elementwise##kernel_type##GradXPUKernel                                \
-      : public ElemwiseGradKernel<T> {                                         \
-   public:                                                                     \
-    void Compute(const framework::ExecutionContext& ctx) const override {      \
-      ElemwiseGradKernel<T>::Compute(ctx);                                     \
-      using Tensor = framework::Tensor;                                        \
-      auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));           \
-      auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));              \
-      auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));              \
-      auto dx_dims = dout->dims();                                             \
-      auto dy_dims_untrimed = dout->dims();                                    \
-      T* dx_data = NULL;                                                       \
-      T* dy_data = NULL;                                                       \
-      const T* y_data = nullptr;                                               \
-      const T* x_data = nullptr;                                               \
-      T* y_broadcast = nullptr;                                                \
-      if (use_x_y_data) {                                                      \
-        auto* x = ctx.Input<Tensor>("X");                                      \
-        auto* y = ctx.Input<Tensor>("Y");                                      \
-        y_data = y->data<T>();                                                 \
-        x_data = x->data<T>();                                                 \
-      } else {                                                                 \
-        x_data = dout->data<T>();                                              \
-        y_data = dout->data<T>();                                              \
-      }                                                                        \
-      int axis = ctx.Attr<int>("axis");                                        \
-      PADDLE_ENFORCE_GE(                                                       \
-          dx_dims.size(), dy_dims_untrimed.size(),                             \
-          platform::errors::InvalidArgument(                                   \
-              "Rank of first input must >= rank of second input."));           \
-      if (dx != nullptr) {                                                     \
-        dx->mutable_data<T>(ctx.GetPlace());                                   \
-        dx_dims = dx->dims();                                                  \
-        dx_data = dx->data<T>();                                               \
-      }                                                                        \
-      if (dy != nullptr) {                                                     \
-        dy->mutable_data<T>(ctx.GetPlace());                                   \
-        dy_dims_untrimed = dy->dims();                                         \
-        dy_data = dy->data<T>();                                               \
-      }                                                                        \
-      int pre, n, post, is_run_common_broadcast;                               \
-      if (dx_dims == dy_dims_untrimed) {                                       \
-        pre = post = 1;                                                        \
-        n = dout->numel();                                                     \
-      } else {                                                                 \
-        axis = (axis == -1 ? dx_dims.size() - dy_dims_untrimed.size() : axis); \
-        PADDLE_ENFORCE_EQ(axis >= 0 && axis < dx_dims.size(), true,            \
-                          platform::errors::InvalidArgument(                   \
-                              "Axis should be in range [0, dx_dims)"));        \
-        auto dy_dims = trim_trailing_singular_dims(dy_dims_untrimed);          \
-        axis = (dy_dims.size() == 0) ? dx_dims.size() : axis;                  \
-        get_mid_dims(dx_dims, dy_dims, axis, &pre, &n, &post,                  \
-                     &is_run_common_broadcast);                                \
-      }                                                                        \
-      int len = pre * n * post;                                                \
-      auto& dev_ctx =                                                          \
-          ctx.template device_context<paddle::platform::XPUDeviceContext>();   \
-      if (dx == nullptr) {                                                     \
-        XPU_MALLOC(&dx_data, len * sizeof(float));                             \
-      }                                                                        \
-      if (dy == nullptr) {                                                     \
-        XPU_MALLOC(&dy_data, len * sizeof(float));                             \
-      } else {                                                                 \
-        if (len != n) {                                                        \
-          XPU_MALLOC(&dy_data, len * sizeof(float));                           \
-        }                                                                      \
-      }                                                                        \
-      if (use_x_y_data) {                                                      \
-        if (len != n) {                                                        \
-          XPU_MALLOC(&y_broadcast, len * sizeof(float));                       \
-          int res =                                                            \
-              xpu::broadcast_ew(dev_ctx.x_context(), y_data, y_broadcast, pre, \
-                                n, post, xpu::ElementwiseOp::ASSIGN);          \
-          PADDLE_ENFORCE_EQ(                                                   \
-              res, xpu::Error_t::SUCCESS,                                      \
-              platform::errors::External("XPU kernel error occur! %s",         \
-                                         get_xpu_error_message(res)));         \
-          y_data = y_broadcast;                                                \
-        }                                                                      \
-      }                                                                        \
-      int res = xpu::elementwise_##kernel_name##_grad(                         \
-          dev_ctx.x_context(), x_data, y_data, dout->data<T>() /*out*/,        \
-          dout->data<T>(), dx_data, dy_data, len);                             \
-      PADDLE_ENFORCE_EQ(                                                       \
-          res, xpu::Error_t::SUCCESS,                                          \
-          platform::errors::External("XPU kernel error occur! %s",             \
-                                     get_xpu_error_message(res)));             \
-      if ((dy != nullptr) && (len != n)) {                                     \
-        int res = xpu::reduce_ew(dev_ctx.x_context(), dy_data, dy->data<T>(),  \
-                                 pre, n, post, xpu::ElementwiseOp::ASSIGN);    \
-        PADDLE_ENFORCE_EQ(                                                     \
-            res, xpu::Error_t::SUCCESS,                                        \
-            platform::errors::External("XPU kernel error occur! %s",           \
-                                       get_xpu_error_message(res)));           \
-        dev_ctx.Wait();                                                        \
-        xpu_free(dy_data);                                                     \
-      }                                                                        \
-      if ((len != n || dx == nullptr || dy == nullptr) &&                      \
-          !(dy != nullptr && len != n)) {                                      \
-        dev_ctx.Wait();                                                        \
-      }                                                                        \
-      if (dx == nullptr) {                                                     \
-        xpu_free(dx_data);                                                     \
-      }                                                                        \
-      if (dy == nullptr) {                                                     \
-        xpu_free(dy_data);                                                     \
-      }                                                                        \
-      if (use_x_y_data) {                                                      \
-        if (len != n) {                                                        \
-          xpu_free(y_broadcast);                                               \
-        }                                                                      \
-      }                                                                        \
-    }                                                                          \
-  }
+#include "xpu/refactor/math.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct XPUAddFunctor {
-  int operator()(xpu::Context* ctx, const T* x, const T* y, T* z, int len) {
-    return xpu::elementwise_add(ctx, x, y, z, len);
+static std::pair<std::vector<int>, std::vector<int>> XPUDimsToBroadcastVector(
+    const framework::DDim& x, const framework::DDim& y) {
+  std::vector<int> x_v;
+  std::vector<int> y_v;
+  int y_size = y.size();
+  for (int i = 0; i < y_size; ++i) {
+    if (x[i] == y[i]) {
+      x_v.push_back(y[i]);
+      y_v.push_back(y[i]);
+      continue;
+    }
+    x_v.push_back(1);
+    x_v.push_back(x[i]);
+    y_v.push_back(y[i] / x[i]);
+    y_v.push_back(x[i]);
   }
-};
+  return std::make_pair(x_v, y_v);
+}
 
-template <typename T>
-struct XPUMulFunctor {
-  int operator()(xpu::Context* ctx, const T* x, const T* y, T* z, int len) {
-    return xpu::elementwise_mul(ctx, x, y, z, len);
+static std::pair<std::vector<int>, std::vector<int>> XPUReducesAxisVector(
+    const framework::DDim& x, const framework::DDim& y) {
+  std::vector<int> x_vector;
+  std::vector<int> axis_v;
+  PADDLE_ENFORCE_GT(
+      x.size(), 0, platform::errors::OutOfRange("x size is less 1, x shape is ",
+                                                x.to_str()));
+  PADDLE_ENFORCE_GT(
+      y.size(), 0, platform::errors::OutOfRange("y size is less 1, y shape is ",
+                                                y.to_str()));
+
+  int y_nums = framework::product(y);
+  x_vector = framework::vectorize<int>(x);
+  if (y_nums == 1) {
+    for (int i = 0; i < x.size(); ++i) {
+      axis_v.push_back(i);
+    }
+    return std::make_pair(x_vector, axis_v);
+  }
+  int yidx = 0;
+  for (size_t i = 0; i < x_vector.size(); ++i) {
+    if (y[yidx] == 1) {
+      axis_v.push_back(i);
+      yidx++;
+      continue;
+    }
+    if (x_vector[i] != y[yidx]) {
+      axis_v.push_back(i);
+      continue;
+    }
+    yidx++;
   }
-};
+  return std::make_pair(x_vector, axis_v);
+}
 
-template <typename T, typename Functor>
-void XPUElementwise(const framework::ExecutionContext& ctx) {
-  PADDLE_ENFORCE_EQ(platform::is_xpu_place(ctx.GetPlace()), true,
-                    platform::errors::PreconditionNotMet(
-                        "This kernel only runs on XPU device."));
+template <typename T>
+void XPUElementwise(
+    const framework::ExecutionContext& ctx,
+    std::function<int(xpu::Context*, const T*, const T*, T*, int)> func) {
   auto x_var = ctx.InputVar("X");
   PADDLE_ENFORCE_NE(x_var, nullptr, platform::errors::InvalidArgument(
                                         "Cannot get input Variable X"));
@@ -194,74 +95,226 @@ void XPUElementwise(const framework::ExecutionContext& ctx) {
   auto* y = ctx.Input<framework::LoDTensor>("Y");
   auto* z = ctx.Output<framework::LoDTensor>("Out");
   z->mutable_data<T>(ctx.GetPlace());
-
-  int axis = ctx.Attr<int>("axis");
   auto x_dims = x.dims();
-  auto y_dims_untrimed = y->dims();
-  PADDLE_ENFORCE_GE(x_dims.size(), y_dims_untrimed.size(),
-                    platform::errors::InvalidArgument(
-                        "Rank of first input must >= rank of second input."));
-  axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
-  PADDLE_ENFORCE_EQ(
-      axis >= 0 && axis < x_dims.size(), true,
-      platform::errors::InvalidArgument("Axis should be in range [0, x_dims)"));
-  auto y_dims = trim_trailing_singular_dims(y_dims_untrimed);
-  axis = (y_dims.size() == 0) ? x_dims.size() : axis;
-  int pre, n, post, is_common_broadcast;
-  get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post, &is_common_broadcast);
+  auto y_dims = y->dims();
+  int max_dim = std::max(x_dims.size(), y_dims.size());
+  int axis = ctx.Attr<int>("axis");
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
 
-  PADDLE_ENFORCE_NE(is_common_broadcast, 1,
-                    platform::errors::Unimplemented(
-                        "X's shape should be equal to Y's shape."));
+  PADDLE_ENFORCE_GE(
+      axis, 0,
+      platform::errors::InvalidArgument(
+          "Axis should be great than or equal to 0, but received axis is %d.",
+          axis));
+  PADDLE_ENFORCE_LT(axis, max_dim,
+                    platform::errors::InvalidArgument(
+                        "Axis should be less than %d, but received axis is %d.",
+                        max_dim, axis));
 
-  int len = pre * n * post;
+  std::vector<int> x_dims_array(max_dim);
+  std::vector<int> y_dims_array(max_dim);
+  std::vector<int> out_dims_array(max_dim);
+  GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(),
+                         y_dims_array.data(), out_dims_array.data(), max_dim,
+                         axis);
+  framework::DDim out_dim = framework::make_ddim(out_dims_array);
 
   const T* x_data = x.data<T>();
   const T* y_data = y->data<T>();
   T* z_data = z->data<T>();
-  T* y_broadcast = nullptr;
+  bool need_wait = false;
+  framework::Tensor x_broadcast_tensor;
+  framework::Tensor y_broadcast_tensor;
+  auto& dev_ctx =
+      ctx.template device_context<paddle::platform::XPUDeviceContext>();
+  int ret = xpu::SUCCESS;
+  // begin broadcast now
+  if (x.numel() != z->numel()) {
+    // broadcast x
+    std::pair<std::vector<int>, std::vector<int>> bcast_v =
+        XPUDimsToBroadcastVector(framework::make_ddim(x_dims_array), out_dim);
+
+    ret = xpu::broadcast<T>(
+        dev_ctx.x_context(), x_data,
+        x_broadcast_tensor.mutable_data<T>(ctx.GetPlace(), z->numel()),
+        bcast_v.first, bcast_v.second);
+    PADDLE_ENFORCE_EQ(
+        ret, xpu::SUCCESS,
+        platform::errors::External(
+            "XPU kernel broadcast occur error in XPUElementwise error code %d",
+            ret));
+    need_wait = true;
+    x_data = x_broadcast_tensor.data<T>();
+  }
 
+  if (y->numel() != z->numel()) {
+    // broadcast y
+    std::vector<int> bcast_x_v;
+    std::vector<int> bcast_y_v;
+    std::pair<std::vector<int>, std::vector<int>> bcast_v =
+        XPUDimsToBroadcastVector(framework::make_ddim(y_dims_array), out_dim);
+    ret = xpu::broadcast<T>(
+        dev_ctx.x_context(), y_data,
+        y_broadcast_tensor.mutable_data<T>(ctx.GetPlace(), z->numel()),
+        bcast_v.first, bcast_v.second);
+    PADDLE_ENFORCE_EQ(
+        ret, xpu::SUCCESS,
+        platform::errors::External(
+            "XPU kernel broadcast occur error in XPUElementwise error code %d",
+            ret));
+    need_wait = true;
+    y_data = y_broadcast_tensor.data<T>();
+  }
+  int len = z->numel();
+  ret = func(dev_ctx.x_context(), x_data, y_data, z_data, len);
+  PADDLE_ENFORCE_EQ(
+      ret, xpu::SUCCESS,
+      platform::errors::External(
+          "XPU kernel Elementwise occur error in XPUElementwise error code ",
+          ret));
+
+  if (need_wait && dev_ctx.x_context()->xpu_stream) {
+    dev_ctx.Wait();
+  }
+}
+
+template <typename T>
+void XPUElementwiseGrad(const framework::ExecutionContext& ctx,
+                        std::function<int(xpu::Context*, const T*, const T*,
+                                          const T*, const T*, T*, T*, int len)>
+                            func,
+                        bool use_x_y_data) {
+  auto* x = ctx.Input<framework::Tensor>("X");
+  auto* y = ctx.Input<framework::Tensor>("Y");
+  auto* dz = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+  auto* z = dz;
+  auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+  auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+  int axis = ctx.Attr<int>("axis");
+  const framework::DDim& x_dims = x->dims();
+  const framework::DDim& y_dims = y->dims();
+  int max_dim = std::max(x_dims.size(), y_dims.size());
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  PADDLE_ENFORCE_GE(
+      axis, 0,
+      platform::errors::InvalidArgument(
+          "Axis should be great than or equal to 0, but received axis is %d.",
+          axis));
+  PADDLE_ENFORCE_LT(axis, max_dim,
+                    platform::errors::InvalidArgument(
+                        "Axis should be less than %d, but received axis is %d.",
+                        max_dim, axis));
+
+  std::vector<int> x_dims_array(max_dim);
+  std::vector<int> y_dims_array(max_dim);
+  std::vector<int> out_dims_array(max_dim);
+  GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(),
+                         y_dims_array.data(), out_dims_array.data(), max_dim,
+                         axis);
+  framework::DDim out_dim = framework::make_ddim(out_dims_array);
+
+  int len = framework::product(out_dim);
+
+  framework::Tensor x_broadcast_tensor;
+  framework::Tensor y_broadcast_tensor;
+
+  framework::Tensor dx_local_tensor;
+  framework::Tensor dy_local_tensor;
+
+  bool need_wait = false;
+  const T* x_data = use_x_y_data ? x->data<T>() : z->data<T>();
+  const T* y_data = use_x_y_data ? y->data<T>() : z->data<T>();
+
+  const T* z_data = z->data<T>();
+  const T* dz_data = (const T*)dz->data<T>();
+
+  bool dx_need_reduce = (dx != nullptr) && (dx->numel() != len);
+  bool dy_need_reduce = (dy != nullptr) && (dy->numel() != len);
+
+  T* dx_data = ((dx == nullptr) || dx_need_reduce)
+                   ? (dx_local_tensor.mutable_data<T>(ctx.GetPlace(), len))
+                   : (dx->mutable_data<T>(ctx.GetPlace()));
+
+  T* dy_data = ((dy == nullptr) || dy_need_reduce)
+                   ? (dy_local_tensor.mutable_data<T>(ctx.GetPlace(), len))
+                   : (dy->mutable_data<T>(ctx.GetPlace()));
+
+  int ret = xpu::SUCCESS;
   auto& dev_ctx =
       ctx.template device_context<paddle::platform::XPUDeviceContext>();
 
-  if (post == 1) {
-    if (std::is_same<Functor, XPUAddFunctor<T>>::value) {
-      int res = xpu::matrix_vector_add(dev_ctx.x_context(), x_data, y_data,
-                                       z_data, pre, n);
-      PADDLE_ENFORCE_EQ(res, xpu::Error_t::SUCCESS,
-                        platform::errors::External("XPU kernel error occur! %s",
-                                                   get_xpu_error_message(res)));
-      return;
-    }
-    if (std::is_same<Functor, XPUMulFunctor<T>>::value) {
-      int res = xpu::matrix_vector_mul(dev_ctx.x_context(), x_data, y_data,
-                                       z_data, pre, n);
-      PADDLE_ENFORCE_EQ(res, xpu::Error_t::SUCCESS,
-                        platform::errors::External("XPU kernel error occur! %s",
-                                                   get_xpu_error_message(res)));
-      return;
-    }
+  if (use_x_y_data && x->numel() != len) {
+    std::vector<int> bcast_x_v;
+    std::vector<int> bcast_y_v;
+    std::pair<std::vector<int>, std::vector<int>> bcast_v =
+        XPUDimsToBroadcastVector(framework::make_ddim(x_dims_array), out_dim);
+    ret = xpu::broadcast<T>(
+        dev_ctx.x_context(), x_data,
+        x_broadcast_tensor.mutable_data<T>(ctx.GetPlace(), len), bcast_v.first,
+        bcast_v.second);
+    PADDLE_ENFORCE_EQ(ret, xpu::SUCCESS,
+                      platform::errors::External(
+                          "XPU kernel broadcast error occur! %d", ret));
+    need_wait = true;
+    x_data = x_broadcast_tensor.data<T>();
+  }
+
+  if (use_x_y_data && y->numel() != len) {
+    // broadcast y
+    std::vector<int> bcast_x_v;
+    std::vector<int> bcast_y_v;
+    std::pair<std::vector<int>, std::vector<int>> bcast_v =
+        XPUDimsToBroadcastVector(framework::make_ddim(y_dims_array), out_dim);
+    ret = xpu::broadcast<T>(
+        dev_ctx.x_context(), y_data,
+        y_broadcast_tensor.mutable_data<T>(ctx.GetPlace(), len), bcast_v.first,
+        bcast_v.second);
+    PADDLE_ENFORCE_EQ(ret, xpu::SUCCESS,
+                      platform::errors::External(
+                          "XPU kernel broadcast error occur! %d", ret));
+    need_wait = true;
+    y_data = y_broadcast_tensor.data<T>();
   }
 
-  if (pre != 1 || post != 1) {
-    XPU_MALLOC(&y_broadcast, len * sizeof(T));
-    int res = xpu::broadcast_ew(dev_ctx.x_context(), y_data, y_broadcast, pre,
-                                n, post, xpu::ElementwiseOp::ASSIGN);
-    PADDLE_ENFORCE_EQ(res, xpu::Error_t::SUCCESS,
-                      platform::errors::External("XPU kernel error occur! %s",
-                                                 get_xpu_error_message(res)));
-    y_data = y_broadcast;
+  ret = func(dev_ctx.x_context(), x_data, y_data, z_data, dz_data, dx_data,
+             dy_data, len);
+  PADDLE_ENFORCE_EQ(ret, xpu::SUCCESS, platform::errors::External(
+                                           "XPU kernel binary occur error in "
+                                           "XPUElementwiseGrad, error code %d",
+                                           ret));
+
+  if (dx_need_reduce) {
+    const framework::DDim& dx_dims = dx->dims();
+    std::pair<std::vector<int>, std::vector<int>> reduce_v =
+        XPUReducesAxisVector(out_dim, dx_dims);
+    ret = xpu::reduce_sum(dev_ctx.x_context(), dx_data,
+                          dx->mutable_data<T>(ctx.GetPlace()), reduce_v.first,
+                          reduce_v.second);
+    PADDLE_ENFORCE_EQ(
+        ret, xpu::SUCCESS,
+        platform::errors::External("XPU kernel reduce_sum occur error in "
+                                   "XPUElementwiseGrad, error code %d",
+                                   ret));
+    need_wait = true;
   }
 
-  Functor functor;
-  int res = functor(dev_ctx.x_context(), x_data, y_data, z_data, len);
-  PADDLE_ENFORCE_EQ(res, xpu::Error_t::SUCCESS,
-                    platform::errors::External("XPU kernel error occur! %s",
-                                               get_xpu_error_message(res)));
+  if (dy_need_reduce) {
+    const framework::DDim& dy_dims = dy->dims();
+    std::pair<std::vector<int>, std::vector<int>> reduce_v =
+        XPUReducesAxisVector(out_dim, dy_dims);
+    ret = xpu::reduce_sum(dev_ctx.x_context(), dy_data,
+                          dy->mutable_data<T>(ctx.GetPlace()), reduce_v.first,
+                          reduce_v.second);
+    PADDLE_ENFORCE_EQ(
+        ret, xpu::SUCCESS,
+        platform::errors::External("XPU kernel reduce_sum occur error in "
+                                   "XPUElementwiseGrad, error code %d",
+                                   ret));
+    need_wait = true;
+  }
 
-  if (pre != 1 || post != 1) {
+  if (need_wait && dev_ctx.x_context()->xpu_stream) {
     dev_ctx.Wait();
-    xpu_free(y_broadcast);
   }
 }
 
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
index f4f6eb9cdc..368a12057c 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
@@ -19,6 +19,9 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
+#include "xpu/refactor/math.h"
+#include "xpu/refactor/nn.h"
+
 namespace paddle {
 namespace operators {
 
@@ -41,11 +44,13 @@ class SoftmaxWithCrossEntropyXPUKernel : public framework::OpKernel<T> {
     loss->mutable_data<T>(context.GetPlace());
     const int n = SizeToAxis(axis, logits->dims());
     const int d = SizeFromAxis(axis, logits->dims());
+    std::vector<int> logits_dims = framework::vectorize<int>(logits->dims());
     // softmax
     auto& dev_ctx =
         context.template device_context<platform::XPUDeviceContext>();
-    int r = xpu::softmax2d_forward(dev_ctx.x_context(), logits->data<float>(),
-                                   softmax->data<float>(), n, d);
+    int r = xpu::softmax(dev_ctx.x_context(), logits->data<float>(),
+                         softmax->data<float>(), logits_dims, axis);
+
     PADDLE_ENFORCE_EQ(
         r, xpu::Error_t::SUCCESS,
         platform::errors::External("XPU kernel error. Softmax2d_forward "
@@ -55,44 +60,35 @@ class SoftmaxWithCrossEntropyXPUKernel : public framework::OpKernel<T> {
     auto ignore_index = context.Attr<int>("ignore_index");
     const bool soft_label = context.Attr<bool>("soft_label");
     if (soft_label) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "XPU only support soft_label == false for now!"));
+      r = xpu::soft_cross_entropy<float>(
+          dev_ctx.x_context(), softmax->data<float>(), labels->data<float>(),
+          loss->data<float>(), n, d);
+      PADDLE_ENFORCE_EQ(
+          r, xpu::Error_t::SUCCESS,
+          platform::errors::External("XPU kernel error. soft_cross_entropy "
+                                     "execution not succeed, error code=%d",
+                                     r));
     } else {
-      auto* p_labels = labels->data<int64_t>();
-      int64_t* labels_int64_host =
-          reinterpret_cast<int64_t*>(std::malloc(n * sizeof(int64_t)));
-      int* labels_int32_host =
-          reinterpret_cast<int*>(std::malloc(n * sizeof(int)));
-      int* labels_int32_device = NULL;
-      int ret = xpu_malloc(reinterpret_cast<void**>(&labels_int32_device),
-                           n * sizeof(int));
-      PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                        platform::errors::External(
-                            "XPU API return wrong value[%d], please check "
-                            "where Baidu Kunlun Card is properly installed.",
-                            ret));
-      dev_ctx.Wait();
-      memory::Copy(platform::CPUPlace(), labels_int64_host,
-                   BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()),
-                   p_labels, n * sizeof(int64_t));
-      for (int i = 0; i < n; ++i) {
-        labels_int32_host[i] = labels_int64_host[i];
-      }
-      memory::Copy(BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()),
-                   labels_int32_device, platform::CPUPlace(), labels_int32_host,
-                   n * sizeof(int));
-      int r = xpu::cross_entropy_forward(
-          dev_ctx.x_context(), n, d, softmax->data<float>(),
-          labels_int32_device, loss->data<float>(), nullptr, ignore_index);
+      Tensor labels_int32;
+      labels_int32.mutable_data<int32_t>(context.GetPlace(), labels->numel());
+      r = xpu::cast_v2<int64_t, int32_t>(
+          dev_ctx.x_context(), labels->data<int64_t>(),
+          labels_int32.data<int32_t>(), labels->numel());
+      PADDLE_ENFORCE_EQ(
+          r, xpu::Error_t::SUCCESS,
+          platform::errors::External("XPU kernel error. cast_v2 "
+                                     "execution not succeed, error code=%d",
+                                     r));
+
+      r = xpu::hard_cross_entropy<float, int32_t>(
+          dev_ctx.x_context(), softmax->data<float>(),
+          labels_int32.data<int32_t>(), loss->data<float>(), nullptr, n, d,
+          ignore_index);
       PADDLE_ENFORCE_EQ(
           r, xpu::Error_t::SUCCESS,
-          platform::errors::External("XPU kernel error. Cross_entropy_forward "
+          platform::errors::External("XPU kernel error. hard_cross_entropy "
                                      "execution not succeed, error code=%d",
                                      r));
-      dev_ctx.Wait();
-      std::free(labels_int32_host);
-      std::free(labels_int64_host);
-      xpu_free(labels_int32_device);
     }
   }
 };
diff --git a/python/paddle/fluid/tests/unittests/xpu/elementwise.py b/python/paddle/fluid/tests/unittests/xpu/elementwise.py
deleted file mode 100644
index f4f2ddb19c..0000000000
--- a/python/paddle/fluid/tests/unittests/xpu/elementwise.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-paddle.enable_static()
-
-
-class TestXPUElementwiseOpBase(object):
-    def setUp(self, op_type):
-        self.op_type = op_type
-        self.attrs = {'use_xpu': True}
-        self.is_common_broadcast = False
-        self.is_x_size_less_than_y = False
-        self.grad_implemented = False
-        self.y_grad_implemented = True
-        self.dtype = np.float32
-        self.__class__.op_type = self.op_type
-        self.__class__.use_xpu = True
-        self.__class__.dtype = self.dtype
-
-    def net(self, place):
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.layers.data(
-                name='X', shape=self.inputs['X'].shape, dtype=self.dtype)
-            y = fluid.layers.data(
-                name='Y', shape=self.inputs['Y'].shape, dtype=self.dtype)
-            op = getattr(fluid.layers, self.op_type)
-            z = op(x, y)
-            exe = fluid.Executor(place)
-            z_value = exe.run(feed=self.inputs, fetch_list=[z.name])
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            if not self.is_common_broadcast and not self.is_x_size_less_than_y:
-                self.check_output_with_place(place, atol=1e-3)
-            else:
-                with self.assertRaises(BaseException):
-                    self.net(place)
-
-    def _check_grad_xpu_helper(self,
-                               inputs_to_check,
-                               output_names,
-                               no_grad_set=None,
-                               max_relative_error=0.01):
-        if self.grad_implemented and not self.is_common_broadcast   \
-          and not self.is_x_size_less_than_y:
-            if paddle.is_compiled_with_xpu():
-                place = paddle.XPUPlace(0)
-                self.check_grad_with_place(
-                    place,
-                    inputs_to_check,
-                    output_names,
-                    no_grad_set=no_grad_set,
-                    max_relative_error=max_relative_error)
-
-    def test_check_grad_normal(self):
-        self._check_grad_xpu_helper(['X', 'Y'], 'Out')
-
-    def test_check_grad_ingore_x(self):
-        self._check_grad_xpu_helper(['Y'], 'Out', set("X"))
-
-    def test_check_grad_ingore_y(self):
-        if self.y_grad_implemented:
-            self._check_grad_xpu_helper(['X'], 'Out', set("Y"))
-
-    def init_axis(self):
-        self.axis = -1
-
-    def make_input(self, x_shape=[13, 17], y_shape=[13, 17]):
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, x_shape).astype(self.dtype),
-            'Y': np.random.uniform(0.1, 1, y_shape).astype(self.dtype)
-        }
-
-    def reshape_input(self, x_shape=None, y_shape=None):
-        if x_shape is None:
-            x = self.inputs['X']
-        else:
-            x = self.inputs['X'].reshape(x_shape)
-        if y_shape is None:
-            y = self.inputs['Y']
-        else:
-            y = self.inputs['Y'].reshape(y_shape)
-        return x, y
-
-    def make_output(self, x_shape=None, y_shape=None):
-        pass
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py
index 9c6e7d21c1..c4905a229b 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py
@@ -13,18 +13,21 @@
 # limitations under the License.
 
 from __future__ import print_function
+import numpy as np
 import sys
 sys.path.append("..")
-import unittest
-import numpy as np
 import paddle
-import paddle.fluid.core as core
 from op_test import OpTest, skip_check_grad_ci
+from op_test_xpu import XPUOpTest
+import unittest
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
+paddle.enable_static()
 
 
-class TestElementwiseAddOp(OpTest):
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp(XPUOpTest):
     def init_kernel_type(self):
         self.use_mkldnn = False
 
@@ -34,6 +37,7 @@ class TestElementwiseAddOp(OpTest):
         self.init_input_output()
         self.init_kernel_type()
         self.init_axis()
+        self.use_xpu = True
 
         self.inputs = {
             'X': OpTest.np_dtype_to_fluid_dtype(self.x),
@@ -43,80 +47,33 @@ class TestElementwiseAddOp(OpTest):
         self.outputs = {'Out': self.out}
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=(self.use_mkldnn == False))
-
-    def test_check_grad_normal(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        if self.dtype == np.float16:
-            return
-        self.check_grad(
-            ['X', 'Y'], 'Out', check_dygraph=(self.use_mkldnn == False))
-
-    def test_check_grad_ingore_x(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        if self.dtype == np.float16:
-            return
-        self.check_grad(
-            ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-            check_dygraph=(self.use_mkldnn == False))
-
-    def test_check_grad_ingore_y(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        if self.dtype == np.float16:
-            return
-        self.check_grad(
-            ['X'],
-            'Out',
-            no_grad_set=set('Y'),
-            check_dygraph=(self.use_mkldnn == False))
-
-    def init_input_output(self):
-        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
-        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
-        self.out = np.add(self.x, self.y)
-
-    def init_dtype(self):
-        self.dtype = np.float64
-
-    def init_axis(self):
-        self.axis = -1
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUElementwiseAddOp(OpTest):
-    def setUp(self):
-        self.op_type = "elementwise_add"
-        self.init_dtype()
-        self.init_input_output()
-        self.init_axis()
-
-        self.inputs = {'X': self.x, 'Y': self.y}
-        self.attrs = {'axis': self.axis, 'use_mkldnn': False, 'use_xpu': True}
-        self.outputs = {'Out': self.out}
-
-    def test_check_output(self):
-        if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
+        if paddle.is_compiled_with_xpu():
             place = paddle.XPUPlace(0)
             self.check_output_with_place(place)
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
+        if paddle.is_compiled_with_xpu():
             place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X', 'Y'], 'Out')
+            self.check_grad_with_place(
+                place, ['X', 'Y'], 'Out', max_relative_error=0.006)
 
     def test_check_grad_ingore_x(self):
-        if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
+        if paddle.is_compiled_with_xpu():
             place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['Y'], 'Out')
+            self.check_grad_with_place(
+                place, ['Y'],
+                'Out',
+                no_grad_set=set("X"),
+                max_relative_error=0.006)
 
     def test_check_grad_ingore_y(self):
-        if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
+        if paddle.is_compiled_with_xpu():
             place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out')
+            self.check_grad_with_place(
+                place, ['X'],
+                'Out',
+                no_grad_set=set("Y"),
+                max_relative_error=0.006)
 
     def init_input_output(self):
         self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
@@ -130,6 +87,8 @@ class TestXPUElementwiseAddOp(OpTest):
         self.axis = -1
 
 
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseAddOp_scalar(TestElementwiseAddOp):
@@ -139,6 +98,8 @@ class TestElementwiseAddOp_scalar(TestElementwiseAddOp):
         self.out = self.x + self.y
 
 
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1,1) to test broadcast.")
 class TestElementwiseAddOp_scalar2(TestElementwiseAddOp):
@@ -148,6 +109,8 @@ class TestElementwiseAddOp_scalar2(TestElementwiseAddOp):
         self.out = self.x + self.y
 
 
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_Vector(TestElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.random((100, )).astype(self.dtype)
@@ -155,6 +118,8 @@ class TestElementwiseAddOp_Vector(TestElementwiseAddOp):
         self.out = np.add(self.x, self.y)
 
 
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_broadcast_0(TestElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 3).astype(self.dtype)
@@ -165,6 +130,8 @@ class TestElementwiseAddOp_broadcast_0(TestElementwiseAddOp):
         self.axis = 0
 
 
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_broadcast_1(TestElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.rand(2, 100, 3).astype(self.dtype)
@@ -175,6 +142,8 @@ class TestElementwiseAddOp_broadcast_1(TestElementwiseAddOp):
         self.axis = 1
 
 
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_broadcast_2(TestElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 100).astype(self.dtype)
@@ -182,6 +151,8 @@ class TestElementwiseAddOp_broadcast_2(TestElementwiseAddOp):
         self.out = self.x + self.y.reshape(1, 1, 100)
 
 
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
@@ -192,6 +163,8 @@ class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp):
         self.axis = 1
 
 
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 3, 4).astype(self.dtype)
@@ -202,6 +175,8 @@ class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp):
         self.axis = 0
 
 
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_broadcast_5(TestElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.rand(10, 3, 12).astype(self.dtype)
@@ -209,6 +184,8 @@ class TestElementwiseAddOp_broadcast_5(TestElementwiseAddOp):
         self.out = self.x + self.y
 
 
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_broadcast_6(TestElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.rand(2, 12, 3, 5).astype(self.dtype)
@@ -216,6 +193,8 @@ class TestElementwiseAddOp_broadcast_6(TestElementwiseAddOp):
         self.out = self.x + self.y
 
 
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_broadcast_7(TestElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.rand(1, 1, 20, 5).astype(self.dtype)
@@ -223,6 +202,8 @@ class TestElementwiseAddOp_broadcast_7(TestElementwiseAddOp):
         self.out = self.x + self.y
 
 
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_rowwise_add_0(TestElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.rand(2, 10, 12).astype(self.dtype)
@@ -233,6 +214,8 @@ class TestElementwiseAddOp_rowwise_add_0(TestElementwiseAddOp):
         self.axis = 1
 
 
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseAddOp_rowwise_add_1(TestElementwiseAddOp):
@@ -245,6 +228,8 @@ class TestElementwiseAddOp_rowwise_add_1(TestElementwiseAddOp):
         self.axis = 1
 
 
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 3).astype(self.dtype)
@@ -255,6 +240,8 @@ class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp):
         self.axis = -1
 
 
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_commonuse_add1(TestElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 100).astype(self.dtype)
@@ -265,6 +252,8 @@ class TestElementwiseAddOp_commonuse_add1(TestElementwiseAddOp):
         self.axis = -1
 
 
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_commonuse_add2(TestElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.rand(10, 3, 1, 4).astype(self.dtype)
@@ -275,6 +264,8 @@ class TestElementwiseAddOp_commonuse_add2(TestElementwiseAddOp):
         self.axis = -1
 
 
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.rand(10, 12).astype(self.dtype)
@@ -285,14 +276,16 @@ class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestElementwiseAddOp):
         self.axis = 2
 
 
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestElementwiseAddOpError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
             # the input of elementwise_add must be Variable.
             x1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.XPUPlace(0))
             y1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.XPUPlace(0))
             self.assertRaises(TypeError, fluid.layers.elementwise_add, x1, y1)
 
             # the input dtype of elementwise_add must be float16 or float32 or float64 or int32 or int64
@@ -302,6 +295,8 @@ class TestElementwiseAddOpError(unittest.TestCase):
             self.assertRaises(TypeError, fluid.layers.elementwise_add, x2, y2)
 
 
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestAddOp(unittest.TestCase):
     def test_name(self):
         with fluid.program_guard(fluid.Program()):
@@ -324,7 +319,7 @@ class TestAddOp(unittest.TestCase):
             y = fluid.data(name="y", shape=[3], dtype='float32')
             z = paddle.add(x, y)
 
-            place = fluid.CPUPlace()
+            place = fluid.XPUPlace(0)
             exe = fluid.Executor(place)
             z_value = exe.run(feed=gen_data(), fetch_list=[z.name])
             z_expected = np.array([3., 8., 6.])
@@ -332,8 +327,8 @@ class TestAddOp(unittest.TestCase):
 
     def test_dygraph(self):
         with fluid.dygraph.guard():
-            np_x = np.array([2, 3, 4]).astype('float64')
-            np_y = np.array([1, 5, 2]).astype('float64')
+            np_x = np.array([2, 3, 4]).astype('float32')
+            np_y = np.array([1, 5, 2]).astype('float32')
             x = fluid.dygraph.to_variable(np_x)
             y = fluid.dygraph.to_variable(np_y)
             z = paddle.add(x, y)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_div_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_div_op_xpu.py
index cb6e412cb0..0fd35d7a45 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_div_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_div_op_xpu.py
@@ -17,121 +17,233 @@ import unittest
 import numpy as np
 import paddle
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 from op_test import OpTest, skip_check_grad_ci
-from elementwise import TestXPUElementwiseOpBase
+from op_test_xpu import XPUOpTest
 paddle.enable_static()
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestXPUElementwiseDivOp(OpTest, TestXPUElementwiseOpBase):
+class ElementwiseDivOp(XPUOpTest):
     def setUp(self):
-        TestXPUElementwiseOpBase.setUp(self, "elementwise_div")
-        self.make_input()
-        self.make_output()
-
-    def make_output(self, x_shape=None, y_shape=None):
-        x, y = self.reshape_input(x_shape, y_shape)
-        self.outputs = {'Out': np.divide(x, y)}
+        self.op_type = "elementwise_div"
+        self.dtype = np.float32
+        self.init_dtype()
+        self.use_xpu = True
+        """ Warning
+        CPU gradient check error!
+        'X': np.random.random((32,84)).astype("float32"),
+        'Y': np.random.random((32,84)).astype("float32")
+        """
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype),
+            'Y': np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        }
+        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad_normal(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X', 'Y'], 'Out', max_relative_error=0.05)
+
+    def test_check_grad_ingore_x(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['Y'],
+                'Out',
+                max_relative_error=0.05,
+                no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X'],
+                'Out',
+                max_relative_error=0.05,
+                no_grad_set=set('Y'))
+
+    def init_dtype(self):
+        pass
+
+
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseDivOp_scalar(ElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [20, 3, 4]).astype(np.float32),
+            'Y': np.random.uniform(0.1, 1, [1]).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] / self.inputs['Y']}
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseDivOp_scalar(TestXPUElementwiseDivOp):
+class TestElementwiseDivOp_Vector(ElementwiseDivOp):
     def setUp(self):
-        super(TestElementwiseDivOp_scalar, self).setUp()
-        self.grad_implemented = False
-        self.make_input([20, 3, 4], [1])
-        self.make_output()
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [100]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [100]).astype("float32")
+        }
+        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseDivOp_Vector(TestXPUElementwiseDivOp):
+class TestElementwiseDivOp_broadcast_0(ElementwiseDivOp):
     def setUp(self):
-        super(TestElementwiseDivOp_Vector, self).setUp()
-        self.make_input([100, ], [100, ])
-        self.make_output()
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [100, 3, 4]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [100]).astype("float32")
+        }
+
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out':
+            np.divide(self.inputs['X'], self.inputs['Y'].reshape(100, 1, 1))
+        }
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseDivOp_broadcast_0(TestXPUElementwiseDivOp):
+class TestElementwiseDivOp_broadcast_1(ElementwiseDivOp):
     def setUp(self):
-        super(TestElementwiseDivOp_broadcast_0, self).setUp()
-        self.attrs['axis'] = 0
-        self.make_input([100, 3, 4], [100, ])
-        self.make_output(y_shape=[100, 1, 1])
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 100, 4]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [100]).astype("float32")
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.divide(self.inputs['X'], self.inputs['Y'].reshape(1, 100, 1))
+        }
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseDivOp_broadcast_1(TestXPUElementwiseDivOp):
+class TestElementwiseDivOp_broadcast_2(ElementwiseDivOp):
     def setUp(self):
-        super(TestElementwiseDivOp_broadcast_1, self).setUp()
-        self.attrs['axis'] = 1
-        self.make_input([2, 100, 4], [100, ])
-        self.make_output(y_shape=[1, 100, 1])
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 100]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [100]).astype("float32")
+        }
+
+        self.outputs = {
+            'Out':
+            np.divide(self.inputs['X'], self.inputs['Y'].reshape(1, 1, 100))
+        }
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseDivOp_broadcast_2(TestXPUElementwiseDivOp):
+class TestElementwiseDivOp_broadcast_3(ElementwiseDivOp):
     def setUp(self):
-        super(TestElementwiseDivOp_broadcast_2, self).setUp()
-        self.make_input([2, 3, 100], [100, ])
-        self.make_output(y_shape=[1, 1, 100])
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 10, 12, 5]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [10, 12]).astype("float32")
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.divide(self.inputs['X'], self.inputs['Y'].reshape(1, 10, 12, 1))
+        }
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseDivOp_broadcast_3(TestXPUElementwiseDivOp):
+class TestElementwiseDivOp_broadcast_4(ElementwiseDivOp):
     def setUp(self):
-        super(TestElementwiseDivOp_broadcast_3, self).setUp()
-        self.attrs['axis'] = 1
-        self.make_input([2, 10, 12, 5], [10, 12])
-        self.make_output(y_shape=[1, 10, 12, 1])
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 50]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [2, 1, 50]).astype("float32")
+        }
+        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseDivOp_broadcast_4(TestXPUElementwiseDivOp):
+class TestElementwiseDivOp_broadcast_5(ElementwiseDivOp):
     def setUp(self):
-        super(TestElementwiseDivOp_broadcast_4, self).setUp()
-        self.is_common_broadcast = True
-        self.make_input([2, 3, 50], [2, 1, 50])
-        self.make_output()
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4, 20]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [2, 3, 1, 20]).astype("float32")
+        }
+        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseDivOp_broadcast_5(TestXPUElementwiseDivOp):
+class TestElementwiseDivOp_commonuse_1(ElementwiseDivOp):
     def setUp(self):
-        super(TestElementwiseDivOp_broadcast_5, self).setUp()
-        self.is_common_broadcast = True
-        self.make_input([2, 3, 4, 20], [2, 3, 1, 20])
-        self.make_output()
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 100]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [1, 1, 100]).astype("float32"),
+        }
+        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseDivOp_commonuse_1(TestXPUElementwiseDivOp):
+class TestElementwiseDivOp_commonuse_2(ElementwiseDivOp):
     def setUp(self):
-        super(TestElementwiseDivOp_commonuse_1, self).setUp()
-        self.is_common_broadcast = True
-        self.make_input([2, 3, 100], [1, 1, 100])
-        self.make_output()
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [30, 3, 1, 5]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [30, 1, 4, 1]).astype("float32"),
+        }
+        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseDivOp_xsize_lessthan_ysize(TestXPUElementwiseDivOp):
+class TestElementwiseDivOp_xsize_lessthan_ysize(ElementwiseDivOp):
     def setUp(self):
-        super(TestElementwiseDivOp_xsize_lessthan_ysize, self).setUp()
-        self.is_x_size_less_than_y = True
-        self.attrs['axis'] = 2
-        self.make_input([10, 12], [2, 3, 10, 12])
-        self.make_output(x_shape=[1, 1, 10, 12])
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [10, 12]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [2, 3, 10, 12]).astype("float32"),
+        }
+
+        self.attrs = {'axis': 2}
+
+        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseDivBroadcast(unittest.TestCase):
+    def test_shape_with_batch_sizes(self):
+        with fluid.program_guard(fluid.Program()):
+            x_var = fluid.data(
+                name='x', dtype='float32', shape=[None, 3, None, None])
+            one = 2.
+            out = one / x_var
+            exe = fluid.Executor(fluid.XPUPlace(0))
+            x = np.random.uniform(0.1, 0.6, (1, 3, 32, 32)).astype("float32")
+            out_result, = exe.run(feed={'x': x}, fetch_list=[out])
+            self.assertEqual((out_result == (2 / x)).all(), True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_floordiv_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_floordiv_op_xpu.py
new file mode 100644
index 0000000000..cc8ec3cac2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_floordiv_op_xpu.py
@@ -0,0 +1,87 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from op_test import OpTest, skip_check_grad_ci
+from op_test_xpu import XPUOpTest
+paddle.enable_static()
+import random
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseModOp(XPUOpTest):
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
+    def setUp(self):
+        self.op_type = "elementwise_floordiv"
+        self.dtype = np.float32
+        self.axis = -1
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.out}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0, 10000, [10, 10]).astype(self.dtype)
+        self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype)
+        self.out = np.floor_divide(self.x, self.y)
+
+    def init_dtype(self):
+        pass
+
+    def init_axis(self):
+        pass
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseModOp_scalar(TestElementwiseModOp):
+    def init_input_output(self):
+        scale_x = random.randint(0, 100000000)
+        scale_y = random.randint(1, 100000000)
+        self.x = (np.random.rand(2, 3, 4) * scale_x).astype(self.dtype)
+        self.y = (np.random.rand(1) * scale_y + 1).astype(self.dtype)
+        self.out = np.floor_divide(self.x, self.y)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseModOpInverse(TestElementwiseModOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(0, 10000, [10]).astype(self.dtype)
+        self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype)
+        self.out = np.floor_divide(self.x, self.y)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_max_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_max_op_xpu.py
index 340c5895c1..dbe575d406 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_max_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_max_op_xpu.py
@@ -16,113 +16,163 @@ sys.path.append("..")
 import unittest
 import numpy as np
 from op_test import OpTest, skip_check_grad_ci
+from op_test_xpu import XPUOpTest
 import paddle
-from elementwise import TestXPUElementwiseOpBase
 paddle.enable_static()
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestXPUElementwiseOp(OpTest, TestXPUElementwiseOpBase):
+class TestElementwiseOp(XPUOpTest):
     def setUp(self):
-        TestXPUElementwiseOpBase.setUp(self, "elementwise_max")
-        self.make_input()
-        self.make_output()
-
-    def make_input(self, x_shape=[13, 17], y_shape=[13, 17], idx_list=None):
-        x = np.random.random(x_shape).astype(self.dtype)
-        sgn = np.random.choice([-1, 1], y_shape).astype(self.dtype)
-        if idx_list is None:
-            y = x + sgn * np.random.uniform(0.1, 1, y_shape).astype(self.dtype)
-        else:
-            x_temp = x
-            for idx in idx_list:
-                x_temp = np.take(x_temp, [0], axis=idx)
-            sgn = sgn.reshape(x_temp.shape)
-            y = x_temp + sgn * np.random.uniform(0.1, 1, x_temp.shape)
-            y = y.reshape(y_shape).astype(self.dtype)
-
+        self.use_xpu = True
+        self.op_type = "elementwise_max"
+        # If x and y have the same value, the max() is not differentiable.
+        # So we generate test data by the following method
+        # to avoid them being too close to each other.
+        x = np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        sgn = np.random.choice([-1, 1], [13, 17]).astype("float32")
+        y = x + sgn * np.random.uniform(0.1, 1, [13, 17]).astype("float32")
         self.inputs = {'X': x, 'Y': y}
-
-    def make_output(self, x_shape=None, y_shape=None):
-        x, y = self.reshape_input(x_shape, y_shape)
-        self.outputs = {'Out': np.maximum(x, y)}
-
-
+        self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad_normal(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['Y'],
+                'Out',
+                max_relative_error=0.006,
+                no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X'],
+                'Out',
+                max_relative_error=0.006,
+                no_grad_set=set('Y'))
+
+
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseMaxOp_scalar(TestXPUElementwiseOp):
+class TestElementwiseMaxOp_scalar(TestElementwiseOp):
     def setUp(self):
-        super(TestElementwiseMaxOp_scalar, self).setUp()
-        self.make_input([2, 3, 20], [1])
-        self.make_output()
-        self.grad_implemented = False
+        self.op_type = "elementwise_max"
+        x = np.random.random_integers(-5, 5, [2, 3, 20]).astype("float32")
+        y = np.array([0.5]).astype("float32")
+        self.inputs = {'X': x, 'Y': y}
+        self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])}
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseMaxOp_Vector(TestXPUElementwiseOp):
+class TestElementwiseMaxOp_Vector(TestElementwiseOp):
     def setUp(self):
-        super(TestElementwiseMaxOp_Vector, self).setUp()
-        self.make_input([100, ], [100, ])
-        self.make_output()
+        self.op_type = "elementwise_max"
+        x = np.random.random((100, )).astype("float32")
+        sgn = np.random.choice([-1, 1], (100, )).astype("float32")
+        y = x + sgn * np.random.uniform(0.1, 1, (100, )).astype("float32")
+        self.inputs = {'X': x, 'Y': y}
+        self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])}
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseMaxOp_broadcast_0(TestXPUElementwiseOp):
+class TestElementwiseMaxOp_broadcast_0(TestElementwiseOp):
     def setUp(self):
-        super(TestElementwiseMaxOp_broadcast_0, self).setUp()
-        self.attrs['axis'] = 0
-        self.make_input([100, 5, 2], [100, ], [1, 2])
-        self.make_output(y_shape=[100, 1, 1])
+        self.op_type = "elementwise_max"
+        x = np.random.uniform(0.5, 1, (100, 5, 2)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (100, )).astype(np.float32)
+        y = x[:, 0, 0] + sgn * \
+            np.random.uniform(1, 2, (100, )).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
 
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestElementwiseMaxOp_broadcast_1(TestXPUElementwiseOp):
-    def setUp(self):
-        super(TestElementwiseMaxOp_broadcast_1, self).setUp()
-        self.attrs['axis'] = 1
-        self.make_input([2, 100, 3], [100, ], [0, 2])
-        self.make_output(y_shape=[1, 100, 1])
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out':
+            np.maximum(self.inputs['X'], self.inputs['Y'].reshape(100, 1, 1))
+        }
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseMaxOp_broadcast_2(TestXPUElementwiseOp):
+class TestElementwiseMaxOp_broadcast_1(TestElementwiseOp):
     def setUp(self):
-        super(TestElementwiseMaxOp_broadcast_2, self).setUp()
-        self.make_input([1, 3, 100], [100, ], [0, 1])
-        self.make_output(y_shape=[1, 1, 100])
+        self.op_type = "elementwise_max"
+        x = np.random.uniform(0.5, 1, (2, 100, 3)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (100, )).astype(np.float32)
+        y = x[0, :, 0] + sgn * \
+            np.random.uniform(1, 2, (100, )).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.maximum(self.inputs['X'], self.inputs['Y'].reshape(1, 100, 1))
+        }
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseMaxOp_broadcast_3(TestXPUElementwiseOp):
+class TestElementwiseMaxOp_broadcast_2(TestElementwiseOp):
     def setUp(self):
-        super(TestElementwiseMaxOp_broadcast_3, self).setUp()
-        self.attrs['axis'] = 1
-        self.make_input([2, 50, 2, 1], [50, 2], [0, 3])
-        self.make_output(y_shape=[1, 50, 2, 1])
+        self.op_type = "elementwise_max"
+        x = np.random.uniform(0.5, 1, (1, 3, 100)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (100, )).astype(np.float32)
+        y = x[0, 0, :] + sgn * \
+            np.random.uniform(1, 2, (100, )).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.outputs = {
+            'Out':
+            np.maximum(self.inputs['X'], self.inputs['Y'].reshape(1, 1, 100))
+        }
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseMaxOp_broadcast_4(TestXPUElementwiseOp):
+class TestElementwiseMaxOp_broadcast_3(TestElementwiseOp):
     def setUp(self):
-        super(TestElementwiseMaxOp_broadcast_4, self).setUp()
-        self.make_input([2, 3, 4, 5], [2, 3, 1, 5])
-        self.make_output()
+        self.op_type = "elementwise_max"
+        x = np.random.uniform(0.5, 1, (2, 50, 2, 1)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (50, 2)).astype(np.float32)
+        y = x[0, :, :, 0] + sgn * \
+            np.random.uniform(1, 2, (50, 2)).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.maximum(self.inputs['X'], self.inputs['Y'].reshape(1, 50, 2, 1))
+        }
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseMaxOp_broadcast_5(TestXPUElementwiseOp):
+class TestElementwiseMaxOp_broadcast_4(TestElementwiseOp):
     def setUp(self):
-        super(TestElementwiseMaxOp_broadcast_5, self).setUp()
-        self.make_input([2, 3, 100], [1, 1, 100])
-        self.make_output()
+        self.op_type = "elementwise_max"
+        x = np.random.uniform(0.5, 1, (2, 3, 4, 5)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (2, 3, 1, 5)).astype(np.float32)
+        y = x + sgn * \
+            np.random.uniform(1, 2, (2, 3, 1, 5)).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])}
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_min_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_min_op_xpu.py
new file mode 100644
index 0000000000..ebe2004c3f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_min_op_xpu.py
@@ -0,0 +1,180 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+from op_test import OpTest, skip_check_grad_ci
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+import paddle
+from op_test_xpu import XPUOpTest
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseOp(XPUOpTest):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        # If x and y have the same value, the min() is not differentiable.
+        # So we generate test data by the following method
+        # to avoid them being too close to each other.
+        x = np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        sgn = np.random.choice([-1, 1], [13, 17]).astype("float32")
+        y = x + sgn * np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        self.inputs = {'X': x, 'Y': y}
+        self.outputs = {'Out': np.minimum(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad_normal(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['Y'],
+                'Out',
+                max_relative_error=0.005,
+                no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X'],
+                'Out',
+                max_relative_error=0.005,
+                no_grad_set=set('Y'))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestElementwiseMinOp_scalar(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        x = np.random.random_integers(-5, 5, [10, 3, 4]).astype("float32")
+        y = np.array([0.5]).astype("float32")
+        self.inputs = {'X': x, 'Y': y}
+        self.outputs = {'Out': np.minimum(self.inputs['X'], self.inputs['Y'])}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMinOp_Vector(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        x = np.random.random((100, )).astype("float32")
+        sgn = np.random.choice([-1, 1], (100, )).astype("float32")
+        y = x + sgn * np.random.uniform(0.1, 1, (100, )).astype("float32")
+        self.inputs = {'X': x, 'Y': y}
+        self.outputs = {'Out': np.minimum(self.inputs['X'], self.inputs['Y'])}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMinOp_broadcast_0(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        x = np.random.uniform(0.5, 1, (100, 3, 2)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (100, )).astype(np.float32)
+        y = x[:, 0, 0] + sgn * \
+            np.random.uniform(1, 2, (100, )).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out':
+            np.minimum(self.inputs['X'], self.inputs['Y'].reshape(100, 1, 1))
+        }
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMinOp_broadcast_1(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        x = np.random.uniform(0.5, 1, (2, 100, 3)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (100, )).astype(np.float32)
+        y = x[0, :, 0] + sgn * \
+            np.random.uniform(1, 2, (100, )).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.minimum(self.inputs['X'], self.inputs['Y'].reshape(1, 100, 1))
+        }
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMinOp_broadcast_2(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        x = np.random.uniform(0.5, 1, (2, 3, 100)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (100, )).astype(np.float32)
+        y = x[0, 0, :] + sgn * \
+            np.random.uniform(1, 2, (100, )).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.outputs = {
+            'Out':
+            np.minimum(self.inputs['X'], self.inputs['Y'].reshape(1, 1, 100))
+        }
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMinOp_broadcast_3(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        x = np.random.uniform(0.5, 1, (2, 25, 4, 1)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (25, 4)).astype(np.float32)
+        y = x[0, :, :, 0] + sgn * \
+            np.random.uniform(1, 2, (25, 4)).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.minimum(self.inputs['X'], self.inputs['Y'].reshape(1, 25, 4, 1))
+        }
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMinOp_broadcast_4(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        x = np.random.uniform(0.5, 1, (2, 10, 2, 5)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (2, 10, 1, 5)).astype(np.float32)
+        y = x + sgn * \
+            np.random.uniform(1, 2, (2, 10, 1, 5)).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.outputs = {'Out': np.minimum(self.inputs['X'], self.inputs['Y'])}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mul_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mul_op_xpu.py
index 3fa9c6d84e..39fd07cb7a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mul_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mul_op_xpu.py
@@ -19,58 +19,111 @@ from op_test import OpTest, skip_check_grad_ci
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 import paddle
-from elementwise import TestXPUElementwiseOpBase
+from op_test_xpu import XPUOpTest
 paddle.enable_static()
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestXPUElementwiseMulOp(OpTest, TestXPUElementwiseOpBase):
+class ElementwiseMulOp(XPUOpTest):
     def init_kernel_type(self):
         self.use_mkldnn = False
 
     def setUp(self):
-        TestXPUElementwiseOpBase.setUp(self, "elementwise_mul")
+        self.use_xpu = True
+        self.op_type = "elementwise_mul"
+        self.dtype = np.float32
+        self.axis = -1
+        self.init_dtype()
+        self.init_input_output()
         self.init_kernel_type()
         self.init_axis()
-        self.attrs['axis'] = self.axis
-        self.attrs['use_mkldnn'] = self.use_mkldnn
-        self.grad_implemented = True
-        self.make_input()
-        self.make_output()
 
-    def make_output(self, x_shape=None, y_shape=None):
-        x, y = self.reshape_input(x_shape, y_shape)
-        self.outputs = {'Out': np.multiply(x, y)}
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.outputs = {'Out': self.out}
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
 
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
 
+    def test_check_grad_normal(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X', 'Y'],
+                'Out',
+                check_dygraph=(self.use_mkldnn == False))
+
+    def test_check_grad_ingore_x(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['Y'],
+                'Out',
+                no_grad_set=set("X"),
+                check_dygraph=(self.use_mkldnn == False))
+
+    def test_check_grad_ingore_y(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X'],
+                'Out',
+                no_grad_set=set('Y'),
+                check_dygraph=(self.use_mkldnn == False))
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.multiply(self.x, self.y)
+
+    def init_dtype(self):
+        pass
+
+    def init_axis(self):
+        pass
+
+
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestXPUElementwiseMulOp_scalar(TestXPUElementwiseMulOp):
+class TestElementwiseMulOp_scalar(ElementwiseMulOp):
     def setUp(self):
-        super(TestXPUElementwiseMulOp_scalar, self).setUp()
-        self.make_input((10, 3, 4), (1, ))
-        self.make_output()
-        self.grad_implemented = False
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(10, 3, 4).astype(np.float32),
+            'Y': np.random.rand(1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+        self.init_kernel_type()
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestXPUElementwiseMulOp_Vector(TestXPUElementwiseMulOp):
+class TestElementwiseMulOp_Vector(ElementwiseMulOp):
     def setUp(self):
-        super(TestXPUElementwiseMulOp_Vector, self).setUp()
-        self.make_input((100, ), (100, ))
-        self.make_output()
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.random((100, )).astype("float32"),
+            'Y': np.random.random((100, )).astype("float32")
+        }
+        self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])}
+        self.init_kernel_type()
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestXPUElementwiseMulOp_broadcast_0(TestXPUElementwiseMulOp):
-    def setUp(self):
-        super(TestXPUElementwiseMulOp_broadcast_0, self).setUp()
-        self.make_input((100, 2, 3), (100, ))
-        self.make_output(y_shape=(100, 1, 1))
-        self.y_grad_implemented = False
+class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x * self.y.reshape(100, 1, 1)
 
     def init_axis(self):
         self.axis = 0
@@ -78,75 +131,140 @@ class TestXPUElementwiseMulOp_broadcast_0(TestXPUElementwiseMulOp):
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseMulOp_broadcast_1(TestXPUElementwiseMulOp):
+class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 100, 3).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 100, 1)
+        }
+        self.init_kernel_type()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMulOp_broadcast_2(ElementwiseMulOp):
     def setUp(self):
-        super(TestElementwiseMulOp_broadcast_1, self).setUp()
-        self.attrs['axis'] = 1
-        self.y_grad_implemented = False
-        self.make_input((2, 100, 3), (100, ))
-        self.make_output(y_shape=(1, 100, 1))
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 100).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32)
+        }
+
+        self.outputs = {
+            'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 1, 100)
+        }
+        self.init_kernel_type()
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseMulOp_broadcast_2(TestXPUElementwiseMulOp):
+class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
     def setUp(self):
-        super(TestElementwiseMulOp_broadcast_2, self).setUp()
-        self.y_grad_implemented = False
-        self.make_input((2, 3, 100), (100, ))
-        self.make_output(y_shape=(1, 1, 100))
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 10, 12, 3).astype(np.float32),
+            'Y': np.random.rand(10, 12).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 10, 12, 1)
+        }
+        self.init_kernel_type()
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseMulOp_broadcast_3(TestXPUElementwiseMulOp):
+class TestElementwiseMulOp_broadcast_4(ElementwiseMulOp):
     def setUp(self):
-        super(TestElementwiseMulOp_broadcast_3, self).setUp()
-        self.attrs['axis'] = 1
-        self.y_grad_implemented = False
-        self.make_input((2, 10, 12, 3), (10, 12))
-        self.make_output(y_shape=(1, 10, 12, 1))
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(10, 2, 11).astype(np.float32),
+            'Y': np.random.rand(10, 1, 11).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+        self.init_kernel_type()
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseMulOp_broadcast_4(TestXPUElementwiseMulOp):
+class TestElementwiseMulOp_broadcast_5(ElementwiseMulOp):
     def setUp(self):
-        super(TestElementwiseMulOp_broadcast_4, self).setUp()
-        self.is_common_broadcast = True
-        self.make_input((10, 2, 11), (10, 1, 11))
-        self.make_output()
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(10, 4, 2, 3).astype(np.float32),
+            'Y': np.random.rand(10, 4, 1, 3).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+        self.init_kernel_type()
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseMulOp_broadcast_5(TestXPUElementwiseMulOp):
+class TestElementwiseMulOp_commonuse_1(ElementwiseMulOp):
     def setUp(self):
-        super(TestElementwiseMulOp_broadcast_5, self).setUp()
-        self.is_common_broadcast = True
-        self.make_input((10, 4, 2, 3), (10, 4, 1, 3))
-        self.make_output()
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 100).astype(np.float32),
+            'Y': np.random.rand(1, 1, 100).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+        self.init_kernel_type()
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestXPUElementwiseMulOp_commonuse_1(TestXPUElementwiseMulOp):
+class TestElementwiseMulOp_commonuse_2(ElementwiseMulOp):
     def setUp(self):
-        super(TestXPUElementwiseMulOp_commonuse_1, self).setUp()
-        self.is_common_broadcast = True
-        self.make_input((2, 3, 100), (1, 1, 100))
-        self.make_output()
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(30, 3, 1, 5).astype(np.float32),
+            'Y': np.random.rand(30, 1, 4, 1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+        self.init_kernel_type()
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestXPUElementwiseMulOp_xsize_lessthan_ysize(TestXPUElementwiseMulOp):
+class TestElementwiseMulOp_xsize_lessthan_ysize(ElementwiseMulOp):
     def setUp(self):
-        super(TestXPUElementwiseMulOp_xsize_lessthan_ysize, self).setUp()
-        self.attrs['axis'] = 2
-        self.is_x_size_less_than_y = True
-        self.make_input((10, 10), (2, 2, 10, 10))
-        self.make_output(x_shape=(1, 1, 10, 10))
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(10, 10).astype(np.float32),
+            'Y': np.random.rand(2, 2, 10, 10).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 2}
+
+        self.outputs = {
+            'Out': self.inputs['X'].reshape(1, 1, 10, 10) * self.inputs['Y']
+        }
+        self.init_kernel_type()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMulOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # the input of elementwise_mul must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.XPUPlace(0))
+            y1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.XPUPlace(0))
+            self.assertRaises(TypeError, fluid.layers.elementwise_mul, x1, y1)
+
+            # the input dtype of elementwise_mul must be float32
+            x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="uint8")
+            y2 = fluid.layers.data(name='y2', shape=[3, 4, 5, 6], dtype="uint8")
+            self.assertRaises(TypeError, fluid.layers.elementwise_mul, x2, y2)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_pow_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_pow_op_xpu.py
new file mode 100644
index 0000000000..cbad376119
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_pow_op_xpu.py
@@ -0,0 +1,182 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from op_test import OpTest, skip_check_grad_ci
+from op_test_xpu import XPUOpTest
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwisePowOp(XPUOpTest):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.uniform(1, 2, [20, 5]).astype("float32"),
+            'Y': np.random.uniform(1, 2, [20, 5]).astype("float32")
+        }
+        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad_normal(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X', 'Y'], 'Out')
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwisePowOp_big_shape_1(TestElementwisePowOp):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.uniform(1, 2, [10, 10]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [10, 10]).astype("float32")
+        }
+        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwisePowOp_big_shape_2(TestElementwisePowOp):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.uniform(1, 2, [10, 10]).astype("float32"),
+            'Y': np.random.uniform(0.2, 2, [10, 10]).astype("float32")
+        }
+        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestElementwisePowOp_scalar(TestElementwisePowOp):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [3, 3, 4]).astype(np.float32),
+            'Y': np.random.uniform(0.1, 1, [1]).astype(np.float32)
+        }
+        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwisePowOp_tensor(TestElementwisePowOp):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [100]).astype("float32"),
+            'Y': np.random.uniform(1, 3, [100]).astype("float32")
+        }
+        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwisePowOp_broadcast_0(TestElementwisePowOp):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 1, 100]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [100]).astype("float32")
+        }
+        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwisePowOp_broadcast_1(TestElementwisePowOp):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 100, 1]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [100]).astype("float32")
+        }
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': np.power(self.inputs['X'], self.inputs['Y'].reshape(100, 1))
+        }
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwisePowOp_broadcast_2(TestElementwisePowOp):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [100, 3, 1]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [100]).astype("float32")
+        }
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out':
+            np.power(self.inputs['X'], self.inputs['Y'].reshape(100, 1, 1))
+        }
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwisePowOp_broadcast_3(TestElementwisePowOp):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 20, 5, 1]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [20, 5]).astype("float32")
+        }
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': np.power(self.inputs['X'], self.inputs['Y'].reshape(1, 20, 5,
+                                                                       1))
+        }
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwisePowOp_broadcast_4(TestElementwisePowOp):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 10, 3, 5]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [2, 10, 1, 5]).astype("float32")
+        }
+        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwisePowOpInt(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {'X': np.asarray([1, 3, 6]), 'Y': np.asarray([1, 1, 1])}
+        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_sub_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_sub_op_xpu.py
index 22aa07be95..3bc9fa067a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_sub_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_sub_op_xpu.py
@@ -11,117 +11,198 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import unittest
+
 import numpy as np
 import sys
 sys.path.append("..")
-from op_test import OpTest, skip_check_grad_ci
 import paddle
-from elementwise import TestXPUElementwiseOpBase
+from op_test import OpTest, skip_check_grad_ci
+from op_test_xpu import XPUOpTest
+import unittest
 paddle.enable_static()
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestXPUElementwiseSubOp(OpTest, TestXPUElementwiseOpBase):
+class TestElementwiseOp(OpTest):
     def setUp(self):
-        TestXPUElementwiseOpBase.setUp(self, "elementwise_sub")
-        self.make_input()
-        self.make_output()
-        self.grad_implemented = True
+        self.use_xpu = True
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float32")
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, atol=1e-3)
+
+    def test_check_grad_normal(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['Y'],
+                'Out',
+                max_relative_error=0.005,
+                no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X'],
+                'Out',
+                max_relative_error=0.005,
+                no_grad_set=set('Y'))
 
-    def make_output(self, x_shape=None, y_shape=None):
-        x, y = self.reshape_input(x_shape, y_shape)
-        self.outputs = {'Out': x - y}
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestElementwiseSubOp_scalar(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(10, 3, 4).astype(np.float32),
+            'Y': np.random.rand(1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseSubOp_scalar(TestXPUElementwiseSubOp):
+class TestElementwiseSubOp_Vector(TestElementwiseOp):
     def setUp(self):
-        super(TestElementwiseSubOp_scalar, self).setUp()
-        self.grad_implemented = False
-        self.make_input((10, 3, 4), (1, ))
-        self.make_output()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.random((100, )).astype("float32"),
+            'Y': np.random.random((100, )).astype("float32")
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseSubOp_Vector(TestXPUElementwiseSubOp):
+class TestElementwiseSubOp_broadcast_0(TestElementwiseOp):
     def setUp(self):
-        super(TestElementwiseSubOp_Vector, self).setUp()
-        self.make_input((100, ), (100, ))
-        self.make_output()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(100, 3, 2).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(100, 1, 1)
+        }
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseSubOp_broadcast_0(TestXPUElementwiseSubOp):
+class TestElementwiseSubOp_broadcast_1(TestElementwiseOp):
     def setUp(self):
-        super(TestElementwiseSubOp_broadcast_0, self).setUp()
-        self.attrs['axis'] = 0
-        self.make_input((100, 3, 2), (100, ))
-        self.make_output(y_shape=(100, 1, 1))
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 100, 3).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 100, 1)
+        }
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseSubOp_broadcast_1(TestXPUElementwiseSubOp):
+class TestElementwiseSubOp_broadcast_2(TestElementwiseOp):
     def setUp(self):
-        super(TestElementwiseSubOp_broadcast_1, self).setUp()
-        self.attrs['axis'] = 1
-        self.make_input((2, 100, 3), (100, ))
-        self.make_output(y_shape=(1, 100, 1))
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 100).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32)
+        }
+
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 1, 100)
+        }
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseSubOp_broadcast_2(TestXPUElementwiseSubOp):
+class TestElementwiseSubOp_broadcast_3(TestElementwiseOp):
     def setUp(self):
-        super(TestElementwiseSubOp_broadcast_2, self).setUp()
-        self.make_input((2, 3, 100), (100, ))
-        self.make_output(y_shape=(1, 1, 100))
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 10, 12, 3).astype(np.float32),
+            'Y': np.random.rand(10, 12).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 10, 12, 1)
+        }
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseSubOp_broadcast_3(TestXPUElementwiseSubOp):
+class TestElementwiseSubOp_broadcast_4(TestElementwiseOp):
     def setUp(self):
-        super(TestElementwiseSubOp_broadcast_3, self).setUp()
-        self.attrs['axis'] = 1
-        self.make_input((2, 10, 12, 3), (10, 12))
-        self.make_output(y_shape=(1, 10, 12, 1))
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 5, 3, 12).astype(np.float32),
+            'Y': np.random.rand(2, 5, 1, 12).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseSubOp_broadcast_4(TestXPUElementwiseSubOp):
+class TestElementwiseSubOp_commonuse_1(TestElementwiseOp):
     def setUp(self):
-        super(TestElementwiseSubOp_broadcast_4, self).setUp()
-        self.is_common_broadcast = True
-        self.make_input((2, 5, 3, 12), (2, 5, 1, 12))
-        self.make_output()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 100).astype(np.float32),
+            'Y': np.random.rand(1, 1, 100).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseSubOp_commonuse_1(TestXPUElementwiseSubOp):
+class TestElementwiseSubOp_commonuse_2(TestElementwiseOp):
     def setUp(self):
-        super(TestElementwiseSubOp_commonuse_1, self).setUp()
-        self.is_common_broadcast = True
-        self.make_input((2, 3, 100), (1, 1, 100))
-        self.make_output()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(10, 3, 1, 4).astype(np.float32),
+            'Y': np.random.rand(10, 1, 12, 1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestElementwiseSubOp_xsize_lessthan_ysize(TestXPUElementwiseSubOp):
+class TestElementwiseSubOp_xsize_lessthan_ysize(TestElementwiseOp):
     def setUp(self):
-        super(TestElementwiseSubOp_xsize_lessthan_ysize, self).setUp()
-        self.attrs['axis'] = 2
-        self.is_x_size_less_than_y = True
-        self.make_input((10, 12), (2, 3, 10, 12))
-        self.make_output(x_shape=(1, 1, 10, 12))
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(10, 12).astype(np.float32),
+            'Y': np.random.rand(2, 3, 10, 12).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 2}
+
+        self.outputs = {
+            'Out': self.inputs['X'].reshape(1, 1, 10, 12) - self.inputs['Y']
+        }
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_softmax_with_cross_entropy_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_softmax_with_cross_entropy_op_xpu.py
index 80e83e030f..5a8985315e 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_softmax_with_cross_entropy_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_softmax_with_cross_entropy_op_xpu.py
@@ -13,16 +13,15 @@
 # limitations under the License.
 
 from __future__ import print_function
+from test_softmax_op import stable_softmax
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle
 
 import unittest
 import numpy as np
 import sys
 sys.path.append("..")
-import paddle
-import paddle.fluid.core as core
-
-from op_test import OpTest
-from test_softmax_op import stable_softmax
 
 
 def cross_entropy(softmax, label, soft_label, axis, ignore_index=-1):
@@ -54,10 +53,11 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
         self.op_type = "softmax_with_cross_entropy"
         self.numeric_stable_mode = False
         self.soft_label = False
-        self.dtype = np.float64
+        self.dtype = np.float32
         self.axis = -1
         self.ignore_index = -1
         self.shape = [41, 37]
+        self.use_xpu = True
 
     def setUp(self):
         self.initParams()
@@ -103,7 +103,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
             paddle.enable_static()
             place = paddle.XPUPlace(0)
             self.check_grad_with_place(
-                place, ["Logits"], "Loss", max_relative_error=0.1)
+                place, ["Logits"], "Loss", max_relative_error=0.2)
 
 
 class TestXPUSoftmaxWithCrossEntropyOp(TestSoftmaxWithCrossEntropyOp):
@@ -115,6 +115,7 @@ class TestXPUSoftmaxWithCrossEntropyOp(TestSoftmaxWithCrossEntropyOp):
         self.axis = -1
         self.ignore_index = -1
         self.dtype = np.float32
+        self.use_xpu = True
 
     def test_check_output(self):
         if paddle.is_compiled_with_xpu():
@@ -127,7 +128,7 @@ class TestXPUSoftmaxWithCrossEntropyOp(TestSoftmaxWithCrossEntropyOp):
             paddle.enable_static()
             place = paddle.XPUPlace(0)
             self.check_grad_with_place(
-                place, ["Logits"], "Loss", max_relative_error=0.1)
+                place, ["Logits"], "Loss", max_relative_error=0.2)
 
 
 class TestXPUSoftmaxWithCrossEntropyOp2(TestXPUSoftmaxWithCrossEntropyOp):
@@ -139,10 +140,11 @@ class TestXPUSoftmaxWithCrossEntropyOp2(TestXPUSoftmaxWithCrossEntropyOp):
         self.op_type = "softmax_with_cross_entropy"
         self.numeric_stable_mode = True
         self.soft_label = True
-        self.dtype = np.float64
+        self.dtype = np.float32
         self.axis = -1
         self.ignore_index = -1
         self.shape = [41, 37]
+        self.use_xpu = True
 
     def test_check_output(self):
         if paddle.is_compiled_with_xpu():
@@ -155,7 +157,7 @@ class TestXPUSoftmaxWithCrossEntropyOp2(TestXPUSoftmaxWithCrossEntropyOp):
             paddle.enable_static()
             place = paddle.XPUPlace(0)
             self.check_grad_with_place(
-                place, ["Logits"], "Loss", max_relative_error=0.1)
+                place, ["Logits"], "Loss", max_relative_error=0.2)
 
 
 class TestXPUSoftmaxWithCrossEntropyOp3(TestXPUSoftmaxWithCrossEntropyOp):
@@ -170,55 +172,56 @@ class TestXPUSoftmaxWithCrossEntropyOp3(TestXPUSoftmaxWithCrossEntropyOp):
         self.shape = [41, 37]
         self.ignore_index = 5
         self.axis = -1
-        self.dtype = np.float64
-
-
-class TestXPUSoftmaxWithCrossEntropyOpAxis1(TestXPUSoftmaxWithCrossEntropyOp):
-    """
-    Test softmax with cross entropy operator with discreate one-hot labels.
-    Given axis != -1
-    """
-
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = False
-        self.dtype = np.float64
-        self.axis = 0
-        self.ignore_index = -1
-        self.shape = [3, 5, 7, 11]
-
-
-class TestXPUSoftmaxWithCrossEntropyOpAxis2(TestXPUSoftmaxWithCrossEntropyOp):
-    """
-    Test softmax with cross entropy operator with discreate one-hot labels.
-    Given axis != -1
-    """
-
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = False
-        self.dtype = np.float64
-        self.axis = 1
-        self.ignore_index = -1
-        self.shape = [3, 5, 7, 11]
-
+        self.dtype = np.float32
 
-class TestXPUSoftmaxWithCrossEntropyOpAxis3(TestXPUSoftmaxWithCrossEntropyOp):
-    """
-    Test softmax with cross entropy operator with discreate one-hot labels.
-    Given axis != -1
-    """
 
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = False
-        self.dtype = np.float64
-        self.axis = 2
-        self.ignore_index = -1
-        self.shape = [3, 5, 7, 11]
+# xpu only support axis = rank -1
+# class TestXPUSoftmaxWithCrossEntropyOpAxis1(TestXPUSoftmaxWithCrossEntropyOp):
+#     """
+#     Test softmax with cross entropy operator with discreate one-hot labels.
+#     Given axis != -1
+#     """
+
+#     def initParams(self):
+#         self.op_type = "softmax_with_cross_entropy"
+#         self.numeric_stable_mode = True
+#         self.soft_label = False
+#         self.dtype = np.float32
+#         self.axis = 0
+#         self.ignore_index = -1
+#         self.shape = [3, 5, 7, 11]
+
+# xpu only support axis = rank -1
+# class TestXPUSoftmaxWithCrossEntropyOpAxis2(TestXPUSoftmaxWithCrossEntropyOp):
+#     """
+#     Test softmax with cross entropy operator with discreate one-hot labels.
+#     Given axis != -1
+#     """
+
+#     def initParams(self):
+#         self.op_type = "softmax_with_cross_entropy"
+#         self.numeric_stable_mode = True
+#         self.soft_label = False
+#         self.dtype = np.float32
+#         self.axis = 1
+#         self.ignore_index = -1
+#         self.shape = [3, 5, 7, 11]
+
+# xpu only support axis = rank -1
+# class TestXPUSoftmaxWithCrossEntropyOpAxis3(TestXPUSoftmaxWithCrossEntropyOp):
+#     """
+#     Test softmax with cross entropy operator with discreate one-hot labels.
+#     Given axis != -1
+#     """
+
+#     def initParams(self):
+#         self.op_type = "softmax_with_cross_entropy"
+#         self.numeric_stable_mode = True
+#         self.soft_label = False
+#         self.dtype = np.float32
+#         self.axis = 2
+#         self.ignore_index = -1
+#         self.shape = [3, 5, 7, 11]
 
 
 class TestXPUSoftmaxWithCrossEntropyOpAxis4(TestXPUSoftmaxWithCrossEntropyOp):
@@ -231,7 +234,7 @@ class TestXPUSoftmaxWithCrossEntropyOpAxis4(TestXPUSoftmaxWithCrossEntropyOp):
         self.op_type = "softmax_with_cross_entropy"
         self.numeric_stable_mode = True
         self.soft_label = False
-        self.dtype = np.float64
+        self.dtype = np.float32
         self.axis = 3
         self.ignore_index = -1
         self.shape = [3, 5, 7, 11]
@@ -248,46 +251,47 @@ class TestXPUSoftmaxWithCrossEntropyOpAxisDimEqualOne(
         self.op_type = "softmax_with_cross_entropy"
         self.numeric_stable_mode = True
         self.soft_label = False
-        self.dtype = np.float64
+        self.dtype = np.float32
         self.axis = -1
         self.ignore_index = -1
         self.shape = [3, 5, 7, 1]
 
 
-class TestXPUSoftmaxWithCrossEntropyOpSoftLabelAxis1(
-        TestXPUSoftmaxWithCrossEntropyOp):
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = True
-        self.shape = [3, 5, 7, 11]
-        self.axis = 0
-        self.ignore_index = -1
-        self.dtype = np.float64
-
-
-class TestXPUSoftmaxWithCrossEntropyOpSoftLabelAxis2(
-        TestXPUSoftmaxWithCrossEntropyOp2):
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = True
-        self.shape = [3, 5, 7, 11]
-        self.axis = 1
-        self.ignore_index = -1
-        self.dtype = np.float64
-
-
-class TestXPUSoftmaxWithCrossEntropyOpSoftLabelAxis3(
-        TestXPUSoftmaxWithCrossEntropyOp2):
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = True
-        self.shape = [3, 5, 7, 11]
-        self.axis = 2
-        self.ignore_index = -1
-        self.dtype = np.float64
+# xpu only support axis = rank -1
+# class TestXPUSoftmaxWithCrossEntropyOpSoftLabelAxis1(
+#         TestXPUSoftmaxWithCrossEntropyOp):
+#     def initParams(self):
+#         self.op_type = "softmax_with_cross_entropy"
+#         self.numeric_stable_mode = True
+#         self.soft_label = True
+#         self.shape = [3, 5, 7, 11]
+#         self.axis = 0
+#         self.ignore_index = -1
+#         self.dtype = np.float32
+
+# xpu only support axis = rank -1
+# class TestXPUSoftmaxWithCrossEntropyOpSoftLabelAxis2(
+#         TestXPUSoftmaxWithCrossEntropyOp2):
+#     def initParams(self):
+#         self.op_type = "softmax_with_cross_entropy"
+#         self.numeric_stable_mode = True
+#         self.soft_label = True
+#         self.shape = [3, 5, 7, 11]
+#         self.axis = 1
+#         self.ignore_index = -1
+#         self.dtype = np.float32
+
+# xpu only support axis = rank -1
+# class TestXPUSoftmaxWithCrossEntropyOpSoftLabelAxis3(
+#         TestXPUSoftmaxWithCrossEntropyOp2):
+#     def initParams(self):
+#         self.op_type = "softmax_with_cross_entropy"
+#         self.numeric_stable_mode = True
+#         self.soft_label = True
+#         self.shape = [3, 5, 7, 11]
+#         self.axis = 2
+#         self.ignore_index = -1
+#         self.dtype = np.float32
 
 
 class TestXPUSoftmaxWithCrossEntropyOpSoftLabelAxis4(
@@ -299,43 +303,44 @@ class TestXPUSoftmaxWithCrossEntropyOpSoftLabelAxis4(
         self.shape = [3, 5, 7, 11]
         self.axis = 3
         self.ignore_index = -1
-        self.dtype = np.float64
-
-
-class TestXPUSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis1(
-        TestXPUSoftmaxWithCrossEntropyOp3):
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = False
-        self.shape = [3, 5, 7, 11]
-        self.ignore_index = 1
-        self.axis = 0
-        self.dtype = np.float64
-
-
-class TestXPUSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis2(
-        TestXPUSoftmaxWithCrossEntropyOp3):
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = False
-        self.shape = [3, 5, 7, 11]
-        self.ignore_index = 0
-        self.axis = 1
-        self.dtype = np.float64
+        self.dtype = np.float32
 
 
-class TestXPUSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis3(
-        TestXPUSoftmaxWithCrossEntropyOp3):
-    def initParams(self):
-        self.op_type = "softmax_with_cross_entropy"
-        self.numeric_stable_mode = True
-        self.soft_label = False
-        self.shape = [3, 5, 7, 11]
-        self.ignore_index = 3
-        self.axis = 2
-        self.dtype = np.float64
+# xpu only support axis = rank -1
+# class TestXPUSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis1(
+#         TestXPUSoftmaxWithCrossEntropyOp3):
+#     def initParams(self):
+#         self.op_type = "softmax_with_cross_entropy"
+#         self.numeric_stable_mode = True
+#         self.soft_label = False
+#         self.shape = [3, 5, 7, 11]
+#         self.ignore_index = 1
+#         self.axis = 0
+#         self.dtype = np.float32
+
+# xpu only support axis = rank -1
+# class TestXPUSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis2(
+#         TestXPUSoftmaxWithCrossEntropyOp3):
+#     def initParams(self):
+#         self.op_type = "softmax_with_cross_entropy"
+#         self.numeric_stable_mode = True
+#         self.soft_label = False
+#         self.shape = [3, 5, 7, 11]
+#         self.ignore_index = 0
+#         self.axis = 1
+#         self.dtype = np.float32
+
+# xpu only support axis = rank -1
+# class TestXPUSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis3(
+#         TestXPUSoftmaxWithCrossEntropyOp3):
+#     def initParams(self):
+#         self.op_type = "softmax_with_cross_entropy"
+#         self.numeric_stable_mode = True
+#         self.soft_label = False
+#         self.shape = [3, 5, 7, 11]
+#         self.ignore_index = 3
+#         self.axis = 2
+#         self.dtype = np.float32
 
 
 class TestXPUSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis4(
@@ -347,7 +352,7 @@ class TestXPUSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis4(
         self.shape = [3, 5, 7, 11]
         self.ignore_index = 3
         self.axis = 3
-        self.dtype = np.float64
+        self.dtype = np.float32
 
 
 class TestXPUSoftmaxWithCrossEntropyOpBoundary0(
@@ -364,7 +369,7 @@ class TestXPUSoftmaxWithCrossEntropyOpBoundary0(
         self.shape = [3, 5, 7, 11]
         self.axis = -1
         self.ignore_index = -1
-        self.dtype = np.float64
+        self.dtype = np.float32
         self.logits = np.full(self.shape, -500.0).astype(self.dtype)
 
 
@@ -382,7 +387,7 @@ class TestXPUSoftmaxWithCrossEntropyOpBoundary1(
         self.shape = [3, 5, 7, 11]
         self.axis = -1
         self.ignore_index = -1
-        self.dtype = np.float64
+        self.dtype = np.float32
         self.logits = np.full(self.shape, 1000.0).astype(self.dtype)
         self.logits[:, :, 0, :] = -1000.0
 
-- 
GitLab