add some ops to train ssd on kunlun (#36407)

* add some ops to train ssd on kunlun * add some ops to train ssd on kunlun * add some ops to train ssd on kunlun * update cast op unittest * update cast op unittest * update cast op unittest * update xpu cmake * update cast unittest

add some ops to train ssd on kunlun (#36407)
* add some ops to train ssd on kunlun * add some ops to train ssd on kunlun * add some ops to train ssd on kunlun * update cast op unittest * update cast op unittest * update cast op unittest * update xpu cmake * update cast unittest
50778ad6 · TTerror · GitHub · cdb9bfa3 · 50778ad6 · 50778ad6
12 changed file
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -35,7 +35,7 @@ ELSE ()
 ENDIF()
 SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210921")
+SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211020")
 SET(XPU_XRE_URL  "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)

--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -299,7 +299,7 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
    endif()
-    if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
+    if (WITH_XPU AND ${pybind_flag} EQUAL 0 AND ${xpu_cc_srcs_len} GREATER 0)
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
    endif()

--- a/paddle/fluid/operators/cast_op_xpu.cc
+++ b/paddle/fluid/operators/cast_op_xpu.cc
@@ -23,6 +23,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
+using var_type = framework::proto::VarType;
+namespace plat = paddle::platform;
 template <typename DeviceContext, typename InT>
 class CastXPUKernel : public framework::OpKernel<InT> {
  using XPUInTDType = typename XPUTypeTrait<InT>::Type;
@@ -31,53 +34,49 @@ class CastXPUKernel : public framework::OpKernel<InT> {
  void Compute(const framework::ExecutionContext& context) const override {
    auto* in = context.Input<framework::Tensor>("X");
    auto* out = context.Output<framework::Tensor>("Out");
-    auto in_type = static_cast<framework::proto::VarType::Type>(
+    auto in_type = static_cast<var_type::Type>(context.Attr<int>("in_dtype"));
-        context.Attr<int>("in_dtype"));
+    auto out_type = static_cast<var_type::Type>(context.Attr<int>("out_dtype"));
-    auto out_type = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("out_dtype"));
    auto* in_data = in->data<InT>();
    auto numel = in->numel();
    auto& dev_ctx = context.template device_context<DeviceContext>();
    int r = -1;
-    if (out_type == framework::proto::VarType::FP32) {
+    switch (out_type) {
-      auto* out_data = out->mutable_data<float>(context.GetPlace());
+      case var_type::FP32:
-      r = xpu::cast_v2<XPUInTDType, float>(
+        r = xpu::cast_v2<XPUInTDType, float>(
-          dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
+            dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
-          out_data, numel);
+            out->mutable_data<float>(context.GetPlace()), numel);
-    } else if (out_type == framework::proto::VarType::INT32) {
+        break;
-      auto* out_data = out->mutable_data<int>(context.GetPlace());
+      case var_type::FP16:
-      r = xpu::cast_v2<XPUInTDType, int32_t>(
+        r = xpu::cast_v2<XPUInTDType, float16>(
-          dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
+            dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
-          out_data, numel);
+            reinterpret_cast<float16*>(
-    } else if (out_type == framework::proto::VarType::INT64) {
+                out->mutable_data<plat::float16>(context.GetPlace())),
-      auto* out_data = out->mutable_data<int64_t>(context.GetPlace());
+            numel);
-      r = xpu::cast_v2<XPUInTDType, int64_t>(
+        break;
-          dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
+      case var_type::INT64:
-          out_data, numel);
+        r = xpu::cast_v2<XPUInTDType, int64_t>(
-    } else if ((out_type == framework::proto::VarType::BOOL) &&
+            dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
-               (in_type == framework::proto::VarType::FP32)) {
+            out->mutable_data<int64_t>(context.GetPlace()), numel);
-      auto* out_data = out->mutable_data<bool>(context.GetPlace());
+        break;
-      r = xpu::cast_v2<float, int8_t>(
+      case var_type::INT32:
-          dev_ctx.x_context(), (const float*)in_data,
+        r = xpu::cast_v2<XPUInTDType, int32_t>(
-          reinterpret_cast<int8_t*>(out_data), numel);
+            dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
-    } else if (out_type == framework::proto::VarType::FP16) {
+            out->mutable_data<int>(context.GetPlace()), numel);
-      auto* out_data =
+        break;
-          out->mutable_data<paddle::platform::float16>(context.GetPlace());
+      case var_type::BOOL:
-      r = xpu::cast_v2<XPUInTDType, float16>(
+        r = xpu::cast_v2<XPUInTDType, bool>(
-          dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
+            dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
-          reinterpret_cast<float16*>(out_data), numel);
+            out->mutable_data<bool>(context.GetPlace()), numel);
+        break;
-    } else {
+      default:
-      PADDLE_THROW(platform::errors::Unavailable("Not supported cast %d -> %d",
+        PADDLE_THROW(platform::errors::Unavailable(
-                                                 in_type, out_type));
+            "Not supported cast %d -> %d", in_type, out_type));
    }
    PADDLE_ENFORCE_EQ(
        r, XPU_SUCCESS,
-        platform::errors::External(
+        platform::errors::External("XPU CAST API return wrong value[%d %s].", r,
-            "XPU API return wrong value[%d], please check whether "
+                                   XPUAPIErrorMsg[r]));
-            "Baidu Kunlun Card is properly installed.",
-            r));
  }
 };
@@ -90,5 +89,6 @@ REGISTER_OP_XPU_KERNEL(
    ops::CastXPUKernel<paddle::platform::XPUDeviceContext, float>,
    ops::CastXPUKernel<paddle::platform::XPUDeviceContext,
                       paddle::platform::float16>,
-    ops::CastXPUKernel<paddle::platform::XPUDeviceContext, int64_t>);
+    ops::CastXPUKernel<paddle::platform::XPUDeviceContext, int64_t>,
+    ops::CastXPUKernel<paddle::platform::XPUDeviceContext, bool>);
 #endif
--- a/paddle/fluid/operators/clip_op_xpu.cc
+++ b/paddle/fluid/operators/clip_op_xpu.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/clip_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename DeviceContext, typename T>
+class ClipXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+    auto max = static_cast<T>(ctx.Attr<float>("max"));
+    if (ctx.HasInput("Max")) {
+      Tensor max_cpu;
+      auto* max_t = ctx.Input<Tensor>("Max");
+      auto* max_data = max_t->data<T>();
+      if (platform::is_xpu_place(max_t->place())) {
+        TensorCopySync(*max_t, platform::CPUPlace(), &max_cpu);
+        max_data = max_cpu.data<T>();
+      }
+      max = max_data[0];
+    }
+    auto min = ctx.Attr<float>("min");
+    if (ctx.HasInput("Min")) {
+      Tensor min_cpu;
+      auto* min_t = ctx.Input<Tensor>("Min");
+      auto* min_data = min_t->data<T>();
+      if (platform::is_xpu_place(min_t->place())) {
+        TensorCopySync(*min_t, platform::CPUPlace(), &min_cpu);
+        min_data = min_cpu.data<T>();
+      }
+      min = min_data[0];
+    }
+    using XPUDataType = typename XPUTypeTrait<T>::Type;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto x_data = reinterpret_cast<const XPUDataType*>(x->data<T>());
+    auto out_data = reinterpret_cast<XPUDataType*>(out->data<T>());
+    int r = xpu::clip_v2(dev_ctx.x_context(), x_data, out_data, x->numel(), min,
+                         max);
+    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
+                                          "XPU API(clip_v2) return wrong "
+                                          "value[%d %s]",
+                                          r, XPUAPIErrorMsg[r]));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_XPU_KERNEL(clip, ops::ClipXPUKernel<plat::XPUDeviceContext, float>);
+#endif
--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
@@ -22,3 +22,9 @@ endif()
 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n")
 file(APPEND ${pybind_file} "USE_OP(logical_and);\nUSE_OP(logical_or);\nUSE_OP(logical_xor);\nUSE_OP(logical_not);\n")
 file(APPEND ${pybind_file} "USE_OP(bitwise_and);\nUSE_OP(bitwise_or);\nUSE_OP(bitwise_xor);\nUSE_OP(bitwise_not);\n")
+if(WITH_XPU)
+  file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(equal, XPU);\nUSE_OP_DEVICE_KERNEL(not_equal, XPU);\n")
+  file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(less_than, XPU);\nUSE_OP_DEVICE_KERNEL(less_equal, XPU);\n")
+  file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(greater_than, XPU);\nUSE_OP_DEVICE_KERNEL(greater_equal, XPU);\n")
+endif()
--- a/paddle/fluid/operators/controlflow/compare_op_xpu.cc
+++ b/paddle/fluid/operators/controlflow/compare_op_xpu.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/controlflow/compare_op.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+namespace paddle {
+namespace operators {
+template <typename T, typename XPUType>
+void XPUCompare(
+    const framework::ExecutionContext& ctx,
+    std::function<int(xpu::Context*, const XPUType*, const XPUType*, bool*,
+                      const std::vector<int>&, const std::vector<int>&)>
+        func) {
+  auto* x = ctx.Input<framework::Tensor>("X");
+  auto* y = ctx.Input<framework::Tensor>("Y");
+  auto* z = ctx.Output<framework::Tensor>("Out");
+  auto x_shape = framework::vectorize<int>(x->dims());
+  auto y_shape = framework::vectorize<int>(y->dims());
+  auto x_data = reinterpret_cast<const XPUType*>(x->data<T>());
+  auto y_data = reinterpret_cast<const XPUType*>(y->data<T>());
+  auto z_data = z->mutable_data<bool>(ctx.GetPlace());
+  auto& dev_ctx =
+      ctx.template device_context<paddle::platform::XPUDeviceContext>();
+  int ret = func(dev_ctx.x_context(), x_data, y_data, z_data, x_shape, y_shape);
+  PADDLE_ENFORCE_EQ(
+      ret, xpu::SUCCESS,
+      platform::errors::External(
+          "XPU kernel compare op occur error[%d %s] in XPUCompare.", ret,
+          XPUAPIErrorMsg[ret]));
+}
+template <typename DeviceContext, typename T>
+class EqualXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUCompare<T, XPUType>(ctx, xpu::broadcast_equal<XPUType>);
+  }
+};
+template <typename DeviceContext, typename T>
+class NotEqualXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUCompare<T, XPUType>(ctx, xpu::broadcast_not_equal<XPUType>);
+  }
+};
+template <typename DeviceContext, typename T>
+class LessThanXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUCompare<T, XPUType>(ctx, xpu::broadcast_less_than<XPUType>);
+  }
+};
+template <typename DeviceContext, typename T>
+class LessEqualXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUCompare<T, XPUType>(ctx, xpu::broadcast_less_equal<XPUType>);
+  }
+};
+template <typename DeviceContext, typename T>
+class GreaterThanXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUCompare<T, XPUType>(ctx, xpu::broadcast_greater_than<XPUType>);
+  }
+};
+template <typename DeviceContext, typename T>
+class GreaterEqualXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUCompare<T, XPUType>(ctx, xpu::broadcast_greater_equal<XPUType>);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_XPU_KERNEL(equal,
+                       ops::EqualXPUKernel<plat::XPUDeviceContext, float>,
+                       ops::EqualXPUKernel<plat::XPUDeviceContext, int>,
+                       ops::EqualXPUKernel<plat::XPUDeviceContext, int64_t>);
+REGISTER_OP_XPU_KERNEL(not_equal,
+                       ops::NotEqualXPUKernel<plat::XPUDeviceContext, float>,
+                       ops::NotEqualXPUKernel<plat::XPUDeviceContext, int>,
+                       ops::NotEqualXPUKernel<plat::XPUDeviceContext, int64_t>);
+REGISTER_OP_XPU_KERNEL(less_than,
+                       ops::LessThanXPUKernel<plat::XPUDeviceContext, float>,
+                       ops::LessThanXPUKernel<plat::XPUDeviceContext, int>,
+                       ops::LessThanXPUKernel<plat::XPUDeviceContext, int64_t>);
+REGISTER_OP_XPU_KERNEL(
+    less_equal, ops::LessEqualXPUKernel<plat::XPUDeviceContext, float>,
+    ops::LessEqualXPUKernel<plat::XPUDeviceContext, int>,
+    ops::LessEqualXPUKernel<plat::XPUDeviceContext, int64_t>);
+REGISTER_OP_XPU_KERNEL(
+    greater_than, ops::GreaterThanXPUKernel<plat::XPUDeviceContext, float>,
+    ops::GreaterThanXPUKernel<plat::XPUDeviceContext, int>,
+    ops::GreaterThanXPUKernel<plat::XPUDeviceContext, int64_t>);
+REGISTER_OP_XPU_KERNEL(
+    greater_equal, ops::GreaterEqualXPUKernel<plat::XPUDeviceContext, float>,
+    ops::GreaterEqualXPUKernel<plat::XPUDeviceContext, int>,
+    ops::GreaterEqualXPUKernel<plat::XPUDeviceContext, int64_t>);
+#endif
--- a/paddle/fluid/operators/stack_op_xpu.cc
+++ b/paddle/fluid/operators/stack_op_xpu.cc
@@ -66,5 +66,7 @@ namespace plat = paddle::platform;
 namespace ops = paddle::operators;
 REGISTER_OP_XPU_KERNEL(stack,
+                       ops::StackXPUKernel<plat::XPUDeviceContext, int64_t>,
+                       ops::StackXPUKernel<plat::XPUDeviceContext, int>,
                       ops::StackXPUKernel<plat::XPUDeviceContext, float>);
 #endif
--- a/paddle/fluid/platform/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/xpu/xpu2_op_list.h
@@ -119,6 +119,35 @@ XPUOpMap& get_kl2_ops() {
      {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                                   pOpKernelType(vartype::FP16, XPUPlace()),
                                   pOpKernelType(vartype::INT32, XPUPlace())})},
+      {"equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
+                              pOpKernelType(vartype::INT32, XPUPlace()),
+                              pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"not_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
+                                  pOpKernelType(vartype::INT32, XPUPlace()),
+                                  pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"less_than", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
+                                  pOpKernelType(vartype::INT32, XPUPlace()),
+                                  pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"less_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
+                                   pOpKernelType(vartype::INT32, XPUPlace()),
+                                   pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"greater_than",
+       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
+                     pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"greater_equal",
+       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
+                     pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"clip", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"stack", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                              pOpKernelType(vartype::INT64, XPUPlace()),
+                              pOpKernelType(vartype::INT32, XPUPlace())})},
+      {"cast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                             pOpKernelType(vartype::FP16, XPUPlace()),
+                             pOpKernelType(vartype::BOOL, XPUPlace()),
+                             pOpKernelType(vartype::INT64, XPUPlace()),
+                             pOpKernelType(vartype::INT32, XPUPlace())})},
      {"fill_any_like",
       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                     pOpKernelType(vartype::INT32, XPUPlace()),

--- a/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
@@ -16,71 +16,48 @@ from __future__ import print_function
 import sys
 sys.path.append("..")
-import op_test
 import unittest
+import op_test
 import numpy as np
 import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
+typeid_dict = {
-class TestCastOp1(op_test.OpTest):
+    'int32': int(core.VarDesc.VarType.INT32),
-    def setUp(self):
+    'int64': int(core.VarDesc.VarType.INT64),
-        ipt = np.random.random(size=[10, 10])
+    'float32': int(core.VarDesc.VarType.FP32),
-        self.inputs = {'X': ipt.astype('float32')}
+    'float16': int(core.VarDesc.VarType.FP16),
-        self.outputs = {'Out': ipt.astype('float32')}
+    'bool': int(core.VarDesc.VarType.BOOL),
-        self.attrs = {
+}
-            'in_dtype': int(core.VarDesc.VarType.FP32),
-            'out_dtype': int(core.VarDesc.VarType.FP32)
-        }
+def create_test_class(in_typename, out_typename):
-        self.op_type = 'cast'
+    class Cls(op_test.OpTest):
+        def setUp(self):
-    def test_check_output(self):
+            ipt = np.random.random(size=[10, 10])
-        if paddle.is_compiled_with_xpu():
+            self.inputs = {'X': ipt.astype(in_typename)}
-            place = paddle.XPUPlace(0)
+            self.outputs = {'Out': ipt.astype(in_typename).astype(out_typename)}
-            self.check_output_with_place(place)
+            self.attrs = {
+                'in_dtype': typeid_dict[in_typename],
-    def test_grad(self):
+                'out_dtype': typeid_dict[out_typename],
-        if paddle.is_compiled_with_xpu():
+            }
-            place = paddle.XPUPlace(0)
+            self.op_type = 'cast'
-            self.check_grad_with_place(place, ['X'], ['Out'])
+        def test_check_output(self):
+            if paddle.is_compiled_with_xpu():
-class TestCastOp2(op_test.OpTest):
+                place = paddle.XPUPlace(0)
-    def setUp(self):
+                self.check_output_with_place(place)
-        ipt = np.random.random(size=[10, 10])
-        self.inputs = {'X': ipt.astype('float32')}
+    cls_name = "cast_{0}_{1}".format(in_typename, out_typename)
-        self.outputs = {'Out': ipt.astype('float16')}
+    Cls.__name__ = cls_name
-        self.attrs = {
+    globals()[cls_name] = Cls
-            'in_dtype': int(core.VarDesc.VarType.FP32),
-            'out_dtype': int(core.VarDesc.VarType.FP16)
-        }
+for in_type in {'float16', 'float32', 'int32', 'int64', 'bool'}:
-        self.op_type = 'cast'
+    for out_type in {'float16', 'float32', 'int32', 'int64'}:
+        create_test_class(in_type, out_type)
-    def test_check_output(self):
-        #self.check_output(atol=1e-3)
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place, atol=1e-3)
-class TestCastOp3(op_test.OpTest):
-    def setUp(self):
-        ipt = np.random.random(size=[10, 10])
-        self.inputs = {'X': ipt.astype('float16')}
-        self.outputs = {'Out': ipt.astype('float32')}
-        self.attrs = {
-            'in_dtype': int(core.VarDesc.VarType.FP16),
-            'out_dtype': int(core.VarDesc.VarType.FP32)
-        }
-        self.op_type = 'cast'
-    def test_check_output(self):
-        #self.check_output(atol=1e-3)
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place, atol=1e-3)
 class TestCastOpError(unittest.TestCase):

--- a/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from op_test_xpu import OpTest, XPUOpTest
+import paddle
+from paddle.fluid import Program, program_guard
+class TestClipOp(XPUOpTest):
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+        self.place = paddle.XPUPlace(0)
+    def setUp(self):
+        self.set_xpu()
+        self.max_relative_error = 0.006
+        self.inputs = {}
+        self.initTestCase()
+        self.op_type = "clip"
+        self.attrs = {}
+        self.attrs['min'] = self.min
+        self.attrs['max'] = self.max
+        if 'Min' in self.inputs:
+            min_v = self.inputs['Min']
+        else:
+            min_v = self.attrs['min']
+        if 'Max' in self.inputs:
+            max_v = self.inputs['Max']
+        else:
+            max_v = self.attrs['max']
+        input = np.random.random(self.shape).astype("float32")
+        input[np.abs(input - min_v) < self.max_relative_error] = 0.5
+        input[np.abs(input - max_v) < self.max_relative_error] = 0.5
+        self.inputs['X'] = input
+        self.outputs = {'Out': np.clip(self.inputs['X'], min_v, max_v)}
+    def test_check_output(self):
+        paddle.enable_static()
+        self.check_output_with_place(self.place)
+        paddle.disable_static()
+    def test_check_grad_normal(self):
+        paddle.enable_static()
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+        paddle.disable_static()
+    def initTestCase(self):
+        self.shape = (4, 10, 10)
+        self.max = 0.8
+        self.min = 0.3
+        self.inputs['Max'] = np.array([0.8]).astype('float32')
+        self.inputs['Min'] = np.array([0.1]).astype('float32')
+class TestCase1(TestClipOp):
+    def initTestCase(self):
+        self.shape = (8, 16, 8)
+        self.max = 0.7
+        self.min = 0.0
+class TestCase2(TestClipOp):
+    def initTestCase(self):
+        self.shape = (8, 16)
+        self.max = 1.0
+        self.min = 0.0
+class TestCase3(TestClipOp):
+    def initTestCase(self):
+        self.shape = (4, 8, 16)
+        self.max = 0.7
+        self.min = 0.2
+class TestCase4(TestClipOp):
+    def initTestCase(self):
+        self.shape = (4, 8, 8)
+        self.max = 0.7
+        self.min = 0.2
+        self.inputs['Max'] = np.array([0.8]).astype('float32')
+        self.inputs['Min'] = np.array([0.3]).astype('float32')
+class TestCase5(TestClipOp):
+    def initTestCase(self):
+        self.shape = (4, 8, 16)
+        self.max = 0.5
+        self.min = 0.5
+class TestClipOpError(unittest.TestCase):
+    def test_errors(self):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+            input_data = np.random.random((2, 4)).astype("float32")
+            def test_Variable():
+                fluid.layers.clip(x=input_data, min=-1.0, max=1.0)
+            self.assertRaises(TypeError, test_Variable)
+            def test_dtype():
+                x2 = fluid.layers.data(name='x2', shape=[1], dtype='int32')
+                fluid.layers.clip(x=x2, min=-1.0, max=1.0)
+            self.assertRaises(TypeError, test_dtype)
+        paddle.disable_static()
+class TestClipAPI(unittest.TestCase):
+    def _executed_api(self, x, min=None, max=None):
+        return paddle.clip(x, min, max)
+    def test_clip(self):
+        paddle.enable_static()
+        data_shape = [1, 9, 9, 4]
+        data = np.random.random(data_shape).astype('float32')
+        images = fluid.data(name='image', shape=data_shape, dtype='float32')
+        min = fluid.data(name='min', shape=[1], dtype='float32')
+        max = fluid.data(name='max', shape=[1], dtype='float32')
+        place = fluid.XPUPlace(0) if fluid.core.is_compiled_with_xpu(
+        ) else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        out_1 = self._executed_api(images, min=min, max=max)
+        out_2 = self._executed_api(images, min=0.2, max=0.9)
+        out_3 = self._executed_api(images, min=0.3)
+        out_4 = self._executed_api(images, max=0.7)
+        out_5 = self._executed_api(images, min=min)
+        out_6 = self._executed_api(images, max=max)
+        out_7 = self._executed_api(images, max=-1.)
+        out_8 = self._executed_api(images)
+        res1, res2, res3, res4, res5, res6, res7, res8 = exe.run(
+            fluid.default_main_program(),
+            feed={
+                "image": data,
+                "min": np.array([0.2]).astype('float32'),
+                "max": np.array([0.8]).astype('float32')
+            },
+            fetch_list=[
+                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8
+            ])
+        self.assertTrue(np.allclose(res1, data.clip(0.2, 0.8)))
+        self.assertTrue(np.allclose(res2, data.clip(0.2, 0.9)))
+        self.assertTrue(np.allclose(res3, data.clip(min=0.3)))
+        self.assertTrue(np.allclose(res4, data.clip(max=0.7)))
+        self.assertTrue(np.allclose(res5, data.clip(min=0.2)))
+        self.assertTrue(np.allclose(res6, data.clip(max=0.8)))
+        self.assertTrue(np.allclose(res7, data.clip(max=-1)))
+        self.assertTrue(np.allclose(res8, data))
+        paddle.disable_static()
+    def test_clip_dygraph(self):
+        paddle.disable_static()
+        place = fluid.XPUPlace(0) if fluid.core.is_compiled_with_xpu(
+        ) else fluid.CPUPlace()
+        paddle.disable_static(place)
+        data_shape = [1, 9, 9, 4]
+        data = np.random.random(data_shape).astype('float32')
+        images = paddle.to_tensor(data, dtype='float32')
+        v_min = paddle.to_tensor(np.array([0.2], dtype=np.float32))
+        v_max = paddle.to_tensor(np.array([0.8], dtype=np.float32))
+        out_1 = self._executed_api(images, min=0.2, max=0.8)
+        images = paddle.to_tensor(data, dtype='float32')
+        out_2 = self._executed_api(images, min=0.2, max=0.9)
+        images = paddle.to_tensor(data, dtype='float32')
+        out_3 = self._executed_api(images, min=v_min, max=v_max)
+        self.assertTrue(np.allclose(out_1.numpy(), data.clip(0.2, 0.8)))
+        self.assertTrue(np.allclose(out_2.numpy(), data.clip(0.2, 0.9)))
+        self.assertTrue(np.allclose(out_3.numpy(), data.clip(0.2, 0.8)))
+    def test_errors(self):
+        paddle.enable_static()
+        x1 = fluid.data(name='x1', shape=[1], dtype="int16")
+        x2 = fluid.data(name='x2', shape=[1], dtype="int8")
+        self.assertRaises(TypeError, paddle.clip, x=x1, min=0.2, max=0.8)
+        self.assertRaises(TypeError, paddle.clip, x=x2, min=0.2, max=0.8)
+        paddle.disable_static()
+class TestInplaceClipAPI(TestClipAPI):
+    def _executed_api(self, x, min=None, max=None):
+        return x.clip_(min, max)
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from op_test_xpu import OpTest, XPUOpTest
+import paddle
+from paddle.fluid import Program, program_guard
+def create_test_class(op_type, typename, callback):
+    class Cls(OpTest):
+        def setUp(self):
+            a = np.random.random(size=(10, 7)).astype(typename)
+            b = np.random.random(size=(10, 7)).astype(typename)
+            c = callback(a, b)
+            self.inputs = {'X': a, 'Y': b}
+            self.outputs = {'Out': c}
+            self.op_type = op_type
+            self.use_xpu = True
+            self.attrs = {'use_xpu': True}
+        def test_check_output(self):
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+        def test_errors(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = fluid.layers.data(name='x', shape=[2], dtype='int32')
+                y = fluid.layers.data(name='y', shape=[2], dtype='int32')
+                a = fluid.layers.data(name='a', shape=[2], dtype='int16')
+                if self.op_type == "less_than":
+                    self.assertRaises(
+                        TypeError,
+                        fluid.layers.less_than,
+                        x=x,
+                        y=y,
+                        force_cpu=1)
+                op = eval("fluid.layers.%s" % self.op_type)
+                self.assertRaises(TypeError, op, x=x, y=y, cond=1)
+                self.assertRaises(TypeError, op, x=x, y=a)
+                self.assertRaises(TypeError, op, x=a, y=y)
+    cls_name = "{0}_{1}".format(op_type, typename)
+    Cls.__name__ = cls_name
+    globals()[cls_name] = Cls
+for _type_name in {'float32', 'int32', 'int64'}:
+    if _type_name == 'float64' and core.is_compiled_with_rocm():
+        _type_name = 'float32'
+    create_test_class('less_than', _type_name, lambda _a, _b: _a < _b)
+    create_test_class('less_equal', _type_name, lambda _a, _b: _a <= _b)
+    create_test_class('greater_than', _type_name, lambda _a, _b: _a > _b)
+    create_test_class('greater_equal', _type_name, lambda _a, _b: _a >= _b)
+    create_test_class('equal', _type_name, lambda _a, _b: _a == _b)
+    create_test_class('not_equal', _type_name, lambda _a, _b: _a != _b)
+def create_paddle_case(op_type, callback):
+    class PaddleCls(unittest.TestCase):
+        def setUp(self):
+            self.op_type = op_type
+            self.input_x = np.array([1, 2, 3, 4]).astype(np.int64)
+            self.input_y = np.array([1, 3, 2, 4]).astype(np.int64)
+            self.real_result = callback(self.input_x, self.input_y)
+            self.place = fluid.XPUPlace(0) if fluid.core.is_compiled_with_xpu(
+            ) else fluid.CPUPlace()
+        def test_api(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = fluid.data(name='x', shape=[4], dtype='int64')
+                y = fluid.data(name='y', shape=[4], dtype='int64')
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                exe = fluid.Executor(self.place)
+                res, = exe.run(feed={"x": self.input_x,
+                                     "y": self.input_y},
+                               fetch_list=[out])
+            self.assertEqual((res == self.real_result).all(), True)
+        def test_api_float(self):
+            if self.op_type == "equal":
+                paddle.enable_static()
+                with program_guard(Program(), Program()):
+                    x = fluid.data(name='x', shape=[4], dtype='int64')
+                    y = fluid.data(name='y', shape=[1], dtype='int64')
+                    op = eval("paddle.%s" % (self.op_type))
+                    out = op(x, y)
+                    exe = fluid.Executor(self.place)
+                    res, = exe.run(feed={"x": self.input_x,
+                                         "y": 1.0},
+                                   fetch_list=[out])
+                self.real_result = np.array([1, 0, 0, 0]).astype(np.int64)
+                self.assertEqual((res == self.real_result).all(), True)
+        def test_dynamic_api(self):
+            paddle.disable_static()
+            x = paddle.to_tensor(self.input_x)
+            y = paddle.to_tensor(self.input_y)
+            op = eval("paddle.%s" % (self.op_type))
+            out = op(x, y)
+            self.assertEqual((out.numpy() == self.real_result).all(), True)
+            paddle.enable_static()
+        def test_dynamic_api_int(self):
+            if self.op_type == "equal":
+                paddle.disable_static()
+                x = paddle.to_tensor(self.input_x)
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, 1)
+                self.real_result = np.array([1, 0, 0, 0]).astype(np.int64)
+                self.assertEqual((out.numpy() == self.real_result).all(), True)
+                paddle.enable_static()
+        def test_dynamic_api_float(self):
+            if self.op_type == "equal":
+                paddle.disable_static()
+                x = paddle.to_tensor(self.input_x)
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, 1.0)
+                self.real_result = np.array([1, 0, 0, 0]).astype(np.int64)
+                self.assertEqual((out.numpy() == self.real_result).all(), True)
+                paddle.enable_static()
+        def test_assert(self):
+            def test_dynamic_api_string(self):
+                if self.op_type == "equal":
+                    paddle.disable_static()
+                    x = paddle.to_tensor(self.input_x)
+                    op = eval("paddle.%s" % (self.op_type))
+                    out = op(x, "1.0")
+                    paddle.enable_static()
+            self.assertRaises(TypeError, test_dynamic_api_string)
+        def test_dynamic_api_bool(self):
+            if self.op_type == "equal":
+                paddle.disable_static()
+                x = paddle.to_tensor(self.input_x)
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, True)
+                self.real_result = np.array([1, 0, 0, 0]).astype(np.int64)
+                self.assertEqual((out.numpy() == self.real_result).all(), True)
+                paddle.enable_static()
+        def test_broadcast_api_1(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(
+                    name='x', shape=[1, 2, 1, 3], dtype='int32')
+                y = paddle.static.data(name='y', shape=[1, 2, 3], dtype='int32')
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                exe = paddle.static.Executor(self.place)
+                input_x = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(np.int32)
+                input_y = np.arange(0, 6).reshape((1, 2, 3)).astype(np.int32)
+                real_result = callback(input_x, input_y)
+                res, = exe.run(feed={"x": input_x,
+                                     "y": input_y},
+                               fetch_list=[out])
+            self.assertEqual((res == real_result).all(), True)
+        def test_broadcast_api_2(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(name='x', shape=[1, 2, 3], dtype='int32')
+                y = paddle.static.data(
+                    name='y', shape=[1, 2, 1, 3], dtype='int32')
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                exe = paddle.static.Executor(self.place)
+                input_x = np.arange(0, 6).reshape((1, 2, 3)).astype(np.int32)
+                input_y = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(np.int32)
+                real_result = callback(input_x, input_y)
+                res, = exe.run(feed={"x": input_x,
+                                     "y": input_y},
+                               fetch_list=[out])
+            self.assertEqual((res == real_result).all(), True)
+        def test_broadcast_api_3(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(name='x', shape=[5], dtype='int32')
+                y = paddle.static.data(name='y', shape=[3, 1], dtype='int32')
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                exe = paddle.static.Executor(self.place)
+                input_x = np.arange(0, 5).reshape((5)).astype(np.int32)
+                input_y = np.array([5, 3, 2]).reshape((3, 1)).astype(np.int32)
+                real_result = callback(input_x, input_y)
+                res, = exe.run(feed={"x": input_x,
+                                     "y": input_y},
+                               fetch_list=[out])
+            self.assertEqual((res == real_result).all(), True)
+        def test_bool_api_4(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(name='x', shape=[3, 1], dtype='bool')
+                y = paddle.static.data(name='y', shape=[3, 1], dtype='bool')
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                exe = paddle.static.Executor(self.place)
+                input_x = np.array([True, False, True]).astype(np.bool)
+                input_y = np.array([True, True, False]).astype(np.bool)
+                real_result = callback(input_x, input_y)
+                res, = exe.run(feed={"x": input_x,
+                                     "y": input_y},
+                               fetch_list=[out])
+            self.assertEqual((res == real_result).all(), True)
+        def test_bool_broadcast_api_4(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(name='x', shape=[3, 1], dtype='bool')
+                y = paddle.static.data(name='y', shape=[1], dtype='bool')
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                exe = paddle.static.Executor(self.place)
+                input_x = np.array([True, False, True]).astype(np.bool)
+                input_y = np.array([True]).astype(np.bool)
+                real_result = callback(input_x, input_y)
+                res, = exe.run(feed={"x": input_x,
+                                     "y": input_y},
+                               fetch_list=[out])
+            self.assertEqual((res == real_result).all(), True)
+        def test_attr_name(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = fluid.layers.data(name='x', shape=[4], dtype='int32')
+                y = fluid.layers.data(name='y', shape=[4], dtype='int32')
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x=x, y=y, name="name_%s" % (self.op_type))
+            self.assertEqual("name_%s" % (self.op_type) in out.name, True)
+    cls_name = "TestCase_{}".format(op_type)
+    PaddleCls.__name__ = cls_name
+    globals()[cls_name] = PaddleCls
+create_paddle_case('less_than', lambda _a, _b: _a < _b)
+create_paddle_case('less_equal', lambda _a, _b: _a <= _b)
+create_paddle_case('greater_than', lambda _a, _b: _a > _b)
+create_paddle_case('greater_equal', lambda _a, _b: _a >= _b)
+create_paddle_case('equal', lambda _a, _b: _a == _b)
+create_paddle_case('not_equal', lambda _a, _b: _a != _b)
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py
@@ -97,5 +97,27 @@ class TestStackOp6(TestStackOpBase):
        self.axis = 3
+class TestStackOpint64(TestStackOpBase):
+    def initDefaultParameters(self):
+        self.num_inputs = 4
+        self.input_dim = (5, 6, 7)
+        self.axis = 0
+        self.dtype = 'int64'
+    def initParameters(self):
+        self.num_inputs = 16
+class TestStackOpint(TestStackOpBase):
+    def initDefaultParameters(self):
+        self.num_inputs = 4
+        self.input_dim = (5, 6, 7)
+        self.axis = 0
+        self.dtype = 'int'
+    def initParameters(self):
+        self.num_inputs = 16
 if __name__ == '__main__':
    unittest.main()