Add exp, abs_grad, reciprocal, reciprocal_grad operator for XPU and update...

Add exp, abs_grad, reciprocal, reciprocal_grad operator for XPU and update xpu2_op_list.h,test=kunlun (#38570)

Add exp, abs_grad, reciprocal, reciprocal_grad operator for XPU and update...
Add exp, abs_grad, reciprocal, reciprocal_grad operator for XPU and update xpu2_op_list.h,test=kunlun (#38570)
ceec1e21 · zhangyk0314 · GitHub · 1fa6900e · ceec1e21 · ceec1e21
4 changed file
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -36,7 +36,7 @@ ENDIF()
 if(NOT DEFINED XPU_BASE_URL)
  SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211226")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211228")
 else()
  SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()

--- a/paddle/fluid/operators/activation_op_xpu.cc
+++ b/paddle/fluid/operators/activation_op_xpu.cc
@@ -98,29 +98,29 @@ void xpu_activation_backward(
 }
 template <typename T>
-struct XPUReluFunctor : public BaseActivationFunctor<T> {
+struct XPUAbsFunctor : public BaseActivationFunctor<T> {
  using XPUType = typename XPUTypeTrait<T>::Type;
  void operator()(const framework::ExecutionContext &ctx) const {
    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::relu<XPUType>);
+        ctx, xpu::abs<XPUType>);
  }
 };
 template <typename T>
-struct XPUSigmoidFunctor : public BaseActivationFunctor<T> {
+struct XPUAbsGradFunctor : public BaseActivationFunctor<T> {
  using XPUType = typename XPUTypeTrait<T>::Type;
  void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
+    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::sigmoid<XPUType>);
+        ctx, xpu::abs_grad<XPUType>);
  }
 };
 template <typename T>
-struct XPUTanhFunctor : public BaseActivationFunctor<T> {
+struct XPUExpFunctor : public BaseActivationFunctor<T> {
  using XPUType = typename XPUTypeTrait<T>::Type;
  void operator()(const framework::ExecutionContext &ctx) const {
    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::tanh<XPUType>);
+        ctx, xpu::exp<XPUType>);
  }
 };
@@ -134,119 +134,83 @@ struct XPULogFunctor : public BaseActivationFunctor<T> {
 };
 template <typename T>
-struct XPUSquareFunctor : public BaseActivationFunctor<T> {
+struct XPUReciprocalFunctor : public BaseActivationFunctor<T> {
  using XPUType = typename XPUTypeTrait<T>::Type;
  void operator()(const framework::ExecutionContext &ctx) const {
    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::square<XPUType>);
+        ctx, xpu::reciprocal<XPUType>);
  }
 };
 template <typename T>
-struct XPUSqrtFunctor : public BaseActivationFunctor<T> {
+struct XPUReciprocalGradFunctor : public BaseActivationFunctor<T> {
  using XPUType = typename XPUTypeTrait<T>::Type;
  void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
+    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::sqrt<XPUType>);
+        ctx, xpu::reciprocal_grad<XPUType>);
  }
 };
 template <typename T>
-struct XPUAbsFunctor : public BaseActivationFunctor<T> {
+struct XPUReluFunctor : public BaseActivationFunctor<T> {
  using XPUType = typename XPUTypeTrait<T>::Type;
  void operator()(const framework::ExecutionContext &ctx) const {
    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::abs<XPUType>);
+        ctx, xpu::relu<XPUType>);
  }
 };
 template <typename T>
-struct XPUPowFunctor : public BaseActivationFunctor<T> {
+struct XPUReluGradFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
  void operator()(const framework::ExecutionContext &ctx) const {
-    const auto *x = ctx.Input<Tensor>("X");
+    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
-    auto *y = ctx.Output<Tensor>("Out");
+        ctx, xpu::relu_grad<XPUType>);
-    auto pow_factor = ctx.Attr<float>("factor");
-    const T *x_data = x->data<T>();
-    T *y_data = y->mutable_data<T>(ctx.GetPlace());
-    T *factor_data = nullptr;
-    auto xpu_context =
-        ctx.device_context<paddle::platform::XPUDeviceContext>().x_context();
-    PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void **>(&factor_data),
-                                 x->numel() * sizeof(T)),
-                      XPU_SUCCESS, platform::errors::ResourceExhausted(
-                                       "XPU has no enough memory"));
-    int r = xpu::constant<T>(xpu_context, factor_data, x->numel(), pow_factor);
-    PADDLE_ENFORCE_EQ(
-        r, xpu::Error_t::SUCCESS,
-        platform::errors::External("XPU constant op return"
-                                   " wrong value[%d %s] in pow op.",
-                                   r, XPUAPIErrorMsg[r]));
-    r = xpu::pow(xpu_context, x_data, factor_data, y_data, x->numel());
-    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                      platform::errors::External("XPU pow op return"
-                                                 " wrong value[%d %s].",
-                                                 r, XPUAPIErrorMsg[r]));
-    if (xpu_context->xpu_stream != nullptr) {
-      xpu_wait(xpu_context->xpu_stream);
-    }
-    xpu_free(factor_data);
  }
 };
 template <typename T>
-struct XPUHardSwishFunctor : public BaseActivationFunctor<T> {
+struct XPUSigmoidFunctor : public BaseActivationFunctor<T> {
  using XPUType = typename XPUTypeTrait<T>::Type;
  void operator()(const framework::ExecutionContext &ctx) const {
-    float threshold = ctx.Attr<float>("threshold");
-    float scale = ctx.Attr<float>("scale");
-    float offset = ctx.Attr<float>("offset");
-    PADDLE_ENFORCE_EQ(threshold, 6.0f,
-                      platform::errors::External(
-                          "Not support threshold [%f] in XPU", threshold));
-    PADDLE_ENFORCE_EQ(scale, 6.0f, platform::errors::External(
-                                       "Not support scale [%f] in XPU", scale));
-    PADDLE_ENFORCE_EQ(
-        offset, 3.0f,
-        platform::errors::External("Not support offset [%f] in XPU", offset));
    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::hard_swish<XPUType>);
+        ctx, xpu::sigmoid<XPUType>);
  }
 };
 template <typename T>
-struct XPUReluGradFunctor : public BaseActivationFunctor<T> {
+struct XPUSigmoidGradFunctor : public BaseActivationFunctor<T> {
  using XPUType = typename XPUTypeTrait<T>::Type;
  void operator()(const framework::ExecutionContext &ctx) const {
    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::relu_grad<XPUType>);
+        ctx, xpu::sigmoid_grad<XPUType>);
  }
 };
 template <typename T>
-struct XPUTanhGradFunctor : public BaseActivationFunctor<T> {
+struct XPUSqrtFunctor : public BaseActivationFunctor<T> {
  using XPUType = typename XPUTypeTrait<T>::Type;
  void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
+    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::tanh_grad<XPUType>);
+        ctx, xpu::sqrt<XPUType>);
  }
 };
 template <typename T>
-struct XPUSigmoidGradFunctor : public BaseActivationFunctor<T> {
+struct XPUSqrtGradFunctor : public BaseActivationFunctor<T> {
  using XPUType = typename XPUTypeTrait<T>::Type;
  void operator()(const framework::ExecutionContext &ctx) const {
    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::sigmoid_grad<XPUType>);
+        ctx, xpu::sqrt_grad<XPUType>);
  }
 };
 template <typename T>
-struct XPUSqrtGradFunctor : public BaseActivationFunctor<T> {
+struct XPUSquareFunctor : public BaseActivationFunctor<T> {
  using XPUType = typename XPUTypeTrait<T>::Type;
  void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
+    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::sqrt_grad<XPUType>);
+        ctx, xpu::square<XPUType>);
  }
 };
@@ -259,6 +223,44 @@ struct XPUSquareGradFunctor : public BaseActivationFunctor<T> {
  }
 };
+template <typename T>
+struct XPUTanhFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  void operator()(const framework::ExecutionContext &ctx) const {
+    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::tanh<XPUType>);
+  }
+};
+template <typename T>
+struct XPUTanhGradFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  void operator()(const framework::ExecutionContext &ctx) const {
+    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::tanh_grad<XPUType>);
+  }
+};
+template <typename T>
+struct XPUHardSwishFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  void operator()(const framework::ExecutionContext &ctx) const {
+    float threshold = ctx.Attr<float>("threshold");
+    float scale = ctx.Attr<float>("scale");
+    float offset = ctx.Attr<float>("offset");
+    PADDLE_ENFORCE_EQ(threshold, 6.0f,
+                      platform::errors::External(
+                          "Not support threshold [%f] in XPU", threshold));
+    PADDLE_ENFORCE_EQ(scale, 6.0f, platform::errors::External(
+                                       "Not support scale [%f] in XPU", scale));
+    PADDLE_ENFORCE_EQ(
+        offset, 3.0f,
+        platform::errors::External("Not support offset [%f] in XPU", offset));
+    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::hard_swish<XPUType>);
+  }
+};
 template <typename T>
 struct XPUHardSwishGradFunctor : public BaseActivationFunctor<T> {
  using XPUType = typename XPUTypeTrait<T>::Type;
@@ -328,6 +330,40 @@ struct XPULeakyReluGradFunctor : public BaseActivationFunctor<T> {
  }
 };
+template <typename T>
+struct XPUPowFunctor : public BaseActivationFunctor<T> {
+  void operator()(const framework::ExecutionContext &ctx) const {
+    const auto *x = ctx.Input<Tensor>("X");
+    auto *y = ctx.Output<Tensor>("Out");
+    auto pow_factor = ctx.Attr<float>("factor");
+    const T *x_data = x->data<T>();
+    T *y_data = y->mutable_data<T>(ctx.GetPlace());
+    T *factor_data = nullptr;
+    auto xpu_context =
+        ctx.device_context<paddle::platform::XPUDeviceContext>().x_context();
+    PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void **>(&factor_data),
+                                 x->numel() * sizeof(T)),
+                      XPU_SUCCESS, platform::errors::ResourceExhausted(
+                                       "XPU has no enough memory"));
+    int r = xpu::constant<T>(xpu_context, factor_data, x->numel(), pow_factor);
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External("XPU constant op return"
+                                   " wrong value[%d %s] in pow op.",
+                                   r, XPUAPIErrorMsg[r]));
+    r = xpu::pow(xpu_context, x_data, factor_data, y_data, x->numel());
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External("XPU pow op return wrong value[%d %s].", r,
+                                   XPUAPIErrorMsg[r]));
+    if (xpu_context->xpu_stream != nullptr) {
+      xpu_wait(xpu_context->xpu_stream);
+    }
+    xpu_free(factor_data);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
@@ -340,15 +376,18 @@ namespace ops = paddle::operators;
      act_type##_grad,                                                   \
      ops::XPUActivationGradKernel<ops::grad_functor<float>>);
+REGISTER_ACTIVATION_XPU_KERNEL(abs, XPUAbsFunctor, XPUAbsGradFunctor)
+REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, XPUHardSwishFunctor,
+                               XPUHardSwishGradFunctor)
+REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, XPULeakyReluFunctor,
+                               XPULeakyReluGradFunctor)
+REGISTER_ACTIVATION_XPU_KERNEL(reciprocal, XPUReciprocalFunctor,
+                               XPUReciprocalGradFunctor)
 REGISTER_ACTIVATION_XPU_KERNEL(relu, XPUReluFunctor, XPUReluGradFunctor)
 REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, XPUSigmoidFunctor,
                               XPUSigmoidGradFunctor)
 REGISTER_ACTIVATION_XPU_KERNEL(sqrt, XPUSqrtFunctor, XPUSqrtGradFunctor)
 REGISTER_ACTIVATION_XPU_KERNEL(square, XPUSquareFunctor, XPUSquareGradFunctor)
-REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, XPUHardSwishFunctor,
-                               XPUHardSwishGradFunctor)
-REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, XPULeakyReluFunctor,
-                               XPULeakyReluGradFunctor)
 REGISTER_OP_XPU_KERNEL(
    tanh, ops::XPUActivationKernel<ops::XPUTanhFunctor<float>>,
@@ -358,11 +397,11 @@ REGISTER_OP_XPU_KERNEL(
    ops::XPUActivationGradKernel<
        ops::XPUTanhGradFunctor<paddle::platform::float16>>);
+REGISTER_OP_XPU_KERNEL(exp,
+                       ops::XPUActivationKernel<ops::XPUExpFunctor<float>>);
 REGISTER_OP_XPU_KERNEL(log,
                       ops::XPUActivationKernel<ops::XPULogFunctor<float>>);
 REGISTER_OP_XPU_KERNEL(pow,
                       ops::XPUActivationKernel<ops::XPUPowFunctor<float>>);
-REGISTER_OP_XPU_KERNEL(abs,
-                       ops::XPUActivationKernel<ops::XPUAbsFunctor<float>>);
 #endif  // PADDLE_WITH_XPU
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -29,6 +29,9 @@ using XPUOpMap = std::unordered_map<std::string, XPUKernelSet>;
 XPUOpMap& get_kl2_ops() {
  // KL1支持的op，通过op_name, data_type, place来索引
  static XPUOpMap s_xpu2_kernels{
+      {"abs", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"abs_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                 pOpKernelType(vartype::FP16, XPUPlace())})},
      {"adamw", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
      {"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
      {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
@@ -106,6 +109,7 @@ XPUOpMap& get_kl2_ops() {
      {"equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                              pOpKernelType(vartype::INT32, XPUPlace()),
                              pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"exp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
      {"expand_as_v2",
       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
                     pOpKernelType(vartype::INT64, XPUPlace()),
@@ -185,6 +189,9 @@ XPUOpMap& get_kl2_ops() {
       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                     pOpKernelType(vartype::INT32, XPUPlace()),
                     pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"hard_swish_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
      {"iou_similarity",
       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
      {"label_smooth",
@@ -227,6 +234,10 @@ XPUOpMap& get_kl2_ops() {
      {"momentum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
      {"mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                            pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"nearest_interp_v2",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"nearest_interp_v2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
      {"not_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                                  pOpKernelType(vartype::INT32, XPUPlace()),
                                  pOpKernelType(vartype::FP32, XPUPlace())})},
@@ -239,6 +250,10 @@ XPUOpMap& get_kl2_ops() {
      {"prior_box", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
      {"range", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                              pOpKernelType(vartype::INT64, XPUPlace())})},
+      {"reciprocal", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reciprocal_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
      {"reduce_max_grad",
       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
      {"reduce_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
@@ -273,6 +288,9 @@ XPUOpMap& get_kl2_ops() {
                                pOpKernelType(vartype::FP32, XPUPlace())})},
      {"shape", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                              pOpKernelType(vartype::INT64, XPUPlace())})},
+      {"sigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sigmoid_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
      {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                                   pOpKernelType(vartype::FP16, XPUPlace()),
                                   pOpKernelType(vartype::INT32, XPUPlace())})},

--- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
@@ -154,6 +154,11 @@ class TestXPUAbs(TestXPUActivation):
        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
        self.outputs = {'Out': out}
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
@@ -334,6 +339,25 @@ def leaky_relu(x, alpha):
    return y_ref.astype(x.dtype)
+class TestXPUReciprocal(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "reciprocal"
+        self.init_dtype()
+        np.random.seed(1024)
+        x = np.random.uniform(1, 2, [1111, 1117]).astype(self.dtype)
+        out = np.reciprocal(x)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+        self.attrs = {'use_xpu': True}
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
 if __name__ == "__main__":
    paddle.enable_static()
    unittest.main()