From ceec1e21df709006bb4a1c73083a034499eea634 Mon Sep 17 00:00:00 2001
From: zhangyk0314 <48021248+zhangyk0314@users.noreply.github.com>
Date: Thu, 30 Dec 2021 19:08:09 +0800
Subject: [PATCH] Add exp, abs_grad, reciprocal, reciprocal_grad operator for
 XPU and update xpu2_op_list.h,test=kunlun (#38570)

---
 cmake/external/xpu.cmake                      |   2 +-
 paddle/fluid/operators/activation_op_xpu.cc   | 183 +++++++++++-------
 .../fluid/platform/device/xpu/xpu2_op_list.h  |  18 ++
 .../unittests/xpu/test_activation_op_xpu.py   |  24 +++
 4 files changed, 154 insertions(+), 73 deletions(-)
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 9041feb10c8..588ba0bfe86 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -36,7 +36,7 @@ ENDIF()
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211226")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211228")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
diff --git a/paddle/fluid/operators/activation_op_xpu.cc b/paddle/fluid/operators/activation_op_xpu.cc
index fe85eb26705..60188ee53ef 100644
--- a/paddle/fluid/operators/activation_op_xpu.cc
+++ b/paddle/fluid/operators/activation_op_xpu.cc
@@ -98,29 +98,29 @@ void xpu_activation_backward(
 }
 
 template <typename T>
-struct XPUReluFunctor : public BaseActivationFunctor<T> {
+struct XPUAbsFunctor : public BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
     xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::relu<XPUType>);
+        ctx, xpu::abs<XPUType>);
   }
 };
 
 template <typename T>
-struct XPUSigmoidFunctor : public BaseActivationFunctor<T> {
+struct XPUAbsGradFunctor : public BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::sigmoid<XPUType>);
+    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::abs_grad<XPUType>);
   }
 };
 
 template <typename T>
-struct XPUTanhFunctor : public BaseActivationFunctor<T> {
+struct XPUExpFunctor : public BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
     xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::tanh<XPUType>);
+        ctx, xpu::exp<XPUType>);
   }
 };
 
@@ -134,119 +134,83 @@ struct XPULogFunctor : public BaseActivationFunctor<T> {
 };
 
 template <typename T>
-struct XPUSquareFunctor : public BaseActivationFunctor<T> {
+struct XPUReciprocalFunctor : public BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
     xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::square<XPUType>);
+        ctx, xpu::reciprocal<XPUType>);
   }
 };
 
 template <typename T>
-struct XPUSqrtFunctor : public BaseActivationFunctor<T> {
+struct XPUReciprocalGradFunctor : public BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::sqrt<XPUType>);
+    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::reciprocal_grad<XPUType>);
   }
 };
 
 template <typename T>
-struct XPUAbsFunctor : public BaseActivationFunctor<T> {
+struct XPUReluFunctor : public BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
     xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::abs<XPUType>);
+        ctx, xpu::relu<XPUType>);
   }
 };
 
 template <typename T>
-struct XPUPowFunctor : public BaseActivationFunctor<T> {
+struct XPUReluGradFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
-    const auto *x = ctx.Input<Tensor>("X");
-    auto *y = ctx.Output<Tensor>("Out");
-    auto pow_factor = ctx.Attr<float>("factor");
-    const T *x_data = x->data<T>();
-    T *y_data = y->mutable_data<T>(ctx.GetPlace());
-    T *factor_data = nullptr;
-
-    auto xpu_context =
-        ctx.device_context<paddle::platform::XPUDeviceContext>().x_context();
-    PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void **>(&factor_data),
-                                 x->numel() * sizeof(T)),
-                      XPU_SUCCESS, platform::errors::ResourceExhausted(
-                                       "XPU has no enough memory"));
-    int r = xpu::constant<T>(xpu_context, factor_data, x->numel(), pow_factor);
-    PADDLE_ENFORCE_EQ(
-        r, xpu::Error_t::SUCCESS,
-        platform::errors::External("XPU constant op return"
-                                   " wrong value[%d %s] in pow op.",
-                                   r, XPUAPIErrorMsg[r]));
-    r = xpu::pow(xpu_context, x_data, factor_data, y_data, x->numel());
-    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                      platform::errors::External("XPU pow op return"
-                                                 " wrong value[%d %s].",
-                                                 r, XPUAPIErrorMsg[r]));
-    if (xpu_context->xpu_stream != nullptr) {
-      xpu_wait(xpu_context->xpu_stream);
-    }
-    xpu_free(factor_data);
+    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::relu_grad<XPUType>);
   }
 };
 
 template <typename T>
-struct XPUHardSwishFunctor : public BaseActivationFunctor<T> {
+struct XPUSigmoidFunctor : public BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
-    float threshold = ctx.Attr<float>("threshold");
-    float scale = ctx.Attr<float>("scale");
-    float offset = ctx.Attr<float>("offset");
-    PADDLE_ENFORCE_EQ(threshold, 6.0f,
-                      platform::errors::External(
-                          "Not support threshold [%f] in XPU", threshold));
-    PADDLE_ENFORCE_EQ(scale, 6.0f, platform::errors::External(
-                                       "Not support scale [%f] in XPU", scale));
-    PADDLE_ENFORCE_EQ(
-        offset, 3.0f,
-        platform::errors::External("Not support offset [%f] in XPU", offset));
     xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::hard_swish<XPUType>);
+        ctx, xpu::sigmoid<XPUType>);
   }
 };
 
 template <typename T>
-struct XPUReluGradFunctor : public BaseActivationFunctor<T> {
+struct XPUSigmoidGradFunctor : public BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
     xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::relu_grad<XPUType>);
+        ctx, xpu::sigmoid_grad<XPUType>);
   }
 };
 
 template <typename T>
-struct XPUTanhGradFunctor : public BaseActivationFunctor<T> {
+struct XPUSqrtFunctor : public BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::tanh_grad<XPUType>);
+    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::sqrt<XPUType>);
   }
 };
 
 template <typename T>
-struct XPUSigmoidGradFunctor : public BaseActivationFunctor<T> {
+struct XPUSqrtGradFunctor : public BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
     xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::sigmoid_grad<XPUType>);
+        ctx, xpu::sqrt_grad<XPUType>);
   }
 };
 
 template <typename T>
-struct XPUSqrtGradFunctor : public BaseActivationFunctor<T> {
+struct XPUSquareFunctor : public BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::sqrt_grad<XPUType>);
+    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::square<XPUType>);
   }
 };
 
@@ -259,6 +223,44 @@ struct XPUSquareGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct XPUTanhFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  void operator()(const framework::ExecutionContext &ctx) const {
+    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::tanh<XPUType>);
+  }
+};
+
+template <typename T>
+struct XPUTanhGradFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  void operator()(const framework::ExecutionContext &ctx) const {
+    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::tanh_grad<XPUType>);
+  }
+};
+
+template <typename T>
+struct XPUHardSwishFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  void operator()(const framework::ExecutionContext &ctx) const {
+    float threshold = ctx.Attr<float>("threshold");
+    float scale = ctx.Attr<float>("scale");
+    float offset = ctx.Attr<float>("offset");
+    PADDLE_ENFORCE_EQ(threshold, 6.0f,
+                      platform::errors::External(
+                          "Not support threshold [%f] in XPU", threshold));
+    PADDLE_ENFORCE_EQ(scale, 6.0f, platform::errors::External(
+                                       "Not support scale [%f] in XPU", scale));
+    PADDLE_ENFORCE_EQ(
+        offset, 3.0f,
+        platform::errors::External("Not support offset [%f] in XPU", offset));
+    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::hard_swish<XPUType>);
+  }
+};
+
 template <typename T>
 struct XPUHardSwishGradFunctor : public BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
@@ -328,6 +330,40 @@ struct XPULeakyReluGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct XPUPowFunctor : public BaseActivationFunctor<T> {
+  void operator()(const framework::ExecutionContext &ctx) const {
+    const auto *x = ctx.Input<Tensor>("X");
+    auto *y = ctx.Output<Tensor>("Out");
+    auto pow_factor = ctx.Attr<float>("factor");
+    const T *x_data = x->data<T>();
+    T *y_data = y->mutable_data<T>(ctx.GetPlace());
+    T *factor_data = nullptr;
+
+    auto xpu_context =
+        ctx.device_context<paddle::platform::XPUDeviceContext>().x_context();
+    PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void **>(&factor_data),
+                                 x->numel() * sizeof(T)),
+                      XPU_SUCCESS, platform::errors::ResourceExhausted(
+                                       "XPU has no enough memory"));
+    int r = xpu::constant<T>(xpu_context, factor_data, x->numel(), pow_factor);
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External("XPU constant op return"
+                                   " wrong value[%d %s] in pow op.",
+                                   r, XPUAPIErrorMsg[r]));
+    r = xpu::pow(xpu_context, x_data, factor_data, y_data, x->numel());
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External("XPU pow op return wrong value[%d %s].", r,
+                                   XPUAPIErrorMsg[r]));
+    if (xpu_context->xpu_stream != nullptr) {
+      xpu_wait(xpu_context->xpu_stream);
+    }
+    xpu_free(factor_data);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -340,15 +376,18 @@ namespace ops = paddle::operators;
       act_type##_grad,                                                   \
       ops::XPUActivationGradKernel<ops::grad_functor<float>>);
 
+REGISTER_ACTIVATION_XPU_KERNEL(abs, XPUAbsFunctor, XPUAbsGradFunctor)
+REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, XPUHardSwishFunctor,
+                               XPUHardSwishGradFunctor)
+REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, XPULeakyReluFunctor,
+                               XPULeakyReluGradFunctor)
+REGISTER_ACTIVATION_XPU_KERNEL(reciprocal, XPUReciprocalFunctor,
+                               XPUReciprocalGradFunctor)
 REGISTER_ACTIVATION_XPU_KERNEL(relu, XPUReluFunctor, XPUReluGradFunctor)
 REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, XPUSigmoidFunctor,
                                XPUSigmoidGradFunctor)
 REGISTER_ACTIVATION_XPU_KERNEL(sqrt, XPUSqrtFunctor, XPUSqrtGradFunctor)
 REGISTER_ACTIVATION_XPU_KERNEL(square, XPUSquareFunctor, XPUSquareGradFunctor)
-REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, XPUHardSwishFunctor,
-                               XPUHardSwishGradFunctor)
-REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, XPULeakyReluFunctor,
-                               XPULeakyReluGradFunctor)
 
 REGISTER_OP_XPU_KERNEL(
     tanh, ops::XPUActivationKernel<ops::XPUTanhFunctor<float>>,
@@ -358,11 +397,11 @@ REGISTER_OP_XPU_KERNEL(
     ops::XPUActivationGradKernel<
         ops::XPUTanhGradFunctor<paddle::platform::float16>>);
 
+REGISTER_OP_XPU_KERNEL(exp,
+                       ops::XPUActivationKernel<ops::XPUExpFunctor<float>>);
 REGISTER_OP_XPU_KERNEL(log,
                        ops::XPUActivationKernel<ops::XPULogFunctor<float>>);
 REGISTER_OP_XPU_KERNEL(pow,
                        ops::XPUActivationKernel<ops::XPUPowFunctor<float>>);
-REGISTER_OP_XPU_KERNEL(abs,
-                       ops::XPUActivationKernel<ops::XPUAbsFunctor<float>>);
 
 #endif  // PADDLE_WITH_XPU
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index c5a140a7681..b4ad88ce6ab 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -29,6 +29,9 @@ using XPUOpMap = std::unordered_map<std::string, XPUKernelSet>;
 XPUOpMap& get_kl2_ops() {
   // KL1支持的op，通过op_name, data_type, place来索引
   static XPUOpMap s_xpu2_kernels{
+      {"abs", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"abs_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                 pOpKernelType(vartype::FP16, XPUPlace())})},
       {"adamw", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
@@ -106,6 +109,7 @@ XPUOpMap& get_kl2_ops() {
       {"equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                               pOpKernelType(vartype::INT32, XPUPlace()),
                               pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"exp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"expand_as_v2",
        XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
@@ -185,6 +189,9 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"hard_swish_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
       {"iou_similarity",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"label_smooth",
@@ -227,6 +234,10 @@ XPUOpMap& get_kl2_ops() {
       {"momentum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                             pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"nearest_interp_v2",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"nearest_interp_v2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"not_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                                   pOpKernelType(vartype::INT32, XPUPlace()),
                                   pOpKernelType(vartype::FP32, XPUPlace())})},
@@ -239,6 +250,10 @@ XPUOpMap& get_kl2_ops() {
       {"prior_box", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"range", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                               pOpKernelType(vartype::INT64, XPUPlace())})},
+      {"reciprocal", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reciprocal_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
       {"reduce_max_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
@@ -273,6 +288,9 @@ XPUOpMap& get_kl2_ops() {
                                 pOpKernelType(vartype::FP32, XPUPlace())})},
       {"shape", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                               pOpKernelType(vartype::INT64, XPUPlace())})},
+      {"sigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sigmoid_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                                    pOpKernelType(vartype::FP16, XPUPlace()),
                                    pOpKernelType(vartype::INT32, XPUPlace())})},
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
index c2c69be45bf..ce82b20eca4 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
@@ -154,6 +154,11 @@ class TestXPUAbs(TestXPUActivation):
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
 
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
+
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
@@ -334,6 +339,25 @@ def leaky_relu(x, alpha):
     return y_ref.astype(x.dtype)
 
 
+class TestXPUReciprocal(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "reciprocal"
+        self.init_dtype()
+
+        np.random.seed(1024)
+        x = np.random.uniform(1, 2, [1111, 1117]).astype(self.dtype)
+        out = np.reciprocal(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+        self.attrs = {'use_xpu': True}
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
+
+
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()
-- 
GitLab