From a3d56a9c1f575504ba88b8f3ab2466d55b22e652 Mon Sep 17 00:00:00 2001
From: Lijunhui <1578034415@qq.com>
Date: Sun, 1 May 2022 20:32:46 +0800
Subject: [PATCH] [KP] Complete registry of elementwise ops on XPU with KP
 (#42056)

---
 .../new_executor/standalone_executor_test.cc  |  3 +-
 .../operators/reduce_ops/reduce_amax_op.cu    |  1 +
 .../operators/reduce_ops/reduce_amin_op.cu    |  1 +
 paddle/fluid/operators/reduce_ops/reduce_op.h | 13 ++++--
 .../platform/device/xpu/xpu_op_kpfirst_list.h |  4 ++
 paddle/phi/kernels/elementwise_kernel.cc      |  8 ++--
 .../phi/kernels/funcs/elementwise_functor.h   |  7 ++++
 .../phi/kernels/kps/elementwise_add_kernel.cu |  1 +
 .../kernels/kps/elementwise_divide_kernel.cu  |  1 +
 paddle/phi/kernels/kps/elementwise_kernel.cu  | 41 +++++++++++++++++++
 .../kps/elementwise_multiply_kernel.cu        |  1 +
 .../kps/elementwise_subtract_kernel.cu        |  1 +
 paddle/phi/kernels/kps/logical_kernel.cu      |  6 +--
 .../primitive/functor_primitives_xpu2.h       |  9 ++--
 14 files changed, 82 insertions(+), 15 deletions(-)
 mode change 100755 => 100644 paddle/phi/kernels/primitive/functor_primitives_xpu2.h

diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index e03277fb317..23bd777fae1 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -74,11 +74,12 @@ PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
 PD_DECLARE_KERNEL(multiply, KPS, ALL_LAYOUT);
 PD_DECLARE_KERNEL(multiply_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(divide, KPS, ALL_LAYOUT);
-PD_DECLARE_KERNEL(maximum, GPU, ALL_LAYOUT);
 #ifdef PADDLE_WITH_XPU_KP
 PD_DECLARE_KERNEL(max_raw, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(maximum, GPU, ALL_LAYOUT);
 #else
 PD_DECLARE_KERNEL(max_raw, KPS, ALL_LAYOUT);
+PD_DECLARE_KERNEL(maximum, KPS, ALL_LAYOUT);
 #endif
 PD_DECLARE_KERNEL(mean, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(mean_grad, GPU, ALL_LAYOUT);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cu b/paddle/fluid/operators/reduce_ops/reduce_amax_op.cu
index 16c7a4794bb..b3385915341 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_amax_op.cu
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amin_op.cu b/paddle/fluid/operators/reduce_ops/reduce_amin_op.cu
index f9f015804e1..037dab396c7 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amin_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_amin_op.cu
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index ff1ddb4175f..76641698ead 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -29,7 +29,7 @@ limitations under the License. */
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/kernels/cpu/reduce.h"
 
-#if defined(__HIPCC__) || defined(__NVCC__)
+#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__)
 #include "paddle/phi/kernels/gpu/reduce.h"
 #include "paddle/phi/kernels/gpu/reduce_grad.h"
 #endif
@@ -613,7 +613,7 @@ If reduce_all is true, just reduce along all dimensions and output a scalar.
   virtual std::string GetOpType() const = 0;
 };
 
-#if defined(__HIPCC__) || defined(__NVCC__)
+#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__)
 template <typename T, template <typename> class ReduceOp,
           template <typename, typename> class TransformOp>
 class ReduceCudaKernel : public framework::OpKernel<T> {
@@ -626,9 +626,12 @@ class ReduceCudaKernel : public framework::OpKernel<T> {
     auto pt_out_dtype = paddle::framework::TransToPhiDataType(
         static_cast<framework::proto::VarType::Type>(out_dtype));
     std::vector<int> dims = context.Attr<std::vector<int>>("dim");
-
+#ifdef PADDLE_WITH_XPU_KP
+    auto& dev_ctx =
+        context.template device_context<paddle::platform::XPUDeviceContext>();
+#else
     auto& dev_ctx = context.cuda_device_context();
-
+#endif
     if (out_dtype >= 0) {
       output->mutable_data(dev_ctx.GetPlace(), pt_out_dtype);
     } else {
@@ -642,6 +645,7 @@ class ReduceCudaKernel : public framework::OpKernel<T> {
   }
 };
 
+#ifndef PADDLE_WITH_XPU_KP
 template <typename T, template <typename, typename> class TransformOp>
 class ReduceCudaGradKernel : public framework::OpKernel<T> {
  public:
@@ -686,6 +690,7 @@ class ReduceCudaGradKernel : public framework::OpKernel<T> {
   }
 };
 #endif
+#endif
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
index 99a1eb97de5..43c9e63ac19 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
@@ -42,6 +42,8 @@ XPUOpMap& get_kp_ops() {
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"elementwise_floordiv",
        XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace())})},
+      {"elementwise_pow",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       // activation op
       {"exp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"hard_swish", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
@@ -105,6 +107,8 @@ XPUOpMap& get_kp_ops() {
       {"reduce_prod", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_all", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace())})},
       {"reduce_any", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace())})},
+      {"reduce_amax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_amin", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
   };
 
   return s_xpu_kp_kernels;
diff --git a/paddle/phi/kernels/elementwise_kernel.cc b/paddle/phi/kernels/elementwise_kernel.cc
index 4cee24d2f80..9d608cd86a6 100644
--- a/paddle/phi/kernels/elementwise_kernel.cc
+++ b/paddle/phi/kernels/elementwise_kernel.cc
@@ -103,7 +103,7 @@ PD_REGISTER_KERNEL(elementwise_pow,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 PD_REGISTER_KERNEL(maximum,
-                   GPU,
+                   KPS,
                    ALL_LAYOUT,
                    phi::MaximumKernel,
                    float,
@@ -113,7 +113,7 @@ PD_REGISTER_KERNEL(maximum,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(minimum,
-                   GPU,
+                   KPS,
                    ALL_LAYOUT,
                    phi::MinimumKernel,
                    float,
@@ -125,9 +125,9 @@ PD_REGISTER_KERNEL(minimum,
 PD_REGISTER_KERNEL(
     modulo, GPU, ALL_LAYOUT, phi::ModuloKernel, float, double, int, int64_t) {}
 PD_REGISTER_KERNEL(
-    floor_divide, GPU, ALL_LAYOUT, phi::FloorDivideKernel, int, int64_t) {}
+    floor_divide, KPS, ALL_LAYOUT, phi::FloorDivideKernel, int, int64_t) {}
 PD_REGISTER_KERNEL(elementwise_pow,
-                   GPU,
+                   KPS,
                    ALL_LAYOUT,
                    phi::ElementwisePowKernel,
                    float,
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index 8d9dd657867..4c2b6ef896e 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -18,6 +18,10 @@ limitations under the License. */
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/hostdevice.h"
+#if defined(__xpu__)
+#include <xpu/runtime.h>
+#include "xpu/kernel/math_xpu2.h"  //pow()
+#endif
 
 namespace phi {
 namespace funcs {
@@ -573,6 +577,9 @@ struct ElementwisePowFunctor {
       return std::llrint(
           std::pow(static_cast<double>(a), static_cast<double>(b)));
     }
+#endif
+#ifdef PADDLE_WITH_XPU_KP
+    return pow(a, b);
 #endif
     return std::pow(a, b);
   }
diff --git a/paddle/phi/kernels/kps/elementwise_add_kernel.cu b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
index b5532c61431..8f7d45771d9 100644
--- a/paddle/phi/kernels/kps/elementwise_add_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
@@ -36,6 +36,7 @@ void AddKernel(const Context& dev_ctx,
 }  // namespace phi
 
 #ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(add, KPS, ALL_LAYOUT, phi::AddKernel, float) {}
 PD_REGISTER_KERNEL(add_raw, KPS, ALL_LAYOUT, phi::AddRawKernel, float) {}
 #else
 
diff --git a/paddle/phi/kernels/kps/elementwise_divide_kernel.cu b/paddle/phi/kernels/kps/elementwise_divide_kernel.cu
index 852babe29db..827c478de97 100644
--- a/paddle/phi/kernels/kps/elementwise_divide_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_divide_kernel.cu
@@ -37,6 +37,7 @@ void DivideKernel(const Context& dev_ctx,
 }  // namespace phi
 
 #ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(divide, KPS, ALL_LAYOUT, phi::DivideKernel, float) {}
 PD_REGISTER_KERNEL(divide_raw, KPS, ALL_LAYOUT, phi::DivideRawKernel, float) {}
 #else
 
diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu
index 5ccd3b1a482..821fda52ab1 100644
--- a/paddle/phi/kernels/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -24,24 +24,65 @@ namespace phi {
 
 // Create the definition of Maximum
 DEFINE_CUDA_ELEMENTWISE_OP(Maximum)
+template <typename T, typename Context>
+void MaximumKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out) {
+  int axis = -1;
+  MaximumRawKernel<T>(dev_ctx, x, y, axis, out);
+}
 // Create the definition of Minimum
 DEFINE_CUDA_ELEMENTWISE_OP(Minimum)
+template <typename T, typename Context>
+void MinimumKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out) {
+  int axis = -1;
+  MinimumRawKernel<T>(dev_ctx, x, y, axis, out);
+}
 // Create the definition of Modulo
 DEFINE_CUDA_ELEMENTWISE_OP(Modulo)
 // Create the definition of FloorDivide
 DEFINE_CUDA_ELEMENTWISE_OP(FloorDivide)
+template <typename T, typename Context>
+void FloorDivideKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       DenseTensor* out) {
+  int axis = -1;
+  FloorDivideRawKernel<T>(dev_ctx, x, y, axis, out);
+}
 // Create the definition of Pow
 DEFINE_CUDA_ELEMENTWISE_OP(ElementwisePow)
+template <typename T, typename Context>
+void ElementwisePowKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& y,
+                          DenseTensor* out) {
+  int axis = -1;
+  ElementwisePowRawKernel<T>(dev_ctx, x, y, axis, out);
+}
 
 }  // namespace phi
 
 #ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(maximum, KPS, ALL_LAYOUT, phi::MaximumKernel, float) {}
 PD_REGISTER_KERNEL(maximum_raw, KPS, ALL_LAYOUT, phi::MaximumRawKernel, float) {
 }
+PD_REGISTER_KERNEL(minimum, KPS, ALL_LAYOUT, phi::MinimumKernel, float) {}
 PD_REGISTER_KERNEL(minimum_raw, KPS, ALL_LAYOUT, phi::MinimumRawKernel, float) {
 }
+PD_REGISTER_KERNEL(floor_divide, KPS, ALL_LAYOUT, phi::FloorDivideKernel, int) {
+}
 PD_REGISTER_KERNEL(
     floor_divide_raw, KPS, ALL_LAYOUT, phi::FloorDivideRawKernel, int) {}
+PD_REGISTER_KERNEL(
+    elementwise_pow, KPS, ALL_LAYOUT, phi::ElementwisePowKernel, float) {}
+PD_REGISTER_KERNEL(
+    elementwise_pow_raw, KPS, ALL_LAYOUT, phi::ElementwisePowRawKernel, float) {
+}
 
 #else
 using float16 = phi::dtype::float16;
diff --git a/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu b/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu
index 8bede0198c2..99408ff2142 100644
--- a/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu
@@ -37,6 +37,7 @@ void MultiplyKernel(const Context& dev_ctx,
 }  // namespace phi
 
 #ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(multiply, KPS, ALL_LAYOUT, phi::MultiplyKernel, float) {}
 PD_REGISTER_KERNEL(
     multiply_raw, KPS, ALL_LAYOUT, phi::MultiplyRawKernel, float) {}
 #else
diff --git a/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu b/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu
index 757dedb99c9..b99f687b59f 100644
--- a/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu
@@ -37,6 +37,7 @@ void SubtractKernel(const Context& dev_ctx,
 }  // namespace phi
 
 #ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(subtract, KPS, ALL_LAYOUT, phi::SubtractKernel, float) {}
 PD_REGISTER_KERNEL(
     subtract_raw, KPS, ALL_LAYOUT, phi::SubtractRawKernel, float) {}
 #else
diff --git a/paddle/phi/kernels/kps/logical_kernel.cu b/paddle/phi/kernels/kps/logical_kernel.cu
index b732d371ad1..81567595395 100644
--- a/paddle/phi/kernels/kps/logical_kernel.cu
+++ b/paddle/phi/kernels/kps/logical_kernel.cu
@@ -65,9 +65,9 @@ void LogicalNotKernel(const Context& dev_ctx,
 
 #ifdef PADDLE_WITH_XPU_KP
 PD_REGISTER_KERNEL(logical_and, KPS, ALL_LAYOUT, phi::LogicalAndKernel, int) {}
-PD_REGISTER_KERNEL(logical_Or, KPS, ALL_LAYOUT, phi::LogicalOrKernel, int) {}
-PD_REGISTER_KERNEL(logical_Not, KPS, ALL_LAYOUT, phi::LogicalNotKernel, int) {}
-PD_REGISTER_KERNEL(logical_Xor, KPS, ALL_LAYOUT, phi::LogicalXorKernel, int) {}
+PD_REGISTER_KERNEL(logical_or, KPS, ALL_LAYOUT, phi::LogicalOrKernel, int) {}
+PD_REGISTER_KERNEL(logical_not, KPS, ALL_LAYOUT, phi::LogicalNotKernel, int) {}
+PD_REGISTER_KERNEL(logical_xor, KPS, ALL_LAYOUT, phi::LogicalXorKernel, int) {}
 #else
 #define REGISTER_LOGICAL_CUDA_KERNEL(logical_and, func_type) \
   PD_REGISTER_KERNEL(logical_and,                            \
diff --git a/paddle/phi/kernels/primitive/functor_primitives_xpu2.h b/paddle/phi/kernels/primitive/functor_primitives_xpu2.h
old mode 100755
new mode 100644
index b01e0474f2d..fdcbb5ec9cc
--- a/paddle/phi/kernels/primitive/functor_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/functor_primitives_xpu2.h
@@ -124,7 +124,8 @@ struct MaxFunctor {
  */
 template <typename T>
 struct AddFunctor {
-  inline T initial() { return static_cast<T>(0.0f); }
+  inline T initial() { /*return static_cast<T>(0.0f);*/
+  }
 
   __device__ T operator()(const T a, const T b) const { return b + a; }
 };
@@ -134,7 +135,8 @@ struct AddFunctor {
  */
 template <typename T>
 struct MulFunctor {
-  inline T initial() { return static_cast<T>(1.0f); }
+  inline T initial() { /*return static_cast<T>(1.0f);*/
+  }
 
   __device__ T operator()(const T& a, const T& b) const { return b * a; }
 };
@@ -144,7 +146,8 @@ struct MulFunctor {
  */
 template <typename T>
 struct LogicalOrFunctor {
-  inline T initial() { return static_cast<T>(false); }
+  inline T initial() { /*return static_cast<T>(false);*/
+  }
 
   __device__ T operator()(const T& a, const T& b) const { return b || a; }
 };
-- 
GitLab