[KP] Complete registry of elementwise ops on XPU with KP (#42056)

a3d56a9c · Lijunhui · GitHub · ba486c5e · a3d56a9c · a3d56a9c
14 changed file
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -74,11 +74,12 @@ PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
 PD_DECLARE_KERNEL(multiply, KPS, ALL_LAYOUT);
 PD_DECLARE_KERNEL(multiply_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(divide, KPS, ALL_LAYOUT);
-PD_DECLARE_KERNEL(maximum, GPU, ALL_LAYOUT);
 #ifdef PADDLE_WITH_XPU_KP
 PD_DECLARE_KERNEL(max_raw, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(maximum, GPU, ALL_LAYOUT);
 #else
 PD_DECLARE_KERNEL(max_raw, KPS, ALL_LAYOUT);
+PD_DECLARE_KERNEL(maximum, KPS, ALL_LAYOUT);
 #endif
 PD_DECLARE_KERNEL(mean, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(mean_grad, GPU, ALL_LAYOUT);

--- a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_amax_op.cu
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"


--- a/paddle/fluid/operators/reduce_ops/reduce_amin_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_amin_op.cu
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"


--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -29,7 +29,7 @@ limitations under the License. */
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/kernels/cpu/reduce.h"

-#if defined(__HIPCC__) || defined(__NVCC__)
+#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__)
 #include "paddle/phi/kernels/gpu/reduce.h"
 #include "paddle/phi/kernels/gpu/reduce_grad.h"
 #endif
@@ -613,7 +613,7 @@ If reduce_all is true, just reduce along all dimensions and output a scalar.
  virtual std::string GetOpType() const = 0;
 };

-#if defined(__HIPCC__) || defined(__NVCC__)
+#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__)
 template <typename T, template <typename> class ReduceOp,
          template <typename, typename> class TransformOp>
 class ReduceCudaKernel : public framework::OpKernel<T> {
@@ -626,9 +626,12 @@ class ReduceCudaKernel : public framework::OpKernel<T> {
    auto pt_out_dtype = paddle::framework::TransToPhiDataType(
        static_cast<framework::proto::VarType::Type>(out_dtype));
    std::vector<int> dims = context.Attr<std::vector<int>>("dim");
-
+#ifdef PADDLE_WITH_XPU_KP
+    auto& dev_ctx =
+        context.template device_context<paddle::platform::XPUDeviceContext>();
+#else
    auto& dev_ctx = context.cuda_device_context();
-
+#endif
    if (out_dtype >= 0) {
      output->mutable_data(dev_ctx.GetPlace(), pt_out_dtype);
    } else {
@@ -642,6 +645,7 @@ class ReduceCudaKernel : public framework::OpKernel<T> {
  }
 };

+#ifndef PADDLE_WITH_XPU_KP
 template <typename T, template <typename, typename> class TransformOp>
 class ReduceCudaGradKernel : public framework::OpKernel<T> {
 public:
@@ -686,6 +690,7 @@ class ReduceCudaGradKernel : public framework::OpKernel<T> {
  }
 };
 #endif
+#endif

 }  // namespace operators
 }  // namespace paddle

--- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
@@ -42,6 +42,8 @@ XPUOpMap& get_kp_ops() {
       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
      {"elementwise_floordiv",
       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace())})},
+      {"elementwise_pow",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
      // activation op
      {"exp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
      {"hard_swish", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
@@ -105,6 +107,8 @@ XPUOpMap& get_kp_ops() {
      {"reduce_prod", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
      {"reduce_all", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace())})},
      {"reduce_any", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace())})},
+      {"reduce_amax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_amin", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
  };

  return s_xpu_kp_kernels;

--- a/paddle/phi/kernels/elementwise_kernel.cc
+++ b/paddle/phi/kernels/elementwise_kernel.cc
@@ -103,7 +103,7 @@ PD_REGISTER_KERNEL(elementwise_pow,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

 PD_REGISTER_KERNEL(maximum,
-                   GPU,
+                   KPS,
                   ALL_LAYOUT,
                   phi::MaximumKernel,
                   float,
@@ -113,7 +113,7 @@ PD_REGISTER_KERNEL(maximum,
                   phi::dtype::float16,
                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(minimum,
-                   GPU,
+                   KPS,
                   ALL_LAYOUT,
                   phi::MinimumKernel,
                   float,
@@ -125,9 +125,9 @@ PD_REGISTER_KERNEL(minimum,
 PD_REGISTER_KERNEL(
    modulo, GPU, ALL_LAYOUT, phi::ModuloKernel, float, double, int, int64_t) {}
 PD_REGISTER_KERNEL(
-    floor_divide, GPU, ALL_LAYOUT, phi::FloorDivideKernel, int, int64_t) {}
+    floor_divide, KPS, ALL_LAYOUT, phi::FloorDivideKernel, int, int64_t) {}
 PD_REGISTER_KERNEL(elementwise_pow,
-                   GPU,
+                   KPS,
                   ALL_LAYOUT,
                   phi::ElementwisePowKernel,
                   float,

--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -18,6 +18,10 @@ limitations under the License. */
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/hostdevice.h"
+#if defined(__xpu__)
+#include <xpu/runtime.h>
+#include "xpu/kernel/math_xpu2.h"  //pow()
+#endif

 namespace phi {
 namespace funcs {
@@ -573,6 +577,9 @@ struct ElementwisePowFunctor {
      return std::llrint(
          std::pow(static_cast<double>(a), static_cast<double>(b)));
    }
+#endif
+#ifdef PADDLE_WITH_XPU_KP
+    return pow(a, b);
 #endif
    return std::pow(a, b);
  }

--- a/paddle/phi/kernels/kps/elementwise_add_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
@@ -36,6 +36,7 @@ void AddKernel(const Context& dev_ctx,
 }  // namespace phi

 #ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(add, KPS, ALL_LAYOUT, phi::AddKernel, float) {}
 PD_REGISTER_KERNEL(add_raw, KPS, ALL_LAYOUT, phi::AddRawKernel, float) {}
 #else


--- a/paddle/phi/kernels/kps/elementwise_divide_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_divide_kernel.cu
@@ -37,6 +37,7 @@ void DivideKernel(const Context& dev_ctx,
 }  // namespace phi

 #ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(divide, KPS, ALL_LAYOUT, phi::DivideKernel, float) {}
 PD_REGISTER_KERNEL(divide_raw, KPS, ALL_LAYOUT, phi::DivideRawKernel, float) {}
 #else


--- a/paddle/phi/kernels/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -24,24 +24,65 @@ namespace phi {

 // Create the definition of Maximum
 DEFINE_CUDA_ELEMENTWISE_OP(Maximum)
+template <typename T, typename Context>
+void MaximumKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out) {
+  int axis = -1;
+  MaximumRawKernel<T>(dev_ctx, x, y, axis, out);
+}
 // Create the definition of Minimum
 DEFINE_CUDA_ELEMENTWISE_OP(Minimum)
+template <typename T, typename Context>
+void MinimumKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out) {
+  int axis = -1;
+  MinimumRawKernel<T>(dev_ctx, x, y, axis, out);
+}
 // Create the definition of Modulo
 DEFINE_CUDA_ELEMENTWISE_OP(Modulo)
 // Create the definition of FloorDivide
 DEFINE_CUDA_ELEMENTWISE_OP(FloorDivide)
+template <typename T, typename Context>
+void FloorDivideKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       DenseTensor* out) {
+  int axis = -1;
+  FloorDivideRawKernel<T>(dev_ctx, x, y, axis, out);
+}
 // Create the definition of Pow
 DEFINE_CUDA_ELEMENTWISE_OP(ElementwisePow)
+template <typename T, typename Context>
+void ElementwisePowKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& y,
+                          DenseTensor* out) {
+  int axis = -1;
+  ElementwisePowRawKernel<T>(dev_ctx, x, y, axis, out);
+}

 }  // namespace phi

 #ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(maximum, KPS, ALL_LAYOUT, phi::MaximumKernel, float) {}
 PD_REGISTER_KERNEL(maximum_raw, KPS, ALL_LAYOUT, phi::MaximumRawKernel, float) {
 }
+PD_REGISTER_KERNEL(minimum, KPS, ALL_LAYOUT, phi::MinimumKernel, float) {}
 PD_REGISTER_KERNEL(minimum_raw, KPS, ALL_LAYOUT, phi::MinimumRawKernel, float) {
 }
+PD_REGISTER_KERNEL(floor_divide, KPS, ALL_LAYOUT, phi::FloorDivideKernel, int) {
+}
 PD_REGISTER_KERNEL(
    floor_divide_raw, KPS, ALL_LAYOUT, phi::FloorDivideRawKernel, int) {}
+PD_REGISTER_KERNEL(
+    elementwise_pow, KPS, ALL_LAYOUT, phi::ElementwisePowKernel, float) {}
+PD_REGISTER_KERNEL(
+    elementwise_pow_raw, KPS, ALL_LAYOUT, phi::ElementwisePowRawKernel, float) {
+}

 #else
 using float16 = phi::dtype::float16;

--- a/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu
@@ -37,6 +37,7 @@ void MultiplyKernel(const Context& dev_ctx,
 }  // namespace phi

 #ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(multiply, KPS, ALL_LAYOUT, phi::MultiplyKernel, float) {}
 PD_REGISTER_KERNEL(
    multiply_raw, KPS, ALL_LAYOUT, phi::MultiplyRawKernel, float) {}
 #else

--- a/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu
@@ -37,6 +37,7 @@ void SubtractKernel(const Context& dev_ctx,
 }  // namespace phi

 #ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(subtract, KPS, ALL_LAYOUT, phi::SubtractKernel, float) {}
 PD_REGISTER_KERNEL(
    subtract_raw, KPS, ALL_LAYOUT, phi::SubtractRawKernel, float) {}
 #else

--- a/paddle/phi/kernels/kps/logical_kernel.cu
+++ b/paddle/phi/kernels/kps/logical_kernel.cu
@@ -65,9 +65,9 @@ void LogicalNotKernel(const Context& dev_ctx,

 #ifdef PADDLE_WITH_XPU_KP
 PD_REGISTER_KERNEL(logical_and, KPS, ALL_LAYOUT, phi::LogicalAndKernel, int) {}
-PD_REGISTER_KERNEL(logical_Or, KPS, ALL_LAYOUT, phi::LogicalOrKernel, int) {}
-PD_REGISTER_KERNEL(logical_Not, KPS, ALL_LAYOUT, phi::LogicalNotKernel, int) {}
-PD_REGISTER_KERNEL(logical_Xor, KPS, ALL_LAYOUT, phi::LogicalXorKernel, int) {}
+PD_REGISTER_KERNEL(logical_or, KPS, ALL_LAYOUT, phi::LogicalOrKernel, int) {}
+PD_REGISTER_KERNEL(logical_not, KPS, ALL_LAYOUT, phi::LogicalNotKernel, int) {}
+PD_REGISTER_KERNEL(logical_xor, KPS, ALL_LAYOUT, phi::LogicalXorKernel, int) {}
 #else
 #define REGISTER_LOGICAL_CUDA_KERNEL(logical_and, func_type) \
  PD_REGISTER_KERNEL(logical_and,                            \

--- a/paddle/phi/kernels/primitive/functor_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/functor_primitives_xpu2.h
@@ -124,7 +124,8 @@ struct MaxFunctor {
 */
 template <typename T>
 struct AddFunctor {
-  inline T initial() { return static_cast<T>(0.0f); }
+  inline T initial() { /*return static_cast<T>(0.0f);*/
+  }

  __device__ T operator()(const T a, const T b) const { return b + a; }
 };
@@ -134,7 +135,8 @@ struct AddFunctor {
 */
 template <typename T>
 struct MulFunctor {
-  inline T initial() { return static_cast<T>(1.0f); }
+  inline T initial() { /*return static_cast<T>(1.0f);*/
+  }

  __device__ T operator()(const T& a, const T& b) const { return b * a; }
 };
@@ -144,7 +146,8 @@ struct MulFunctor {
 */
 template <typename T>
 struct LogicalOrFunctor {
-  inline T initial() { return static_cast<T>(false); }
+  inline T initial() { /*return static_cast<T>(false);*/
+  }

  __device__ T operator()(const T& a, const T& b) const { return b || a; }
 };