From fbe2c31162d40b36bec0aabe12cd4780c10959ef Mon Sep 17 00:00:00 2001 From: Lijunhui <1578034415@qq.com> Date: Thu, 14 Apr 2022 20:41:35 +0800 Subject: [PATCH] [KP] Add registry for elementwise_add/max/min/sub/div/mul/floordiv on XPU2 with KP lib (#41494) * regist elementwise_xxx --- .../new_executor/standalone_executor_test.cc | 4 ++ .../elementwise/elementwise_add_op.kps | 2 +- .../platform/device/xpu/xpu_op_kpfirst_list.h | 12 ++++++ .../phi/kernels/funcs/elementwise_functor.h | 4 ++ .../kernels/impl/elementwise_kernel_impl.h | 2 +- .../{gpu => kps}/elementwise_kernel.cu | 41 +++++++++++++------ 6 files changed, 51 insertions(+), 14 deletions(-) rename paddle/phi/kernels/{gpu => kps}/elementwise_kernel.cu (84%) diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc index fbcbb2ca23b..cbbb802b67d 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -69,7 +69,11 @@ PD_DECLARE_KERNEL(split, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(concat, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(concat_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT); +#ifdef PADDLE_WITH_XPU_KP PD_DECLARE_KERNEL(add_raw, GPU, ALL_LAYOUT); +#else +PD_DECLARE_KERNEL(add_raw, KPS, ALL_LAYOUT); +#endif PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(mean, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(sigmoid, GPU, ALL_LAYOUT); diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.kps b/paddle/fluid/operators/elementwise/elementwise_add_op.kps index 3b7457d72e1..ecd52a310ac 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.kps +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.kps @@ -58,4 +58,4 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseAddKernel, ops::ElementwiseAddKernel>, ops::ElementwiseAddKernel>); -#endif \ No newline at end of file +#endif diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h index f1742f8eb5a..9afde00a98b 100644 --- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h +++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h @@ -30,6 +30,18 @@ XPUOpMap& get_kp_ops() { static XPUOpMap s_xpu_kp_kernels{ {"elementwise_add", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"elementwise_div", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"elementwise_sub", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"elementwise_max", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"elementwise_min", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"elementwise_mul", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"elementwise_floordiv", + XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace())})}, // activation op {"exp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"hard_swish", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h index 0ea5ff0e82e..8d9dd657867 100644 --- a/paddle/phi/kernels/funcs/elementwise_functor.h +++ b/paddle/phi/kernels/funcs/elementwise_functor.h @@ -542,7 +542,9 @@ struct InverseModuloFunctor< template struct FloorDivideFunctor { inline HOSTDEVICE T operator()(const T a, const T b) const { +#ifndef PADDLE_WITH_XPU_KP PADDLE_ENFORCE(b != 0, DIV_ERROR_INFO); +#endif return static_cast(std::trunc(a / b)); } }; @@ -550,7 +552,9 @@ struct FloorDivideFunctor { template struct InverseFloorDivideFunctor { inline HOSTDEVICE T operator()(const T a, const T b) const { +#ifndef PADDLE_WITH_XPU_KP PADDLE_ENFORCE(a != 0, DIV_ERROR_INFO); +#endif return static_cast(std::trunc(b / a)); } }; diff --git a/paddle/phi/kernels/impl/elementwise_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_kernel_impl.h index 0e69d00110e..d5c2c559b2c 100644 --- a/paddle/phi/kernels/impl/elementwise_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_kernel_impl.h @@ -17,7 +17,7 @@ #include "paddle/phi/kernels/elementwise_kernel.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) #include "paddle/phi/kernels/funcs/broadcast_function.h" #endif diff --git a/paddle/phi/kernels/gpu/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu similarity index 84% rename from paddle/phi/kernels/gpu/elementwise_kernel.cu rename to paddle/phi/kernels/kps/elementwise_kernel.cu index 73964d31a34..01a34c0f85e 100644 --- a/paddle/phi/kernels/gpu/elementwise_kernel.cu +++ b/paddle/phi/kernels/kps/elementwise_kernel.cu @@ -13,8 +13,10 @@ // limitations under the License. #include "paddle/phi/backends/gpu/gpu_context.h" +#ifndef PADDLE_WITH_XPU_KP #include "paddle/phi/common/complex.h" #include "paddle/phi/common/float16.h" +#endif #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h" @@ -40,7 +42,6 @@ namespace phi { /** * Kernels */ - // Create the definition of Add DEFINE_CUDA_ELEMENTWISE_OP(Add) // Create the definition of Subtract @@ -62,19 +63,34 @@ DEFINE_CUDA_ELEMENTWISE_OP(ElementwisePow) } // namespace phi +#ifdef PADDLE_WITH_XPU_KP +PD_REGISTER_KERNEL(add_raw, KPS, ALL_LAYOUT, phi::AddRawKernel, float) {} +PD_REGISTER_KERNEL( + subtract_raw, KPS, ALL_LAYOUT, phi::SubtractRawKernel, float) {} +PD_REGISTER_KERNEL(divide_raw, KPS, ALL_LAYOUT, phi::DivideRawKernel, float) {} +PD_REGISTER_KERNEL( + multiply_raw, KPS, ALL_LAYOUT, phi::MultiplyRawKernel, float) {} +PD_REGISTER_KERNEL(maximum_raw, KPS, ALL_LAYOUT, phi::MaximumRawKernel, float) { +} +PD_REGISTER_KERNEL(minimum_raw, KPS, ALL_LAYOUT, phi::MinimumRawKernel, float) { +} +PD_REGISTER_KERNEL( + floor_divide_raw, KPS, ALL_LAYOUT, phi::FloorDivideRawKernel, int) {} + +#else using float16 = phi::dtype::float16; using bfloat16 = phi::dtype::bfloat16; using complex64 = ::phi::dtype::complex; using complex128 = ::phi::dtype::complex; PD_REGISTER_KERNEL( - fmax, GPU, ALL_LAYOUT, phi::FMaxKernel, float, double, int, int64_t) {} + fmax, KPS, ALL_LAYOUT, phi::FMaxKernel, float, double, int, int64_t) {} PD_REGISTER_KERNEL( - fmin, GPU, ALL_LAYOUT, phi::FMinKernel, float, double, int, int64_t) {} + fmin, KPS, ALL_LAYOUT, phi::FMinKernel, float, double, int, int64_t) {} PD_REGISTER_KERNEL(add_raw, - GPU, + KPS, ALL_LAYOUT, phi::AddRawKernel, float, @@ -87,7 +103,7 @@ PD_REGISTER_KERNEL(add_raw, complex64, complex128) {} PD_REGISTER_KERNEL(subtract_raw, - GPU, + KPS, ALL_LAYOUT, phi::SubtractRawKernel, float, @@ -100,7 +116,7 @@ PD_REGISTER_KERNEL(subtract_raw, complex64, complex128) {} PD_REGISTER_KERNEL(divide_raw, - GPU, + KPS, ALL_LAYOUT, phi::DivideRawKernel, float, @@ -112,7 +128,7 @@ PD_REGISTER_KERNEL(divide_raw, complex64, complex128) {} PD_REGISTER_KERNEL(multiply_raw, - GPU, + KPS, ALL_LAYOUT, phi::MultiplyRawKernel, float, @@ -125,7 +141,7 @@ PD_REGISTER_KERNEL(multiply_raw, complex128, bfloat16) {} PD_REGISTER_KERNEL(maximum_raw, - GPU, + KPS, ALL_LAYOUT, phi::MaximumRawKernel, float, @@ -135,7 +151,7 @@ PD_REGISTER_KERNEL(maximum_raw, float16, bfloat16) {} PD_REGISTER_KERNEL(minimum_raw, - GPU, + KPS, ALL_LAYOUT, phi::MinimumRawKernel, float, @@ -145,7 +161,7 @@ PD_REGISTER_KERNEL(minimum_raw, float16, bfloat16) {} PD_REGISTER_KERNEL(modulo_raw, - GPU, + KPS, ALL_LAYOUT, phi::ModuloRawKernel, float, @@ -153,16 +169,17 @@ PD_REGISTER_KERNEL(modulo_raw, int, int64_t) {} PD_REGISTER_KERNEL(floor_divide_raw, - GPU, + KPS, ALL_LAYOUT, phi::FloorDivideRawKernel, int, int64_t) {} PD_REGISTER_KERNEL(elementwise_pow_raw, - GPU, + KPS, ALL_LAYOUT, phi::ElementwisePowRawKernel, float, double, int, int64_t) {} +#endif -- GitLab