From a3d56a9c1f575504ba88b8f3ab2466d55b22e652 Mon Sep 17 00:00:00 2001 From: Lijunhui <1578034415@qq.com> Date: Sun, 1 May 2022 20:32:46 +0800 Subject: [PATCH] [KP] Complete registry of elementwise ops on XPU with KP (#42056) --- .../new_executor/standalone_executor_test.cc | 3 +- .../operators/reduce_ops/reduce_amax_op.cu | 1 + .../operators/reduce_ops/reduce_amin_op.cu | 1 + paddle/fluid/operators/reduce_ops/reduce_op.h | 13 ++++-- .../platform/device/xpu/xpu_op_kpfirst_list.h | 4 ++ paddle/phi/kernels/elementwise_kernel.cc | 8 ++-- .../phi/kernels/funcs/elementwise_functor.h | 7 ++++ .../phi/kernels/kps/elementwise_add_kernel.cu | 1 + .../kernels/kps/elementwise_divide_kernel.cu | 1 + paddle/phi/kernels/kps/elementwise_kernel.cu | 41 +++++++++++++++++++ .../kps/elementwise_multiply_kernel.cu | 1 + .../kps/elementwise_subtract_kernel.cu | 1 + paddle/phi/kernels/kps/logical_kernel.cu | 6 +-- .../primitive/functor_primitives_xpu2.h | 9 ++-- 14 files changed, 82 insertions(+), 15 deletions(-) mode change 100755 => 100644 paddle/phi/kernels/primitive/functor_primitives_xpu2.h diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc index e03277fb317..23bd777fae1 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -74,11 +74,12 @@ PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT); PD_DECLARE_KERNEL(multiply, KPS, ALL_LAYOUT); PD_DECLARE_KERNEL(multiply_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(divide, KPS, ALL_LAYOUT); -PD_DECLARE_KERNEL(maximum, GPU, ALL_LAYOUT); #ifdef PADDLE_WITH_XPU_KP PD_DECLARE_KERNEL(max_raw, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(maximum, GPU, ALL_LAYOUT); #else PD_DECLARE_KERNEL(max_raw, KPS, ALL_LAYOUT); +PD_DECLARE_KERNEL(maximum, KPS, ALL_LAYOUT); #endif PD_DECLARE_KERNEL(mean, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(mean_grad, GPU, ALL_LAYOUT); diff --git a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cu b/paddle/fluid/operators/reduce_ops/reduce_amax_op.cu index 16c7a4794bb..b3385915341 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_amax_op.cu @@ -11,6 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.h" diff --git a/paddle/fluid/operators/reduce_ops/reduce_amin_op.cu b/paddle/fluid/operators/reduce_ops/reduce_amin_op.cu index f9f015804e1..037dab396c7 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_amin_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_amin_op.cu @@ -11,6 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.h" diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index ff1ddb4175f..76641698ead 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -29,7 +29,7 @@ limitations under the License. */ #include "paddle/phi/api/lib/utils/tensor_utils.h" #include "paddle/phi/kernels/cpu/reduce.h" -#if defined(__HIPCC__) || defined(__NVCC__) +#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__) #include "paddle/phi/kernels/gpu/reduce.h" #include "paddle/phi/kernels/gpu/reduce_grad.h" #endif @@ -613,7 +613,7 @@ If reduce_all is true, just reduce along all dimensions and output a scalar. virtual std::string GetOpType() const = 0; }; -#if defined(__HIPCC__) || defined(__NVCC__) +#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__) template class ReduceOp, template class TransformOp> class ReduceCudaKernel : public framework::OpKernel { @@ -626,9 +626,12 @@ class ReduceCudaKernel : public framework::OpKernel { auto pt_out_dtype = paddle::framework::TransToPhiDataType( static_cast(out_dtype)); std::vector dims = context.Attr>("dim"); - +#ifdef PADDLE_WITH_XPU_KP + auto& dev_ctx = + context.template device_context(); +#else auto& dev_ctx = context.cuda_device_context(); - +#endif if (out_dtype >= 0) { output->mutable_data(dev_ctx.GetPlace(), pt_out_dtype); } else { @@ -642,6 +645,7 @@ class ReduceCudaKernel : public framework::OpKernel { } }; +#ifndef PADDLE_WITH_XPU_KP template class TransformOp> class ReduceCudaGradKernel : public framework::OpKernel { public: @@ -686,6 +690,7 @@ class ReduceCudaGradKernel : public framework::OpKernel { } }; #endif +#endif } // namespace operators } // namespace paddle diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h index 99a1eb97de5..43c9e63ac19 100644 --- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h +++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h @@ -42,6 +42,8 @@ XPUOpMap& get_kp_ops() { XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"elementwise_floordiv", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace())})}, + {"elementwise_pow", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, // activation op {"exp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"hard_swish", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, @@ -105,6 +107,8 @@ XPUOpMap& get_kp_ops() { {"reduce_prod", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"reduce_all", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace())})}, {"reduce_any", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace())})}, + {"reduce_amax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"reduce_amin", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, }; return s_xpu_kp_kernels; diff --git a/paddle/phi/kernels/elementwise_kernel.cc b/paddle/phi/kernels/elementwise_kernel.cc index 4cee24d2f80..9d608cd86a6 100644 --- a/paddle/phi/kernels/elementwise_kernel.cc +++ b/paddle/phi/kernels/elementwise_kernel.cc @@ -103,7 +103,7 @@ PD_REGISTER_KERNEL(elementwise_pow, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(maximum, - GPU, + KPS, ALL_LAYOUT, phi::MaximumKernel, float, @@ -113,7 +113,7 @@ PD_REGISTER_KERNEL(maximum, phi::dtype::float16, phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(minimum, - GPU, + KPS, ALL_LAYOUT, phi::MinimumKernel, float, @@ -125,9 +125,9 @@ PD_REGISTER_KERNEL(minimum, PD_REGISTER_KERNEL( modulo, GPU, ALL_LAYOUT, phi::ModuloKernel, float, double, int, int64_t) {} PD_REGISTER_KERNEL( - floor_divide, GPU, ALL_LAYOUT, phi::FloorDivideKernel, int, int64_t) {} + floor_divide, KPS, ALL_LAYOUT, phi::FloorDivideKernel, int, int64_t) {} PD_REGISTER_KERNEL(elementwise_pow, - GPU, + KPS, ALL_LAYOUT, phi::ElementwisePowKernel, float, diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h index 8d9dd657867..4c2b6ef896e 100644 --- a/paddle/phi/kernels/funcs/elementwise_functor.h +++ b/paddle/phi/kernels/funcs/elementwise_functor.h @@ -18,6 +18,10 @@ limitations under the License. */ #include "paddle/phi/common/float16.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/hostdevice.h" +#if defined(__xpu__) +#include +#include "xpu/kernel/math_xpu2.h" //pow() +#endif namespace phi { namespace funcs { @@ -573,6 +577,9 @@ struct ElementwisePowFunctor { return std::llrint( std::pow(static_cast(a), static_cast(b))); } +#endif +#ifdef PADDLE_WITH_XPU_KP + return pow(a, b); #endif return std::pow(a, b); } diff --git a/paddle/phi/kernels/kps/elementwise_add_kernel.cu b/paddle/phi/kernels/kps/elementwise_add_kernel.cu index b5532c61431..8f7d45771d9 100644 --- a/paddle/phi/kernels/kps/elementwise_add_kernel.cu +++ b/paddle/phi/kernels/kps/elementwise_add_kernel.cu @@ -36,6 +36,7 @@ void AddKernel(const Context& dev_ctx, } // namespace phi #ifdef PADDLE_WITH_XPU_KP +PD_REGISTER_KERNEL(add, KPS, ALL_LAYOUT, phi::AddKernel, float) {} PD_REGISTER_KERNEL(add_raw, KPS, ALL_LAYOUT, phi::AddRawKernel, float) {} #else diff --git a/paddle/phi/kernels/kps/elementwise_divide_kernel.cu b/paddle/phi/kernels/kps/elementwise_divide_kernel.cu index 852babe29db..827c478de97 100644 --- a/paddle/phi/kernels/kps/elementwise_divide_kernel.cu +++ b/paddle/phi/kernels/kps/elementwise_divide_kernel.cu @@ -37,6 +37,7 @@ void DivideKernel(const Context& dev_ctx, } // namespace phi #ifdef PADDLE_WITH_XPU_KP +PD_REGISTER_KERNEL(divide, KPS, ALL_LAYOUT, phi::DivideKernel, float) {} PD_REGISTER_KERNEL(divide_raw, KPS, ALL_LAYOUT, phi::DivideRawKernel, float) {} #else diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu index 5ccd3b1a482..821fda52ab1 100644 --- a/paddle/phi/kernels/kps/elementwise_kernel.cu +++ b/paddle/phi/kernels/kps/elementwise_kernel.cu @@ -24,24 +24,65 @@ namespace phi { // Create the definition of Maximum DEFINE_CUDA_ELEMENTWISE_OP(Maximum) +template +void MaximumKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + int axis = -1; + MaximumRawKernel(dev_ctx, x, y, axis, out); +} // Create the definition of Minimum DEFINE_CUDA_ELEMENTWISE_OP(Minimum) +template +void MinimumKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + int axis = -1; + MinimumRawKernel(dev_ctx, x, y, axis, out); +} // Create the definition of Modulo DEFINE_CUDA_ELEMENTWISE_OP(Modulo) // Create the definition of FloorDivide DEFINE_CUDA_ELEMENTWISE_OP(FloorDivide) +template +void FloorDivideKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + int axis = -1; + FloorDivideRawKernel(dev_ctx, x, y, axis, out); +} // Create the definition of Pow DEFINE_CUDA_ELEMENTWISE_OP(ElementwisePow) +template +void ElementwisePowKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + int axis = -1; + ElementwisePowRawKernel(dev_ctx, x, y, axis, out); +} } // namespace phi #ifdef PADDLE_WITH_XPU_KP +PD_REGISTER_KERNEL(maximum, KPS, ALL_LAYOUT, phi::MaximumKernel, float) {} PD_REGISTER_KERNEL(maximum_raw, KPS, ALL_LAYOUT, phi::MaximumRawKernel, float) { } +PD_REGISTER_KERNEL(minimum, KPS, ALL_LAYOUT, phi::MinimumKernel, float) {} PD_REGISTER_KERNEL(minimum_raw, KPS, ALL_LAYOUT, phi::MinimumRawKernel, float) { } +PD_REGISTER_KERNEL(floor_divide, KPS, ALL_LAYOUT, phi::FloorDivideKernel, int) { +} PD_REGISTER_KERNEL( floor_divide_raw, KPS, ALL_LAYOUT, phi::FloorDivideRawKernel, int) {} +PD_REGISTER_KERNEL( + elementwise_pow, KPS, ALL_LAYOUT, phi::ElementwisePowKernel, float) {} +PD_REGISTER_KERNEL( + elementwise_pow_raw, KPS, ALL_LAYOUT, phi::ElementwisePowRawKernel, float) { +} #else using float16 = phi::dtype::float16; diff --git a/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu b/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu index 8bede0198c2..99408ff2142 100644 --- a/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu +++ b/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu @@ -37,6 +37,7 @@ void MultiplyKernel(const Context& dev_ctx, } // namespace phi #ifdef PADDLE_WITH_XPU_KP +PD_REGISTER_KERNEL(multiply, KPS, ALL_LAYOUT, phi::MultiplyKernel, float) {} PD_REGISTER_KERNEL( multiply_raw, KPS, ALL_LAYOUT, phi::MultiplyRawKernel, float) {} #else diff --git a/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu b/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu index 757dedb99c9..b99f687b59f 100644 --- a/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu +++ b/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu @@ -37,6 +37,7 @@ void SubtractKernel(const Context& dev_ctx, } // namespace phi #ifdef PADDLE_WITH_XPU_KP +PD_REGISTER_KERNEL(subtract, KPS, ALL_LAYOUT, phi::SubtractKernel, float) {} PD_REGISTER_KERNEL( subtract_raw, KPS, ALL_LAYOUT, phi::SubtractRawKernel, float) {} #else diff --git a/paddle/phi/kernels/kps/logical_kernel.cu b/paddle/phi/kernels/kps/logical_kernel.cu index b732d371ad1..81567595395 100644 --- a/paddle/phi/kernels/kps/logical_kernel.cu +++ b/paddle/phi/kernels/kps/logical_kernel.cu @@ -65,9 +65,9 @@ void LogicalNotKernel(const Context& dev_ctx, #ifdef PADDLE_WITH_XPU_KP PD_REGISTER_KERNEL(logical_and, KPS, ALL_LAYOUT, phi::LogicalAndKernel, int) {} -PD_REGISTER_KERNEL(logical_Or, KPS, ALL_LAYOUT, phi::LogicalOrKernel, int) {} -PD_REGISTER_KERNEL(logical_Not, KPS, ALL_LAYOUT, phi::LogicalNotKernel, int) {} -PD_REGISTER_KERNEL(logical_Xor, KPS, ALL_LAYOUT, phi::LogicalXorKernel, int) {} +PD_REGISTER_KERNEL(logical_or, KPS, ALL_LAYOUT, phi::LogicalOrKernel, int) {} +PD_REGISTER_KERNEL(logical_not, KPS, ALL_LAYOUT, phi::LogicalNotKernel, int) {} +PD_REGISTER_KERNEL(logical_xor, KPS, ALL_LAYOUT, phi::LogicalXorKernel, int) {} #else #define REGISTER_LOGICAL_CUDA_KERNEL(logical_and, func_type) \ PD_REGISTER_KERNEL(logical_and, \ diff --git a/paddle/phi/kernels/primitive/functor_primitives_xpu2.h b/paddle/phi/kernels/primitive/functor_primitives_xpu2.h old mode 100755 new mode 100644 index b01e0474f2d..fdcbb5ec9cc --- a/paddle/phi/kernels/primitive/functor_primitives_xpu2.h +++ b/paddle/phi/kernels/primitive/functor_primitives_xpu2.h @@ -124,7 +124,8 @@ struct MaxFunctor { */ template struct AddFunctor { - inline T initial() { return static_cast(0.0f); } + inline T initial() { /*return static_cast(0.0f);*/ + } __device__ T operator()(const T a, const T b) const { return b + a; } }; @@ -134,7 +135,8 @@ struct AddFunctor { */ template struct MulFunctor { - inline T initial() { return static_cast(1.0f); } + inline T initial() { /*return static_cast(1.0f);*/ + } __device__ T operator()(const T& a, const T& b) const { return b * a; } }; @@ -144,7 +146,8 @@ struct MulFunctor { */ template struct LogicalOrFunctor { - inline T initial() { return static_cast(false); } + inline T initial() { /*return static_cast(false);*/ + } __device__ T operator()(const T& a, const T& b) const { return b || a; } }; -- GitLab