diff --git a/paddle/phi/kernels/funcs/index_impl.cu.h b/paddle/phi/kernels/funcs/index_impl.cu.h index 4e2e2a7508700f0351ffcb716679971076d03d8c..cfe95f87f6335dbbb3a6694a339aa35bdbf8dfc6 100644 --- a/paddle/phi/kernels/funcs/index_impl.cu.h +++ b/paddle/phi/kernels/funcs/index_impl.cu.h @@ -43,7 +43,7 @@ __global__ void VectorizedIndexKernel(T *out, out + data_offset, &result[0], BLOCK_NUM_X * VecSize); } size_t num = numel - data_offset; - if (num > 0) { + if (static_cast(num) > 0) { kps::InitWithDataIndex(&args[0], data_offset); kps::ElementwiseUnary( &result[0], &args[0], func); diff --git a/paddle/phi/kernels/gpu/prelu_funcs.h b/paddle/phi/kernels/gpu/prelu_funcs.h index efb22bfadfc921ab4d66dd37991a0dbc15bd4875..dc480bd739eae42bd0035fc8bf8ca7ac961c8ef8 100644 --- a/paddle/phi/kernels/gpu/prelu_funcs.h +++ b/paddle/phi/kernels/gpu/prelu_funcs.h @@ -28,157 +28,83 @@ inline static int PADDLE_GET_BLOCKS(const int N) { } template -__global__ void PReluChannelFirstWiseKernel(const T *input, - const T *alpha, - T *output, - size_t channel_num, - size_t plane_size, - size_t numel) { - CUDA_KERNEL_LOOP(index, numel) { - size_t temp = index / plane_size; - size_t channel_index = temp % channel_num; - T scale = alpha[channel_index]; - T x = input[index]; +struct PReluChannelFirstWiseCUDAFunctor { + const T* x_; + const T* alpha_; + size_t channel_num_; + size_t plane_size_; + int numel_; + + HOSTDEVICE inline PReluChannelFirstWiseCUDAFunctor(const T* x, + const T* alpha, + int numel, + size_t channel_num, + size_t plane_size) + : x_(x), + alpha_(alpha), + numel_(numel), + channel_num_(channel_num), + plane_size_(plane_size) {} + + HOSTDEVICE inline T operator()(const unsigned int n) const { T zero = static_cast(0); - output[index] = (x > zero) ? x : scale * x; + size_t temp = n / plane_size_; + size_t channel_index = temp % channel_num_; + T scale = alpha_[channel_index]; + T x = x_[n]; + return (x > zero) ? x : scale * x; } -} +}; template -__global__ void PReluChannelLastWiseKernel(const T *input, - const T *alpha, - T *output, - size_t channel_num, - size_t numel) { - CUDA_KERNEL_LOOP(index, numel) { - size_t channel_index = index % channel_num; - T scale = alpha[channel_index]; - T x = input[index]; - T zero = static_cast(0); - output[index] = (x > zero) ? x : scale * x; - } -} +struct PReluChannelLastWiseCUDAFunctor { + const T* x_; + const T* alpha_; + size_t channel_num_; -template -__global__ void PReluElementWiseKernel(const T *input, - const T *alpha, - T *output, - size_t spatial_size, - size_t numel) { - CUDA_KERNEL_LOOP(index, numel) { - size_t element_index = index % spatial_size; - T scale = alpha[element_index]; - T x = input[index]; - T zero = static_cast(0); - output[index] = (x > zero) ? x : scale * x; - } -} + HOSTDEVICE inline PReluChannelLastWiseCUDAFunctor(const T* x, + const T* alpha, + size_t channel_num) + : x_(x), alpha_(alpha), channel_num_(channel_num) {} -template -__global__ void PReluScalarKernel(const T *input, - const T *alpha, - T *output, - size_t numel) { - T scale = alpha[0]; - CUDA_KERNEL_LOOP(index, numel) { - T x = input[index]; + HOSTDEVICE inline T operator()(const unsigned int n) const { T zero = static_cast(0); - output[index] = (x > zero) ? x : scale * x; + size_t channel_index = n % channel_num_; + T scale = alpha_[channel_index]; + T x = x_[n]; + return (x > zero) ? x : scale * x; } -} - -template -class PreluChannelWiseDirectCUDAFunctor { - public: - void operator()(gpuStream_t stream, - const T *input, - const T *alpha, - T *output, - size_t batch_size, - size_t channel, - bool channel_last, - size_t numel); }; template -class PreluElementWiseDirectCUDAFunctor { - public: - void operator()(gpuStream_t stream, - const T *input, - const T *alpha, - T *output, - size_t batch_size, - size_t numel); -}; +struct PreluElementWiseDirectCUDAFunctor { + const T* x_; + const T* alpha_; + size_t spatial_size_; -template -class PreluScalarDirectCUDAFunctor { - public: - void operator()(gpuStream_t stream, - const T *input, - const T *alpha, - T *output, - size_t numel); -}; + HOSTDEVICE inline PreluElementWiseDirectCUDAFunctor(const T* x, + const T* alpha, + size_t spatial_size) + : x_(x), alpha_(alpha), spatial_size_(spatial_size) {} -template -void PreluChannelWiseDirectCUDAFunctor::operator()(gpuStream_t stream, - const T *input, - const T *alpha, - T *output, - size_t batch_size, - size_t channel, - bool channel_last, - size_t numel) { - if (channel_last) { - PReluChannelLastWiseKernel<<>>( - input, alpha, output, channel, numel); - } else { - PReluChannelFirstWiseKernel<<>>( - input, alpha, output, channel, numel / batch_size / channel, numel); + HOSTDEVICE inline T operator()(const unsigned int n) const { + T zero = static_cast(0); + size_t element_index = n % spatial_size_; + T scale = alpha_[element_index]; + T x = x_[n]; + return (x > zero) ? x : scale * x; } -} - -template -void PreluElementWiseDirectCUDAFunctor::operator()(gpuStream_t stream, - const T *input, - const T *alpha, - T *output, - size_t batch_size, - size_t numel) { - PReluElementWiseKernel<<>>( - input, alpha, output, numel / batch_size, numel); -} +}; template -void PreluScalarDirectCUDAFunctor::operator()(gpuStream_t stream, - const T *input, - const T *alpha, - T *output, - size_t numel) { - PReluScalarKernel<<>>( - input, alpha, output, numel); -} - -template class PreluChannelWiseDirectCUDAFunctor; -template class PreluChannelWiseDirectCUDAFunctor; -template class PreluChannelWiseDirectCUDAFunctor; - -template class PreluElementWiseDirectCUDAFunctor; -template class PreluElementWiseDirectCUDAFunctor; -template class PreluElementWiseDirectCUDAFunctor; - -template class PreluScalarDirectCUDAFunctor; -template class PreluScalarDirectCUDAFunctor; -template class PreluScalarDirectCUDAFunctor; +struct PreluScalarDirectCUDAFunctor { + const T* scalar_; + HOSTDEVICE inline PreluScalarDirectCUDAFunctor(const T* scalar) + : scalar_(scalar) {} + HOSTDEVICE inline T operator()(const T x) const { + T zero = static_cast(0); + return (x > zero) ? x : scalar_[0] * x; + } +}; } // namespace phi diff --git a/paddle/phi/kernels/gpu/prelu_kernel.cu b/paddle/phi/kernels/gpu/prelu_kernel.cu index c4730768982bb77b87bdfe1c11b08a74f72cface..d698d78a55cc23bc01318b015c8d798f99a44e61 100644 --- a/paddle/phi/kernels/gpu/prelu_kernel.cu +++ b/paddle/phi/kernels/gpu/prelu_kernel.cu @@ -16,6 +16,8 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/funcs/index_impl.cu.h" #include "paddle/phi/kernels/gpu/prelu_funcs.h" namespace phi { @@ -27,36 +29,43 @@ void PReluKernel(const Context& dev_ctx, const std::string& data_format, const std::string& mode, DenseTensor* out) { + dev_ctx.template Alloc(out); const T* x_ptr = x.data(); - T* o_ptr = dev_ctx.template Alloc(out); - const T* alpha_ptr = alpha.data(); + int numel = x.numel(); auto dim = x.dims(); auto x_rank = dim.size(); VLOG(4) << "dim[0]:" << dim[0] << ", dim[1]:" << dim[1] << ", dim[" - << x_rank - 1 << "]:" << dim[x_rank - 1] << ", numel:" << numel; + << x_rank - 1 << "]:" << dim[x_rank - 1] << ", numel:" << numel + << ", mode:" << mode << ", format:" << data_format; if (mode == "channel") { bool channel_last = data_format == "NHWC"; size_t channel = channel_last ? dim[x_rank - 1] : dim[1]; - PreluChannelWiseDirectCUDAFunctor prelu_channel_wise; - prelu_channel_wise(dev_ctx.stream(), - x_ptr, - alpha_ptr, - o_ptr, - dim[0], - channel, - channel_last, - numel); + if (channel_last) { + auto func = PReluChannelLastWiseCUDAFunctor(x_ptr, alpha_ptr, channel); + phi::IndexKernel>( + dev_ctx, out, func); + } else { + size_t plane_size = numel / dim[0] / channel; + auto func = PReluChannelFirstWiseCUDAFunctor( + x_ptr, alpha_ptr, numel, channel, plane_size); + phi::IndexKernel>( + dev_ctx, out, func); + } } else if (mode == "element") { - PreluElementWiseDirectCUDAFunctor prelu_element_wise; - prelu_element_wise( - dev_ctx.stream(), x_ptr, alpha_ptr, o_ptr, dim[0], numel); + size_t spatial_size = numel / dim[0]; + auto func = + PreluElementWiseDirectCUDAFunctor(x_ptr, alpha_ptr, spatial_size); + phi::IndexKernel>( + dev_ctx, out, func); } else { - PreluScalarDirectCUDAFunctor prelu_scalar; - prelu_scalar(dev_ctx.stream(), x_ptr, alpha_ptr, o_ptr, numel); + std::vector ins = {&x}; + std::vector outs = {out}; + auto func = PreluScalarDirectCUDAFunctor(alpha_ptr); + phi::funcs::ElementwiseKernel(dev_ctx, ins, &outs, func); } }