未验证 提交 6bd5b7ce 编写于 作者: T thunder95 提交者: GitHub

【PaddlePaddle Hackathon 4 No.35】为 Paddle 优化 prelu op 在 GPU 上的计算性能 (#51131)

* untracked files

* prelu_perf

* remove unused files

* upd

* fix bug
上级 12d43da9
...@@ -43,7 +43,7 @@ __global__ void VectorizedIndexKernel(T *out, ...@@ -43,7 +43,7 @@ __global__ void VectorizedIndexKernel(T *out,
out + data_offset, &result[0], BLOCK_NUM_X * VecSize); out + data_offset, &result[0], BLOCK_NUM_X * VecSize);
} }
size_t num = numel - data_offset; size_t num = numel - data_offset;
if (num > 0) { if (static_cast<int>(num) > 0) {
kps::InitWithDataIndex<size_t, VecSize, 1>(&args[0], data_offset); kps::InitWithDataIndex<size_t, VecSize, 1>(&args[0], data_offset);
kps::ElementwiseUnary<size_t, T, VecSize, 1, Functor>( kps::ElementwiseUnary<size_t, T, VecSize, 1, Functor>(
&result[0], &args[0], func); &result[0], &args[0], func);
......
...@@ -28,157 +28,83 @@ inline static int PADDLE_GET_BLOCKS(const int N) { ...@@ -28,157 +28,83 @@ inline static int PADDLE_GET_BLOCKS(const int N) {
} }
template <typename T> template <typename T>
__global__ void PReluChannelFirstWiseKernel(const T *input, struct PReluChannelFirstWiseCUDAFunctor {
const T *alpha, const T* x_;
T *output, const T* alpha_;
size_t channel_num, size_t channel_num_;
size_t plane_size, size_t plane_size_;
size_t numel) { int numel_;
CUDA_KERNEL_LOOP(index, numel) {
size_t temp = index / plane_size; HOSTDEVICE inline PReluChannelFirstWiseCUDAFunctor(const T* x,
size_t channel_index = temp % channel_num; const T* alpha,
T scale = alpha[channel_index]; int numel,
T x = input[index]; size_t channel_num,
size_t plane_size)
: x_(x),
alpha_(alpha),
numel_(numel),
channel_num_(channel_num),
plane_size_(plane_size) {}
HOSTDEVICE inline T operator()(const unsigned int n) const {
T zero = static_cast<T>(0); T zero = static_cast<T>(0);
output[index] = (x > zero) ? x : scale * x; size_t temp = n / plane_size_;
size_t channel_index = temp % channel_num_;
T scale = alpha_[channel_index];
T x = x_[n];
return (x > zero) ? x : scale * x;
} }
} };
template <typename T> template <typename T>
__global__ void PReluChannelLastWiseKernel(const T *input, struct PReluChannelLastWiseCUDAFunctor {
const T *alpha, const T* x_;
T *output, const T* alpha_;
size_t channel_num, size_t channel_num_;
size_t numel) {
CUDA_KERNEL_LOOP(index, numel) {
size_t channel_index = index % channel_num;
T scale = alpha[channel_index];
T x = input[index];
T zero = static_cast<T>(0);
output[index] = (x > zero) ? x : scale * x;
}
}
template <typename T> HOSTDEVICE inline PReluChannelLastWiseCUDAFunctor(const T* x,
__global__ void PReluElementWiseKernel(const T *input, const T* alpha,
const T *alpha, size_t channel_num)
T *output, : x_(x), alpha_(alpha), channel_num_(channel_num) {}
size_t spatial_size,
size_t numel) {
CUDA_KERNEL_LOOP(index, numel) {
size_t element_index = index % spatial_size;
T scale = alpha[element_index];
T x = input[index];
T zero = static_cast<T>(0);
output[index] = (x > zero) ? x : scale * x;
}
}
template <typename T> HOSTDEVICE inline T operator()(const unsigned int n) const {
__global__ void PReluScalarKernel(const T *input,
const T *alpha,
T *output,
size_t numel) {
T scale = alpha[0];
CUDA_KERNEL_LOOP(index, numel) {
T x = input[index];
T zero = static_cast<T>(0); T zero = static_cast<T>(0);
output[index] = (x > zero) ? x : scale * x; size_t channel_index = n % channel_num_;
T scale = alpha_[channel_index];
T x = x_[n];
return (x > zero) ? x : scale * x;
} }
}
template <typename T>
class PreluChannelWiseDirectCUDAFunctor {
public:
void operator()(gpuStream_t stream,
const T *input,
const T *alpha,
T *output,
size_t batch_size,
size_t channel,
bool channel_last,
size_t numel);
}; };
template <typename T> template <typename T>
class PreluElementWiseDirectCUDAFunctor { struct PreluElementWiseDirectCUDAFunctor {
public: const T* x_;
void operator()(gpuStream_t stream, const T* alpha_;
const T *input, size_t spatial_size_;
const T *alpha,
T *output,
size_t batch_size,
size_t numel);
};
template <typename T> HOSTDEVICE inline PreluElementWiseDirectCUDAFunctor(const T* x,
class PreluScalarDirectCUDAFunctor { const T* alpha,
public: size_t spatial_size)
void operator()(gpuStream_t stream, : x_(x), alpha_(alpha), spatial_size_(spatial_size) {}
const T *input,
const T *alpha,
T *output,
size_t numel);
};
template <typename T> HOSTDEVICE inline T operator()(const unsigned int n) const {
void PreluChannelWiseDirectCUDAFunctor<T>::operator()(gpuStream_t stream, T zero = static_cast<T>(0);
const T *input, size_t element_index = n % spatial_size_;
const T *alpha, T scale = alpha_[element_index];
T *output, T x = x_[n];
size_t batch_size, return (x > zero) ? x : scale * x;
size_t channel,
bool channel_last,
size_t numel) {
if (channel_last) {
PReluChannelLastWiseKernel<<<PADDLE_GET_BLOCKS(numel),
CUDA_NUM_THREADS,
0,
stream>>>(
input, alpha, output, channel, numel);
} else {
PReluChannelFirstWiseKernel<<<PADDLE_GET_BLOCKS(numel),
CUDA_NUM_THREADS,
0,
stream>>>(
input, alpha, output, channel, numel / batch_size / channel, numel);
} }
} };
template <typename T>
void PreluElementWiseDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
const T *input,
const T *alpha,
T *output,
size_t batch_size,
size_t numel) {
PReluElementWiseKernel<<<PADDLE_GET_BLOCKS(numel),
CUDA_NUM_THREADS,
0,
stream>>>(
input, alpha, output, numel / batch_size, numel);
}
template <typename T> template <typename T>
void PreluScalarDirectCUDAFunctor<T>::operator()(gpuStream_t stream, struct PreluScalarDirectCUDAFunctor {
const T *input, const T* scalar_;
const T *alpha, HOSTDEVICE inline PreluScalarDirectCUDAFunctor(const T* scalar)
T *output, : scalar_(scalar) {}
size_t numel) { HOSTDEVICE inline T operator()(const T x) const {
PReluScalarKernel<<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>( T zero = static_cast<T>(0);
input, alpha, output, numel); return (x > zero) ? x : scalar_[0] * x;
} }
};
template class PreluChannelWiseDirectCUDAFunctor<float>;
template class PreluChannelWiseDirectCUDAFunctor<phi::dtype::float16>;
template class PreluChannelWiseDirectCUDAFunctor<double>;
template class PreluElementWiseDirectCUDAFunctor<float>;
template class PreluElementWiseDirectCUDAFunctor<phi::dtype::float16>;
template class PreluElementWiseDirectCUDAFunctor<double>;
template class PreluScalarDirectCUDAFunctor<float>;
template class PreluScalarDirectCUDAFunctor<phi::dtype::float16>;
template class PreluScalarDirectCUDAFunctor<double>;
} // namespace phi } // namespace phi
...@@ -16,6 +16,8 @@ ...@@ -16,6 +16,8 @@
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/funcs/index_impl.cu.h"
#include "paddle/phi/kernels/gpu/prelu_funcs.h" #include "paddle/phi/kernels/gpu/prelu_funcs.h"
namespace phi { namespace phi {
...@@ -27,36 +29,43 @@ void PReluKernel(const Context& dev_ctx, ...@@ -27,36 +29,43 @@ void PReluKernel(const Context& dev_ctx,
const std::string& data_format, const std::string& data_format,
const std::string& mode, const std::string& mode,
DenseTensor* out) { DenseTensor* out) {
dev_ctx.template Alloc<T>(out);
const T* x_ptr = x.data<T>(); const T* x_ptr = x.data<T>();
T* o_ptr = dev_ctx.template Alloc<T>(out);
const T* alpha_ptr = alpha.data<T>(); const T* alpha_ptr = alpha.data<T>();
int numel = x.numel(); int numel = x.numel();
auto dim = x.dims(); auto dim = x.dims();
auto x_rank = dim.size(); auto x_rank = dim.size();
VLOG(4) << "dim[0]:" << dim[0] << ", dim[1]:" << dim[1] << ", dim[" VLOG(4) << "dim[0]:" << dim[0] << ", dim[1]:" << dim[1] << ", dim["
<< x_rank - 1 << "]:" << dim[x_rank - 1] << ", numel:" << numel; << x_rank - 1 << "]:" << dim[x_rank - 1] << ", numel:" << numel
<< ", mode:" << mode << ", format:" << data_format;
if (mode == "channel") { if (mode == "channel") {
bool channel_last = data_format == "NHWC"; bool channel_last = data_format == "NHWC";
size_t channel = channel_last ? dim[x_rank - 1] : dim[1]; size_t channel = channel_last ? dim[x_rank - 1] : dim[1];
PreluChannelWiseDirectCUDAFunctor<T> prelu_channel_wise; if (channel_last) {
prelu_channel_wise(dev_ctx.stream(), auto func = PReluChannelLastWiseCUDAFunctor<T>(x_ptr, alpha_ptr, channel);
x_ptr, phi::IndexKernel<T, PReluChannelLastWiseCUDAFunctor<T>>(
alpha_ptr, dev_ctx, out, func);
o_ptr, } else {
dim[0], size_t plane_size = numel / dim[0] / channel;
channel, auto func = PReluChannelFirstWiseCUDAFunctor<T>(
channel_last, x_ptr, alpha_ptr, numel, channel, plane_size);
numel); phi::IndexKernel<T, PReluChannelFirstWiseCUDAFunctor<T>>(
dev_ctx, out, func);
}
} else if (mode == "element") { } else if (mode == "element") {
PreluElementWiseDirectCUDAFunctor<T> prelu_element_wise; size_t spatial_size = numel / dim[0];
prelu_element_wise( auto func =
dev_ctx.stream(), x_ptr, alpha_ptr, o_ptr, dim[0], numel); PreluElementWiseDirectCUDAFunctor<T>(x_ptr, alpha_ptr, spatial_size);
phi::IndexKernel<T, PreluElementWiseDirectCUDAFunctor<T>>(
dev_ctx, out, func);
} else { } else {
PreluScalarDirectCUDAFunctor<T> prelu_scalar; std::vector<const DenseTensor*> ins = {&x};
prelu_scalar(dev_ctx.stream(), x_ptr, alpha_ptr, o_ptr, numel); std::vector<DenseTensor*> outs = {out};
auto func = PreluScalarDirectCUDAFunctor<T>(alpha_ptr);
phi::funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, func);
} }
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册