未验证 提交 34d4b40d 编写于 作者: Y Yiqun Liu 提交者: GitHub

Simplify the softmax kernel and add the check of whether cudnn softmax can be used. (#40424)

上级 f452ad5c
...@@ -79,7 +79,7 @@ class VecT2<phi::dtype::bfloat16> { ...@@ -79,7 +79,7 @@ class VecT2<phi::dtype::bfloat16> {
using Type = int; using Type = int;
}; };
static inline int log2_ceil(int value) { static inline int Log2Ceil(int value) {
int log2_value = 0; int log2_value = 0;
while ((1 << log2_value) < value) ++log2_value; while ((1 << log2_value) < value) ++log2_value;
return log2_value; return log2_value;
...@@ -577,8 +577,8 @@ static void GetBlockDim(int mid_dim, int low_dim, dim3* block) { ...@@ -577,8 +577,8 @@ static void GetBlockDim(int mid_dim, int low_dim, dim3* block) {
#else #else
constexpr int max_num_threads = 1024; constexpr int max_num_threads = 1024;
#endif #endif
int block_x = 1 << log2_ceil(low_dim); int block_x = 1 << Log2Ceil(low_dim);
int block_y = 1 << log2_ceil(mid_dim); int block_y = 1 << Log2Ceil(mid_dim);
block->x = std::min(block_x, 32); block->x = std::min(block_x, 32);
block->y = std::min(block_y, static_cast<int>(max_num_threads / block->x)); block->y = std::min(block_y, static_cast<int>(max_num_threads / block->x));
block->x = std::min(block_x, static_cast<int>(max_num_threads / block->y)); block->x = std::min(block_x, static_cast<int>(max_num_threads / block->y));
...@@ -739,6 +739,131 @@ void LaunchNormalSoftmaxBackward(const GPUContext& dev_ctx, ...@@ -739,6 +739,131 @@ void LaunchNormalSoftmaxBackward(const GPUContext& dev_ctx,
} }
} }
static std::vector<int> GetSoftmaxTensorDims(const phi::DDim& dims,
const int axis) {
int dim = dims[axis];
int N = phi::funcs::SizeToAxis(axis, dims);
int D = phi::funcs::SizeOutAxis(axis, dims);
return {N, dim, D, 1};
}
template <typename T>
void SoftmaxForwardCudnnKernel(const GPUContext& dev_ctx,
const DenseTensor& x,
const int axis,
const bool log_mode,
DenseTensor* out) {
auto* out_data = out->data<T>();
const int rank = x.dims().size();
std::vector<int> tensor_dims = GetSoftmaxTensorDims(x.dims(), axis);
auto handle = dev_ctx.cudnn_handle();
GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
ScopedTensorDescriptor scoped_desc;
#ifdef PADDLE_WITH_HIP
miopenTensorDescriptor_t desc =
scoped_desc.descriptor<T>(layout, tensor_dims);
auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
: MIOPEN_SOFTMAX_MODE_CHANNEL;
auto algo = log_mode ? MIOPEN_SOFTMAX_LOG : MIOPEN_SOFTMAX_ACCURATE;
PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::miopenSoftmaxForward_V2(
handle,
paddle::platform::CudnnDataType<T>::kOne(),
desc,
x.data<T>(),
paddle::platform::CudnnDataType<T>::kZero(),
desc,
out_data,
algo,
mode));
#else
cudnnTensorDescriptor_t desc = scoped_desc.descriptor<T>(layout, tensor_dims);
auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
: CUDNN_SOFTMAX_MODE_CHANNEL;
auto algo = log_mode ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE;
PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxForward(
handle,
algo,
mode,
paddle::platform::CudnnDataType<T>::kOne(),
desc,
x.data<T>(),
paddle::platform::CudnnDataType<T>::kZero(),
desc,
out_data));
#endif
}
template <typename T>
void SoftmaxBackwardCudnnKernel(const GPUContext& dev_ctx,
const DenseTensor& out,
const DenseTensor& dout,
const int axis,
const bool log_mode,
DenseTensor* dx) {
auto* dx_data = dx->data<T>();
int rank = out.dims().size();
std::vector<int> tensor_dims = GetSoftmaxTensorDims(out.dims(), axis);
auto handle = dev_ctx.cudnn_handle();
GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
ScopedTensorDescriptor scoped_desc;
#ifdef PADDLE_WITH_HIP
miopenTensorDescriptor_t desc =
scoped_desc.descriptor<T>(layout, tensor_dims);
auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
: MIOPEN_SOFTMAX_MODE_CHANNEL;
auto algo = log_mode ? MIOPEN_SOFTMAX_LOG : MIOPEN_SOFTMAX_ACCURATE;
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenSoftmaxBackward_V2(
handle,
paddle::platform::CudnnDataType<T>::kOne(),
desc,
out.data<T>(),
desc,
dout.data<T>(),
paddle::platform::CudnnDataType<T>::kZero(),
desc,
dx_data,
algo,
mode));
#else
cudnnTensorDescriptor_t desc = scoped_desc.descriptor<T>(layout, tensor_dims);
auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
: CUDNN_SOFTMAX_MODE_CHANNEL;
auto algo = log_mode ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE;
PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxBackward(
handle,
algo,
mode,
paddle::platform::CudnnDataType<T>::kOne(),
desc,
out.data<T>(),
desc,
dout.data<T>(),
paddle::platform::CudnnDataType<T>::kZero(),
desc,
dx_data));
#endif
}
template <typename T>
static bool CanUseCudnnSoftmax(const GPUContext& dev_ctx) {
if (dev_ctx.cudnn_handle() != nullptr) {
if (std::is_same<T, phi::dtype::bfloat16>::value) {
#if CUDNN_VERSION < 8100
return false;
#endif
}
return true;
}
return false;
}
template <typename T, bool LogMode = false> template <typename T, bool LogMode = false>
void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx, void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
...@@ -746,29 +871,29 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx, ...@@ -746,29 +871,29 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
DenseTensor* out) { DenseTensor* out) {
auto* out_data = out->data<T>(); auto* out_data = out->data<T>();
auto dims = x.dims(); int rank = x.dims().size();
const int rank = dims.size(); int axis = phi::funcs::CanonicalAxis(input_axis, rank);
const int axis = phi::funcs::CanonicalAxis(input_axis, rank); std::vector<int> tensor_dims = GetSoftmaxTensorDims(x.dims(), axis);
const int dim = dims[axis]; int N = tensor_dims[0];
const int N = phi::funcs::SizeToAxis(axis, dims); int dim = tensor_dims[1];
const int D = phi::funcs::SizeOutAxis(axis, dims); int D = tensor_dims[2];
constexpr int max_dim = 512; constexpr int max_dim = 512;
constexpr int warps_per_block = 4;
if (D == 1 && dim <= max_dim && sizeof(T) <= 4) { if (D == 1 &&
const int kDimLog2 = static_cast<int>(log2_ceil(dim)); (!CanUseCudnnSoftmax<T>(dev_ctx) || (dim <= max_dim && sizeof(T) <= 4))) {
const int kDimCeil = 1 << kDimLog2; int dim_log2 = static_cast<int>(Log2Ceil(dim));
int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32; int dim_ceil = 1 << dim_log2;
int batches_per_warp = (kDimCeil <= 32) ? 2 : 1; int warp_size = (dim_ceil < 32) ? dim_ceil : 32;
int batches_per_warp = (dim_ceil <= 32) ? 2 : 1;
// use 128 threads per block to maximimize gpu utilization // use 128 threads per block to maximimize gpu utilization
constexpr int threads_per_block = 128; constexpr int threads_per_block = 128;
int warps_per_block = (threads_per_block / kWarpSize); int warps_per_block = (threads_per_block / warp_size);
int batches_per_block = warps_per_block * batches_per_warp; int batches_per_block = warps_per_block * batches_per_warp;
int blocks = (N + batches_per_block - 1) / batches_per_block; int blocks = (N + batches_per_block - 1) / batches_per_block;
dim3 threads(kWarpSize, warps_per_block, 1); dim3 threads(warp_size, warps_per_block, 1);
// vectorization read/write // vectorization read/write
using T4 = typename VecT4<T>::Type; using T4 = typename VecT4<T>::Type;
...@@ -783,7 +908,7 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx, ...@@ -783,7 +908,7 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
N, N,
dim, dim,
dim, dim,
kDimLog2); dim_log2);
} else if (dim % 2 == 0) { } else if (dim % 2 == 0) {
SwitchWarpSoftmaxForward<T, T2, LogMode>(blocks, SwitchWarpSoftmaxForward<T, T2, LogMode>(blocks,
threads, threads,
...@@ -793,7 +918,7 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx, ...@@ -793,7 +918,7 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
N, N,
dim, dim,
dim, dim,
kDimLog2); dim_log2);
} else { } else {
SwitchWarpSoftmaxForward<T, T, LogMode>(blocks, SwitchWarpSoftmaxForward<T, T, LogMode>(blocks,
threads, threads,
...@@ -803,78 +928,13 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx, ...@@ -803,78 +928,13 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
N, N,
dim, dim,
dim, dim,
kDimLog2); dim_log2);
} }
} else if (D > 1) { } else if (D > 1) {
LaunchNormalSoftmaxForward<T, LogMode>( LaunchNormalSoftmaxForward<T, LogMode>(
dev_ctx, out_data, x.data<T>(), N, dim, D); dev_ctx, out_data, x.data<T>(), N, dim, D);
} else { } else {
ScopedTensorDescriptor desc; SoftmaxForwardCudnnKernel<T>(dev_ctx, x, axis, LogMode, out);
std::vector<int> tensor_dims = {N, dim, D, 1};
GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
#ifdef PADDLE_WITH_HIP
miopenTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
#else
cudnnTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
#endif
auto handle = dev_ctx.cudnn_handle();
#ifdef PADDLE_WITH_HIP
auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
: MIOPEN_SOFTMAX_MODE_CHANNEL;
if (LogMode) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenSoftmaxForward_V2(
handle,
paddle::platform::CudnnDataType<T>::kOne(),
desc_,
x.data<T>(),
paddle::platform::CudnnDataType<T>::kZero(),
desc_,
out_data,
MIOPEN_SOFTMAX_LOG,
mode));
} else {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenSoftmaxForward_V2(
handle,
paddle::platform::CudnnDataType<T>::kOne(),
desc_,
x.data<T>(),
paddle::platform::CudnnDataType<T>::kZero(),
desc_,
out_data,
MIOPEN_SOFTMAX_ACCURATE,
mode));
}
#else
auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
: CUDNN_SOFTMAX_MODE_CHANNEL;
if (LogMode) {
PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxForward(
handle,
CUDNN_SOFTMAX_LOG,
mode,
paddle::platform::CudnnDataType<T>::kOne(),
desc_,
x.data<T>(),
paddle::platform::CudnnDataType<T>::kZero(),
desc_,
out_data));
} else {
PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxForward(
handle,
CUDNN_SOFTMAX_ACCURATE,
mode,
paddle::platform::CudnnDataType<T>::kOne(),
desc_,
x.data<T>(),
paddle::platform::CudnnDataType<T>::kZero(),
desc_,
out_data));
}
#endif
} }
} }
...@@ -886,27 +946,28 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx, ...@@ -886,27 +946,28 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
DenseTensor* dx) { DenseTensor* dx) {
auto* dx_data = dx->data<T>(); auto* dx_data = dx->data<T>();
auto dims = out.dims(); int rank = out.dims().size();
const int rank = dims.size(); int axis = phi::funcs::CanonicalAxis(input_axis, rank);
const int axis = phi::funcs::CanonicalAxis(input_axis, rank); std::vector<int> tensor_dims = GetSoftmaxTensorDims(out.dims(), axis);
const int dim = dims[axis]; int N = tensor_dims[0];
const int N = phi::funcs::SizeToAxis(axis, dims); int dim = tensor_dims[1];
const int D = phi::funcs::SizeOutAxis(axis, dims); int D = tensor_dims[2];
constexpr int max_dim = 512; constexpr int max_dim = 512;
constexpr int warps_per_block = 4;
if (D == 1 && dim <= max_dim && sizeof(T) <= 4) { if (D == 1 &&
const int kDimLog2 = log2_ceil(dim); (!CanUseCudnnSoftmax<T>(dev_ctx) || (dim <= max_dim && sizeof(T) <= 4))) {
const int kDimCeil = 1 << kDimLog2; int dim_log2 = Log2Ceil(dim);
int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32; int dim_ceil = 1 << dim_log2;
int batches_per_warp = (kDimCeil <= 128) ? 2 : 1; int warp_size = (dim_ceil < 32) ? dim_ceil : 32;
int batches_per_warp = (dim_ceil <= 128) ? 2 : 1;
constexpr int threads_per_block = 128; constexpr int threads_per_block = 128;
int warps_per_block = (threads_per_block / kWarpSize); int warps_per_block = (threads_per_block / warp_size);
int batches_per_block = warps_per_block * batches_per_warp; int batches_per_block = warps_per_block * batches_per_warp;
int blocks = (N + batches_per_block - 1) / batches_per_block; int blocks = (N + batches_per_block - 1) / batches_per_block;
dim3 threads(kWarpSize, warps_per_block, 1); dim3 threads(warp_size, warps_per_block, 1);
// vectorization read/write // vectorization read/write
using T4 = typename VecT4<T>::Type; using T4 = typename VecT4<T>::Type;
...@@ -921,7 +982,7 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx, ...@@ -921,7 +982,7 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
N, N,
dim, dim,
dim, dim,
kDimLog2); dim_log2);
} else if (dim % 2 == 0) { } else if (dim % 2 == 0) {
SwitchWarpSoftmaxBackward<T, T2, LogMode>(blocks, SwitchWarpSoftmaxBackward<T, T2, LogMode>(blocks,
threads, threads,
...@@ -932,7 +993,7 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx, ...@@ -932,7 +993,7 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
N, N,
dim, dim,
dim, dim,
kDimLog2); dim_log2);
} else { } else {
SwitchWarpSoftmaxBackward<T, T, LogMode>(blocks, SwitchWarpSoftmaxBackward<T, T, LogMode>(blocks,
threads, threads,
...@@ -943,88 +1004,13 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx, ...@@ -943,88 +1004,13 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
N, N,
dim, dim,
dim, dim,
kDimLog2); dim_log2);
} }
} else if (D > 1) { } else if (D > 1) {
LaunchNormalSoftmaxBackward<T, LogMode>( LaunchNormalSoftmaxBackward<T, LogMode>(
dev_ctx, dx_data, dout.data<T>(), out.data<T>(), N, dim, D); dev_ctx, dx_data, dout.data<T>(), out.data<T>(), N, dim, D);
} else { } else {
ScopedTensorDescriptor desc; SoftmaxBackwardCudnnKernel<T>(dev_ctx, out, dout, axis, LogMode, dx);
std::vector<int> tensor_dims = {N, dim, D, 1};
GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
#ifdef PADDLE_WITH_HIP
miopenTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
#else
cudnnTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
#endif
auto handle = dev_ctx.cudnn_handle();
#ifdef PADDLE_WITH_HIP
auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
: MIOPEN_SOFTMAX_MODE_CHANNEL;
if (LogMode) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenSoftmaxBackward_V2(
handle,
paddle::platform::CudnnDataType<T>::kOne(),
desc_,
out.data<T>(),
desc_,
dout.data<T>(),
paddle::platform::CudnnDataType<T>::kZero(),
desc_,
dx_data,
MIOPEN_SOFTMAX_LOG,
mode));
} else {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::miopenSoftmaxBackward_V2(
handle,
paddle::platform::CudnnDataType<T>::kOne(),
desc_,
out.data<T>(),
desc_,
dout.data<T>(),
paddle::platform::CudnnDataType<T>::kZero(),
desc_,
dx_data,
MIOPEN_SOFTMAX_ACCURATE,
mode));
}
#else
auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
: CUDNN_SOFTMAX_MODE_CHANNEL;
if (LogMode) {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnSoftmaxBackward(
handle,
CUDNN_SOFTMAX_LOG,
mode,
paddle::platform::CudnnDataType<T>::kOne(),
desc_,
out.data<T>(),
desc_,
dout.data<T>(),
paddle::platform::CudnnDataType<T>::kZero(),
desc_,
dx_data));
} else {
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnSoftmaxBackward(
handle,
CUDNN_SOFTMAX_ACCURATE,
mode,
paddle::platform::CudnnDataType<T>::kOne(),
desc_,
out.data<T>(),
desc_,
dout.data<T>(),
paddle::platform::CudnnDataType<T>::kZero(),
desc_,
dx_data));
}
#endif
} }
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册