diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h index c9c549379bbce6ceb4c2314f34f24ad659f5c272..0352fdf6fa2f1c1b74515d8e0023ef5a58e4efae 100644 --- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h +++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h @@ -79,7 +79,7 @@ class VecT2 { using Type = int; }; -static inline int log2_ceil(int value) { +static inline int Log2Ceil(int value) { int log2_value = 0; while ((1 << log2_value) < value) ++log2_value; return log2_value; @@ -577,8 +577,8 @@ static void GetBlockDim(int mid_dim, int low_dim, dim3* block) { #else constexpr int max_num_threads = 1024; #endif - int block_x = 1 << log2_ceil(low_dim); - int block_y = 1 << log2_ceil(mid_dim); + int block_x = 1 << Log2Ceil(low_dim); + int block_y = 1 << Log2Ceil(mid_dim); block->x = std::min(block_x, 32); block->y = std::min(block_y, static_cast(max_num_threads / block->x)); block->x = std::min(block_x, static_cast(max_num_threads / block->y)); @@ -739,6 +739,131 @@ void LaunchNormalSoftmaxBackward(const GPUContext& dev_ctx, } } +static std::vector GetSoftmaxTensorDims(const phi::DDim& dims, + const int axis) { + int dim = dims[axis]; + int N = phi::funcs::SizeToAxis(axis, dims); + int D = phi::funcs::SizeOutAxis(axis, dims); + return {N, dim, D, 1}; +} + +template +void SoftmaxForwardCudnnKernel(const GPUContext& dev_ctx, + const DenseTensor& x, + const int axis, + const bool log_mode, + DenseTensor* out) { + auto* out_data = out->data(); + + const int rank = x.dims().size(); + std::vector tensor_dims = GetSoftmaxTensorDims(x.dims(), axis); + + auto handle = dev_ctx.cudnn_handle(); + GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW; + + ScopedTensorDescriptor scoped_desc; +#ifdef PADDLE_WITH_HIP + miopenTensorDescriptor_t desc = + scoped_desc.descriptor(layout, tensor_dims); + auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE + : MIOPEN_SOFTMAX_MODE_CHANNEL; + auto algo = log_mode ? MIOPEN_SOFTMAX_LOG : MIOPEN_SOFTMAX_ACCURATE; + PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::miopenSoftmaxForward_V2( + handle, + paddle::platform::CudnnDataType::kOne(), + desc, + x.data(), + paddle::platform::CudnnDataType::kZero(), + desc, + out_data, + algo, + mode)); +#else + cudnnTensorDescriptor_t desc = scoped_desc.descriptor(layout, tensor_dims); + auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE + : CUDNN_SOFTMAX_MODE_CHANNEL; + auto algo = log_mode ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE; + PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxForward( + handle, + algo, + mode, + paddle::platform::CudnnDataType::kOne(), + desc, + x.data(), + paddle::platform::CudnnDataType::kZero(), + desc, + out_data)); +#endif +} + +template +void SoftmaxBackwardCudnnKernel(const GPUContext& dev_ctx, + const DenseTensor& out, + const DenseTensor& dout, + const int axis, + const bool log_mode, + DenseTensor* dx) { + auto* dx_data = dx->data(); + + int rank = out.dims().size(); + std::vector tensor_dims = GetSoftmaxTensorDims(out.dims(), axis); + + auto handle = dev_ctx.cudnn_handle(); + GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW; + + ScopedTensorDescriptor scoped_desc; +#ifdef PADDLE_WITH_HIP + miopenTensorDescriptor_t desc = + scoped_desc.descriptor(layout, tensor_dims); + auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE + : MIOPEN_SOFTMAX_MODE_CHANNEL; + auto algo = log_mode ? MIOPEN_SOFTMAX_LOG : MIOPEN_SOFTMAX_ACCURATE; + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenSoftmaxBackward_V2( + handle, + paddle::platform::CudnnDataType::kOne(), + desc, + out.data(), + desc, + dout.data(), + paddle::platform::CudnnDataType::kZero(), + desc, + dx_data, + algo, + mode)); +#else + cudnnTensorDescriptor_t desc = scoped_desc.descriptor(layout, tensor_dims); + auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE + : CUDNN_SOFTMAX_MODE_CHANNEL; + auto algo = log_mode ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE; + PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxBackward( + handle, + algo, + mode, + paddle::platform::CudnnDataType::kOne(), + desc, + out.data(), + desc, + dout.data(), + paddle::platform::CudnnDataType::kZero(), + desc, + dx_data)); +#endif +} + +template +static bool CanUseCudnnSoftmax(const GPUContext& dev_ctx) { + if (dev_ctx.cudnn_handle() != nullptr) { + if (std::is_same::value) { +#if CUDNN_VERSION < 8100 + return false; +#endif + } + return true; + } + return false; +} + template void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx, const DenseTensor& x, @@ -746,29 +871,29 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx, DenseTensor* out) { auto* out_data = out->data(); - auto dims = x.dims(); - const int rank = dims.size(); - const int axis = phi::funcs::CanonicalAxis(input_axis, rank); - const int dim = dims[axis]; - const int N = phi::funcs::SizeToAxis(axis, dims); - const int D = phi::funcs::SizeOutAxis(axis, dims); + int rank = x.dims().size(); + int axis = phi::funcs::CanonicalAxis(input_axis, rank); + std::vector tensor_dims = GetSoftmaxTensorDims(x.dims(), axis); + int N = tensor_dims[0]; + int dim = tensor_dims[1]; + int D = tensor_dims[2]; constexpr int max_dim = 512; - constexpr int warps_per_block = 4; - if (D == 1 && dim <= max_dim && sizeof(T) <= 4) { - const int kDimLog2 = static_cast(log2_ceil(dim)); - const int kDimCeil = 1 << kDimLog2; - int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32; - int batches_per_warp = (kDimCeil <= 32) ? 2 : 1; + if (D == 1 && + (!CanUseCudnnSoftmax(dev_ctx) || (dim <= max_dim && sizeof(T) <= 4))) { + int dim_log2 = static_cast(Log2Ceil(dim)); + int dim_ceil = 1 << dim_log2; + int warp_size = (dim_ceil < 32) ? dim_ceil : 32; + int batches_per_warp = (dim_ceil <= 32) ? 2 : 1; // use 128 threads per block to maximimize gpu utilization constexpr int threads_per_block = 128; - int warps_per_block = (threads_per_block / kWarpSize); + int warps_per_block = (threads_per_block / warp_size); int batches_per_block = warps_per_block * batches_per_warp; int blocks = (N + batches_per_block - 1) / batches_per_block; - dim3 threads(kWarpSize, warps_per_block, 1); + dim3 threads(warp_size, warps_per_block, 1); // vectorization read/write using T4 = typename VecT4::Type; @@ -783,7 +908,7 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx, N, dim, dim, - kDimLog2); + dim_log2); } else if (dim % 2 == 0) { SwitchWarpSoftmaxForward(blocks, threads, @@ -793,7 +918,7 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx, N, dim, dim, - kDimLog2); + dim_log2); } else { SwitchWarpSoftmaxForward(blocks, threads, @@ -803,78 +928,13 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx, N, dim, dim, - kDimLog2); + dim_log2); } } else if (D > 1) { LaunchNormalSoftmaxForward( dev_ctx, out_data, x.data(), N, dim, D); } else { - ScopedTensorDescriptor desc; - std::vector tensor_dims = {N, dim, D, 1}; - GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW; -#ifdef PADDLE_WITH_HIP - miopenTensorDescriptor_t desc_ = desc.descriptor(layout, tensor_dims); -#else - cudnnTensorDescriptor_t desc_ = desc.descriptor(layout, tensor_dims); -#endif - - auto handle = dev_ctx.cudnn_handle(); - -#ifdef PADDLE_WITH_HIP - auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE - : MIOPEN_SOFTMAX_MODE_CHANNEL; - if (LogMode) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::miopenSoftmaxForward_V2( - handle, - paddle::platform::CudnnDataType::kOne(), - desc_, - x.data(), - paddle::platform::CudnnDataType::kZero(), - desc_, - out_data, - MIOPEN_SOFTMAX_LOG, - mode)); - } else { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::miopenSoftmaxForward_V2( - handle, - paddle::platform::CudnnDataType::kOne(), - desc_, - x.data(), - paddle::platform::CudnnDataType::kZero(), - desc_, - out_data, - MIOPEN_SOFTMAX_ACCURATE, - mode)); - } -#else - auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE - : CUDNN_SOFTMAX_MODE_CHANNEL; - if (LogMode) { - PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxForward( - handle, - CUDNN_SOFTMAX_LOG, - mode, - paddle::platform::CudnnDataType::kOne(), - desc_, - x.data(), - paddle::platform::CudnnDataType::kZero(), - desc_, - out_data)); - } else { - PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxForward( - handle, - CUDNN_SOFTMAX_ACCURATE, - mode, - paddle::platform::CudnnDataType::kOne(), - desc_, - x.data(), - paddle::platform::CudnnDataType::kZero(), - desc_, - out_data)); - } -#endif + SoftmaxForwardCudnnKernel(dev_ctx, x, axis, LogMode, out); } } @@ -886,27 +946,28 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx, DenseTensor* dx) { auto* dx_data = dx->data(); - auto dims = out.dims(); - const int rank = dims.size(); - const int axis = phi::funcs::CanonicalAxis(input_axis, rank); - const int dim = dims[axis]; - const int N = phi::funcs::SizeToAxis(axis, dims); - const int D = phi::funcs::SizeOutAxis(axis, dims); + int rank = out.dims().size(); + int axis = phi::funcs::CanonicalAxis(input_axis, rank); + std::vector tensor_dims = GetSoftmaxTensorDims(out.dims(), axis); + int N = tensor_dims[0]; + int dim = tensor_dims[1]; + int D = tensor_dims[2]; constexpr int max_dim = 512; - constexpr int warps_per_block = 4; - if (D == 1 && dim <= max_dim && sizeof(T) <= 4) { - const int kDimLog2 = log2_ceil(dim); - const int kDimCeil = 1 << kDimLog2; - int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32; - int batches_per_warp = (kDimCeil <= 128) ? 2 : 1; + if (D == 1 && + (!CanUseCudnnSoftmax(dev_ctx) || (dim <= max_dim && sizeof(T) <= 4))) { + int dim_log2 = Log2Ceil(dim); + int dim_ceil = 1 << dim_log2; + int warp_size = (dim_ceil < 32) ? dim_ceil : 32; + int batches_per_warp = (dim_ceil <= 128) ? 2 : 1; + constexpr int threads_per_block = 128; - int warps_per_block = (threads_per_block / kWarpSize); + int warps_per_block = (threads_per_block / warp_size); int batches_per_block = warps_per_block * batches_per_warp; int blocks = (N + batches_per_block - 1) / batches_per_block; - dim3 threads(kWarpSize, warps_per_block, 1); + dim3 threads(warp_size, warps_per_block, 1); // vectorization read/write using T4 = typename VecT4::Type; @@ -921,7 +982,7 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx, N, dim, dim, - kDimLog2); + dim_log2); } else if (dim % 2 == 0) { SwitchWarpSoftmaxBackward(blocks, threads, @@ -932,7 +993,7 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx, N, dim, dim, - kDimLog2); + dim_log2); } else { SwitchWarpSoftmaxBackward(blocks, threads, @@ -943,88 +1004,13 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx, N, dim, dim, - kDimLog2); + dim_log2); } } else if (D > 1) { LaunchNormalSoftmaxBackward( dev_ctx, dx_data, dout.data(), out.data(), N, dim, D); } else { - ScopedTensorDescriptor desc; - std::vector tensor_dims = {N, dim, D, 1}; - GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW; -#ifdef PADDLE_WITH_HIP - miopenTensorDescriptor_t desc_ = desc.descriptor(layout, tensor_dims); -#else - cudnnTensorDescriptor_t desc_ = desc.descriptor(layout, tensor_dims); -#endif - - auto handle = dev_ctx.cudnn_handle(); - -#ifdef PADDLE_WITH_HIP - auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE - : MIOPEN_SOFTMAX_MODE_CHANNEL; - if (LogMode) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::miopenSoftmaxBackward_V2( - handle, - paddle::platform::CudnnDataType::kOne(), - desc_, - out.data(), - desc_, - dout.data(), - paddle::platform::CudnnDataType::kZero(), - desc_, - dx_data, - MIOPEN_SOFTMAX_LOG, - mode)); - } else { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::miopenSoftmaxBackward_V2( - handle, - paddle::platform::CudnnDataType::kOne(), - desc_, - out.data(), - desc_, - dout.data(), - paddle::platform::CudnnDataType::kZero(), - desc_, - dx_data, - MIOPEN_SOFTMAX_ACCURATE, - mode)); - } -#else - auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE - : CUDNN_SOFTMAX_MODE_CHANNEL; - if (LogMode) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnSoftmaxBackward( - handle, - CUDNN_SOFTMAX_LOG, - mode, - paddle::platform::CudnnDataType::kOne(), - desc_, - out.data(), - desc_, - dout.data(), - paddle::platform::CudnnDataType::kZero(), - desc_, - dx_data)); - } else { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnSoftmaxBackward( - handle, - CUDNN_SOFTMAX_ACCURATE, - mode, - paddle::platform::CudnnDataType::kOne(), - desc_, - out.data(), - desc_, - dout.data(), - paddle::platform::CudnnDataType::kZero(), - desc_, - dx_data)); - } -#endif + SoftmaxBackwardCudnnKernel(dev_ctx, out, dout, axis, LogMode, dx); } }