/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/math/math_cuda_utils.h" #include "paddle/fluid/operators/softmax_op.h" #include "paddle/fluid/platform/cuda_device_function.h" #ifdef PADDLE_WITH_HIP #include "paddle/fluid/platform/miopen_helper.h" #else #include "paddle/fluid/platform/cudnn_helper.h" #endif namespace paddle { namespace operators { using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; using DataLayout = platform::DataLayout; using Tensor = framework::Tensor; // Vectorization trait 4 * sizeof(T) template class VecT4 {}; template <> class VecT4 { public: using Type = long4; }; template <> class VecT4 { public: using Type = int4; }; template <> class VecT4 { public: using Type = int2; }; // Vectorization trait 2 * sizeof(T) template class VecT2 {}; template <> class VecT2 { public: using Type = int4; }; template <> class VecT2 { public: using Type = int2; }; template <> class VecT2 { public: using Type = int; }; static inline int log2_ceil(int value) { int log2_value = 0; while ((1 << log2_value) < value) ++log2_value; return log2_value; } template __device__ __forceinline__ void WarpReduceSum(T* sum) { #pragma unroll for (int offset = WarpSize / 2; offset > 0; offset /= 2) { #pragma unroll for (int i = 0; i < BatchSize; ++i) { T sum_val = platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset); sum[i] = sum[i] + sum_val; } } } template __device__ __forceinline__ void WarpReduceMax(T* sum) { #pragma unroll for (int offset = WarpSize / 2; offset > 0; offset /= 2) { #pragma unroll for (int i = 0; i < BatchSize; ++i) { T max_val = platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset); sum[i] = max(sum[i], max_val); } } } /* Core function of computing softmax forward for axis=-1. The computation includes - Compute maximum of batch: maxvalue_{i} = max_j src_{i,j} - Compute sum of exp batch: s_{i} = sum_{j}{ exp(src_{i,j} - maxvalue_{i} } - Compute: (a_{i,j} - maxvalue_{i}) / s_{i} One warp (32 threads) is used to compute 1 or 2 batch (kBatchSize). For reduction max (sum), firstly compute max (sum) to one warp, then use shuffle api to compute max (sum) in one warp. */ template __global__ void WarpSoftmaxForward(T* softmax, const T* src, const int batch_size, const int stride, const int element_count) { constexpr int kDimCeil = 1 << Log2Elements; constexpr int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32; constexpr int kVSize = sizeof(VecT) / sizeof(T); constexpr int kIterations = kDimCeil / kWarpSize; constexpr int kIterationsV = (kIterations >= kVSize) ? (kIterations / kVSize) : 1; constexpr int kBatchSize = (kDimCeil <= 32) ? 2 : 1; int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * kBatchSize; // max index to read int idx_max_v[kBatchSize]; #pragma unroll for (int i = 0; i < kBatchSize; i++) { int idx_max = ((i + first_batch) < batch_size) ? element_count : 0; idx_max_v[i] = idx_max / kVSize; } // read data from global memory AccT srcdata[kBatchSize][kIterationsV][kVSize]; #pragma unroll for (int i = 0; i < kBatchSize; ++i) { // read data #pragma unroll for (int it = 0; it < kIterationsV; ++it) { int src_idx = threadIdx.x + it * kWarpSize; if (kVSize == 1) { if (src_idx < idx_max_v[i]) { srcdata[i][it][0] = static_cast(src[(first_batch + i) * stride + src_idx]); } else { srcdata[i][it][0] = -std::numeric_limits::infinity(); } } else { const VecT* src_v = reinterpret_cast(&src[(first_batch + i) * stride]); if (src_idx < idx_max_v[i]) { VecT srctmp = src_v[src_idx]; const T* srcinptr = reinterpret_cast(&srctmp); #pragma unroll for (int s = 0; s < kVSize; s++) { srcdata[i][it][s] = static_cast(srcinptr[s]); } } else { #pragma unroll for (int s = 0; s < kVSize; s++) { srcdata[i][it][s] = -std::numeric_limits::infinity(); } } } } } // compute max value AccT max_value[kBatchSize]; #pragma unroll for (int i = 0; i < kBatchSize; ++i) { // it = 0 AccT valmax = srcdata[i][0][0]; #pragma unroll for (int s = 1; s < kVSize; ++s) { valmax = (valmax > srcdata[i][0][s]) ? valmax : srcdata[i][0][s]; } max_value[i] = valmax; // it = 1, 2, ... #pragma unroll for (int it = 1; it < kIterationsV; ++it) { AccT valmax = srcdata[i][it][0]; #pragma unroll for (int s = 1; s < kVSize; ++s) { valmax = (valmax > srcdata[i][it][s]) ? valmax : srcdata[i][it][s]; } max_value[i] = (max_value[i] > valmax) ? max_value[i] : valmax; } } WarpReduceMax(max_value); // compute sum AccT sum[kBatchSize]; #pragma unroll for (int i = 0; i < kBatchSize; ++i) { // it = 0 if (LogMode) { sum[i] = std::exp(srcdata[i][0][0] - max_value[i]); } else { srcdata[i][0][0] = std::exp(srcdata[i][0][0] - max_value[i]); sum[i] = srcdata[i][0][0]; } #pragma unroll for (int s = 1; s < kVSize; ++s) { if (LogMode) { sum[i] += std::exp(srcdata[i][0][s] - max_value[i]); } else { srcdata[i][0][s] = std::exp(srcdata[i][0][s] - max_value[i]); sum[i] += srcdata[i][0][s]; } } // it = 1, 2, ... #pragma unroll for (int it = 1; it < kIterationsV; ++it) { #pragma unroll for (int s = 0; s < kVSize; ++s) { if (LogMode) { sum[i] += std::exp(srcdata[i][it][s] - max_value[i]); } else { srcdata[i][it][s] = std::exp(srcdata[i][it][s] - max_value[i]); sum[i] += srcdata[i][it][s]; } } } } WarpReduceSum(sum); // write result to global memory #pragma unroll for (int i = 0; i < kBatchSize; ++i) { if (LogMode) { sum[i] = std::log(sum[i]); } #pragma unroll for (int it = 0; it < kIterationsV; ++it) { int idx = threadIdx.x + it * kWarpSize; if (kVSize == 1) { if (idx < idx_max_v[i]) { if (LogMode) { softmax[(first_batch + i) * stride + idx] = srcdata[i][it][0] - max_value[i] - sum[i]; } else { softmax[(first_batch + i) * stride + idx] = srcdata[i][it][0] / sum[i]; } } else { break; } } else { VecT* softmax_v = reinterpret_cast(&softmax[(first_batch + i) * stride]); VecT tmpdata; T* tmpptr = reinterpret_cast(&tmpdata); #pragma unroll for (int s = 0; s < kVSize; ++s) { if (LogMode) { tmpptr[s] = srcdata[i][it][s] - max_value[i] - sum[i]; } else { tmpptr[s] = srcdata[i][it][s] / sum[i]; } } if (idx < idx_max_v[i]) { softmax_v[idx] = tmpdata; } else { break; } } } } } /* Core function of computing softmax backward for axis=-1. The computation includes - Compute sum of exp batch: s_{i} = sum_{j} {src_{i,j} * grad_{i,j} - Compute src_{i,j} * ( grad_{i,j}) - s_{i} ) One warp (32 threads) is used to compute 1 or 2 batch (kBatchSize). For reduction max (sum), firstly compute max (sum) to one warp, then use shuffle api to compute max (sum) in one warp. */ template __global__ void WarpSoftmaxBackward(T* dst, const T* grad, const T* src, int batch_size, int stride, int element_count) { constexpr int kVSize = sizeof(VecT) / sizeof(T); constexpr int kDimCeil = 1 << Log2Elements; constexpr int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32; constexpr int kIterations = kDimCeil / kWarpSize; constexpr int kBatchSize = (kDimCeil <= 128) ? 2 : 1; constexpr int kIterationsV = (kIterations >= kVSize) ? (kIterations / kVSize) : 1; int element_count_v = element_count / kVSize; int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * kBatchSize; int local_batches = batch_size - first_batch; if (local_batches > kBatchSize) { local_batches = kBatchSize; } // read data from global memory VecT src_reg[kBatchSize][kIterationsV]; VecT grad_reg[kBatchSize][kIterationsV]; for (int i = 0; i < kBatchSize; ++i) { const VecT* src_v = reinterpret_cast(&src[(first_batch + i) * stride]); const VecT* grad_v = reinterpret_cast(&grad[(first_batch + i) * stride]); // max index to read int idx_max = (i < local_batches) ? element_count : 0; int idx_max_v = idx_max / kVSize; // read data for (int it = 0; it < kIterationsV; ++it) { int src_idx = threadIdx.x + it * kWarpSize; if (src_idx < idx_max_v) { src_reg[i][it] = src_v[src_idx]; grad_reg[i][it] = grad_v[src_idx]; } else { #pragma unroll for (int s = 0; s < kVSize; s++) { reinterpret_cast(&src_reg[i][it])[s] = 0.0; reinterpret_cast(&grad_reg[i][it])[s] = 0.0; } } } } // compute sum AccT sum[kBatchSize]{0.0}; #pragma unroll for (int i = 0; i < kBatchSize; ++i) { #pragma unroll for (int it = 0; it < kIterationsV; ++it) { T* gradptr = reinterpret_cast(&grad_reg[i][it]); T* srcptr = reinterpret_cast(&src_reg[i][it]); #pragma unroll for (int s = 0; s < kVSize; ++s) { if (LogMode) { sum[i] += static_cast(gradptr[s]); } else { sum[i] += static_cast(gradptr[s] * srcptr[s]); } } } } WarpReduceSum(sum); // write result #pragma unroll for (int i = 0; i < kBatchSize; ++i) { if (i >= local_batches) break; VecT* dst_v = reinterpret_cast(&dst[(first_batch + i) * stride]); // max index to write int idx_max = (i < local_batches) ? element_count : 0; int idx_max_v = idx_max / kVSize; #pragma unroll for (int it = 0; it < kIterationsV; ++it) { VecT tmpdata; T* tmpptr = reinterpret_cast(&tmpdata); T* gradptr = reinterpret_cast(&grad_reg[i][it]); T* srcptr = reinterpret_cast(&src_reg[i][it]); #pragma unroll for (int s = 0; s < kVSize; ++s) { if (LogMode) { tmpptr[s] = static_cast(gradptr[s]) - std::exp(static_cast(srcptr[s])) * sum[i]; } else { tmpptr[s] = static_cast(srcptr[s]) * (static_cast(gradptr[s]) - sum[i]); } } int idx = threadIdx.x + it * kWarpSize; if (idx < idx_max_v) { dst_v[idx] = tmpdata; } } } } #define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, AccT) \ case Log2Elements: \ WarpSoftmaxForward<<>>( \ dst, src, batch_size, stride, element_count); \ break; /* Wrapper of softmax formward with template instantiation on size of input. */ template void SwitchWarpSoftmaxForward(const int blocks, const dim3 threads, const platform::CUDADeviceContext& dev_ctx, T* dst, const T* src, const int batch_size, const int stride, const int element_count, int Log2Elements) { using AccT = typename details::MPTypeTrait::Type; switch (Log2Elements) { SOFTMAX_WARP_FORWARD_CASE(0, AccT); SOFTMAX_WARP_FORWARD_CASE(1, AccT); SOFTMAX_WARP_FORWARD_CASE(2, AccT); SOFTMAX_WARP_FORWARD_CASE(3, AccT); SOFTMAX_WARP_FORWARD_CASE(4, AccT); SOFTMAX_WARP_FORWARD_CASE(5, AccT); SOFTMAX_WARP_FORWARD_CASE(6, AccT); SOFTMAX_WARP_FORWARD_CASE(7, AccT); SOFTMAX_WARP_FORWARD_CASE(8, AccT); SOFTMAX_WARP_FORWARD_CASE(9, AccT); default: break; } } #define SOFTMAX_WARP_BACKWARD_CASE(Log2Elements, AccT) \ case Log2Elements: \ WarpSoftmaxBackward<<>>( \ dst, grad, src, batch_size, stride, element_count); \ break; /* Wrapper of softmax backward with template instantiation on size of input. */ template void SwitchWarpSoftmaxBackward(const int blocks, const dim3 threads, const platform::CUDADeviceContext& dev_ctx, T* dst, const T* grad, const T* src, const int batch_size, const int stride, const int element_count, int Log2Elements) { using AccT = typename details::MPTypeTrait::Type; switch (Log2Elements) { SOFTMAX_WARP_BACKWARD_CASE(0, AccT); SOFTMAX_WARP_BACKWARD_CASE(1, AccT); SOFTMAX_WARP_BACKWARD_CASE(2, AccT); SOFTMAX_WARP_BACKWARD_CASE(3, AccT); SOFTMAX_WARP_BACKWARD_CASE(4, AccT); SOFTMAX_WARP_BACKWARD_CASE(5, AccT); SOFTMAX_WARP_BACKWARD_CASE(6, AccT); SOFTMAX_WARP_BACKWARD_CASE(7, AccT); SOFTMAX_WARP_BACKWARD_CASE(8, AccT); SOFTMAX_WARP_BACKWARD_CASE(9, AccT); default: break; } } #undef SOFTMAX_WARP_FORWARD_CASE #undef SOFTMAX_WARP_BACKWARD_CASE template void SoftmaxForwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx, const Tensor& x, const int input_axis, Tensor* out) { auto* out_data = out->data(); auto dims = x.dims(); const int rank = dims.size(); const int axis = CanonicalAxis(input_axis, rank); const int dim = dims[axis]; const int N = SizeToAxis(axis, dims); const int D = SizeOutAxis(axis, dims); constexpr int max_dim = 320; constexpr int warps_per_block = 4; if (D == 1 && dim <= max_dim && sizeof(T) <= 4) { const int kDimLog2 = static_cast(log2_ceil(dim)); const int kDimCeil = 1 << kDimLog2; int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32; int batches_per_warp = (kDimCeil <= 32) ? 2 : 1; // use 128 threads per block to maximimize gpu utilization constexpr int threads_per_block = 128; int warps_per_block = (threads_per_block / kWarpSize); int batches_per_block = warps_per_block * batches_per_warp; int blocks = (N + batches_per_block - 1) / batches_per_block; dim3 threads(kWarpSize, warps_per_block, 1); // vectorization read/write using T4 = typename VecT4::Type; using T2 = typename VecT2::Type; if (dim % 4 == 0) { SwitchWarpSoftmaxForward(blocks, threads, dev_ctx, out_data, x.data(), N, dim, dim, kDimLog2); } else if (dim % 2 == 0) { SwitchWarpSoftmaxForward(blocks, threads, dev_ctx, out_data, x.data(), N, dim, dim, kDimLog2); } else { SwitchWarpSoftmaxForward(blocks, threads, dev_ctx, out_data, x.data(), N, dim, dim, kDimLog2); } } else { ScopedTensorDescriptor desc; std::vector tensor_dims = {N, dim, D, 1}; DataLayout layout = DataLayout::kNCHW; #ifdef PADDLE_WITH_HIP miopenTensorDescriptor_t desc_ = desc.descriptor(layout, tensor_dims); #else cudnnTensorDescriptor_t desc_ = desc.descriptor(layout, tensor_dims); #endif auto handle = dev_ctx.cudnn_handle(); #ifdef PADDLE_WITH_HIP auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE : MIOPEN_SOFTMAX_MODE_CHANNEL; if (LogMode) { PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2( handle, platform::CudnnDataType::kOne(), desc_, x.data(), platform::CudnnDataType::kZero(), desc_, out_data, MIOPEN_SOFTMAX_LOG, mode)); } else { PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2( handle, platform::CudnnDataType::kOne(), desc_, x.data(), platform::CudnnDataType::kZero(), desc_, out_data, MIOPEN_SOFTMAX_ACCURATE, mode)); } #else auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE : CUDNN_SOFTMAX_MODE_CHANNEL; if (LogMode) { PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward( handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType::kOne(), desc_, x.data(), platform::CudnnDataType::kZero(), desc_, out_data)); } else { PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward( handle, CUDNN_SOFTMAX_ACCURATE, mode, platform::CudnnDataType::kOne(), desc_, x.data(), platform::CudnnDataType::kZero(), desc_, out_data)); } #endif } } template void SoftmaxBackwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx, const Tensor& out, const Tensor& dout, const int input_axis, Tensor* dx) { auto* dx_data = dx->data(); auto dims = out.dims(); const int rank = dims.size(); const int axis = CanonicalAxis(input_axis, rank); const int dim = dims[axis]; const int N = SizeToAxis(axis, dims); const int D = SizeOutAxis(axis, dims); constexpr int max_dim = 320; constexpr int warps_per_block = 4; if (D == 1 && dim <= max_dim && sizeof(T) <= 4) { const int kDimLog2 = log2_ceil(dim); const int kDimCeil = 1 << kDimLog2; int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32; int batches_per_warp = (kDimCeil <= 128) ? 2 : 1; constexpr int threads_per_block = 128; int warps_per_block = (threads_per_block / kWarpSize); int batches_per_block = warps_per_block * batches_per_warp; int blocks = (N + batches_per_block - 1) / batches_per_block; dim3 threads(kWarpSize, warps_per_block, 1); // vectorization read/write using T4 = typename VecT4::Type; using T2 = typename VecT2::Type; if (dim % 4 == 0) { SwitchWarpSoftmaxBackward( blocks, threads, dev_ctx, dx_data, dout.data(), out.data(), N, dim, dim, kDimLog2); } else if (dim % 2 == 0) { SwitchWarpSoftmaxBackward( blocks, threads, dev_ctx, dx_data, dout.data(), out.data(), N, dim, dim, kDimLog2); } else { SwitchWarpSoftmaxBackward( blocks, threads, dev_ctx, dx_data, dout.data(), out.data(), N, dim, dim, kDimLog2); } } else { ScopedTensorDescriptor desc; std::vector tensor_dims = {N, dim, D, 1}; DataLayout layout = DataLayout::kNCHW; #ifdef PADDLE_WITH_HIP miopenTensorDescriptor_t desc_ = desc.descriptor(layout, tensor_dims); #else cudnnTensorDescriptor_t desc_ = desc.descriptor(layout, tensor_dims); #endif auto handle = dev_ctx.cudnn_handle(); #ifdef PADDLE_WITH_HIP auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE : MIOPEN_SOFTMAX_MODE_CHANNEL; if (LogMode) { PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2( handle, platform::CudnnDataType::kOne(), desc_, out.data(), desc_, dout.data(), platform::CudnnDataType::kZero(), desc_, dx_data, MIOPEN_SOFTMAX_LOG, mode)); } else { PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2( handle, platform::CudnnDataType::kOne(), desc_, out.data(), desc_, dout.data(), platform::CudnnDataType::kZero(), desc_, dx_data, MIOPEN_SOFTMAX_ACCURATE, mode)); } #else auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE : CUDNN_SOFTMAX_MODE_CHANNEL; if (LogMode) { PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward( handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType::kOne(), desc_, out.data(), desc_, dout.data(), platform::CudnnDataType::kZero(), desc_, dx_data)); } else { PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward( handle, CUDNN_SOFTMAX_ACCURATE, mode, platform::CudnnDataType::kOne(), desc_, out.data(), desc_, dout.data(), platform::CudnnDataType::kZero(), desc_, dx_data)); } #endif } } } // namespace operators } // namespace paddle