diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu index 6015266dde9e7af0ca8523ec97e661efb507317b..7dae6f49a37b273392d17c740b57dc8e38e6739f 100644 --- a/paddle/phi/kernels/funcs/fc_functor.cu +++ b/paddle/phi/kernels/funcs/fc_functor.cu @@ -15,6 +15,7 @@ limitations under the License. */ #include #include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/fc_functor.h" @@ -127,37 +128,54 @@ void AddReluKernel( } #if defined(PADDLE_WITH_CUDA) -template -__global__ void bias_relu_v2(const int num, - const half2* bias, - half2* data, - int K) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; +template +__global__ void bias_relu_v4_half2(const int num, + const half2* bias, + half2* data, + int K) { + using LoadT = phi::AlignedVector; + LoadT data_vec; + LoadT bias_vec; + const int32_t global_thread_idx = blockIdx.x * blockDim.x + threadIdx.x; + const int32_t grid_stride = gridDim.x * blockDim.x; - if (tid < num) { - int bias_idx = tid % K; - const half2 bias_ptr = bias[bias_idx]; - const half2 in_ptr = data[tid]; - half2 packed_val; + for (int32_t linear_idx = global_thread_idx * Half2VecSize; linear_idx < num; + linear_idx += grid_stride * Half2VecSize) { + phi::Load(&data[linear_idx], &data_vec); + const int bias_idx = linear_idx % K; + phi::Load(&bias[bias_idx], &bias_vec); + +#pragma unroll + for (int unroll_idx = 0; unroll_idx < Half2VecSize; unroll_idx++) { +// Do biasAdd #if __CUDA_ARCH__ >= 530 - packed_val = __hadd2(bias_ptr, in_ptr); + data_vec[unroll_idx] = + __hadd2(data_vec[unroll_idx], bias_vec[unroll_idx]); #else - packed_val.x = __hadd(bias_ptr.x, in_ptr.x); - packed_val.y = __hadd(bias_ptr.y, in_ptr.y); + data_vec[unroll_idx].x = + __hadd(data_vec[unroll_idx].x, bias_vec[unroll_idx].x); + data_vec[unroll_idx].y = + __hadd(data_vec[unroll_idx].y, bias_vec[unroll_idx].y); #endif - if (DoRelu) { + + // Do relu + if (DoRelu) { #if __CUDA_ARCH__ >= 800 - packed_val = __hmax2(__half2(0, 0), packed_val); + data_vec[unroll_idx] = __hmax2(__half2(0, 0), data_vec[unroll_idx]); #elif __CUDA_ARCH__ >= 530 - packed_val = __hmul2(__hgt2(__half2(0, 0), packed_val), packed_val); + data_vec[unroll_idx] = __hmul2( + __hgt2(__half2(0, 0), data_vec[unroll_idx]), data_vec[unroll_idx]); #else - packed_val.x = static_cast(static_cast(packed_val.x) > 0) * - static_cast(packed_val.x); - packed_val.y = static_cast(static_cast(packed_val.y) > 0) * - static_cast(packed_val.y); + data_vec[unroll_idx].x = + static_cast(static_cast(data_vec[unroll_idx].x) > 0) * + static_cast(data_vec[unroll_idx].x); + data_vec[unroll_idx].y = + static_cast(static_cast(data_vec[unroll_idx].y) > 0) * + static_cast(data_vec[unroll_idx].y); #endif + } } - data[tid] = packed_val; + phi::Store(data_vec, &data[linear_idx]); } } @@ -188,6 +206,53 @@ __global__ void InplaceAddReluKernel(const int N, } } +/** + * brief: Launch BiasAddReluKernel with relu or not. + **/ +template +void LaunchBiasAddReluHalf2Kernel(cudaStream_t stream, + const int32_t rows, + const int32_t cols, + float16* Y, + const float16* B, + bool relu) { + const int threads = 256; + const int vec_num = rows * cols / (Half2VecSize * 2); + const int half2_num = rows * cols / 2; + const int blocks = (vec_num + threads - 1) / threads; + // Here reinterpret_cast to half2 type. + typedef typename FcTypeTraits::Type trans_type; + auto* bias_half2_ptr = reinterpret_cast(B); + auto* data_half2_ptr = reinterpret_cast(Y); + if (relu) { + bias_relu_v4_half2<<>>( + half2_num, bias_half2_ptr, data_half2_ptr, cols / 2); + } else { + bias_relu_v4_half2<<>>( + half2_num, bias_half2_ptr, data_half2_ptr, cols / 2); + } +} + +/** + * brief: Dispatch BiasAddReluKernel half2 type with 8 / 4 / 2 vecsize. + **/ +void DispatchBiasAddReluKernelHalf2VecSize(cudaStream_t stream, + const int32_t rows, + const int32_t cols, + float16* Y, + const float16* B, + bool relu) { + // Half Max Vecsize is 128 / 16 = 8, since we use half2 type, here + // Half2VecSize need divide 2. + if (cols % 8 == 0) { + LaunchBiasAddReluHalf2Kernel<4>(stream, rows, cols, Y, B, relu); + } else if (cols % 4 == 0) { + LaunchBiasAddReluHalf2Kernel<2>(stream, rows, cols, Y, B, relu); + } else { + LaunchBiasAddReluHalf2Kernel<1>(stream, rows, cols, Y, B, relu); + } +} + template <> void AddReluKernel(cudaStream_t stream, const int M, @@ -196,19 +261,7 @@ void AddReluKernel(cudaStream_t stream, const float16* B, bool relu) { if (N % 2 == 0) { - const int threads = 256; - const int num = M * N / 2; - const int blocks = (num + threads - 1) / threads; - typedef typename FcTypeTraits::Type trans_type; - auto* bias_ptr_v2 = reinterpret_cast(B); - auto* data_ptr_v2 = reinterpret_cast(Y); - if (relu) { - bias_relu_v2<<>>( - num, bias_ptr_v2, data_ptr_v2, N / 2); - } else { - bias_relu_v2<<>>( - num, bias_ptr_v2, data_ptr_v2, N / 2); - } + DispatchBiasAddReluKernelHalf2VecSize(stream, M, N, Y, B, relu); } else { const int threads = 256; const int blocks = M;