From 856f741a4e4535543b8f6a1ceb8e9b2312bbacf7 Mon Sep 17 00:00:00 2001 From: ming1753 <61511741+ming1753@users.noreply.github.com> Date: Fri, 29 Jul 2022 10:35:32 +0800 Subject: [PATCH] fused_fc_elementwise_layernorm_op support fp16 (#44710) * fused_fc_elementwise_layernorm support fp16 * fused_fc_elementwise_layernorm support double --- .../inference/api/paddle_pass_builder.cc | 4 +- .../fused_fc_elementwise_layernorm_op.cu | 303 +++++++++++++++--- 2 files changed, 265 insertions(+), 42 deletions(-) diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 67acbd7e511..42d58dd7828 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -161,7 +161,9 @@ const std::vector kGpuLowerPrecisionPasses{ "conv_elementwise_add_fuse_pass", "gpu_cpu_map_matmul_v2_to_mul_pass", // "gpu_cpu_map_matmul_v2_to_matmul_pass", // - "fc_fuse_pass"}; + "fc_fuse_pass", + "fc_elementwise_layernorm_fuse_pass", +}; const std::vector kTrtLowerPrecisionPasses{ // "conv_bn_fuse_pass", diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu index 53bc3db9b0f..37aa5cbd14d 100644 --- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu +++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu @@ -20,6 +20,10 @@ limitations under the License. */ namespace cub = hipcub; #endif +#if defined(PADDLE_WITH_CUDA) +#include +#endif + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" @@ -28,9 +32,11 @@ namespace cub = hipcub; namespace paddle { namespace operators { +using float16 = phi::dtype::float16; + template static __device__ __forceinline__ T Relu(T x) { - return (x > 0) ? x : 0; + return static_cast(fmaxf(0.f, x)); } static __device__ __forceinline__ float RealSqrt(float x) { return sqrtf(x); } @@ -137,6 +143,243 @@ __global__ void InplaceAddReluAddLayerNormKernel(const T* y, } } +template +__global__ void InplaceAddReluAddLayerNormKernel(const float16* y_data, + const float16* bias_0_data, + const float16* bias_1_data, + const float16* scale_data, + float16* out_data, + float16* mean_data, + float16* variance_data, + int M, + int N, + float epsilon) { +#if defined(PADDLE_WITH_CUDA) + const half* y = reinterpret_cast(y_data); + const half* bias_0 = reinterpret_cast(bias_0_data); + const half* bias_1 = reinterpret_cast(bias_1_data); + const half* scale = reinterpret_cast(scale_data); + half* out = reinterpret_cast(out_data); + half* mean = reinterpret_cast(mean_data); + half* variance = reinterpret_cast(variance_data); +#else + const float16* y = y_data; + const float16* bias_0 = bias_0_data; + const float16* bias_1 = bias_1_data; + const float16* scale = scale_data; + float16* out = out_data; + float16* mean = mean_data; + float16* variance = variance_data; +#endif + using BlockReduce = cub::BlockReduce, BlockDim>; + __shared__ typename BlockReduce::TempStorage temp_storage; +#if defined(PADDLE_WITH_CUDA) + __shared__ half shared_mem[BlockDim + 2]; +#else + __shared__ float16 shared_mem[BlockDim + 2]; +#endif + + for (int i = blockIdx.x; i < M; i += gridDim.x) { + int index = i * N + threadIdx.x; + + // The fisrt BlockDim elements will be saved to shared memory. + int save_index = threadIdx.x; +#if defined(PADDLE_WITH_CUDA) + half* save_ptr = shared_mem; +#else + float16* save_ptr = shared_mem; +#endif + float sum_i = 0; + float square_sum_i = 0; + for (int j = threadIdx.x; j < N; j += blockDim.x) { +#if defined(PADDLE_WITH_CUDA) + half tmp_0 = out[index]; + // Add bias + half tmp_1; + if (bias_0 != nullptr) { + tmp_1 = __hadd(tmp_0, bias_0[j]); + } else { + tmp_1 = tmp_0; + } + // Relu + half tmp_2 = DoRelu ? Relu(tmp_1) : tmp_1; + // elementwise_add + half tmp_3 = __hadd(tmp_2, y[index]); +#else + float16 tmp_0 = out[index]; + // Add bias + float16 tmp_1 = bias_0 ? tmp_0 + bias_0[j] : tmp_0; + // Relu + float16 tmp_2 = DoRelu ? Relu(tmp_1) : tmp_1; + // elementwise_add + float16 tmp_3 = tmp_2 + y[index]; +#endif + // Save + save_ptr[save_index] = tmp_3; + save_ptr = out; + + index += blockDim.x; + save_index = index; + + // For layer_norm, reduce to calculate mean and std + sum_i += static_cast(tmp_3); +#if defined(PADDLE_WITH_CUDA) && __CUDA_ARCH__ >= 530 + square_sum_i += static_cast(__hmul(tmp_3, tmp_3)); +#elif defined(PADDLE_WITH_CUDA) + square_sum_i += static_cast(tmp_3) * static_cast(tmp_3); +#else + square_sum_i += static_cast(tmp_3 * tmp_3); +#endif + } + auto pair = BlockReduce(temp_storage) + .Reduce(PairForLayerNorm(sum_i, square_sum_i), + PairForLayerNormAddFunctor()); + if (threadIdx.x == 0) { +#if defined(PADDLE_WITH_CUDA) + half mean_i = static_cast(pair.first_ / N); +#if __CUDA_ARCH__ >= 530 + half variance_i = static_cast( + pair.second_ / N - static_cast(__hmul(mean_i, mean_i))); +#else + half variance_i = + static_cast(pair.second_ / N - static_cast(mean_i) * + static_cast(mean_i)); +#endif +#else + float16 mean_i = static_cast(pair.first_ / N); + float16 variance_i = static_cast( + pair.second_ / N - static_cast(mean_i * mean_i)); +#endif + shared_mem[BlockDim] = mean_i; + shared_mem[BlockDim + 1] = variance_i; + if (mean) { + mean[blockIdx.x] = mean_i; + } + if (variance) { + variance[blockIdx.x] = variance_i; + } + } + __syncthreads(); +#if defined(PADDLE_WITH_CUDA) + half mean_i = shared_mem[BlockDim]; + half std_i = static_cast( + RealSqrt(static_cast(shared_mem[BlockDim + 1]) + epsilon)); +#else + float16 mean_i = shared_mem[BlockDim]; + float16 std_i = static_cast( + RealSqrt(static_cast(shared_mem[BlockDim + 1]) + epsilon)); +#endif + + index = i * N + threadIdx.x; + // First BlockDim elements loading from shared memory. + save_index = threadIdx.x; + save_ptr = shared_mem; + + // For layer_norm, calculate out + for (int j = threadIdx.x; j < N; j += blockDim.x) { +#if defined(PADDLE_WITH_CUDA) +#if __CUDA_ARCH__ >= 530 + half tmp_0 = __hdiv(__hsub(save_ptr[save_index], mean_i), std_i); + half tmp_1 = scale ? __hmul(scale[j], tmp_0) : tmp_0; +#else + half tmp_0 = static_cast(static_cast(save_ptr[save_index]) + + static_cast(mean_i) / + static_cast(std_i)); + half tmp_1 = scale ? static_cast(static_cast(scale[j]) * + static_cast(tmp_0)) + : tmp_0; +#endif + if (bias_1 != nullptr) { + out[index] = __hadd(tmp_1, bias_1[j]); + } else { + out[index] = tmp_1; + } +#else + float16 tmp_0 = (save_ptr[save_index] - mean_i) / std_i; + float16 tmp_1 = scale ? scale[j] * tmp_0 : tmp_0; + out[index] = bias_1 ? tmp_1 + bias_1[j] : tmp_1; +#endif + save_ptr = out; + index += blockDim.x; + save_index = index; + } + } +} + +template +void AddReluAddLayerNorm(gpuStream_t stream, + bool with_relu, + int max_threads, + const T* y, + const T* bias_0, + const T* bias_1, + const T* scale, + T* out, + T* mean, + T* variance, + int M, + int N, + float epsilon) { + if (with_relu) { + switch (platform::RoundToPowerOfTwo(N)) { + CUDA_LAUNCH_KERNEL_HELPER( + InplaceAddReluAddLayerNormKernel + <<>>( + y, bias_0, bias_1, scale, out, mean, variance, M, N, epsilon)); + } + } else { + switch (platform::RoundToPowerOfTwo(N)) { + CUDA_LAUNCH_KERNEL_HELPER( + InplaceAddReluAddLayerNormKernel + <<>>( + y, bias_0, bias_1, scale, out, mean, variance, M, N, epsilon)); + } + } +} + +template <> +void AddReluAddLayerNorm(gpuStream_t stream, + bool with_relu, + int max_threads, + const float16* y, + const float16* bias_0, + const float16* bias_1, + const float16* scale, + float16* out, + float16* mean, + float16* variance, + int M, + int N, + float epsilon) { + if (with_relu) { + switch (platform::RoundToPowerOfTwo(N)) { + CUDA_LAUNCH_KERNEL_HELPER( + InplaceAddReluAddLayerNormKernel + <<>>( + y, bias_0, bias_1, scale, out, mean, variance, M, N, epsilon)); + } + } else { + switch (platform::RoundToPowerOfTwo(N)) { + CUDA_LAUNCH_KERNEL_HELPER( + InplaceAddReluAddLayerNormKernel + <<>>( + y, bias_0, bias_1, scale, out, mean, variance, M, N, epsilon)); + } + } +} + template class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel { public: @@ -169,7 +412,6 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel { static_cast(0.0), out_data, N); - auto* y = ctx.Input("Y"); auto* bias_0 = ctx.Input("Bias0"); auto* bias_1 = ctx.Input("Bias1"); @@ -192,43 +434,19 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel { float epsilon = ctx.Attr("epsilon"); int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - if (with_relu) { - switch (platform::RoundToPowerOfTwo(N)) { - CUDA_LAUNCH_KERNEL_HELPER( - InplaceAddReluAddLayerNormKernel - <<>>(y_data, - bias_0_data, - bias_1_data, - scale_data, - out_data, - mean_data, - variance_data, - M, - N, - epsilon)); - } - } else { - switch (platform::RoundToPowerOfTwo(N)) { - CUDA_LAUNCH_KERNEL_HELPER( - InplaceAddReluAddLayerNormKernel - <<>>(y_data, - bias_0_data, - bias_1_data, - scale_data, - out_data, - mean_data, - variance_data, - M, - N, - epsilon)); - } - } + AddReluAddLayerNorm(dev_ctx.stream(), + with_relu, + max_threads, + y_data, + bias_0_data, + bias_1_data, + scale_data, + out_data, + mean_data, + variance_data, + M, + N, + epsilon); } }; @@ -236,5 +454,8 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(fused_fc_elementwise_layernorm, - ops::FusedFCElementwiseLayerNormOpKernel); +REGISTER_OP_CUDA_KERNEL( + fused_fc_elementwise_layernorm, + ops::FusedFCElementwiseLayerNormOpKernel, + ops::FusedFCElementwiseLayerNormOpKernel, + ops::FusedFCElementwiseLayerNormOpKernel); -- GitLab