// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // clang-format will try to sort headers according to google c++ style, // and that cause compiling problems. // clang-format off #include "paddle/phi/kernels/gelu_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/gpu/gelu_funcs.h" // clang-format on DECLARE_bool(use_fast_math); namespace phi { template struct GeluWithApproximateFunctor { using MPType = typename phi::dtype::MPTypeTrait::Type; inline HOSTDEVICE T operator()(T arg_x) { // this function is tanh approximation of gelu MPType x = static_cast(arg_x); MPType one = static_cast(1); MPType half = static_cast(0.5); MPType kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); auto tanh_out = tanh(kAlpha * x * (one + static_cast(GELU_CONSTANT) * x * x)); MPType out = x * half * (one + tanh_out); return static_cast(out); } }; template struct GeluWithoutApproximateFunctor { using MPType = typename phi::dtype::MPTypeTrait::Type; inline HOSTDEVICE T operator()(T arg_x) { // actual gelu with approximation = false MPType x = static_cast(arg_x); return static_cast(x * normcdf(x)); } }; template void GeluKernel(const Context& dev_ctx, const DenseTensor& x, bool approximate, DenseTensor* out) { dev_ctx.template Alloc(out); std::vector ins = {&x}; std::vector outs = {out}; if (approximate) { #ifdef __NVCC__ if (std::is_same::value) { size_t n = x.numel(); const auto* in_ptr = reinterpret_cast(x.data()); auto* out_ptr = reinterpret_cast<__half*>(out->data()); if (TryLaunchFP16FastGeluFwdVectorizeCUDAKernel( dev_ctx, in_ptr, out_ptr, n)) { return; } } #endif using Functor = GeluWithApproximateFunctor; phi::funcs::ElementwiseKernel( dev_ctx, ins, &outs, Functor()); } else { using Functor = GeluWithoutApproximateFunctor; phi::funcs::ElementwiseKernel( dev_ctx, ins, &outs, Functor()); } } } // namespace phi PD_REGISTER_KERNEL(gelu, GPU, ALL_LAYOUT, phi::GeluKernel, float, double, phi::dtype::float16, phi::dtype::bfloat16) {}