From 29d75c14f1e25ca9c4b741270859027fa390179a Mon Sep 17 00:00:00 2001 From: limingshu <61349199+JamesLim-sy@users.noreply.github.com> Date: Wed, 23 Nov 2022 14:00:32 +0800 Subject: [PATCH] Add bfloat16 type support for abs op (#48205) * first commit * 2nd commit --- paddle/phi/kernels/funcs/complex_functors.h | 69 ++++++------------- paddle/phi/kernels/gpu/abs_grad_kernel.cu | 1 + paddle/phi/kernels/gpu/abs_kernel.cu | 16 ++++- .../phi/kernels/impl/abs_grad_kernel_impl.h | 68 +++++++++++++++++- .../tests/unittests/test_activation_op.py | 1 + 5 files changed, 105 insertions(+), 50 deletions(-) diff --git a/paddle/phi/kernels/funcs/complex_functors.h b/paddle/phi/kernels/funcs/complex_functors.h index 8b292cb5dc..e6ffeb3b56 100644 --- a/paddle/phi/kernels/funcs/complex_functors.h +++ b/paddle/phi/kernels/funcs/complex_functors.h @@ -110,53 +110,6 @@ struct AbsFunctor>> { int64_t numel_; }; -template -struct AbsGradCUDAFunctor { - HOSTDEVICE inline AbsGradCUDAFunctor() {} - - HOSTDEVICE inline T operator()(const T x, const T dout) const { - T output; - if (x == T(0)) { - output = T(0); - } else { - output = T(dout) * (x / T(std::abs(x))); - } - return output; - } -}; - -template <> -struct AbsGradCUDAFunctor> { - HOSTDEVICE inline AbsGradCUDAFunctor() {} - HOSTDEVICE inline phi::dtype::complex operator()( - const phi::dtype::complex x, const float dout) const { - phi::dtype::complex output; - if (x == phi::dtype::complex(0)) { - output = phi::dtype::complex(0); - } else { - output = phi::dtype::complex(dout) * - (x / phi::dtype::complex(abs(x))); - } - return output; - } -}; - -template <> -struct AbsGradCUDAFunctor> { - HOSTDEVICE inline AbsGradCUDAFunctor() {} - HOSTDEVICE inline phi::dtype::complex operator()( - const phi::dtype::complex x, const double dout) const { - phi::dtype::complex output; - if (x == phi::dtype::complex(0)) { - output = phi::dtype::complex(0); - } else { - output = phi::dtype::complex(dout) * - (x / phi::dtype::complex(abs(x))); - } - return output; - } -}; - template struct AbsGradFunctor { AbsGradFunctor(const dtype::Real* dout, @@ -179,6 +132,28 @@ struct AbsGradFunctor { int64_t numel_; }; +template <> +struct AbsGradFunctor { + AbsGradFunctor(const dtype::Real* dout, + const phi::dtype::bfloat16* x, + phi::dtype::bfloat16* output, + int64_t numel) + : dout_(dout), x_(x), output_(output), numel_(numel) {} + + HOSTDEVICE void operator()(int64_t idx) const { + if (x_[idx] == static_cast(0)) { + output_[idx] = static_cast(0); + } else { + output_[idx] = dout_[idx] * (x_[idx] / (abs(x_[idx]))); + } + } + + const dtype::Real* dout_; + const phi::dtype::bfloat16* x_; + phi::dtype::bfloat16* output_; + int64_t numel_; +}; + template <> struct AbsGradFunctor> { AbsGradFunctor(const float* dout, diff --git a/paddle/phi/kernels/gpu/abs_grad_kernel.cu b/paddle/phi/kernels/gpu/abs_grad_kernel.cu index 8edb6b7122..a1afa8569b 100644 --- a/paddle/phi/kernels/gpu/abs_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/abs_grad_kernel.cu @@ -31,6 +31,7 @@ PD_REGISTER_KERNEL(abs_grad, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, complex, complex) { kernel->InputAt(1).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); diff --git a/paddle/phi/kernels/gpu/abs_kernel.cu b/paddle/phi/kernels/gpu/abs_kernel.cu index d025f4b61e..9f27c98616 100644 --- a/paddle/phi/kernels/gpu/abs_kernel.cu +++ b/paddle/phi/kernels/gpu/abs_kernel.cu @@ -16,8 +16,8 @@ #include #include - #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/complex_functors.h" @@ -36,7 +36,18 @@ struct CudaAbsFunctor>> { }; template -struct CudaAbsFunctor>> { +struct CudaAbsFunctor< + T, + std::enable_if_t>::value && + std::is_same::value>> { + __device__ __forceinline__ T operator()(const T x) const { return abs(x); } +}; + +template +struct CudaAbsFunctor< + T, + std::enable_if_t>::value && + !std::is_same::value>> { __device__ __forceinline__ T operator()(const T x) const { return std::abs(x); } @@ -63,5 +74,6 @@ PD_REGISTER_KERNEL(abs, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} diff --git a/paddle/phi/kernels/impl/abs_grad_kernel_impl.h b/paddle/phi/kernels/impl/abs_grad_kernel_impl.h index 9dad40b57c..7064eec4f9 100644 --- a/paddle/phi/kernels/impl/abs_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/abs_grad_kernel_impl.h @@ -14,6 +14,7 @@ #pragma once +#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/kernels/abs_grad_kernel.h" #include "paddle/phi/kernels/funcs/complex_functors.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" @@ -22,6 +23,70 @@ namespace phi { #if defined(__NVCC__) + +template +struct AbsGradCUDAFunctor { + HOSTDEVICE inline AbsGradCUDAFunctor() {} + + HOSTDEVICE inline T operator()(const T x, const T dout) const { + T output; + if (x == T(0)) { + output = T(0); + } else { + output = T(dout) * (x / T(std::abs(x))); + } + return output; + } +}; + +template <> +struct AbsGradCUDAFunctor { + HOSTDEVICE inline AbsGradCUDAFunctor() {} + + HOSTDEVICE inline phi::dtype::bfloat16 operator()( + const phi::dtype::bfloat16 x, const phi::dtype::bfloat16 dout) const { + phi::dtype::bfloat16 output; + if (x == phi::dtype::bfloat16(0)) { + output = static_cast(0); + } else { + output = (dout) * (x / abs(x)); + } + return output; + } +}; + +template <> +struct AbsGradCUDAFunctor> { + HOSTDEVICE inline AbsGradCUDAFunctor() {} + HOSTDEVICE inline phi::dtype::complex operator()( + const phi::dtype::complex x, const float dout) const { + phi::dtype::complex output; + if (x == phi::dtype::complex(0)) { + output = phi::dtype::complex(0); + } else { + output = phi::dtype::complex(dout) * + (x / phi::dtype::complex(abs(x))); + } + return output; + } +}; + +template <> +struct AbsGradCUDAFunctor> { + HOSTDEVICE inline AbsGradCUDAFunctor() {} + HOSTDEVICE inline phi::dtype::complex operator()( + const phi::dtype::complex x, const double dout) const { + phi::dtype::complex output; + if (x == phi::dtype::complex(0)) { + output = phi::dtype::complex(0); + } else { + output = phi::dtype::complex(dout) * + (x / phi::dtype::complex(abs(x))); + } + return output; + } +}; + template void AbsGradKernelImpl(const GPUContext& dev_ctx, const DenseTensor& x, @@ -30,9 +95,10 @@ void AbsGradKernelImpl(const GPUContext& dev_ctx, std::vector ins = {&x, &dout}; std::vector outs = {dx}; dev_ctx.Alloc(dx); - phi::funcs::AbsGradCUDAFunctor abs_grad_cuda_functor; + AbsGradCUDAFunctor abs_grad_cuda_functor; phi::funcs::ElementwiseKernel(dev_ctx, ins, &outs, abs_grad_cuda_functor); } + template void AbsGradKernel(const Context& dev_ctx, const DenseTensor& x, diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index 4411fdc3d1..913777c251 100755 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -3699,6 +3699,7 @@ def create_test_act_bf16_class( create_test_act_bf16_class(TestRelu) +create_test_act_bf16_class(TestAbs) if __name__ == "__main__": unittest.main() -- GitLab