Add FP16 & BF16 for erfinv (#55287)

6d7efd09 · cyberslack_lee · GitHub · 19da5c0c · 6d7efd09 · 6d7efd09
5 changed file
--- a/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu
@@ -22,5 +22,11 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h"
-PD_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(erfinv_grad,
-    erfinv_grad, GPU, ALL_LAYOUT, phi::ErfinvGradKernel, float, double) {}
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ErfinvGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
--- a/paddle/phi/kernels/gpu/erfinv_kernel.cu
+++ b/paddle/phi/kernels/gpu/erfinv_kernel.cu
@@ -23,7 +23,21 @@ template <typename T>
 struct ErfinvFunctor {
  HOSTDEVICE inline T operator()(const T x) const { return erfinv(x); }
 };
+template <>
+struct ErfinvFunctor<float16> {
+  HOSTDEVICE inline float16 operator()(const float16 x) const {
+    auto x_ = static_cast<float>(x);
+    return static_cast<float16>(erfinv(x_));
+  }
+};
+template <>
+struct ErfinvFunctor<bfloat16> {
+  HOSTDEVICE inline bfloat16 operator()(const bfloat16 x) const {
+    auto x_ = static_cast<float>(x);
+    return static_cast<bfloat16>(erfinv(x_));
+  }
+};
 template <typename T, typename Context>
 void ErfinvKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
  ctx.template Alloc<T>(out);
@@ -34,4 +48,11 @@ void ErfinvKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
 }  // namespace phi
-PD_REGISTER_KERNEL(erfinv, GPU, ALL_LAYOUT, phi::ErfinvKernel, float, double) {}
+PD_REGISTER_KERNEL(erfinv,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ErfinvKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
--- a/paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h
@@ -29,7 +29,7 @@ void ErfinvGradKernel(const Context& ctx,
  auto eigen_dout = EigenVector<T>::Flatten(out_grad);
  auto eigen_dx = EigenVector<T>::Flatten(*x_grad);
  auto& place = *ctx.eigen_device();
-  constexpr T half_sqrt_pi = static_cast<T>(1 / M_2_SQRTPI);
+  T half_sqrt_pi = static_cast<T>(1 / M_2_SQRTPI);
  eigen_dx.device(place) = half_sqrt_pi * eigen_dout * eigen_out.square().exp();
 }

--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -4760,7 +4760,7 @@ def erfinv(x, name=None):
            erfinv(erf(x)) = x.
    Args:
-        x (Tensor): An N-D Tensor, the data type is float32, float64.
+        x (Tensor): An N-D Tensor, the data type is float16, bfloat16, float32, float64.
        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
    Returns:
@@ -4779,7 +4779,9 @@ def erfinv(x, name=None):
    if in_dynamic_mode():
        return _C_ops.erfinv(x)
    else:
-        check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'erfinv')
+        check_variable_and_dtype(
+            x, 'x', ['float32', 'float64', 'float16', 'uint16'], 'erfinv'
+        )
        helper = LayerHelper('erfinv', **locals())
        out = helper.create_variable_for_type_inference(dtype=x.dtype)
        helper.append_op(type='erfinv', inputs={'X': x}, outputs={'Out': out})

--- a/test/legacy_test/test_erfinv_op.py
+++ b/test/legacy_test/test_erfinv_op.py
@@ -15,7 +15,11 @@
 import unittest
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import (
+    OpTest,
+    convert_float_to_uint16,
+    convert_uint16_to_float,
+)
 from scipy.special import erfinv
 import paddle
@@ -25,7 +29,7 @@ paddle.enable_static()
 np.random.seed(0)
-class TestErfinv(OpTest):
+class TestErfinvOp(OpTest):
    def setUp(self):
        self.op_type = "erfinv"
        self.python_api = paddle.erfinv
@@ -55,12 +59,12 @@ class TestErfinv(OpTest):
        )
-class TestErfinvFP32(TestErfinv):
+class TestErfinvFP64Op(TestErfinvOp):
    def init_dtype(self):
-        self.dtype = np.float32
+        self.dtype = np.float64
-class TestErfinvAPI(unittest.TestCase):
+class TestErfinvAPIOp(unittest.TestCase):
    def init_dtype(self):
        self.dtype = 'float32'
@@ -110,5 +114,49 @@ class TestErfinvAPI(unittest.TestCase):
            run(place)
+class TestErfinvFP16Op(TestErfinvOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the bfloat16",
+)
+class TestErfinvBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "erfinv"
+        self.public_python_api = paddle.erfinv
+        self.python_api = paddle.erfinv
+        self.dtype = np.uint16
+        self.shape = [11, 17]
+        self.datatype = np.float32
+        self.input_data = np.random.uniform(-1, 1, size=self.shape).astype(
+            self.datatype
+        )
+        self.inputs = {'X': convert_float_to_uint16(self.input_data)}
+        self.inputs_data = convert_uint16_to_float(self.inputs['X'])
+        out_ref = erfinv(self.input_data)
+        self.grad_out = np.ones(self.shape, self.datatype)
+        self.gradient = (
+            np.sqrt(np.pi) / 2 * np.exp(np.square(out_ref)) * self.grad_out
+        )
+        self.outputs = {'Out': convert_float_to_uint16(out_ref)}
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place,
+            ['X'],
+            'Out',
+        )
 if __name__ == "__main__":
    unittest.main()