[AMP OP&Test] Add fp16 and bf16 test to activation (#52521)

* adjust defalut tolerance of output and grad * fix a bug in the grad of OpTest * fix the type of setting defalut value in optest, both forward and backward * add defalut * fix test_sum_op * adjust tolerance * fix the tolerance of eager * add bf16 and fp16 to the activation tests * remove some fixs * fix activation * fix fp16 * fix gelu * fix the activation tests * add bfloat16 specialization to singrad and cosgrad * fix bugs * fix bugs * add unittest * add skip * add fp/bf to rrelu/rrelu_grad * git add rrelu * fix bugs

[AMP OP&Test] Add fp16 and bf16 test to activation (#52521)
* adjust defalut tolerance of output and grad * fix a bug in the grad of OpTest * fix the type of setting defalut value in optest, both forward and backward * add defalut * fix test_sum_op * adjust tolerance * fix the tolerance of eager * add bf16 and fp16 to the activation tests * remove some fixs * fix activation * fix fp16 * fix gelu * fix the activation tests * add bfloat16 specialization to singrad and cosgrad * fix bugs * fix bugs * add unittest * add skip * add fp/bf to rrelu/rrelu_grad * git add rrelu * fix bugs
6bd5fd75 · Vvsmile · GitHub · 70eaf9de · 6bd5fd75 · 6bd5fd75
7 changed file
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -70,6 +70,13 @@ struct Sine<dtype::float16> {
  }
 };
+template <>
+struct Sine<dtype::bfloat16> {
+  HOSTDEVICE dtype::bfloat16 operator()(const dtype::bfloat16& val) const {
+    return dtype::bfloat16(sin(static_cast<float>(val)));
+  }
+};
 template <typename T>
 struct Cosine {
  HOSTDEVICE T operator()(const T& val) const { return cos(val); }
@@ -82,6 +89,13 @@ struct Cosine<dtype::float16> {
  }
 };
+template <>
+struct Cosine<dtype::bfloat16> {
+  HOSTDEVICE dtype::bfloat16 operator()(const dtype::bfloat16& val) const {
+    return dtype::bfloat16(cos(static_cast<float>(val)));
+  }
+};
 // sine'(x) = cos(x)
 template <typename T>
 struct SinGradFunctor : public BaseActivationFunctor<T> {
@@ -2664,10 +2678,12 @@ struct CudaExpGradFunctor : public BaseActivationFunctor<T> {
 template <typename T>
 struct CudaReciprocalFunctor : public BaseActivationFunctor<T> {
-  T one = static_cast<T>(1.0f);
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
-  // reciprocal(x) = 1 / x
+  __device__ __forceinline__ T operator()(const T x) const {
-  __device__ __forceinline__ T operator()(const T x) const { return one / x; }
+    return static_cast<T>(one / static_cast<MPType>(x));
+  }
 };
 template <typename T>

--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -425,7 +425,8 @@ PD_REGISTER_KERNEL(sin_double_grad,
                   double,
                   int,
                   int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(sin_triple_grad,
                   GPU,
@@ -435,7 +436,8 @@ PD_REGISTER_KERNEL(sin_triple_grad,
                   double,
                   int,
                   int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(cos_double_grad,
                   GPU,
@@ -445,7 +447,8 @@ PD_REGISTER_KERNEL(cos_double_grad,
                   double,
                   int,
                   int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(cos_triple_grad,
                   GPU,
@@ -455,7 +458,8 @@ PD_REGISTER_KERNEL(cos_triple_grad,
                   double,
                   int,
                   int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(softsign_grad, SoftsignGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel)
@@ -473,7 +477,8 @@ PD_REGISTER_KERNEL(log_double_grad,
                   phi::LogDoubleGradKernel,
                   float,
                   double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(hardswish_grad, HardSwishGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(swish_grad, SwishGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(round_grad, RoundGradKernel)

--- a/paddle/phi/kernels/gpu/rrelu_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/rrelu_grad_kernel.cu
@@ -83,4 +83,5 @@ PD_REGISTER_KERNEL(rrelu_grad,
                   phi::RReluGradKernel,
                   float,
                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
                   double) {}
--- a/paddle/phi/kernels/gpu/rrelu_kernel.cu
+++ b/paddle/phi/kernels/gpu/rrelu_kernel.cu
@@ -110,4 +110,5 @@ PD_REGISTER_KERNEL(rrelu,
                   phi::RReluKernel,
                   float,
                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
                   double) {}
--- a/python/paddle/fluid/tests/unittests/eager_op_test.py
+++ b/python/paddle/fluid/tests/unittests/eager_op_test.py
@@ -885,7 +885,9 @@ class OpTest(unittest.TestCase):
                np_dyg,
                rtol=1e-05,
                equal_nan=False,
-                err_msg='Output ('
+                err_msg='Operator ('
+                + self.op_type
+                + ') Output ('
                + name
                + ') has diff at '
                + str(place)
@@ -1137,7 +1139,9 @@ class OpTest(unittest.TestCase):
                    actual_out,
                    rtol=1e-05,
                    atol=inplace_atol,
-                    err_msg='Output ('
+                    err_msg='Operator ('
+                    + self.op_type
+                    + ') Output ('
                    + name
                    + ') has diff at '
                    + str(place)
@@ -1626,7 +1630,9 @@ class OpTest(unittest.TestCase):
                        rtol=self.rtol if hasattr(self, 'rtol') else rtol,
                        equal_nan=equal_nan,
                        err_msg=(
-                            "Output ("
+                            "Operator ("
+                            + self.op_type
+                            + ") Output ("
                            + name
                            + ") has diff at "
                            + str(place)
@@ -1643,7 +1649,9 @@ class OpTest(unittest.TestCase):
                        rtol=self.rtol if hasattr(self, 'rtol') else rtol,
                        equal_nan=equal_nan,
                    ),
-                    "Output ("
+                    "Operator ("
+                    + self.op_type
+                    + ") Output ("
                    + name
                    + ") has diff at "
                    + str(place)
@@ -1815,7 +1823,9 @@ class OpTest(unittest.TestCase):
                            rtol=self.rtol if hasattr(self, 'rtol') else rtol,
                            equal_nan=equal_nan,
                            err_msg=(
-                                "Output ("
+                                "Operator ("
+                                + self.op_type
+                                + ") Output ("
                                + name
                                + ") has diff at "
                                + str(place)
@@ -1832,7 +1842,9 @@ class OpTest(unittest.TestCase):
                            rtol=self.rtol if hasattr(self, 'rtol') else rtol,
                            equal_nan=equal_nan,
                        ),
-                        "Output ("
+                        "Operator ("
+                        + self.op_type
+                        + ") Output ("
                        + name
                        + ") has diff at "
                        + str(place)
@@ -1882,7 +1894,9 @@ class OpTest(unittest.TestCase):
                        .get_tensor()
                        .recursive_sequence_lengths(),
                        expect[1],
-                        "Output ("
+                        "Operator ("
+                        + self.op_type
+                        + ") Output ("
                        + name
                        + ") has different lod at "
                        + str(place)

--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
--- a/python/paddle/fluid/tests/unittests/test_rrelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rrelu_op.py
@@ -15,7 +15,7 @@
 import unittest
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 import paddle
 import paddle.nn.functional as F
@@ -327,7 +327,7 @@ class RReluTest(OpTest):
        ]  # python out sig is customized output signature.
    def init_params(self):
-        self.dtype = "float64"
+        self.init_dtype()
        self.x_shape = [2, 3, 4, 5]
        x_np = np.random.uniform(-1, 1, self.x_shape).astype(self.dtype)
@@ -337,12 +337,19 @@ class RReluTest(OpTest):
        self.inputs = {'X': x_np}
        self.outputs = {'Out': out_np, 'Noise': noise_np}
+        self.convert_input_output()
        self.attrs = {
            'lower': self.lower,
            "upper": self.upper,
            "is_test": self.is_test,
        }
+    def init_dtype(self):
+        self.dtype = "float64"
+    def convert_input_output(self):
+        pass
    def test_check_output(self):
        self.check_output(no_check_set=['Noise'])
@@ -363,5 +370,67 @@ class RReluTrainingTest(RReluTest):
        ]  # python out sig is customized output signature.
+class RReluTestFP16OP(RReluTest):
+    def init_dtype(self):
+        self.dtype = np.float16
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class RReluTestBF16OP(RReluTest):
+    def init_dtype(self):
+        self.dtype = np.float32
+    def convert_input_output(self):
+        self.inputs = {'X': convert_float_to_uint16(self.inputs['X'])}
+        self.outputs = {
+            'Out': convert_float_to_uint16(self.outputs['Out']),
+            'Noise': convert_float_to_uint16(self.outputs['Noise']),
+        }
+        self.dtype = np.uint16
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, no_check_set=['Noise'])
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out')
+class RReluTrainingTestFP16OP(RReluTrainingTest):
+    def init_dtype(self):
+        self.dtype = np.float16
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class RReluTrainingTestBF16OP(RReluTrainingTest):
+    def init_dtype(self):
+        self.dtype = np.float32
+    def convert_input_output(self):
+        self.inputs = {'X': convert_float_to_uint16(self.inputs['X'])}
+        self.outputs = {
+            'Out': convert_float_to_uint16(self.outputs['Out']),
+            'Noise': convert_float_to_uint16(self.outputs['Noise']),
+        }
+        self.dtype = np.uint16
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, no_check_set=['Noise'])
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out')
 if __name__ == "__main__":
    unittest.main()