Support fp16 in GPU impl of fused_elemwise_activation_op. (#20636)

* Support fp16 in fused_elemwise_activation_op. * Fix unit testing in ONLY-CPU mode.

Support fp16 in GPU impl of fused_elemwise_activation_op. (#20636)
* Support fp16 in fused_elemwise_activation_op. * Fix unit testing in ONLY-CPU mode.
01eddc1a · qingqing01 · GitHub · db9fbcbc · 01eddc1a · 01eddc1a
3 changed file
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu
@@ -20,11 +20,15 @@ REGISTER_OP_CUDA_KERNEL(
    ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
                                       float>,
    ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
-                                       double>);
+                                       double>,
+    ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
+                                       paddle::platform::float16>);

 REGISTER_OP_CUDA_KERNEL(
    fused_elemwise_activation_grad,
    ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
                                           float>,
    ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
-                                           double>);
+                                           double>,
+    ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
+                                           paddle::platform::float16>);
--- a/paddle/fluid/operators/math/functors.h
+++ b/paddle/fluid/operators/math/functors.h
@@ -14,6 +14,8 @@ limitations under the License. */

 #pragma once

+#include "paddle/fluid/operators/math.h"
+
 namespace paddle {
 namespace operators {
 namespace math {
@@ -40,8 +42,8 @@ struct AddFunctor {

 template <typename T>
 struct AddGradFunctor {
-  inline HOSTDEVICE T Dx(T x, T y) { return 1; }
-  inline HOSTDEVICE T Dy(T x, T y) { return 1; }
+  inline HOSTDEVICE T Dx(T x, T y) { return static_cast<T>(1.); }
+  inline HOSTDEVICE T Dy(T x, T y) { return static_cast<T>(1.); }
 };

 template <typename T>
@@ -68,14 +70,22 @@ struct ScaleGradFunctor {

 template <typename T>
 struct ReluFunctor {
-  inline HOSTDEVICE T operator()(T x) { return x * (x > 0); }
+  inline HOSTDEVICE T operator()(T x) {
+    return x * (x > static_cast<T>(0) ? static_cast<T>(1) : static_cast<T>(0));
+  }
 };

 template <typename T>
 struct ReluGradFunctor {
-  inline HOSTDEVICE T UseX(T x) { return x > 0 ? 1 : 0; }
-  inline HOSTDEVICE T UseOut(T out) { return out > 0 ? 1 : 0; }
-  inline HOSTDEVICE T UseXAndOut(T x, T out) { return out > 0 ? 1 : 0; }
+  inline HOSTDEVICE T UseX(T x) {
+    return x > static_cast<T>(0) ? static_cast<T>(1) : static_cast<T>(0);
+  }
+  inline HOSTDEVICE T UseOut(T out) {
+    return out > static_cast<T>(0) ? static_cast<T>(1) : static_cast<T>(0);
+  }
+  inline HOSTDEVICE T UseXAndOut(T x, T out) {
+    return out > static_cast<T>(0) ? static_cast<T>(1) : static_cast<T>(0);
+  }
 };

 template <typename T>
@@ -84,9 +94,9 @@ struct TanhFunctor {
  const T kMax = static_cast<T>(13);
  inline HOSTDEVICE T operator()(T x) {
    // y = 2 / (1 + e^-2x) - 1
-    T t0 = 2 * x;
+    T t0 = static_cast<T>(2) * x;
    T t1 = (t0 < kMin) ? kMin : ((t0 > kMax) ? kMax : t0);
-    return static_cast<T>(2) / (static_cast<T>(1) + std::exp(-t1)) -
+    return static_cast<T>(2) / (static_cast<T>(1) + real_exp(-t1)) -
           static_cast<T>(1);
  }
 };
@@ -107,7 +117,7 @@ struct SigmoidFunctor {
  inline HOSTDEVICE T operator()(T x) {
    // y = 1 / (1 + e^-x)
    T tmp = (x < kMin) ? kMin : ((x > kMax) ? kMax : x);
-    return static_cast<T>(1) / (static_cast<T>(1) + std::exp(-tmp));
+    return static_cast<T>(1) / (static_cast<T>(1) + real_exp(-tmp));
  }
 };


--- a/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
@@ -33,17 +33,24 @@ from op_test import OpTest
 #   TestFusedElementwiseActivationOp_channelwise_add


-def create_test_class(test_case, callback, attrs):
+def create_test_class(test_case,
+                      callback,
+                      attrs,
+                      dtype=np.float32,
+                      grad_chek=True):
    class TestFusedElementwiseActivationOp_base(OpTest):
        def setUp(self):
            self.op_type = "fused_elemwise_activation"
-            self.dtype = np.float32
+            self.dtype = dtype
            self.axis = -1

            self.init_input()
            self.init_output()
            self.init_attr()

+            self.out = self.out.astype(self.dtype)
+            self.intermediate_out = self.intermediate_out.astype(self.dtype)
+
            self.inputs = {
                'X': OpTest.np_dtype_to_fluid_dtype(self.x),
                'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
@@ -71,16 +78,25 @@ def create_test_class(test_case, callback, attrs):
                self.attrs[key] = attrs[key]

        def test_check_output(self):
-            self.check_output()
+            if self.dtype == np.float16 and core.is_compiled_with_cuda():
+                place = core.CUDAPlace(0)
+                if core.is_float16_supported(place):
+                    self.check_output_with_place(place, atol=1e-3)
+            else:
+                self.check_output()

        # FIXME(zcd): the intermediate_out_grad is not checked.
        def test_check_grad_normal(self):
+            if not grad_chek:
+                return
            if self.attrs["save_intermediate_out"]:
                self.check_grad(['X', 'Y'], ['Out'], max_relative_error=0.005)
            else:
                self.check_grad(['X', 'Y'], ['Out'], max_relative_error=0.005)

        def test_check_grad_ingore_x(self):
+            if not grad_chek:
+                return
            if self.attrs["save_intermediate_out"]:
                self.check_grad(
                    ['Y'], ['Out'],
@@ -93,6 +109,8 @@ def create_test_class(test_case, callback, attrs):
                    no_grad_set=set("X"))

        def test_check_grad_ingore_y(self):
+            if not grad_chek:
+                return
            if self.attrs["save_intermediate_out"]:
                self.check_grad(
                    ['X'], ['Out'],
@@ -307,11 +325,29 @@ for mode in {0, 1}:
            'functor_list': ["scale", "elementwise_add"],
            'save_intermediate_out': save_intermediate_out,
        })
+        create_test_class(
+            'scale_add_fp16' + suffix,
+            scale_add_func, {
+                'scale': scale,
+                'functor_list': ["scale", "elementwise_add"],
+                'save_intermediate_out': save_intermediate_out,
+            },
+            dtype=np.float16,
+            grad_chek=False)
        create_test_class('add_scale' + suffix, add_scale_func, {
            'scale': scale,
            'functor_list': ["elementwise_add", "scale"],
            'save_intermediate_out': save_intermediate_out,
        })
+        create_test_class(
+            'add_scale_fp16' + suffix,
+            add_scale_func, {
+                'scale': scale,
+                'functor_list': ["elementwise_add", "scale"],
+                'save_intermediate_out': save_intermediate_out,
+            },
+            dtype=np.float16,
+            grad_chek=False)
        create_test_class('add_relu' + suffix, add_relu_func, {
            'functor_list': ["elementwise_add", "relu"],
            'save_intermediate_out': save_intermediate_out,
@@ -320,11 +356,36 @@ for mode in {0, 1}:
            'functor_list': ["relu", "elementwise_add"],
            'save_intermediate_out': save_intermediate_out,
        })
+        create_test_class(
+            'add_relu_fp16' + suffix,
+            add_relu_func, {
+                'functor_list': ["elementwise_add", "relu"],
+                'save_intermediate_out': save_intermediate_out,
+            },
+            dtype=np.float16,
+            grad_chek=False)
+        create_test_class(
+            'relu_add_fp16' + suffix,
+            relu_add_func, {
+                'functor_list': ["relu", "elementwise_add"],
+                'save_intermediate_out': save_intermediate_out,
+            },
+            dtype=np.float16,
+            grad_chek=False)
        create_test_class('mul_scale' + suffix, mul_scale_func, {
            'scale': scale,
            'functor_list': ["elementwise_mul", "scale"],
            'save_intermediate_out': save_intermediate_out,
        })
+        create_test_class(
+            'mul_scale' + suffix,
+            mul_scale_func, {
+                'scale': scale,
+                'functor_list': ["elementwise_mul", "scale"],
+                'save_intermediate_out': save_intermediate_out,
+            },
+            dtype=np.float16,
+            grad_chek=False)

 if __name__ == '__main__':
    unittest.main()