From 01eddc1a04787b27e1b682fcc8c799f643ebafef Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Wed, 16 Oct 2019 09:50:21 +0800 Subject: [PATCH] Support fp16 in GPU impl of fused_elemwise_activation_op. (#20636) * Support fp16 in fused_elemwise_activation_op. * Fix unit testing in ONLY-CPU mode. --- .../fused/fused_elemwise_activation_op.cu | 8 ++- paddle/fluid/operators/math/functors.h | 28 +++++--- .../test_fused_elemwise_activation_op.py | 67 ++++++++++++++++++- 3 files changed, 89 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu index e10693bae18..dba4097c7f3 100644 --- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu +++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu @@ -20,11 +20,15 @@ REGISTER_OP_CUDA_KERNEL( ops::FusedElemwiseActivationKernel, ops::FusedElemwiseActivationKernel); + double>, + ops::FusedElemwiseActivationKernel); REGISTER_OP_CUDA_KERNEL( fused_elemwise_activation_grad, ops::FusedElemwiseActivationGradKernel, ops::FusedElemwiseActivationGradKernel); + double>, + ops::FusedElemwiseActivationGradKernel); diff --git a/paddle/fluid/operators/math/functors.h b/paddle/fluid/operators/math/functors.h index e98bf82169a..bf64d7e8ceb 100644 --- a/paddle/fluid/operators/math/functors.h +++ b/paddle/fluid/operators/math/functors.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once +#include "paddle/fluid/operators/math.h" + namespace paddle { namespace operators { namespace math { @@ -40,8 +42,8 @@ struct AddFunctor { template struct AddGradFunctor { - inline HOSTDEVICE T Dx(T x, T y) { return 1; } - inline HOSTDEVICE T Dy(T x, T y) { return 1; } + inline HOSTDEVICE T Dx(T x, T y) { return static_cast(1.); } + inline HOSTDEVICE T Dy(T x, T y) { return static_cast(1.); } }; template @@ -68,14 +70,22 @@ struct ScaleGradFunctor { template struct ReluFunctor { - inline HOSTDEVICE T operator()(T x) { return x * (x > 0); } + inline HOSTDEVICE T operator()(T x) { + return x * (x > static_cast(0) ? static_cast(1) : static_cast(0)); + } }; template struct ReluGradFunctor { - inline HOSTDEVICE T UseX(T x) { return x > 0 ? 1 : 0; } - inline HOSTDEVICE T UseOut(T out) { return out > 0 ? 1 : 0; } - inline HOSTDEVICE T UseXAndOut(T x, T out) { return out > 0 ? 1 : 0; } + inline HOSTDEVICE T UseX(T x) { + return x > static_cast(0) ? static_cast(1) : static_cast(0); + } + inline HOSTDEVICE T UseOut(T out) { + return out > static_cast(0) ? static_cast(1) : static_cast(0); + } + inline HOSTDEVICE T UseXAndOut(T x, T out) { + return out > static_cast(0) ? static_cast(1) : static_cast(0); + } }; template @@ -84,9 +94,9 @@ struct TanhFunctor { const T kMax = static_cast(13); inline HOSTDEVICE T operator()(T x) { // y = 2 / (1 + e^-2x) - 1 - T t0 = 2 * x; + T t0 = static_cast(2) * x; T t1 = (t0 < kMin) ? kMin : ((t0 > kMax) ? kMax : t0); - return static_cast(2) / (static_cast(1) + std::exp(-t1)) - + return static_cast(2) / (static_cast(1) + real_exp(-t1)) - static_cast(1); } }; @@ -107,7 +117,7 @@ struct SigmoidFunctor { inline HOSTDEVICE T operator()(T x) { // y = 1 / (1 + e^-x) T tmp = (x < kMin) ? kMin : ((x > kMax) ? kMax : x); - return static_cast(1) / (static_cast(1) + std::exp(-tmp)); + return static_cast(1) / (static_cast(1) + real_exp(-tmp)); } }; diff --git a/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py index 3cf8e7229ad..5141bd47a80 100644 --- a/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py @@ -33,17 +33,24 @@ from op_test import OpTest # TestFusedElementwiseActivationOp_channelwise_add -def create_test_class(test_case, callback, attrs): +def create_test_class(test_case, + callback, + attrs, + dtype=np.float32, + grad_chek=True): class TestFusedElementwiseActivationOp_base(OpTest): def setUp(self): self.op_type = "fused_elemwise_activation" - self.dtype = np.float32 + self.dtype = dtype self.axis = -1 self.init_input() self.init_output() self.init_attr() + self.out = self.out.astype(self.dtype) + self.intermediate_out = self.intermediate_out.astype(self.dtype) + self.inputs = { 'X': OpTest.np_dtype_to_fluid_dtype(self.x), 'Y': OpTest.np_dtype_to_fluid_dtype(self.y) @@ -71,16 +78,25 @@ def create_test_class(test_case, callback, attrs): self.attrs[key] = attrs[key] def test_check_output(self): - self.check_output() + if self.dtype == np.float16 and core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=1e-3) + else: + self.check_output() # FIXME(zcd): the intermediate_out_grad is not checked. def test_check_grad_normal(self): + if not grad_chek: + return if self.attrs["save_intermediate_out"]: self.check_grad(['X', 'Y'], ['Out'], max_relative_error=0.005) else: self.check_grad(['X', 'Y'], ['Out'], max_relative_error=0.005) def test_check_grad_ingore_x(self): + if not grad_chek: + return if self.attrs["save_intermediate_out"]: self.check_grad( ['Y'], ['Out'], @@ -93,6 +109,8 @@ def create_test_class(test_case, callback, attrs): no_grad_set=set("X")) def test_check_grad_ingore_y(self): + if not grad_chek: + return if self.attrs["save_intermediate_out"]: self.check_grad( ['X'], ['Out'], @@ -307,11 +325,29 @@ for mode in {0, 1}: 'functor_list': ["scale", "elementwise_add"], 'save_intermediate_out': save_intermediate_out, }) + create_test_class( + 'scale_add_fp16' + suffix, + scale_add_func, { + 'scale': scale, + 'functor_list': ["scale", "elementwise_add"], + 'save_intermediate_out': save_intermediate_out, + }, + dtype=np.float16, + grad_chek=False) create_test_class('add_scale' + suffix, add_scale_func, { 'scale': scale, 'functor_list': ["elementwise_add", "scale"], 'save_intermediate_out': save_intermediate_out, }) + create_test_class( + 'add_scale_fp16' + suffix, + add_scale_func, { + 'scale': scale, + 'functor_list': ["elementwise_add", "scale"], + 'save_intermediate_out': save_intermediate_out, + }, + dtype=np.float16, + grad_chek=False) create_test_class('add_relu' + suffix, add_relu_func, { 'functor_list': ["elementwise_add", "relu"], 'save_intermediate_out': save_intermediate_out, @@ -320,11 +356,36 @@ for mode in {0, 1}: 'functor_list': ["relu", "elementwise_add"], 'save_intermediate_out': save_intermediate_out, }) + create_test_class( + 'add_relu_fp16' + suffix, + add_relu_func, { + 'functor_list': ["elementwise_add", "relu"], + 'save_intermediate_out': save_intermediate_out, + }, + dtype=np.float16, + grad_chek=False) + create_test_class( + 'relu_add_fp16' + suffix, + relu_add_func, { + 'functor_list': ["relu", "elementwise_add"], + 'save_intermediate_out': save_intermediate_out, + }, + dtype=np.float16, + grad_chek=False) create_test_class('mul_scale' + suffix, mul_scale_func, { 'scale': scale, 'functor_list': ["elementwise_mul", "scale"], 'save_intermediate_out': save_intermediate_out, }) + create_test_class( + 'mul_scale' + suffix, + mul_scale_func, { + 'scale': scale, + 'functor_list': ["elementwise_mul", "scale"], + 'save_intermediate_out': save_intermediate_out, + }, + dtype=np.float16, + grad_chek=False) if __name__ == '__main__': unittest.main() -- GitLab