From fbd83812c7b3a086970263975e1baff1cd7f3f9e Mon Sep 17 00:00:00 2001 From: Zhang Ting Date: Mon, 29 Aug 2022 15:09:44 +0800 Subject: [PATCH] Fix HardSwish inf (#35386) * fix hard_swish inf * skip_check_grad for mkldnn op * 'fix code style' * fix unittest --- paddle/phi/kernels/funcs/activation_functor.h | 35 +++++++++++-------- .../mkldnn/test_activation_mkldnn_op.py | 10 +++++- .../tests/unittests/test_activation_op.py | 15 ++++---- 3 files changed, 37 insertions(+), 23 deletions(-) diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h index 318f2e8b6b3..51420c5ecb6 100644 --- a/paddle/phi/kernels/funcs/activation_functor.h +++ b/paddle/phi/kernels/funcs/activation_functor.h @@ -3363,7 +3363,8 @@ struct CudaSwishGradFunctor : public BaseActivationFunctor { template struct CudaHardSwishFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); + using MPType = typename phi::dtype::MPTypeTrait::Type; + const MPType zero = static_cast(0.0f); float threshold; float scale; float offset; @@ -3377,19 +3378,19 @@ struct CudaHardSwishFunctor : public BaseActivationFunctor { // x * (x + offset) / scale, otherwise // threshold = scale = 6, offset = 3 by default __device__ __forceinline__ T operator()(const T x) const { - T t = static_cast(threshold); - T temp = x + static_cast(offset); - T temp_max = temp > zero ? temp : zero; - T temp_min = temp_max < t ? temp_max : t; - return temp_min * x / static_cast(scale); + const MPType x_t = static_cast(x); + const MPType temp_max = std::max(x_t + static_cast(offset), zero); + const MPType temp_min = std::min(temp_max, static_cast(threshold)); + return static_cast(temp_min * x_t / static_cast(scale)); } }; template struct CudaHardSwishGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - T one = static_cast(1.0f); - T two = static_cast(2.0f); + using MPType = typename phi::dtype::MPTypeTrait::Type; + const MPType zero = static_cast(0.0f); + const MPType one = static_cast(1.0f); + const MPType two = static_cast(2.0f); float threshold; float scale; float offset; @@ -3403,11 +3404,17 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor { // dout * (2 * x / scale + offset / scale), otherwise // threshold = scale = 6, offset = 3 by default __device__ __forceinline__ T operator()(const T dout, const T x) const { - T o = static_cast(offset); - T s = static_cast(scale); - T temp1 = static_cast(x + o > zero); - T temp2 = static_cast(x + o < static_cast(threshold)); - return dout * (temp1 * temp2 * (two * x + o) / s + one - temp2); + const MPType dout_t = static_cast(dout); + const MPType x_t = static_cast(x); + const MPType offset_t = static_cast(offset); + const MPType scale_t = static_cast(scale); + const MPType temp1 = static_cast(x_t + offset_t > zero); + const MPType temp2 = + static_cast(x_t + offset_t < static_cast(threshold)); + + return static_cast( + dout_t * + (temp1 * temp2 * (two * x_t + offset_t) / scale_t + one - temp2)); } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py index 6796773ae65..ac880b02c45 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py @@ -18,7 +18,7 @@ import unittest import numpy as np from scipy.special import expit, erf import paddle.fluid.core as core -from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_float_to_uint16 +from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_float_to_uint16, skip_check_grad_ci from paddle.fluid.tests.unittests.test_activation_op import TestActivation, TestRelu, TestTanh, TestSqrt, TestAbs, TestLeakyRelu, TestSwish, TestHardSwish, TestRelu6, TestSigmoid from paddle.fluid.tests.unittests.test_gelu_op import gelu from mkldnn_op_test import check_if_mkldnn_primitives_exist_in_bwd @@ -128,6 +128,7 @@ class TestMKLDNNSwishDim2(TestSwish): self.dtype = np.float32 +@skip_check_grad_ci(reason="not implemented yet") class TestMKLDNNHardSwishDim2(TestHardSwish): def setUp(self): @@ -138,6 +139,9 @@ class TestMKLDNNHardSwishDim2(TestHardSwish): def init_dtype(self): self.dtype = np.float32 + def test_check_grad(self): + pass + class TestMKLDNNSigmoidDim2(TestSigmoid): @@ -317,6 +321,7 @@ def ref_hardswish(x, threshold=6.0, scale=6.0, offset=3.0): scale).astype(x.dtype) +@skip_check_grad_ci(reason="not implemented yet") class TestMKLDNNHardSwishDim4(TestHardSwish): def setUp(self): @@ -338,6 +343,9 @@ class TestMKLDNNHardSwishDim4(TestHardSwish): def init_dtype(self): self.dtype = np.float32 + def test_check_grad(self): + pass + class TestMKLDNNMish(TestActivation): diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index b849d6e6f59..fe31e3bacd5 100755 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -1815,8 +1815,12 @@ class TestRelu6API(unittest.TestCase): def ref_hardswish(x, threshold=6.0, scale=6.0, offset=3.0): + x_dtype = x.dtype + if x_dtype == 'float16': + x_dtype = 'float16' + x = x.astype('float32') return (x * np.minimum(np.maximum(x + offset, 0.), threshold) / - scale).astype(x.dtype) + scale).astype(x_dtype) class TestHardSwish(TestActivation): @@ -1825,7 +1829,6 @@ class TestHardSwish(TestActivation): self.op_type = 'hard_swish' self.init_dtype() self.python_api = paddle.nn.functional.hardswish - skip_check_grad_ci(reason="not implemented yet") np.random.seed(1024) x = np.random.uniform(-6, 6, [10, 12]).astype(self.dtype) @@ -1842,10 +1845,6 @@ class TestHardSwish(TestActivation): self.outputs = {'Out': out} def test_check_grad(self): - if self.dtype == np.float16: - return - - return # not implemented yet self.check_grad(['X'], 'Out', check_eager=True) def test_check_output(self): @@ -1873,11 +1872,11 @@ class TestHardswishAPI(unittest.TestCase): def test_dygraph_api(self): paddle.disable_static(self.place) - x = paddle.to_tensor(self.x_np) + x = paddle.to_tensor([11648., 11448.]) out1 = F.hardswish(x) m = paddle.nn.Hardswish() out2 = m(x) - out_ref = ref_hardswish(self.x_np) + out_ref = [11648., 11448.] for r in [out1, out2]: np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-05) paddle.enable_static() -- GitLab