From 6c71c1f8f91a3223cc18e4793acc23f4229f4f38 Mon Sep 17 00:00:00 2001 From: Clementine Date: Tue, 27 Nov 2018 21:51:05 +0800 Subject: [PATCH] Add activation gelu (#14569) --- paddle/fluid/operators/activation_op.cc | 10 ++++++ paddle/fluid/operators/activation_op.h | 31 +++++++++++++++++++ paddle/fluid/platform/float16.h | 5 +++ .../tests/unittests/test_activation_op.py | 20 +++++++++++- 4 files changed, 65 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index bb9ea3f3ba0..832245371e0 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -149,6 +149,13 @@ $out = \max(x, 0)$ )DOC"; +UNUSED constexpr char GeluDoc[] = R"DOC( +Gelu Activation Operator. + +$out = \\frac{1 + erf(\\frac{x}{\\sqrt{2}})}{2} x$ + +)DOC"; + UNUSED constexpr char TanhDoc[] = R"DOC( Tanh Activation Operator. @@ -472,6 +479,7 @@ REGISTER_ACTIVATION_OP_MAKER(Sigmoid, SigmoidDoc); REGISTER_ACTIVATION_OP_MAKER(LogSigmoid, LogSigmoidDoc); REGISTER_ACTIVATION_OP_MAKER(Exp, ExpDoc); REGISTER_ACTIVATION_OP_MAKER(Relu, ReluDoc); +REGISTER_ACTIVATION_OP_MAKER(Gelu, GeluDoc); REGISTER_ACTIVATION_OP_MAKER(Tanh, TanhDoc); REGISTER_ACTIVATION_OP_MAKER(TanhShrink, TanhShrinkDoc); REGISTER_ACTIVATION_OP_MAKER(Sqrt, SqrtDoc); @@ -489,6 +497,7 @@ REGISTER_ACTIVATION_OP_MAKER(Softsign, SoftsignDoc); REGISTER_ACTIVATION_OP_GRAD_MAKER(Sigmoid, sigmoid); REGISTER_ACTIVATION_OP_GRAD_MAKER(Relu, relu); +REGISTER_ACTIVATION_OP_GRAD_MAKER(Gelu, gelu); REGISTER_ACTIVATION_OP_GRAD_MAKER(Exp, exp); REGISTER_ACTIVATION_OP_GRAD_MAKER(Tanh, tanh); REGISTER_ACTIVATION_OP_GRAD_MAKER(Ceil, ceil); @@ -525,6 +534,7 @@ namespace ops = paddle::operators; __macro(Round, round); \ __macro(Log, log); \ __macro(Square, square); \ + __macro(Gelu, gelu); \ __macro(BRelu, brelu); \ __macro(Pow, pow); \ __macro(STanh, stanh); \ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 4ffc7f364bc..c60ca18d132 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -16,6 +16,11 @@ limitations under the License. */ #include #include +#include +#ifndef _USE_MATH_DEFINES +#define _USE_MATH_DEFINES +#endif + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detail/safe_ref.h" @@ -212,6 +217,31 @@ struct ReluGradFunctor : public BaseActivationFunctor { } }; +// gelu(x) = 0.5 * x * (1 + erf(x / sqrt(2))) +template +struct GeluFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + auto temp = + ((x * static_cast(M_SQRT1_2)).erf()).template cast().eval(); + out.device(d) = x * static_cast(0.5) * (static_cast(1) + temp); + } +}; + +template +struct GeluGradFunctor : BaseActivationFunctor { + bool Inplace() const { return IsInplace("gelu"); } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp = (static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2) * x * + ((-static_cast(0.5) * x.square()).exp())) + .template cast() + .eval(); + dx.device(d) = dout * (out / x + temp); + } +}; + // tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) template struct TanhFunctor : public BaseActivationFunctor { @@ -877,6 +907,7 @@ struct SwishGradFunctor : public BaseActivationFunctor { __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ __macro(exp, ExpFunctor, ExpGradFunctor); \ __macro(relu, ReluFunctor, ReluGradFunctor); \ + __macro(gelu, GeluFunctor, GeluGradFunctor); \ __macro(tanh, TanhFunctor, TanhGradFunctor); \ __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ __macro(sqrt, SqrtFunctor, SqrtGradFunctor); \ diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h index ee16fc66e4a..9d48557caf7 100644 --- a/paddle/fluid/platform/float16.h +++ b/paddle/fluid/platform/float16.h @@ -1039,6 +1039,11 @@ HOSTDEVICE inline float16 exp(const float16& a) { return float16(::expf(static_cast(a))); } +template <> +HOSTDEVICE inline float16 erf(const float16& a) { + return float16(::erff(static_cast(a))); +} + template <> HOSTDEVICE inline float16 log(const float16& a) { return float16(::logf(static_cast(a))); diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index ad7591417ec..55c43ef115a 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -18,7 +18,7 @@ import unittest import numpy as np import paddle.fluid.core as core from op_test import OpTest -from scipy.special import expit +from scipy.special import expit, erf class TestActivation(OpTest): @@ -295,6 +295,23 @@ class TestRelu(TestActivation): self.check_grad(['X'], 'Out', max_relative_error=0.007) +class TestGelu(TestActivation): + def setUp(self): + self.op_type = "gelu" + self.init_dtype() + + x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype) + out = 0.5 * x * (1.0 + erf(x / np.sqrt(2.0))) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + + def test_check_grad(self): + if self.dtype == np.float16: + return + self.check_grad(['X'], 'Out', max_relative_error=0.007) + + class TestBRelu(TestActivation): def setUp(self): self.op_type = "brelu" @@ -628,6 +645,7 @@ create_test_act_fp16_class(TestCos, grad_atol=0.85) create_test_act_fp16_class(TestSin) create_test_act_fp16_class(TestRound, grad_check=False) create_test_act_fp16_class(TestRelu) +create_test_act_fp16_class(TestGelu) create_test_act_fp16_class(TestBRelu) create_test_act_fp16_class(TestRelu6) create_test_act_fp16_class(TestSoftRelu) -- GitLab