diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc index 92db62907924d8e9e3e6acde88f3d66b7f69ec0a..a6bb738af3ad30b9f1e5b0340fb7a7419794427a 100644 --- a/paddle/operators/activation_op.cc +++ b/paddle/operators/activation_op.cc @@ -49,6 +49,18 @@ class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker { } }; +class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LogSigmoidOpMaker(framework::OpProto *proto, + framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of LogSigmoid operator"); + AddOutput("Y", "Output of LogSigmoid operator"); + AddComment( + "Logsigmoid activation operator, logsigmoid = log (1 / (1 + exp(-x)))"); + } +}; + class ExpOpMaker : public framework::OpProtoAndCheckerMaker { public: ExpOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) @@ -85,6 +97,23 @@ class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker { } }; +template +class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SoftShrinkOpMaker(framework::OpProto *proto, + framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Softshrink operator"); + AddOutput("Y", "Output of Softshrink operator"); + AddComment( + "Softshrink activation operator, " + "softshrink = x - lambda, if x > lambda;" + " x + lambda, if x < lambda; 0 otherwise"); + AddAttr("lambda", "non-negative offset") + .SetDefault(static_cast(0.5f)); + } +}; + class TanhOpMaker : public framework::OpProtoAndCheckerMaker { public: TanhOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) @@ -271,6 +300,9 @@ namespace ops = paddle::operators; REGISTER_OP(sigmoid, ops::ActivationOp, ops::SigmoidOpMaker, sigmoid_grad, ops::ActivationOpGrad); +REGISTER_OP(logsigmoid, ops::ActivationOp, ops::LogSigmoidOpMaker, + logsigmoid_grad, ops::ActivationOpGrad); + REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad, ops::ActivationOpGrad); @@ -283,6 +315,9 @@ REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad, REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker, tanh_shrink_grad, ops::ActivationOpGrad); +REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker, + softshrink_grad, ops::ActivationOpGrad); + REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad, ops::ActivationOpGrad); diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h index 123f0c4dbca6537c9bd167ca74a06987db6e1893..70d5a620521b234186d8d0a16695ce630e804aa3 100644 --- a/paddle/operators/activation_op.h +++ b/paddle/operators/activation_op.h @@ -95,6 +95,41 @@ struct SigmoidGradFunctor : public BaseActivationFunctor { } }; +// Originally: logsigmoid(x) = -log (1 + exp(-x)) +// For numerical stability, we can use the log-sum-exp trick: +// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/ +// We can rewrite the above equation as: +// y = -log( exp(0) + exp(-x)) [since exp(0) = 1] +// = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0))) +// = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x - +// max(-x, 0))) +// = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0)))) +// = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))) +// +// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0)) +// + exp(-x - max(-x, 0)))) +template +struct LogSigmoidFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Y y) const { + auto temp = (-x).cwiseMax(static_cast(0)); // temp = max(-x, 0) + y.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log()); + } +}; + +// Originally: f' = exp(-x) / (1 + exp(-x)) +// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + +// exp(-x - max(-x, 0))) +template +struct LogSigmoidGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Y y, dY dy, dX dx) const { + auto temp = (-x).cwiseMax(static_cast(0)); // temp = max(-x, 0) + dx.device(d) = + dy * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp())); + } +}; + // exp(x) = e^x template struct ExpFunctor : public BaseActivationFunctor { @@ -164,6 +199,37 @@ struct TanhShrinkGradFunctor : public BaseActivationFunctor { } }; +// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < lambda; 0 +// otherwise +template +struct SoftShrinkFunctor : public BaseActivationFunctor { + float lambda; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"lambda", &lambda}}; + } + + template + void operator()(Device d, X x, Y y) const { + auto temp1 = (x > lambda).template cast().eval(); + auto temp2 = (x < -lambda).template cast().eval(); + y.device(d) = temp1 * (x - lambda) + temp2 * (x + lambda); + } +}; + +template +struct SoftShrinkGradFunctor : public BaseActivationFunctor { + float lambda; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"lambda", &lambda}}; + } + template + void operator()(Device d, X x, Y y, dY dy, dX dx) const { + auto temp1 = (x > lambda).template cast().eval(); + auto temp2 = (x < -lambda).template cast().eval(); + dx.device(d) = dy * (temp1 + temp2).template cast(); + } +}; + // sqrt(x) = x^(1/2) template struct SqrtFunctor : public BaseActivationFunctor { @@ -471,9 +537,11 @@ struct STanhGradFunctor : public BaseActivationFunctor { #define FOR_EACH_KERNEL_FUNCTOR(__macro) \ __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor); \ + __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ __macro(exp, ExpFunctor, ExpGradFunctor); \ __macro(relu, ReluFunctor, ReluGradFunctor); \ __macro(tanh, TanhFunctor, TanhGradFunctor); \ + __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ __macro(sqrt, SqrtFunctor, SqrtGradFunctor); \ __macro(abs, AbsFunctor, AbsGradFunctor); \ __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ @@ -484,7 +552,7 @@ struct STanhGradFunctor : public BaseActivationFunctor { __macro(pow, PowFunctor, PowGradFunctor); \ __macro(stanh, STanhFunctor, STanhGradFunctor); \ __macro(softsign, SoftsignFunctor, SoftsignGradFunctor); \ - __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor); \ __macro(relu6, Relu6Functor, Relu6GradFunctor); \ + __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor); \ __macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \ __macro(elu, ELUFunctor, ELUGradFunctor) diff --git a/python/paddle/v2/framework/tests/test_activation_op.py b/python/paddle/v2/framework/tests/test_activation_op.py index 4528ed555d6bd316a9a0d8f76de861f2b8a61030..9157e00f6e00f2fe90245c2b5c55984c37a069bc 100644 --- a/python/paddle/v2/framework/tests/test_activation_op.py +++ b/python/paddle/v2/framework/tests/test_activation_op.py @@ -33,6 +33,21 @@ class TestSigmoid(OpTest): self.check_grad(['X'], 'Y', max_relative_error=0.008) +class TestLogSigmoid(OpTest): + def setUp(self): + self.op_type = "logsigmoid" + self.inputs = { + 'X': np.random.uniform(-1, 1, [11, 17]).astype("float32") + } + self.outputs = {'Y': np.log(1 / (1 + np.exp(-self.inputs['X'])))} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Y', max_relative_error=0.008) + + class TestTanh(OpTest): def setUp(self): self.op_type = "tanh" @@ -63,6 +78,26 @@ class TestTanhShrink(OpTest): self.check_grad(['X'], 'Y', max_relative_error=0.008) +class TestSoftShrink(OpTest): + def setUp(self): + self.op_type = "softshrink" + lambda_val = 0.1 + self.attrs = {'lambda': lambda_val} + self.inputs = { + 'X': np.random.uniform(0.25, 10, [4, 4]).astype("float32") + } + y = np.copy(self.inputs['X']) + y = (y < -lambda_val) * (y + lambda_val) + (y > lambda_val) * ( + y - lambda_val) + self.outputs = {'Y': y} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Y', max_relative_error=0.007) + + class TestSqrt(OpTest): def setUp(self): self.op_type = "sqrt"