// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #include #include #include #include #include #include #include #include #ifndef _USE_MATH_DEFINES #define _USE_MATH_DEFINES #endif #include #include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/extensions.h" #ifdef PADDLE_WITH_XPU_KP #define __forceinline__ __inline__ #endif namespace phi { namespace funcs { enum ActBwdOpFwdDeps { kNoDeps = 0x00, // Do not need any forward input/output kDepX = 0x01, // Only need forward input X kDepOut = 0x02, // Only need forward output Out }; template struct BaseActivationFunctor { using ELEMENT_TYPE = T; using AttrPair = std::vector>; AttrPair GetAttrs() { return AttrPair(); } }; template struct Sine { HOSTDEVICE T operator()(const T& val) const { return sin(val); } }; template <> struct Sine { HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { return dtype::float16(sin(static_cast(val))); } }; template <> struct Sine { HOSTDEVICE dtype::bfloat16 operator()(const dtype::bfloat16& val) const { return dtype::bfloat16(sin(static_cast(val))); } }; template struct Cosine { HOSTDEVICE T operator()(const T& val) const { return cos(val); } }; template <> struct Cosine { HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { return dtype::float16(cos(static_cast(val))); } }; template <> struct Cosine { HOSTDEVICE dtype::bfloat16 operator()(const dtype::bfloat16& val) const { return dtype::bfloat16(cos(static_cast(val))); } }; // sine'(x) = cos(x) template struct SinGradFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out UNUSED, dOut dout, dX dx) const { dx.device(d) = dout * x.unaryExpr(Cosine()); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; // sine(x) = sin(x) template struct SinFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out) const { // Note(GGBond8488): Since Eigen3.3, Behavior like {A = (B * A).cwiseAbs()} // will give wrong result, details see // http://eigen.tuxfamily.org/dox/group__TopicAliasing.html out.device(d) = x.unaryExpr(Sine()).eval(); } }; // sine''(x) = -sin(x) template struct SinDoubleGradFunctor : public BaseActivationFunctor { template void operator()(const Device& dev, const DenseTensor* X, const DenseTensor* dOut, const DenseTensor* ddX, DenseTensor* dX, DenseTensor* ddOut) const { auto* d = dev.eigen_device(); auto d2d1x = EigenVector::Flatten( GET_DATA_SAFELY(ddX, "Input", "d2d1x", "SinDoubleGrad")); auto x = EigenVector::Flatten( GET_DATA_SAFELY(X, "Input", "x", "SinDoubleGrad")); // calculate d2x first, so d2d1y can inplace d2d1x auto d2x = EigenVector::Flatten( GET_DATA_SAFELY(dX, "Output", "d2x", "SinDoubleGrad")); if (dX) { if (dOut) { auto d1y = EigenVector::Flatten( GET_DATA_SAFELY(dOut, "Output", "d1y", "SinDoubleGrad")); d2x.device(*d) = -d2d1x * x.unaryExpr(Sine()) * d1y; } else { d2x.device(*d) = -d2d1x * x.unaryExpr(Sine()) * static_cast(0); } } // calculate d2d1y if (ddOut) { auto d2d1y = EigenVector::Flatten( GET_DATA_SAFELY(ddOut, "Output", "d2d1y", "SinDoubleGrad")); d2d1y.device(*d) = d2d1x * x.unaryExpr(Cosine()); } } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; // 1st reverse grad // y = sin(x) // x --> y // d1x = d1y * cos(x) // // 2nd reverse grad // x, d1y --> d1x // d2x = -sin(x) * d1y * d2d1x // d2d1y = cos(x) * d2d1x // // 3rd reverse grad // x, d1y, d2d1x --> d2x, d2d1y // d3x = -cos(x) * d1y * d2d1x * d3d2x - sin(x) * d2d1x * d3d2d1y // d3d1y = -sin(x) * d2d1x * d3d2x // d3d2d1x = -sin(x) * d1y * d3d2x + cos(x) * d3d2d1y template struct SinTripleGradFunctor : public BaseActivationFunctor { template void operator()(const Device& dev, const DenseTensor* X, const DenseTensor* ddX, const DenseTensor* dOut, const DenseTensor* d_DDOut, const DenseTensor* d_dx_New, DenseTensor* d_d_Out, DenseTensor* d_x_New, DenseTensor* d_DDx) const { auto* d = dev.eigen_device(); auto x = EigenVector::Flatten( GET_DATA_SAFELY(X, "Input", "x", "SinTripleGrad")); auto d3d2x = EigenVector::Flatten( GET_DATA_SAFELY(d_dx_New, "Input", "d3d2x", "SinTripleGrad")); if (d_x_New) { auto d3x = EigenVector::Flatten( GET_DATA_SAFELY(d_x_New, "Output", "d3x", "SinTripleGrad")); if (dOut && ddX && d_DDOut) { auto d2d1x = EigenVector::Flatten( GET_DATA_SAFELY(ddX, "Input", "d2d1x", "SinTripleGrad")); auto d1y = EigenVector::Flatten( GET_DATA_SAFELY(dOut, "Input", "d1y", "SinTripleGrad")); auto d3d2d1y = EigenVector::Flatten( GET_DATA_SAFELY(d_DDOut, "Input", "d3d2d1y", "SinTripleGrad")); d3x.device(*d) = -x.unaryExpr(Cosine()) * d1y * d2d1x * d3d2x - x.unaryExpr(Sine()) * d2d1x * d3d2d1y; } else if (!dOut && ddX && d_DDOut) { auto d2d1x = EigenVector::Flatten( GET_DATA_SAFELY(ddX, "Input", "d2d1x", "SinTripleGrad")); auto d3d2d1y = EigenVector::Flatten( GET_DATA_SAFELY(d_DDOut, "Input", "d3d2d1y", "SinTripleGrad")); d3x.device(*d) = -x.unaryExpr(Sine()) * d2d1x * d3d2d1y; } else if (dOut && ddX && !d_DDOut) { auto d2d1x = EigenVector::Flatten( GET_DATA_SAFELY(ddX, "Input", "d2d1x", "SinTripleGrad")); auto d1y = EigenVector::Flatten( GET_DATA_SAFELY(dOut, "Input", "d1y", "SinTripleGrad")); d3x.device(*d) = -x.unaryExpr(Cosine()) * d1y * d2d1x * d3d2x; } else { d3x.device(*d) = x * static_cast(0); } } if (d_d_Out) { auto d3d1y = EigenVector::Flatten( GET_DATA_SAFELY(d_d_Out, "Output", "d3d1y", "SinTripleGrad")); if (ddX) { auto d2d1x = EigenVector::Flatten( GET_DATA_SAFELY(ddX, "Input", "d2d1x", "SinTripleGrad")); d3d1y.device(*d) = -x.unaryExpr(Sine()) * d2d1x * d3d2x; } else { d3d1y.device(*d) = static_cast(0) * x; } } if (d_DDx) { auto d3d2d1x = EigenVector::Flatten( GET_DATA_SAFELY(d_DDx, "Output", "d3d2d1x", "SinTripleGrad")); if (dOut && d_DDOut) { auto d1y = EigenVector::Flatten( GET_DATA_SAFELY(dOut, "Input", "d1y", "SinTripleGrad")); auto d3d2d1y = EigenVector::Flatten( GET_DATA_SAFELY(d_DDOut, "Input", "d3d2d1y", "SinTripleGrad")); d3d2d1x.device(*d) = -x.unaryExpr(Sine()) * d1y * d3d2x + x.unaryExpr(Cosine()) * d3d2d1y; } else if (dOut && !d_DDOut) { auto d1y = EigenVector::Flatten( GET_DATA_SAFELY(dOut, "Input", "d1y", "SinTripleGrad")); d3d2d1x.device(*d) = -x.unaryExpr(Sine()) * d1y * d3d2x; } else if (!dOut && d_DDOut) { auto d3d2d1y = EigenVector::Flatten( GET_DATA_SAFELY(d_DDOut, "Input", "d3d2d1y", "SinTripleGrad")); d3d2d1x.device(*d) = x.unaryExpr(Cosine()) * d3d2d1y; } else { d3d2d1x.device(*d) = x * static_cast(0); } } } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepOut; } }; // reciprocal(x) = 1 / x template struct ReciprocalFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out) const { out.device(d) = static_cast(1) / x; } }; template struct ReciprocalGradFunctor : public BaseActivationFunctor { template void operator()(Device d, X x UNUSED, Out out, dOut dout, dX dx) const { dx.device(d) = dout * static_cast(-1) * out * out; } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepOut; } }; // 1st reverse grad // y = cos(x) // x --> y // d1x = d1y * -sin(x) // // 2nd reverse grad // x, d1y --> d1x // d2x = -cos(x) * d1y * d2d1x // d2d1y = -sin(x) * d2d1x // // 3rd reverse grad // x, d1y, d2d1x --> d2x, d2d1y // d3x = sin(x) * d1y * d2d1x * d3d2x - cos(x) * d2d1x * d3d2d1y // d3d1y = -cos(x) * d2d1x * d3d2x // d3d2d1x = -cos(x) * d1y * d3d2x - sin(x) * d3d2d1y // cosine'(x) = -sin(x) template struct CosGradFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out UNUSED, dOut dout, dX dx) const { dx.device(d) = -dout * x.unaryExpr(Sine()); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; // cos''(x) = -cos(x) template struct CosDoubleGradFunctor : public BaseActivationFunctor { template void operator()(const Device& dev, const DenseTensor* X, const DenseTensor* dOut, const DenseTensor* ddX, DenseTensor* dX, DenseTensor* ddOut) const { auto* d = dev.eigen_device(); auto d2d1x = EigenVector::Flatten( GET_DATA_SAFELY(ddX, "Input", "d2d1x", "CosDoubleGrad")); auto x = EigenVector::Flatten( GET_DATA_SAFELY(X, "Input", "x", "CosDoubleGrad")); // calculate d2x first, so d2d1y can inplace d2d1x auto d2x = EigenVector::Flatten( GET_DATA_SAFELY(dX, "Output", "d2x", "CosDoubleGrad")); if (ddOut) { if (dOut) { auto d1y = EigenVector::Flatten( GET_DATA_SAFELY(dOut, "Output", "d1y", "CosDoubleGrad")); d2x.device(*d) = -d2d1x * x.unaryExpr(Cosine()) * d1y; } else { d2x.device(*d) = x * static_cast(0); } } if (dX) { // calculate d2d1y auto d2d1y = EigenVector::Flatten( GET_DATA_SAFELY(ddOut, "Output", "d2d1y", "CosDoubleGrad")); d2d1y.device(*d) = -d2d1x * x.unaryExpr(Sine()); } } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; template struct CosTripleGradFunctor : public BaseActivationFunctor { template void operator()(const Device& dev, const DenseTensor* X, const DenseTensor* ddX, const DenseTensor* dOut, const DenseTensor* d_DDOut, const DenseTensor* d_dx_New, DenseTensor* d_d_Out, DenseTensor* d_x_New, DenseTensor* d_DDx) const { auto* d = dev.eigen_device(); auto x = EigenVector::Flatten( GET_DATA_SAFELY(X, "Input", "x", "CosTripleGrad")); auto d3d2x = EigenVector::Flatten( GET_DATA_SAFELY(d_dx_New, "Input", "d3d2x", "CosTripleGrad")); if (d_x_New) { auto d3x = EigenVector::Flatten( GET_DATA_SAFELY(d_x_New, "Output", "d3x", "CosTripleGrad")); if (dOut && ddX && d_DDOut) { auto d2d1x = EigenVector::Flatten( GET_DATA_SAFELY(ddX, "Input", "d2d1x", "CosTripleGrad")); auto d1y = EigenVector::Flatten( GET_DATA_SAFELY(dOut, "Input", "d1y", "CosTripleGrad")); auto d3d2d1y = EigenVector::Flatten( GET_DATA_SAFELY(d_DDOut, "Input", "d3d2d1y", "CosTripleGrad")); d3x.device(*d) = x.unaryExpr(Sine()) * d1y * d2d1x * d3d2x - x.unaryExpr(Cosine()) * d2d1x * d3d2d1y; } else if (dOut && ddX && !d_DDOut) { auto d2d1x = EigenVector::Flatten( GET_DATA_SAFELY(ddX, "Input", "d2d1x", "CosTripleGrad")); auto d1y = EigenVector::Flatten( GET_DATA_SAFELY(dOut, "Input", "d1y", "CosTripleGrad")); d3x.device(*d) = x.unaryExpr(Sine()) * d1y * d2d1x * d3d2x; } else if (!dOut && ddX && d_DDOut) { auto d2d1x = EigenVector::Flatten( GET_DATA_SAFELY(ddX, "Input", "d2d1x", "CosTripleGrad")); auto d3d2d1y = EigenVector::Flatten( GET_DATA_SAFELY(d_DDOut, "Input", "d3d2d1y", "CosTripleGrad")); d3x.device(*d) = -x.unaryExpr(Cosine()) * d2d1x * d3d2d1y; } else { d3x.device(*d) = static_cast(0) * x; } } if (d_d_Out) { auto d3d1y = EigenVector::Flatten( GET_DATA_SAFELY(d_d_Out, "Output", "d3d1y", "CosTripleGrad")); if (ddX) { auto d2d1x = EigenVector::Flatten( GET_DATA_SAFELY(ddX, "Input", "d2d1x", "CosTripleGrad")); d3d1y.device(*d) = -x.unaryExpr(Cosine()) * d2d1x * d3d2x; } else { d3d1y.device(*d) = static_cast(0) * x; } } if (d_DDx) { auto d3d2d1x = EigenVector::Flatten( GET_DATA_SAFELY(d_DDx, "Output", "d3d2d1x", "CosTripleGrad")); if (dOut && d_DDOut) { auto d1y = EigenVector::Flatten( GET_DATA_SAFELY(dOut, "Input", "d1y", "CosTripleGrad")); auto d3d2d1y = EigenVector::Flatten( GET_DATA_SAFELY(d_DDOut, "Input", "d3d2d1y", "CosTripleGrad")); d3d2d1x.device(*d) = -x.unaryExpr(Cosine()) * d1y * d3d2x - x.unaryExpr(Sine()) * d3d2d1y; } else if (!dOut && d_DDOut) { auto d3d2d1y = EigenVector::Flatten( GET_DATA_SAFELY(d_DDOut, "Input", "d3d2d1y", "CosTripleGrad")); d3d2d1x.device(*d) = -x.unaryExpr(Sine()) * d3d2d1y; } else if (dOut && !d_DDOut) { auto d1y = EigenVector::Flatten( GET_DATA_SAFELY(dOut, "Input", "d1y", "CosTripleGrad")); d3d2d1x.device(*d) = -x.unaryExpr(Cosine()) * d1y * d3d2x; } else { d3d2d1x.device(*d) = static_cast(0) * x; } } } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepOut; } }; // cosine(x) = cos(x) template struct CosFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out) const { out.device(d) = x.unaryExpr(Cosine()).eval(); } }; template struct LogitFunctor { template void operator()(Device d, X x, Out out, P p, float eps) const { // logit(x) = ln(x/(1-x)) auto tmp_x = (x.cwiseMin(static_cast(1.0 - eps))).cwiseMax(static_cast(eps)); if (!eps) { out.device(d) = (x < static_cast(0.0) || x > static_cast(1.0)) .select(p.constant(static_cast(NAN)), (tmp_x / (static_cast(1) - tmp_x)).log()); } else { out.device(d) = (tmp_x / (static_cast(1) - tmp_x)).log(); } } }; // mish(x) = x * tanh(softplus(x)) // softplus(x) = x, if x > threshold // = ln(1 + exp(x)), otherwise template struct MishFunctor : public BaseActivationFunctor { float threshold; typename BaseActivationFunctor::AttrPair GetAttrs() { return {{"threshold", &threshold}}; } template void operator()(Device d, X x, Out out) const { auto sp = (x > static_cast(threshold)) .select(x, (static_cast(1) + x.exp()).log()); out.device(d) = x * sp.tanh(); } }; // dx = dout * (tanh(sp) + x * (1 - tanh(sp) ** 2) * (1 - exp(-sp))) // sp = softplus(x) template struct MishGradFunctor : public BaseActivationFunctor { float threshold; typename BaseActivationFunctor::AttrPair GetAttrs() { return {{"threshold", &threshold}}; } template void operator()(Device d, X x, Out out UNUSED, dOut dout, dX dx) const { auto sp = (x > static_cast(threshold)) .select(x, (static_cast(1) + x.exp()).log()); auto gsp = static_cast(1) - (-sp).exp(); auto tsp = sp.tanh(); dx.device(d) = dout * (tsp + x * (static_cast(1) - tsp * tsp) * gsp); } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template struct STanhFunctor : public BaseActivationFunctor { float scale_a; float scale_b; typename BaseActivationFunctor::AttrPair GetAttrs() { return {{"scale_a", &scale_a}, {"scale_b", &scale_b}}; } template void operator()(Device d, X x, Out out) const { out.device(d) = static_cast(scale_b) * (static_cast(scale_a) * x).tanh(); } }; template struct STanhGradFunctor : public BaseActivationFunctor { float scale_a; float scale_b; typename BaseActivationFunctor::AttrPair GetAttrs() { return {{"scale_a", &scale_a}, {"scale_b", &scale_b}}; } template void operator()(Device d, X x, Out out UNUSED, dOut dout, dX dx) const { auto a = static_cast(scale_a); auto b = static_cast(scale_b); auto temp = (a * x).tanh() * (a * x).tanh(); dx.device(d) = dout * a * b * (static_cast(1) - temp); } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template struct Tangent { HOSTDEVICE T operator()(const T& val) const { return tan(val); } }; template <> struct Tangent { HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { return dtype::float16(tan(static_cast(val))); } }; // Tangent'(x) = -Tangent(x) template struct TanGradFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out UNUSED, dOut dout, dX dx) const { dx.device(d) = dout / x.unaryExpr(Cosine()).square(); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; // square(x) = x^2 template struct SquareFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out) const { out.device(d) = x.square(); } }; template struct SquareGradFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out UNUSED, dOut dout, dX dx) const { dx.device(d) = dout * static_cast(2) * x; } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // sqrt(x) = x^(1/2) template struct SqrtFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out) const { out.device(d) = x.sqrt(); } }; template struct SqrtGradFunctor : public BaseActivationFunctor { template void operator()(Device d, X x UNUSED, Out out, dOut dout, dX dx) const { dx.device(d) = static_cast(0.5) * dout / out; } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepOut; } }; // rsqrt(x) = x^(-1/2) template struct RsqrtFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out) const { out.device(d) = x.rsqrt(); } }; template struct RsqrtGradFunctor : public BaseActivationFunctor { template void operator()(Device d, X x UNUSED, Out out, dOut dout, dX dx) const { dx.device(d) = static_cast(-0.5) * dout * out * out * out; } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepOut; } }; // // For numerical stability, using the following formula instead of // softplus(x) = // // log(1 + exp(x)) // // softplus(x) = log(1 + exp(beta * x)) / beta when beta * x <= // threshold(beta = // // 1, threshold = 20 by default), otherwise x template struct SoftplusFunctor : public BaseActivationFunctor { float beta; float threshold; typename BaseActivationFunctor::AttrPair GetAttrs() { return {{"beta", &beta}, {"threshold", &threshold}}; } template void operator()(Device d, X x, Out out) const { auto x_beta = static_cast(beta) * x; out.device(d) = (x_beta > static_cast(threshold)) .select(x, (static_cast(1) + x_beta.exp()).log() / static_cast(beta)); } }; // For numerical stability, using the following formula instead of // d(softplus(x))/dx = 1 / (1 + exp(-x)) // d(softplus(x))/dx = 1 / (1 + exp(-beta * x)) when beta * x <= threshold(beta // = 1, threshold = 20 by default), otherwise x template struct SoftplusGradFunctor : public BaseActivationFunctor { float beta; float threshold; typename BaseActivationFunctor::AttrPair GetAttrs() { return {{"beta", &beta}, {"threshold", &threshold}}; } template void operator()(Device d, X x, Out out UNUSED, dOut dout, dX dx) const { auto x_beta = static_cast(beta) * x; dx.device(d) = (x_beta > static_cast(threshold)) .select(dout, dout / (static_cast(1) + (-x_beta).exp())); } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template struct SoftplusDoubleGradFunctor : public BaseActivationFunctor { float beta; float threshold; typename BaseActivationFunctor::AttrPair GetAttrs() { return {{"beta", &beta}, {"threshold", &threshold}}; } template void operator()(const Device& dev, const DenseTensor* X, const DenseTensor* dOut, const DenseTensor* ddX, DenseTensor* dX, DenseTensor* ddOut) const { auto* d = dev.eigen_device(); auto x = EigenVector::Flatten( GET_DATA_SAFELY(X, "Input", "X", "SoftplusDoubleGrad")); auto x_beta = static_cast(beta) * x; auto ddx = EigenVector::Flatten( GET_DATA_SAFELY(ddX, "Input", "DDX", "SoftplusDoubleGrad")); if (dX) { auto dx = EigenVector::Flatten( GET_DATA_SAFELY(dX, "Output", "DX", "SoftplusDoubleGrad")); auto dout = EigenVector::Flatten( GET_DATA_SAFELY(dOut, "Output", "DOut", "SoftplusDoubleGrad")); // ddx * dout * beta * exp(x_beta) / (exp(x_beta) + 1) ^ 2, if x_beta // <= threshold // 0, if x_beta > threshold dx.device(*d) = (x_beta > static_cast(threshold)) .select(x.constant(static_cast(0)), ddx * dout * static_cast(beta) * x_beta.exp() / (x_beta.exp() + static_cast(1)) .pow(static_cast(2))); } if (ddOut) { auto ddout = EigenVector::Flatten( GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SoftplusDoubleGrad")); // ddx / (1 + exp(-x_beta)), if x_beta <= threshold // ddx, if x_beta > threshold ddout.device(*d) = (x_beta > static_cast(threshold)) .select(ddx, ddx / (static_cast(1) + (-x_beta).exp())); } } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // Tangent(x) = tan(x) template struct TanFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out) const { // Note(GGBond8488): Since Eigen3.3, Behavior like {A = (B * A).cwiseAbs()} // will give wrong result, details see // http://eigen.tuxfamily.org/dox/group__TopicAliasing.html out.device(d) = x.unaryExpr(Tangent()).eval(); } }; template struct Sinh { HOSTDEVICE T operator()(const T& val) const { return sinh(val); } }; template <> struct Sinh { HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { return dtype::float16(sinhf(static_cast(val))); } }; template struct Cosh { HOSTDEVICE T operator()(const T& val) const { return cosh(val); } }; template <> struct Cosh { HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { return dtype::float16(coshf(static_cast(val))); } }; // sinh(x) = sinh(x) template struct SinhFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out) const { out.device(d) = x.unaryExpr(Sinh()).eval(); } }; // cosh(x) = cosh(x) template struct CoshFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out) const { out.device(d) = x.unaryExpr(Cosh()).eval(); } }; // sinh'(x) = cosh(x) template struct SinhGradFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out UNUSED, dOut dout, dX dx) const { dx.device(d) = dout * x.unaryExpr(Cosh()); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; // cosh'(x) = sinh(x) template struct CoshGradFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out UNUSED, dOut dout, dX dx) const { dx.device(d) = dout * x.unaryExpr(Sinh()); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; template struct Acos { HOSTDEVICE T operator()(const T& val) const { return acos(val); } }; template <> struct Acos { HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { return dtype::float16(acos(static_cast(val))); } }; // Acos(x) = acos(x) template struct AcosFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out) const { out.device(d) = x.unaryExpr(Acos()).eval(); } }; // acos'(x) = -1/sqrt(1-x^2) template struct AcosGradFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out UNUSED, dOut dout, dX dx) const { dx.device(d) = -dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; template struct Asin { HOSTDEVICE T operator()(const T& val) const { return asin(val); } }; template <> struct Asin { HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { return dtype::float16(asin(static_cast(val))); } }; // Asin(x) = asin(x) template struct AsinFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out) const { out.device(d) = x.unaryExpr(Asin()).eval(); } }; // asin'(x) = 1/sqrt(1-x^2) template struct AsinGradFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out UNUSED, dOut dout, dX dx) const { dx.device(d) = dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; template struct Atan { HOSTDEVICE T operator()(const T& val) const { return atan(val); } }; template <> struct Atan { HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { return dtype::float16(atan(static_cast(val))); } }; // Atan(x) = atan(x) template struct AtanFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out) const { out.device(d) = x.unaryExpr(Atan()).eval(); } }; // atan'(x) = 1 / (1 + x^2) template struct AtanGradFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out UNUSED, dOut dout, dX dx) const { dx.device(d) = dout * static_cast(1) / (static_cast(1) + x.square()); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; template struct LogitGradFunctor { template void operator()(Device d, X x, dOut dout, dX dx, P p, float eps) const { // logit(x)' = 1/(x*(1-x)) dx.device(d) = (x < static_cast(eps) || x > static_cast(1.0 - eps)) .select(p.constant(static_cast(0)), dout * (static_cast(1) / ((static_cast(1) - x) * x))); } }; template struct Acosh { HOSTDEVICE T operator()(const T& val) const { return acosh(val); } }; template <> struct Acosh { HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { return dtype::float16(acosh(static_cast(val))); } }; // Acosh(x) = acosh(x) template struct AcoshFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out) const { out.device(d) = x.unaryExpr(Acosh()).eval(); } }; // acosh'(x) = 1/sqrt(x^2 - 1) template struct AcoshGradFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out UNUSED, dOut dout, dX dx) const { dx.device(d) = dout * static_cast(1) / (x * x - static_cast(1)).sqrt(); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; template struct Asinh { HOSTDEVICE T operator()(const T& val) const { return asinh(val); } }; template <> struct Asinh { HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { return dtype::float16(asinh(static_cast(val))); } }; // Asinh(x) = asinh(x) template struct AsinhFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out) const { out.device(d) = x.unaryExpr(Asinh()).eval(); } }; // asinh'(x) = 1/sqrt(x^2 + 1) template struct AsinhGradFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out UNUSED, dOut dout, dX dx) const { dx.device(d) = dout * static_cast(1) / (x.square() + static_cast(1)).sqrt(); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; template struct Atanh { HOSTDEVICE T operator()(const T& val) const { return atanh(val); } }; template <> struct Atanh { HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { return dtype::float16(atanh(static_cast(val))); } }; // Atanh(x) = atanh(x) template struct AtanhFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out) const { out.device(d) = x.unaryExpr(Atanh()).eval(); } }; // atanh'(x) = 1/(1 - x^2) template struct AtanhGradFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out UNUSED, dOut dout, dX dx) const { dx.device(d) = dout * static_cast(1) / (static_cast(1) - x.square()); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; // exp functor // exp(x) = e^x template struct ExpFunctor : public BaseActivationFunctor { using U = typename std::conditional_t::value, float, T>; template void operator()(Device d, X x, Out out) const { out.device(d) = x.template cast().exp(); } }; template struct ExpGradFunctor : public BaseActivationFunctor { template void operator()(Device d, X x UNUSED, Out out, dOut dout, dX dx) const { dx.device(d) = dout * out; } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepOut; } }; // expm1(x) = e^x - 1 template struct Expm1Functor : public BaseActivationFunctor { using U = typename std::conditional_t::value, float, T>; template void operator()(Device d, X x, Out out) const { out.device(d) = x.template cast().expm1(); } }; template struct Expm1GradFunctor : public BaseActivationFunctor { template void operator()(Device d, X x UNUSED, Out out, dOut dout, dX dx) const { dx.device(d) = dout * out + dout; } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepOut; } }; // relu(x) = max(x, 0) template struct ReluCPUFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out) const { out.device(d) = x.unaryExpr([] HOSTDEVICE(T v) { return v > static_cast(0) ? v : static_cast(0); }); } }; template struct ReluCUDAFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out) const { out.device(d) = x.cwiseMax(static_cast(0)); } }; template struct ReluGradFunctor : public BaseActivationFunctor { template void operator()(Device d, X x UNUSED, Out out, dOut dout, dX dx) const { dx.device(d) = dout * (out > static_cast(0)).template cast(); } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepOut; } }; template struct ReluGradGradFunctor : public BaseActivationFunctor { template void operator()(const Device& dev, const DenseTensor* X UNUSED, const DenseTensor* Out, const DenseTensor* ddX, DenseTensor* ddOut, DenseTensor* dOut UNUSED, DenseTensor* dX UNUSED) const { auto* d = dev.eigen_device(); auto ddx = EigenVector::Flatten( GET_DATA_SAFELY(ddX, "Input", "DDX", "ReluGradGrad")); auto out = EigenVector::Flatten( GET_DATA_SAFELY(Out, "Output", "Out", "ReluGradGrad")); if (ddOut) { auto ddout = EigenVector::Flatten( GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ReluGradGrad")); ddout.device(*d) = ddx * (out > static_cast(0)).template cast(); } } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepOut; } }; // tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) template struct TanhFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out) const { out.device(d) = x.tanh(); } }; template struct TanhGradFunctor : public BaseActivationFunctor { template void operator()(Device d, X x UNUSED, Out out, dOut dout, dX dx) const { dx.device(d) = dout * (static_cast(1) - out * out); } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepOut; } }; template struct TanhGradGradFunctor : public BaseActivationFunctor { template void operator()(const Device& dev, const DenseTensor* Out, const DenseTensor* ddX, const DenseTensor* dOut, DenseTensor* dOutNew, DenseTensor* ddOut) const { auto* d = dev.eigen_device(); auto ddx = EigenVector::Flatten( GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhGradGrad")); auto out = EigenVector::Flatten( GET_DATA_SAFELY(Out, "Input", "Out", "TanhGradGrad")); // tanh grad grad : ddout = (1 - out^2) * ddx, dout = - (dout_old * 2 * out // * ddx) if (dOutNew) { auto dout = EigenVector::Flatten( GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad")); auto dout_new = EigenVector::Flatten( GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "TanhGradGrad")); dout_new.device(*d) = static_cast(-1) * dout * static_cast(2) * out * ddx; } if (ddOut) { auto ddout = EigenVector::Flatten( GET_DATA_SAFELY(ddOut, "Output", "DDOut", "TanhGradGrad")); ddout.device(*d) = (static_cast(1) - out * out) * ddx; } } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepOut; } }; /* Out DOut D_Dout DDx -> TanhTripleGrad -> D_DDx D_DDout d_OutNew D_Dout_new D_Dout = (-2) * Out * DDx * D_Dout_new D_DDx = (1-Out^2)*D_DDout + (-2) * Out * DOut * D_Dout_new D_OutNew = (-2) * Out * DDx * D_DDout + (-2) * DOut * DDx * D_Dout_new Out, DDX, DOut, D_DDOut, D_DOut_New // input D_OutNew, D_DOut, D_DDx // output */ template struct TanhTripleGradFunctor : public BaseActivationFunctor { template void operator()(const Device& dev, const DenseTensor* Out, const DenseTensor* ddX, const DenseTensor* dOut, const DenseTensor* d_DDOut, const DenseTensor* d_dOut_New, DenseTensor* d_d_Out, DenseTensor* d_Out_New, DenseTensor* d_DDx) const { auto* d = dev.eigen_device(); auto ddx = EigenVector::Flatten( GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhTripleGrad")); auto out = EigenVector::Flatten( GET_DATA_SAFELY(Out, "Input", "Out", "TanhTripleGrad")); auto dout = EigenVector::Flatten( GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhTripleGrad")); if (d_Out_New) { auto d_OutNew = EigenVector::Flatten( GET_DATA_SAFELY(d_Out_New, "Output", "D_OutNew", "TanhTripleGrad")); if (d_DDOut && d_dOut_New) { auto d_ddOut = EigenVector::Flatten( GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "TanhTripleGrad")); auto d_dOutNew = EigenVector::Flatten(GET_DATA_SAFELY( d_dOut_New, "Input", "D_DOut_New", "TanhTripleGrad")); d_OutNew.device(*d) = (static_cast(-2) * out * ddx * d_ddOut) - (static_cast(2) * dout * ddx * d_dOutNew); } else if (d_DDOut && !d_dOut_New) { auto d_ddOut = EigenVector