未验证 提交 d7112180 编写于 作者: Y YuanRisheng 提交者: GitHub

[Phi]Move Tanh/BRelu/LeakyRelu/ThresholdedRelu Kernels to Phi (#40385)

* move activation op

* adjust code format

* fix compile bugs

* fix ci bugs

* code format adjust

* code format adjust2

* activate ci status

* modify according to comment

* move activation kernel

* revert relu6

* reduce add code

* perfect use_phi_functor

* completing func name

* fix bugs when run ci

* fix bugs when run infr

* modifpy infrt get kernel signature
上级 42c7bb47
......@@ -25,11 +25,11 @@ USE_OP_ITSELF(softmax);
USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
USE_OP_ITSELF(elementwise_add);
USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
USE_OP(leaky_relu);
USE_OP_ITSELF(leaky_relu);
USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN);
USE_OP(gelu);
USE_OP_ITSELF(relu);
USE_OP(tanh);
USE_OP_ITSELF(tanh);
USE_OP_DEVICE_KERNEL(tanh, MKLDNN);
namespace paddle {
......
......@@ -32,7 +32,7 @@ USE_OP(concat);
USE_OP(matmul);
USE_OP_ITSELF(elementwise_add);
USE_OP(sigmoid);
USE_OP(tanh);
USE_OP_ITSELF(tanh);
USE_OP(elementwise_mul);
USE_OP(softmax_with_cross_entropy);
USE_OP_ITSELF(reduce_mean);
......@@ -48,7 +48,7 @@ USE_OP(transpose2_grad);
USE_OP(concat_grad);
USE_OP_ITSELF(elementwise_mul_grad);
USE_OP(sigmoid_grad);
USE_OP(tanh_grad);
USE_OP_ITSELF(tanh_grad);
USE_OP(sum);
USE_OP(slice_grad);
USE_OP(lookup_table_grad);
......
......@@ -54,5 +54,5 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); }
USE_OP_ITSELF(relu);
USE_OP(sigmoid);
USE_OP(tanh);
USE_OP_ITSELF(tanh);
USE_OP(relu6);
......@@ -45,4 +45,4 @@ TEST(leaky_relu_op, test_leaky_relu) {
} // namespace paddle
// USE_OP(leaky_relu);
USE_OP(leaky_relu);
USE_OP_ITSELF(leaky_relu);
......@@ -1482,6 +1482,9 @@ REGISTER_ACTIVATION_OP(cosh, Cosh, CoshFunctor, CoshGradFunctor);
REGISTER_ACTIVATION_OP(asinh, Asinh, AsinhFunctor, AsinhGradFunctor);
REGISTER_ACTIVATION_OP(acosh, Acosh, AcoshFunctor, AcoshGradFunctor);
REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor);
REGISTER_ACTIVATION_OP(brelu, BRelu, BReluFunctor, BReluGradFunctor);
REGISTER_ACTIVATION_OP(thresholded_relu, ThresholdedRelu,
ThresholdedReluFunctor, ThresholdedReluGradFunctor);
/* ========================== sigmoid register =============================
*/
......@@ -1567,23 +1570,6 @@ REGISTER_OPERATOR(
ops::ActivationOpTripleGrad<ops::TanhTripleGradFunctor<float>::FwdDeps()>,
ops::ActivationTripleGradOpInplaceInferer);
REGISTER_ACTIVATION_CPU_KERNEL(tanh, Tanh, TanhFunctor, TanhGradFunctor);
REGISTER_OP_CPU_KERNEL(
tanh_grad_grad, ops::TanhDoubleGradKernel<plat::CPUDeviceContext,
ops::TanhGradGradFunctor<float>>,
ops::TanhDoubleGradKernel<plat::CPUDeviceContext,
ops::TanhGradGradFunctor<double>>,
ops::TanhDoubleGradKernel<plat::CPUDeviceContext,
ops::TanhGradGradFunctor<plat::float16>>);
// Register TripleGrad Kernel
REGISTER_OP_CPU_KERNEL(
tanh_triple_grad,
ops::TanhTripeGradKernel<plat::CPUDeviceContext,
ops::TanhTripleGradFunctor<float>>,
ops::TanhTripeGradKernel<plat::CPUDeviceContext,
ops::TanhTripleGradFunctor<double>>,
ops::TanhTripeGradKernel<plat::CPUDeviceContext,
ops::TanhTripleGradFunctor<plat::float16>>);
/* ========================================================================== */
/* ========================== relu register ============================= */
......@@ -1623,16 +1609,6 @@ REGISTER_OPERATOR(
ops::ActivationOpDoubleGrad2<ops::LeakyReluGradFunctor<float>::FwdDeps()>,
ops::ActivationDoubleGradOpInplaceInferer);
REGISTER_ACTIVATION_CPU_KERNEL(leaky_relu, LeakyRelu, LeakyReluFunctor,
LeakyReluGradFunctor);
REGISTER_OP_CPU_KERNEL(
leaky_relu_grad_grad,
ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
ops::LeakyReluGradGradFunctor<float>>,
ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
ops::LeakyReluGradGradFunctor<double>>,
ops::ActivationDoubleGradKernel<
plat::CPUDeviceContext, ops::LeakyReluGradGradFunctor<plat::float16>>);
/* ========================================================================== */
/* ======================== elu register ============================ */
......
......@@ -253,6 +253,14 @@ struct SigmoidFunctor : public BaseActivationFunctor<T> {
template <typename T> \
using name##GradFunctor = phi::funcs::name##GradFunctor<T>;
#define USE_PHI_DOUBLE_GRAD_FUNCTOR(name) \
template <typename T> \
using name##GradGradFunctor = phi::funcs::name##GradGradFunctor<T>;
#define USE_PHI_TRIPLE_GRAD_FUNCTOR(name) \
template <typename T> \
using name##TripleGradFunctor = phi::funcs::name##TripleGradFunctor<T>;
USE_PHI_FUNCTOR(Cos)
USE_PHI_FUNCTOR(Tan)
USE_PHI_FUNCTOR(Acos)
......@@ -264,6 +272,13 @@ USE_PHI_FUNCTOR(Cosh)
USE_PHI_FUNCTOR(Asinh)
USE_PHI_FUNCTOR(Acosh)
USE_PHI_FUNCTOR(Atanh)
USE_PHI_FUNCTOR(Tanh)
USE_PHI_DOUBLE_GRAD_FUNCTOR(Tanh)
USE_PHI_TRIPLE_GRAD_FUNCTOR(Tanh)
USE_PHI_FUNCTOR(BRelu)
USE_PHI_FUNCTOR(ThresholdedRelu)
USE_PHI_FUNCTOR(LeakyRelu)
USE_PHI_DOUBLE_GRAD_FUNCTOR(LeakyRelu)
template <typename T>
struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
......@@ -497,117 +512,6 @@ using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor<T>;
template <typename T>
using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor<T>;
// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
template <typename T>
struct TanhFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x.tanh();
}
};
template <typename T>
struct TanhGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout * (static_cast<T>(1) - out * out);
}
static constexpr ActBwdOpFwdDeps FwdDeps() {
return ActBwdOpFwdDeps::kDepOut;
}
};
template <typename T>
struct TanhGradGradFunctor : public BaseActivationFunctor<T> {
template <typename Device>
void operator()(const Device& dev, const framework::Tensor* Out,
const framework::Tensor* ddX, const framework::Tensor* dOut,
framework::Tensor* dOutNew, framework::Tensor* ddOut) const {
auto* d = dev.eigen_device();
auto ddx = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhGradGrad"));
auto out = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(Out, "Input", "Out", "TanhGradGrad"));
// tanh grad grad : ddout = (1 - out^2) * ddx, dout = - (dout_old * 2 * out
// * ddx)
if (dOutNew) {
auto dout = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad"));
auto dout_new = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "TanhGradGrad"));
dout_new.device(*d) =
static_cast<T>(-1) * dout * static_cast<T>(2) * out * ddx;
}
if (ddOut) {
auto ddout = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddOut, "Output", "DDOut", "TanhGradGrad"));
ddout.device(*d) = (static_cast<T>(1) - out * out) * ddx;
}
}
static constexpr ActBwdOpFwdDeps FwdDeps() {
return ActBwdOpFwdDeps::kDepOut;
}
};
/*
Out
DOut D_Dout
DDx -> TanhTripleGrad -> D_DDx
D_DDout d_OutNew
D_Dout_new
D_Dout = (-2) * Out * DDx * D_Dout_new
D_DDx = (1-Out^2)*D_DDout + (-2) * Out * DOut * D_Dout_new
D_OutNew = (-2) * Out * DDx * D_DDout + (-2) * DOut * DDx * D_Dout_new
Out, DDX, DOut, D_DDOut, D_DOut_New // input
D_OutNew, D_DOut, D_DDx // output
*/
template <typename T>
struct TanhTripleGradFunctor : public BaseActivationFunctor<T> {
template <typename Device>
void operator()(const Device& dev, const framework::Tensor* Out,
const framework::Tensor* ddX, const framework::Tensor* dOut,
const framework::Tensor* d_DDOut,
const framework::Tensor* d_dOut_New,
framework::Tensor* d_d_Out, framework::Tensor* d_Out_New,
framework::Tensor* d_DDx) const {
auto* d = dev.eigen_device();
auto ddx = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhTripleGrad"));
auto out = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(Out, "Input", "Out", "TanhTripleGrad"));
auto dout = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhTripleGrad"));
auto d_ddOut = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "TanhTripleGrad"));
auto d_dOutNew = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(d_dOut_New, "Input", "D_DOut_New", "TanhTripleGrad"));
if (d_Out_New) {
auto d_OutNew = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(d_Out_New, "Output", "D_OutNew", "TanhTripleGrad"));
d_OutNew.device(*d) = (static_cast<T>(-2) * out * ddx * d_ddOut) -
(static_cast<T>(2) * dout * ddx * d_dOutNew);
}
if (d_d_Out) {
auto d_dOut = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "TanhTripleGrad"));
d_dOut.device(*d) = static_cast<T>(-2) * out * ddx * d_dOutNew;
}
if (d_DDx) {
auto d_ddx = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "TanhTripleGrad"));
d_ddx.device(*d) = (static_cast<T>(1) - (out * out)) * d_ddOut -
static_cast<T>(2) * out * dout * d_dOutNew;
}
}
static constexpr ActBwdOpFwdDeps FwdDeps() {
return ActBwdOpFwdDeps::kDepOut;
}
};
// tanhshrink(x) = x - tanh(x)
// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
template <typename T>
......@@ -909,42 +813,6 @@ struct SquareGradFunctor : public BaseActivationFunctor<T> {
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct BReluFunctor : public BaseActivationFunctor<T> {
float t_min;
float t_max;
// NOTE: Explicit hides the `BaseActivationFunctor<T>::GetAttrs`
// not polymorphism for speed.
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"t_min", &t_min}, {"t_max", &t_max}};
}
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) =
x.cwiseMax(static_cast<T>(t_min)).cwiseMin(static_cast<T>(t_max));
}
};
template <typename T>
struct BReluGradFunctor : public BaseActivationFunctor<T> {
float t_min;
float t_max;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"t_min", &t_min}, {"t_max", &t_max}};
}
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout *
((x > static_cast<T>(t_min)) * (x < static_cast<T>(t_max)))
.template cast<T>();
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
// relu6(x) = min(max(0, x), 6)
template <typename T>
struct Relu6Functor : public BaseActivationFunctor<T> {
......@@ -1168,41 +1036,6 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
}
};
template <typename T>
struct LeakyReluFunctor : public BaseActivationFunctor<T> {
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
if (alpha < 1.f) {
out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
} else {
out.device(d) = x.cwiseMin(static_cast<T>(alpha) * x);
}
}
};
template <typename T>
struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
auto temp1 =
static_cast<T>(alpha) * (x < static_cast<T>(0)).template cast<T>();
auto temp2 = (x >= static_cast<T>(0)).template cast<T>();
dx.device(d) = dout * (temp1 + temp2).template cast<T>();
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct ELUFunctor : public BaseActivationFunctor<T> {
float alpha;
......@@ -1430,37 +1263,6 @@ struct STanhGradFunctor : public BaseActivationFunctor<T> {
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct ThresholdedReluFunctor : public BaseActivationFunctor<T> {
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
auto th = static_cast<T>(threshold);
out.device(d) = (x > th).template cast<T>() * x;
}
};
template <typename T>
struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
auto th = static_cast<T>(threshold);
dx.device(d) = dout * (x > th).template cast<T>();
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct HardSigmoidFunctor : public BaseActivationFunctor<T> {
float slope;
......@@ -1531,121 +1333,6 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
/*
* in arguments: x, out, ddx
* out arguments: ddout, dout, dx
*/
template <ActBwdOpFwdDeps kDepValue>
inline void ExtractActivationDoubleGradTensor(
const framework::ExecutionContext& ctx, const framework::Tensor** X,
const framework::Tensor** Out, const framework::Tensor** ddX,
framework::Tensor** dX, framework::Tensor** dOut,
framework::Tensor** ddOut) {
auto ddx_var = ctx.InputVar("DDX");
auto ddo_var = ctx.OutputVar("DDOut");
PADDLE_ENFORCE_NOT_NULL(
ddx_var, platform::errors::NotFound(
"Cannot get input Variable Out, variable name = %s",
ctx.InputName("DDX")));
if (CanBeUsedBySelectedRows.count(ctx.Type())) {
*ddX = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*ddx_var);
if (ddo_var) {
*ddOut = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
ddo_var);
}
} else {
*ddX = ctx.Input<framework::Tensor>("DDX");
if (ddo_var) {
*ddOut = ctx.Output<framework::Tensor>("DDOut");
}
}
PADDLE_ENFORCE_NOT_NULL(
*ddX,
platform::errors::NotFound(
"Cannot get the tensor from the Variable Output, variable name = %s",
ctx.OutputName("DDX")));
if (static_cast<int>(kDepValue) & static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
auto x_var = ctx.InputVar("X");
PADDLE_ENFORCE_NOT_NULL(
x_var, platform::errors::NotFound(
"Cannot get input Variable Out, variable name = %s",
ctx.InputName("X")));
auto dx_var = ctx.OutputVar("DX");
if (CanBeUsedBySelectedRows.count(ctx.Type())) {
*X = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var);
if (dx_var) {
*dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
dx_var);
}
} else {
*X = ctx.Input<framework::Tensor>("X");
if (dx_var) {
*dX = ctx.Output<framework::Tensor>("DX");
}
}
} else {
VLOG(10) << "Inplace activation of Op: " << ctx.Type();
*X = *ddX;
}
if (static_cast<int>(kDepValue) &
static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
auto out_var = ctx.InputVar("Out");
PADDLE_ENFORCE_NOT_NULL(
out_var,
platform::errors::NotFound(
"Cannot get the tensor from the Variable Out, variable name = %s",
ctx.InputName("Out")));
auto dout_var = ctx.OutputVar("DOut");
if (CanBeUsedBySelectedRows.count(ctx.Type())) {
*Out =
paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var);
if (dout_var) {
*dOut =
paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
dout_var);
}
} else {
*Out = ctx.Input<framework::Tensor>("Out");
if (dout_var) {
*dOut = ctx.Output<framework::Tensor>("DOut");
}
}
} else {
VLOG(10) << "Inplace activation of Op: " << ctx.Type();
*Out = *ddX;
}
}
template <typename DeviceContext, typename Functor>
class ActivationDoubleGradKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
using T = typename Functor::ELEMENT_TYPE;
void Compute(const framework::ExecutionContext& ctx) const override {
const framework::Tensor *X, *Out, *ddX;
X = Out = ddX = nullptr;
framework::Tensor *ddOut, *dOut, *dX;
ddOut = dOut = dX = nullptr;
ExtractActivationDoubleGradTensor<Functor::FwdDeps()>(ctx, &X, &Out, &ddX,
&dX, &dOut, &ddOut);
if (ddOut) ddOut->mutable_data<T>(ctx.GetPlace());
if (dOut) dOut->mutable_data<T>(ctx.GetPlace());
if (dX) dX->mutable_data<T>(Out->dims(), ctx.GetPlace());
auto& place = ctx.template device_context<DeviceContext>();
Functor functor;
auto attrs = functor.GetAttrs();
for (auto& attr : attrs) {
*attr.second = ctx.Attr<float>(attr.first);
}
functor(place, X, Out, ddX, ddOut, dOut, dX);
}
};
template <typename T>
struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
template <typename Device>
......@@ -1667,35 +1354,6 @@ struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
template <typename Device>
void operator()(const Device& dev, const framework::Tensor* X,
const framework::Tensor* Out, const framework::Tensor* ddX,
framework::Tensor* ddOut, framework::Tensor* dOut,
framework::Tensor* dX) const {
if (ddOut) {
auto* d = dev.eigen_device();
auto ddx = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddX, "Input", "DDX", "LeakyReluGradGrad"));
auto x = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad"));
auto ddout = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad"));
ddout.device(*d) =
ddx *
((x > static_cast<T>(0)).template cast<T>() +
static_cast<T>(alpha) * (x <= static_cast<T>(0)).template cast<T>())
.template cast<T>();
}
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct ELUGradGradFunctor : public BaseActivationFunctor<T> {
float alpha;
......@@ -2504,7 +2162,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
__macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); \
__macro(log2, Log2, Log2Functor, Log2GradFunctor); \
__macro(log10, Log10, Log10Functor, Log10GradFunctor); \
__macro(brelu, BRelu, BReluFunctor, BReluGradFunctor); \
__macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \
__macro(stanh, STanh, STanhFunctor, STanhGradFunctor); \
__macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor); \
......@@ -2515,7 +2172,5 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
__macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, \
HardSigmoidGradFunctor); \
__macro(swish, Swish, SwishFunctor, SwishGradFunctor); \
__macro(thresholded_relu, ThresholdedRelu, ThresholdedReluFunctor, \
ThresholdedReluGradFunctor); \
__macro(mish, Mish, MishFunctor, MishGradFunctor); \
__macro(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor);
......@@ -18,38 +18,6 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename T>
struct CudaLeakyReluFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
// leakyrelu(x) = x > 0 ? x : alpha * x
__device__ __forceinline__ T operator()(const T x) const {
return x > zero ? x : static_cast<T>(alpha) * x;
}
};
template <typename T>
struct CudaLeakyReluGradFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
// dx = dout * (x > 0 ? 1 : alpha)
__device__ __forceinline__ T operator()(const T dout, const T x) const {
return x > zero ? dout : static_cast<T>(alpha) * dout;
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaSigmoidFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
......@@ -224,31 +192,6 @@ struct CudaZeroGradFunctor : public BaseActivationFunctor<T> {
}
};
template <typename T>
struct CudaTanhFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
// tanh(x) = tanh(x)
__device__ __forceinline__ T operator()(const T arg_x) const {
MPType x = static_cast<MPType>(arg_x);
return static_cast<T>(tanh(x));
}
};
template <typename T>
struct CudaTanhGradFunctor : public BaseActivationFunctor<T> {
T one = static_cast<T>(1.0f);
// dx = dout * (1 - out^2)
__device__ __forceinline__ T operator()(const T dout, const T out) const {
return dout * (one - out * out);
}
static constexpr ActBwdOpFwdDeps FwdDeps() {
return ActBwdOpFwdDeps::kDepOut;
}
};
template <typename T>
struct CudaReciprocalFunctor : public BaseActivationFunctor<T> {
T one = static_cast<T>(1.0f);
......@@ -476,45 +419,6 @@ struct CudaLog10GradFunctor : public BaseActivationFunctor<T> {
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaBReluFunctor : public BaseActivationFunctor<T> {
float t_min;
float t_max;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"t_min", &t_min}, {"t_max", &t_max}};
}
// brelu(x) = min(max(x, t_min), t_max)
__device__ __forceinline__ T operator()(const T x) const {
T t_min_cast = static_cast<T>(t_min);
T t_max_cast = static_cast<T>(t_max);
T temp_max = x > t_min_cast ? x : t_min_cast;
T temp_min = temp_max < t_max_cast ? temp_max : t_max_cast;
return temp_min;
}
};
template <typename T>
struct CudaBReluGradFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
float t_min;
float t_max;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"t_min", &t_min}, {"t_max", &t_max}};
}
// dx = (x > t_min && x < t_max) ? dout : 0
__device__ __forceinline__ T operator()(const T dout, const T x) const {
T t_min_cast = static_cast<T>(t_min);
T t_max_cast = static_cast<T>(t_max);
return (x > t_min_cast && x < t_max_cast) ? dout : zero;
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaSoftReluFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
......@@ -907,38 +811,6 @@ struct CudaMishGradFunctor : public BaseActivationFunctor<T> {
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaThresholdedReluFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
// thresholded_relu(x) = x > threshold ? x : 0
__device__ __forceinline__ T operator()(const T x) const {
return x > static_cast<T>(threshold) ? x : zero;
}
};
template <typename T>
struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
// dx = x > threshold ? dout : 0
__device__ __forceinline__ T operator()(const T dout, const T x) const {
return x > static_cast<T>(threshold) ? dout : zero;
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaHardSwishFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
......@@ -1212,6 +1084,22 @@ class ActivationGradCudaKernel
}
};
USE_PHI_FUNCTOR(CudaCos)
USE_PHI_FUNCTOR(CudaTan)
USE_PHI_FUNCTOR(CudaAcos)
USE_PHI_FUNCTOR(CudaSin)
USE_PHI_FUNCTOR(CudaAsin)
USE_PHI_FUNCTOR(CudaAtan)
USE_PHI_FUNCTOR(CudaSinh)
USE_PHI_FUNCTOR(CudaCosh)
USE_PHI_FUNCTOR(CudaAsinh)
USE_PHI_FUNCTOR(CudaAcosh)
USE_PHI_FUNCTOR(CudaAtanh)
USE_PHI_FUNCTOR(CudaTanh)
USE_PHI_FUNCTOR(CudaBRelu)
USE_PHI_FUNCTOR(CudaLeakyRelu)
USE_PHI_FUNCTOR(CudaThresholdedRelu)
} // namespace operators
} // namespace paddle
......@@ -1270,20 +1158,6 @@ namespace plat = paddle::platform;
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, \
ops::grad_functor<plat::bfloat16>>);
/* ======================== leaky relu register ============================ */
REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor,
CudaLeakyReluGradFunctor);
REGISTER_OP_CUDA_KERNEL(
leaky_relu_grad_grad,
ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
ops::LeakyReluGradGradFunctor<float>>,
ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
ops::LeakyReluGradGradFunctor<double>>,
ops::ActivationDoubleGradKernel<
plat::CUDADeviceContext, ops::LeakyReluGradGradFunctor<plat::float16>>);
/* ========================================================================== */
/* ======================== elu register ============================ */
REGISTER_OP_CUDA_KERNEL(
elu, ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
......@@ -1348,29 +1222,6 @@ REGISTER_OP_CUDA_KERNEL(
ops::SigmoidTripleGradFunctor<plat::bfloat16>>);
/* ========================================================================== */
/* =========================== tanh register ============================ */
REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, CudaTanhFunctor,
CudaTanhGradFunctor);
REGISTER_OP_CUDA_KERNEL(
tanh_grad_grad,
ops::TanhDoubleGradKernel<paddle::platform::CUDADeviceContext,
ops::TanhGradGradFunctor<float>>,
ops::TanhDoubleGradKernel<paddle::platform::CUDADeviceContext,
ops::TanhGradGradFunctor<double>>,
ops::TanhDoubleGradKernel<plat::CUDADeviceContext,
ops::TanhGradGradFunctor<plat::float16>>);
REGISTER_OP_CUDA_KERNEL(
tanh_triple_grad,
ops::TanhTripeGradKernel<paddle::platform::CUDADeviceContext,
ops::TanhTripleGradFunctor<float>>,
ops::TanhTripeGradKernel<paddle::platform::CUDADeviceContext,
ops::TanhTripleGradFunctor<double>>,
ops::TanhTripeGradKernel<plat::CUDADeviceContext,
ops::TanhTripleGradFunctor<plat::float16>>);
/* ========================================================================== */
/* =========================== sqrt register ============================= */
REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, CudaSqrtFunctor,
CudaSqrtGradFunctor);
......@@ -1521,7 +1372,6 @@ REGISTER_OP_CUDA_KERNEL(
__macro(log1p, Log1p, CudaLog1pFunctor, CudaLog1pGradFunctor); \
__macro(log2, Log2, CudaLog2Functor, CudaLog2GradFunctor); \
__macro(log10, Log10, CudaLog10Functor, CudaLog10GradFunctor); \
__macro(brelu, BRelu, CudaBReluFunctor, CudaBReluGradFunctor); \
__macro(soft_relu, SoftRelu, CudaSoftReluFunctor, CudaSoftReluGradFunctor); \
__macro(stanh, STanh, CudaSTanhFunctor, CudaSTanhGradFunctor); \
__macro(softplus, Softplus, CudaSoftplusFunctor, CudaSoftplusGradFunctor); \
......@@ -1535,8 +1385,6 @@ REGISTER_OP_CUDA_KERNEL(
CudaHardSigmoidGradFunctor); \
__macro(swish, Swish, CudaSwishFunctor, CudaSwishGradFunctor); \
__macro(mish, Mish, CudaMishFunctor, CudaMishGradFunctor); \
__macro(thresholded_relu, ThresholdedRelu, CudaThresholdedReluFunctor, \
CudaThresholdedReluGradFunctor); \
__macro(hard_swish, HardSwish, CudaHardSwishFunctor, \
CudaHardSwishGradFunctor);
FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL)
......
......@@ -39,6 +39,54 @@ void ReluDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& ddx,
DenseTensor* ddout);
template <typename T, typename Context>
void TanhDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& out,
const DenseTensor& ddx,
const DenseTensor& dout,
DenseTensor* dout_new,
DenseTensor* ddout);
template <typename T, typename Context>
void TanhTripleGradKernel(const Context& dev_ctx,
const DenseTensor& out,
const DenseTensor& ddx,
const DenseTensor& dout,
const DenseTensor& d_ddout,
const DenseTensor& d_dout_new,
DenseTensor* d_out_new,
DenseTensor* d_dout,
DenseTensor* d_ddx);
template <typename T, typename Context>
void BReluGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& dout,
float t_min,
float t_max,
DenseTensor* dx);
template <typename T, typename Context>
void LeakyReluGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& dout,
float alpha,
DenseTensor* dx);
template <typename T, typename Context>
void LeakyReluDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& ddx,
float alpha,
DenseTensor* ddout);
template <typename T, typename Context>
void ThresholdedReluGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& dout,
float threshold,
DenseTensor* dx);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cos);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Tan);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acos);
......@@ -51,5 +99,6 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asinh);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acosh);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atanh);
DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Relu);
DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Tanh);
} // namespace phi
......@@ -36,5 +36,25 @@ DECLARE_ACTIVATION_KERNEL(Asinh)
DECLARE_ACTIVATION_KERNEL(Acosh)
DECLARE_ACTIVATION_KERNEL(Atanh)
DECLARE_ACTIVATION_KERNEL(Relu)
DECLARE_ACTIVATION_KERNEL(Tanh)
template <typename T, typename Context>
void BReluKernel(const Context& dev_ctx,
const DenseTensor& x,
float t_min,
float t_max,
DenseTensor* out);
template <typename T, typename Context>
void LeakyReluKernel(const Context& dev_ctx,
const DenseTensor& x,
float alpha,
DenseTensor* out);
template <typename T, typename Context>
void ThresholdedReluKernel(const Context& dev_ctx,
const DenseTensor& x,
float threshold,
DenseTensor* out);
} // namespace phi
......@@ -27,65 +27,135 @@ namespace phi {
const DenseTensor& x, \
const DenseTensor& dout, \
DenseTensor* dx) { \
functor_class functor; \
ActivationGradImpl<T, Context, functor_class>( \
functor_class<T> functor; \
ActivationGradImpl<T, Context, functor_class<T>>( \
dev_ctx, &x, nullptr, &dout, dx, functor); \
}
#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX( \
name, functor_class, attr) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& dout, \
float attr, \
DenseTensor* dx) { \
functor_class<T> functor; \
auto attrs = functor.GetAttrs(); \
*(attrs[0].second) = attr; \
ActivationGradImpl<T, Context, functor_class<T>>( \
dev_ctx, &x, nullptr, &dout, dx, functor); \
}
#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX( \
name, functor_class, attr1, attr2) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& dout, \
float attr1, \
float attr2, \
DenseTensor* dx) { \
functor_class<T> functor; \
auto attrs = functor.GetAttrs(); \
*(attrs[0].second) = attr1; \
*(attrs[1].second) = attr2; \
ActivationGradImpl<T, Context, functor_class<T>>( \
dev_ctx, &x, nullptr, &dout, dx, functor); \
}
#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& out, \
const DenseTensor& dout, \
DenseTensor* dx) { \
functor_class functor; \
ActivationGradImpl<T, Context, functor_class>( \
functor_class<T> functor; \
ActivationGradImpl<T, Context, functor_class<T>>( \
dev_ctx, nullptr, &out, &dout, dx, functor); \
}
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CosGradFunctor<T>);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::TanGradFunctor<T>);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::AcosGradFunctor<T>);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::SinGradFunctor<T>);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::AsinGradFunctor<T>);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::AtanGradFunctor<T>);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::SinhGradFunctor<T>);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CoshGradFunctor<T>);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::AsinhGradFunctor<T>);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::AcoshGradFunctor<T>);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::AtanhGradFunctor<T>);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::ReluGradFunctor<T>);
#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut( \
name, functor_class, attr) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& out, \
const DenseTensor& dout, \
float attr, \
DenseTensor* dx) { \
functor_class<T> functor; \
auto attrs = functor.GetAttrs(); \
*(attrs[0].second) = attr; \
ActivationGradImpl<T, Context, functor_class<T>>( \
dev_ctx, nullptr, &out, &dout, dx, functor); \
}
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CosGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::TanGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::AcosGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::SinGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::AsinGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::AtanGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::SinhGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CoshGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::AsinhGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::AcoshGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::AtanhGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::ReluGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Tanh, funcs::TanhGradFunctor);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu,
funcs::LeakyReluGradFunctor,
alpha);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(
ThresholdedRelu, funcs::ThresholdedReluGradFunctor, threshold);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu,
funcs::BReluGradFunctor,
t_min,
t_max);
} // namespace phi
PD_REGISTER_KERNEL(
cos_grad, CPU, ALL_LAYOUT, phi::CosGradKernel, float, double) {}
PD_REGISTER_KERNEL(
tan_grad, CPU, ALL_LAYOUT, phi::TanGradKernel, float, double) {}
PD_REGISTER_KERNEL(
acos_grad, CPU, ALL_LAYOUT, phi::AcosGradKernel, float, double) {}
PD_REGISTER_KERNEL(
sin_grad, CPU, ALL_LAYOUT, phi::SinGradKernel, float, double) {}
PD_REGISTER_KERNEL(
asin_grad, CPU, ALL_LAYOUT, phi::AsinGradKernel, float, double) {}
PD_REGISTER_KERNEL(
atan_grad, CPU, ALL_LAYOUT, phi::AtanGradKernel, float, double) {}
PD_REGISTER_KERNEL(
sinh_grad, CPU, ALL_LAYOUT, phi::SinhGradKernel, float, double) {}
PD_REGISTER_KERNEL(
cosh_grad, CPU, ALL_LAYOUT, phi::CoshGradKernel, float, double) {}
PD_REGISTER_KERNEL(
asinh_grad, CPU, ALL_LAYOUT, phi::AsinhGradKernel, float, double) {}
PD_REGISTER_KERNEL(
acosh_grad, CPU, ALL_LAYOUT, phi::AcoshGradKernel, float, double) {}
PD_REGISTER_KERNEL(
atanh_grad, CPU, ALL_LAYOUT, phi::AtanhGradKernel, float, double) {}
PD_REGISTER_KERNEL(
relu_grad, CPU, ALL_LAYOUT, phi::ReluGradKernel, float, double) {}
PD_REGISTER_KERNEL(relu_double_grad,
#define PD_REGISTER_ACTIVATION_GRAD_KERNEL(name, func) \
PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func, float, double) {}
#define PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(name, func) \
PD_REGISTER_KERNEL( \
name, CPU, ALL_LAYOUT, phi::func, float, double, phi::dtype::float16) {}
PD_REGISTER_ACTIVATION_GRAD_KERNEL(sin_grad, SinGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(cos_grad, CosGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(tan_grad, TanGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(acos_grad, AcosGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(asin_grad, AsinGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(atan_grad, AtanGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(sinh_grad, SinhGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(cosh_grad, CoshGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(asinh_grad, AsinhGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(acosh_grad, AcoshGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(atanh_grad, AtanhGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_grad, TanhGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad,
ThresholdedReluGradKernel)
PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(relu_double_grad,
ReluDoubleGradKernel)
PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(tanh_double_grad,
TanhDoubleGradKernel)
PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(leaky_relu_double_grad,
LeakyReluDoubleGradKernel)
PD_REGISTER_KERNEL(tanh_triple_grad,
CPU,
ALL_LAYOUT,
phi::ReluDoubleGradKernel,
phi::TanhTripleGradKernel,
float,
double,
phi::dtype::float16) {}
......@@ -27,6 +27,33 @@ namespace phi {
ActivationImpl<T, Context, functor_class>(dev_ctx, x, out, functor); \
}
#define DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \
template <typename T, typename Context> \
void name##Kernel(const Context& dev_ctx, \
const DenseTensor& x, \
float attr, \
DenseTensor* out) { \
functor_class<T> functor; \
auto attrs = functor.GetAttrs(); \
*(attrs[0].second) = attr; \
ActivationImpl<T, Context, functor_class<T>>(dev_ctx, x, out, functor); \
}
#define DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS( \
name, functor_class, attr1, attr2) \
template <typename T, typename Context> \
void name##Kernel(const Context& dev_ctx, \
const DenseTensor& x, \
float attr1, \
float attr2, \
DenseTensor* out) { \
functor_class<T> functor; \
auto attrs = functor.GetAttrs(); \
*(attrs[0].second) = attr1; \
*(attrs[1].second) = attr2; \
ActivationImpl<T, Context, functor_class<T>>(dev_ctx, x, out, functor); \
}
DEFINE_CPU_ACTIVATION_KERNEL(Sin, funcs::SinFunctor<T>)
DEFINE_CPU_ACTIVATION_KERNEL(Cos, funcs::CosFunctor<T>)
DEFINE_CPU_ACTIVATION_KERNEL(Tan, funcs::TanFunctor<T>)
......@@ -39,17 +66,31 @@ DEFINE_CPU_ACTIVATION_KERNEL(Asinh, funcs::AsinhFunctor<T>)
DEFINE_CPU_ACTIVATION_KERNEL(Acosh, funcs::AcoshFunctor<T>)
DEFINE_CPU_ACTIVATION_KERNEL(Atanh, funcs::AtanhFunctor<T>)
DEFINE_CPU_ACTIVATION_KERNEL(Relu, funcs::ReluCPUFunctor<T>)
DEFINE_CPU_ACTIVATION_KERNEL(Tanh, funcs::TanhFunctor<T>)
DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, funcs::LeakyReluFunctor, alpha)
DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
funcs::ThresholdedReluFunctor,
threshold)
DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, funcs::BReluFunctor, t_min, t_max)
} // namespace phi
PD_REGISTER_KERNEL(sin, CPU, ALL_LAYOUT, phi::SinKernel, float, double) {}
PD_REGISTER_KERNEL(cos, CPU, ALL_LAYOUT, phi::CosKernel, float, double) {}
PD_REGISTER_KERNEL(tan, CPU, ALL_LAYOUT, phi::TanKernel, float, double) {}
PD_REGISTER_KERNEL(acos, CPU, ALL_LAYOUT, phi::AcosKernel, float, double) {}
PD_REGISTER_KERNEL(asin, CPU, ALL_LAYOUT, phi::AsinKernel, float, double) {}
PD_REGISTER_KERNEL(atan, CPU, ALL_LAYOUT, phi::AtanKernel, float, double) {}
PD_REGISTER_KERNEL(sinh, CPU, ALL_LAYOUT, phi::SinhKernel, float, double) {}
PD_REGISTER_KERNEL(cosh, CPU, ALL_LAYOUT, phi::CoshKernel, float, double) {}
PD_REGISTER_KERNEL(asinh, CPU, ALL_LAYOUT, phi::AsinhKernel, float, double) {}
PD_REGISTER_KERNEL(acosh, CPU, ALL_LAYOUT, phi::AcoshKernel, float, double) {}
PD_REGISTER_KERNEL(atanh, CPU, ALL_LAYOUT, phi::AtanhKernel, float, double) {}
PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {}
#define PD_REGISTER_ACTIVATION_KERNEL(name, func) \
PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func##Kernel, float, double) {}
PD_REGISTER_ACTIVATION_KERNEL(sin, Sin)
PD_REGISTER_ACTIVATION_KERNEL(cos, Cos)
PD_REGISTER_ACTIVATION_KERNEL(tan, Tan)
PD_REGISTER_ACTIVATION_KERNEL(acos, Acos)
PD_REGISTER_ACTIVATION_KERNEL(asin, Asin)
PD_REGISTER_ACTIVATION_KERNEL(atan, Atan)
PD_REGISTER_ACTIVATION_KERNEL(sinh, Sinh)
PD_REGISTER_ACTIVATION_KERNEL(cosh, Cosh)
PD_REGISTER_ACTIVATION_KERNEL(asinh, Asinh)
PD_REGISTER_ACTIVATION_KERNEL(acosh, Acosh)
PD_REGISTER_ACTIVATION_KERNEL(atanh, Atanh)
PD_REGISTER_ACTIVATION_KERNEL(tanh, Tanh)
PD_REGISTER_ACTIVATION_KERNEL(brelu, BRelu)
PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyRelu)
PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedRelu)
......@@ -513,7 +513,270 @@ struct ReluGradGradFunctor : public BaseActivationFunctor<T> {
}
};
#if defined(__NVCC__) || defined(__HIPCC__)
// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
template <typename T>
struct TanhFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x.tanh();
}
};
template <typename T>
struct TanhGradFunctor : public BaseActivationFunctor<T> {
template <typename Device,
typename X,
typename Out,
typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout * (static_cast<T>(1) - out * out);
}
static constexpr ActBwdOpFwdDeps FwdDeps() {
return ActBwdOpFwdDeps::kDepOut;
}
};
template <typename T>
struct TanhGradGradFunctor : public BaseActivationFunctor<T> {
template <typename Device>
void operator()(const Device& dev,
const DenseTensor* Out,
const DenseTensor* ddX,
const DenseTensor* dOut,
DenseTensor* dOutNew,
DenseTensor* ddOut) const {
auto* d = dev.eigen_device();
auto ddx = EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhGradGrad"));
auto out = EigenVector<T>::Flatten(
GET_DATA_SAFELY(Out, "Input", "Out", "TanhGradGrad"));
// tanh grad grad : ddout = (1 - out^2) * ddx, dout = - (dout_old * 2 * out
// * ddx)
if (dOutNew) {
auto dout = EigenVector<T>::Flatten(
GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad"));
auto dout_new = EigenVector<T>::Flatten(
GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "TanhGradGrad"));
dout_new.device(*d) =
static_cast<T>(-1) * dout * static_cast<T>(2) * out * ddx;
}
if (ddOut) {
auto ddout = EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddOut, "Output", "DDOut", "TanhGradGrad"));
ddout.device(*d) = (static_cast<T>(1) - out * out) * ddx;
}
}
static constexpr ActBwdOpFwdDeps FwdDeps() {
return ActBwdOpFwdDeps::kDepOut;
}
};
/*
Out
DOut D_Dout
DDx -> TanhTripleGrad -> D_DDx
D_DDout d_OutNew
D_Dout_new
D_Dout = (-2) * Out * DDx * D_Dout_new
D_DDx = (1-Out^2)*D_DDout + (-2) * Out * DOut * D_Dout_new
D_OutNew = (-2) * Out * DDx * D_DDout + (-2) * DOut * DDx * D_Dout_new
Out, DDX, DOut, D_DDOut, D_DOut_New // input
D_OutNew, D_DOut, D_DDx // output
*/
template <typename T>
struct TanhTripleGradFunctor : public BaseActivationFunctor<T> {
template <typename Device>
void operator()(const Device& dev,
const DenseTensor* Out,
const DenseTensor* ddX,
const DenseTensor* dOut,
const DenseTensor* d_DDOut,
const DenseTensor* d_dOut_New,
DenseTensor* d_d_Out,
DenseTensor* d_Out_New,
DenseTensor* d_DDx) const {
auto* d = dev.eigen_device();
auto ddx = EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhTripleGrad"));
auto out = EigenVector<T>::Flatten(
GET_DATA_SAFELY(Out, "Input", "Out", "TanhTripleGrad"));
auto dout = EigenVector<T>::Flatten(
GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhTripleGrad"));
auto d_ddOut = EigenVector<T>::Flatten(
GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "TanhTripleGrad"));
auto d_dOutNew = EigenVector<T>::Flatten(
GET_DATA_SAFELY(d_dOut_New, "Input", "D_DOut_New", "TanhTripleGrad"));
if (d_Out_New) {
auto d_OutNew = EigenVector<T>::Flatten(
GET_DATA_SAFELY(d_Out_New, "Output", "D_OutNew", "TanhTripleGrad"));
d_OutNew.device(*d) = (static_cast<T>(-2) * out * ddx * d_ddOut) -
(static_cast<T>(2) * dout * ddx * d_dOutNew);
}
if (d_d_Out) {
auto d_dOut = EigenVector<T>::Flatten(
GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "TanhTripleGrad"));
d_dOut.device(*d) = static_cast<T>(-2) * out * ddx * d_dOutNew;
}
if (d_DDx) {
auto d_ddx = EigenVector<T>::Flatten(
GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "TanhTripleGrad"));
d_ddx.device(*d) = (static_cast<T>(1) - (out * out)) * d_ddOut -
static_cast<T>(2) * out * dout * d_dOutNew;
}
}
static constexpr ActBwdOpFwdDeps FwdDeps() {
return ActBwdOpFwdDeps::kDepOut;
}
};
template <typename T>
struct BReluFunctor : public BaseActivationFunctor<T> {
float t_min;
float t_max;
// NOTE: Explicit hides the `BaseActivationFunctor<T>::GetAttrs`
// not polymorphism for speed.
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"t_min", &t_min}, {"t_max", &t_max}};
}
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) =
x.cwiseMax(static_cast<T>(t_min)).cwiseMin(static_cast<T>(t_max));
}
};
template <typename T>
struct BReluGradFunctor : public BaseActivationFunctor<T> {
float t_min;
float t_max;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"t_min", &t_min}, {"t_max", &t_max}};
}
template <typename Device,
typename X,
typename Out,
typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout *
((x > static_cast<T>(t_min)) * (x < static_cast<T>(t_max)))
.template cast<T>();
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct LeakyReluFunctor : public BaseActivationFunctor<T> {
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
if (alpha < 1.f) {
out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
} else {
out.device(d) = x.cwiseMin(static_cast<T>(alpha) * x);
}
}
};
template <typename T>
struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
template <typename Device,
typename X,
typename Out,
typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
auto temp1 =
static_cast<T>(alpha) * (x < static_cast<T>(0)).template cast<T>();
auto temp2 = (x >= static_cast<T>(0)).template cast<T>();
dx.device(d) = dout * (temp1 + temp2).template cast<T>();
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
template <typename Device>
void operator()(const Device& dev,
const DenseTensor* X,
const DenseTensor* Out,
const DenseTensor* ddX,
DenseTensor* ddOut,
DenseTensor* dOut,
DenseTensor* dX) const {
if (ddOut) {
auto* d = dev.eigen_device();
auto ddx = EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddX, "Input", "DDX", "LeakyReluGradGrad"));
auto x = EigenVector<T>::Flatten(
GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad"));
auto ddout = EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad"));
ddout.device(*d) =
ddx *
((x > static_cast<T>(0)).template cast<T>() +
static_cast<T>(alpha) * (x <= static_cast<T>(0)).template cast<T>())
.template cast<T>();
}
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct ThresholdedReluFunctor : public BaseActivationFunctor<T> {
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
auto th = static_cast<T>(threshold);
out.device(d) = (x > th).template cast<T>() * x;
}
};
template <typename T>
struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
template <typename Device,
typename X,
typename Out,
typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
auto th = static_cast<T>(threshold);
dx.device(d) = dout * (x > th).template cast<T>();
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
template <typename T>
struct CudaReluFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
......@@ -824,6 +1087,133 @@ struct CudaAtanGradFunctor : public BaseActivationFunctor<T> {
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaTanhFunctor : public BaseActivationFunctor<T> {
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
// tanh(x) = tanh(x)
__device__ __forceinline__ T operator()(const T arg_x) const {
MPType x = static_cast<MPType>(arg_x);
return static_cast<T>(tanh(x));
}
};
template <typename T>
struct CudaTanhGradFunctor : public BaseActivationFunctor<T> {
T one = static_cast<T>(1.0f);
// dx = dout * (1 - out^2)
__device__ __forceinline__ T operator()(const T dout, const T out) const {
return dout * (one - out * out);
}
static constexpr ActBwdOpFwdDeps FwdDeps() {
return ActBwdOpFwdDeps::kDepOut;
}
};
template <typename T>
struct CudaBReluFunctor : public BaseActivationFunctor<T> {
float t_min;
float t_max;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"t_min", &t_min}, {"t_max", &t_max}};
}
// brelu(x) = min(max(x, t_min), t_max)
__device__ __forceinline__ T operator()(const T x) const {
T t_min_cast = static_cast<T>(t_min);
T t_max_cast = static_cast<T>(t_max);
T temp_max = x > t_min_cast ? x : t_min_cast;
T temp_min = temp_max < t_max_cast ? temp_max : t_max_cast;
return temp_min;
}
};
template <typename T>
struct CudaBReluGradFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
float t_min;
float t_max;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"t_min", &t_min}, {"t_max", &t_max}};
}
// dx = (x > t_min && x < t_max) ? dout : 0
__device__ __forceinline__ T operator()(const T dout, const T x) const {
T t_min_cast = static_cast<T>(t_min);
T t_max_cast = static_cast<T>(t_max);
return (x > t_min_cast && x < t_max_cast) ? dout : zero;
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaThresholdedReluFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
// thresholded_relu(x) = x > threshold ? x : 0
__device__ __forceinline__ T operator()(const T x) const {
return x > static_cast<T>(threshold) ? x : zero;
}
};
template <typename T>
struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
// dx = x > threshold ? dout : 0
__device__ __forceinline__ T operator()(const T dout, const T x) const {
return x > static_cast<T>(threshold) ? dout : zero;
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaLeakyReluFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
// leakyrelu(x) = x > 0 ? x : alpha * x
__device__ __forceinline__ T operator()(const T x) const {
return x > zero ? x : static_cast<T>(alpha) * x;
}
};
template <typename T>
struct CudaLeakyReluGradFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
// dx = dout * (x > 0 ? 1 : alpha)
__device__ __forceinline__ T operator()(const T dout, const T x) const {
return x > zero ? dout : static_cast<T>(alpha) * dout;
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
#endif
} // namespace funcs
......
......@@ -79,113 +79,97 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
const DenseTensor& x, \
const DenseTensor& dout, \
DenseTensor* dx) { \
functor_class functor; \
ActivationGradGPUImpl<T, Context, functor_class>( \
funcs::functor_class<T> functor; \
ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, &x, nullptr, &dout, dx, functor); \
}
#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX( \
name, functor_class, attr) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& dout, \
float attr, \
DenseTensor* dx) { \
funcs::functor_class<T> functor; \
auto attrs = functor.GetAttrs(); \
*(attrs[0].second) = attr; \
ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, &x, nullptr, &dout, dx, functor); \
}
#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX( \
name, functor_class, attr1, attr2) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& dout, \
float attr1, \
float attr2, \
DenseTensor* dx) { \
funcs::functor_class<T> functor; \
auto attrs = functor.GetAttrs(); \
*(attrs[0].second) = attr1; \
*(attrs[1].second) = attr2; \
ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, &x, nullptr, &dout, dx, functor); \
}
#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& out, \
const DenseTensor& dout, \
DenseTensor* dx) { \
functor_class functor; \
ActivationGradGPUImpl<T, Context, functor_class>( \
funcs::functor_class<T> functor; \
ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, nullptr, &out, &dout, dx, functor); \
}
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::CudaReluGradFunctor<T>);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CudaCosGradFunctor<T>);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::CudaTanGradFunctor<T>);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::CudaAcosGradFunctor<T>);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::CudaSinGradFunctor<T>);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::CudaAsinGradFunctor<T>);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::CudaAtanGradFunctor<T>);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::CudaSinhGradFunctor<T>);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CudaCoshGradFunctor<T>);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::CudaAsinhGradFunctor<T>);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::CudaAcoshGradFunctor<T>);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::CudaAtanhGradFunctor<T>);
#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut( \
name, functor_class, attr) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& out, \
const DenseTensor& dout, \
float attr, \
DenseTensor* dx) { \
funcs::functor_class<T> functor; \
auto attrs = functor.GetAttrs(); \
*(attrs[0].second) = attr; \
ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, nullptr, &out, &dout, dx, functor); \
}
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, CudaReluGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Tanh, CudaTanhGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, CudaCosGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, CudaTanGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, CudaAcosGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, CudaSinGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, CudaAsinGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, CudaAtanGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, CudaSinhGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, CudaCoshGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, CudaAsinhGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, CudaAcoshGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, CudaAtanhGradFunctor);
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu,
CudaLeakyReluGradFunctor,
alpha);
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(ThresholdedRelu,
CudaThresholdedReluGradFunctor,
threshold);
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu,
CudaBReluGradFunctor,
t_min,
t_max);
} // namespace phi
PD_REGISTER_KERNEL(cos_grad,
GPU,
ALL_LAYOUT,
phi::CosGradKernel,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(tan_grad,
GPU,
ALL_LAYOUT,
phi::TanGradKernel,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(acos_grad,
GPU,
ALL_LAYOUT,
phi::AcosGradKernel,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(sin_grad,
GPU,
ALL_LAYOUT,
phi::SinGradKernel,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(asin_grad,
GPU,
ALL_LAYOUT,
phi::AsinGradKernel,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(atan_grad,
GPU,
ALL_LAYOUT,
phi::AtanGradKernel,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(sinh_grad,
GPU,
ALL_LAYOUT,
phi::SinhGradKernel,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(cosh_grad,
GPU,
ALL_LAYOUT,
phi::CoshGradKernel,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(asinh_grad,
GPU,
ALL_LAYOUT,
phi::AsinhGradKernel,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(acosh_grad,
GPU,
ALL_LAYOUT,
phi::AcoshGradKernel,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(atanh_grad,
GPU,
ALL_LAYOUT,
phi::AtanhGradKernel,
float,
double,
phi::dtype::float16) {}
#ifdef PADDLE_WITH_HIP
PD_REGISTER_KERNEL(relu_grad,
GPU,
......@@ -219,3 +203,34 @@ PD_REGISTER_KERNEL(relu_double_grad,
phi::dtype::float16,
phi::dtype::bfloat16) {}
#endif
#define PD_REGISTER_ACTIVATION_GRAD_KERNEL(name, func) \
PD_REGISTER_KERNEL(name, \
GPU, \
ALL_LAYOUT, \
phi::func, \
float, \
double, \
phi::dtype::float16, \
phi::dtype::bfloat16) {}
PD_REGISTER_ACTIVATION_GRAD_KERNEL(sin_grad, SinGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(cos_grad, CosGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(tan_grad, TanGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(acos_grad, AcosGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(asin_grad, AsinGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(atan_grad, AtanGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(sinh_grad, SinhGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(cosh_grad, CoshGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(asinh_grad, AsinhGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(acosh_grad, AcoshGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(atanh_grad, AtanhGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_grad, TanhGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_double_grad, TanhDoubleGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_triple_grad, TanhTripleGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_double_grad,
LeakyReluDoubleGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad,
ThresholdedReluGradKernel)
......@@ -46,6 +46,35 @@ void ActivationGPUImpl(const Context& dev_ctx,
ActivationGPUImpl<T, Context, functor_class>(dev_ctx, x, out, functor); \
}
#define DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \
template <typename T, typename Context> \
void name##Kernel(const Context& dev_ctx, \
const DenseTensor& x, \
float attr, \
DenseTensor* out) { \
funcs::functor_class<T> functor; \
auto attrs = functor.GetAttrs(); \
*(attrs[0].second) = attr; \
ActivationGPUImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, x, out, functor); \
}
#define DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS( \
name, functor_class, attr1, attr2) \
template <typename T, typename Context> \
void name##Kernel(const Context& dev_ctx, \
const DenseTensor& x, \
float attr1, \
float attr2, \
DenseTensor* out) { \
funcs::functor_class<T> functor; \
auto attrs = functor.GetAttrs(); \
*(attrs[0].second) = attr1; \
*(attrs[1].second) = attr2; \
ActivationGPUImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, x, out, functor); \
}
DEFINE_GPU_ACTIVATION_KERNEL(Cos, funcs::CudaCosFunctor<T>)
DEFINE_GPU_ACTIVATION_KERNEL(Tan, funcs::CudaTanFunctor<T>)
DEFINE_GPU_ACTIVATION_KERNEL(Acos, funcs::CudaAcosFunctor<T>)
......@@ -58,6 +87,14 @@ DEFINE_GPU_ACTIVATION_KERNEL(Asinh, funcs::CudaAsinhFunctor<T>)
DEFINE_GPU_ACTIVATION_KERNEL(Acosh, funcs::CudaAcoshFunctor<T>)
DEFINE_GPU_ACTIVATION_KERNEL(Atanh, funcs::CudaAtanhFunctor<T>)
DEFINE_GPU_ACTIVATION_KERNEL(Relu, funcs::CudaReluFunctor<T>)
DEFINE_GPU_ACTIVATION_KERNEL(Tanh, funcs::CudaTanhFunctor<T>)
DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha)
DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
CudaThresholdedReluFunctor,
threshold)
DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, CudaBReluFunctor, t_min, t_max)
} // namespace phi
......@@ -79,65 +116,29 @@ PD_REGISTER_KERNEL(relu,
phi::dtype::float16,
phi::dtype::bfloat16) {}
#endif
PD_REGISTER_KERNEL(
sin, GPU, ALL_LAYOUT, phi::SinKernel, float, double, phi::dtype::float16) {}
PD_REGISTER_KERNEL(
cos, GPU, ALL_LAYOUT, phi::CosKernel, float, double, phi::dtype::float16) {}
PD_REGISTER_KERNEL(
tan, GPU, ALL_LAYOUT, phi::TanKernel, float, double, phi::dtype::float16) {}
PD_REGISTER_KERNEL(acos,
GPU,
ALL_LAYOUT,
phi::AcosKernel,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(asin,
GPU,
ALL_LAYOUT,
phi::AsinKernel,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(atan,
GPU,
ALL_LAYOUT,
phi::AtanKernel,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(sinh,
GPU,
ALL_LAYOUT,
phi::SinhKernel,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(cosh,
GPU,
ALL_LAYOUT,
phi::CoshKernel,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(asinh,
GPU,
ALL_LAYOUT,
phi::AsinhKernel,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(acosh,
GPU,
ALL_LAYOUT,
phi::AcoshKernel,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(atanh,
GPU,
ALL_LAYOUT,
phi::AtanhKernel,
float,
double,
phi::dtype::float16) {}
#define PD_REGISTER_ACTIVATION_KERNEL(name, func) \
PD_REGISTER_KERNEL(name, \
GPU, \
ALL_LAYOUT, \
phi::func, \
float, \
double, \
phi::dtype::float16, \
phi::dtype::bfloat16) {}
PD_REGISTER_ACTIVATION_KERNEL(sin, SinKernel)
PD_REGISTER_ACTIVATION_KERNEL(cos, CosKernel)
PD_REGISTER_ACTIVATION_KERNEL(tan, TanKernel)
PD_REGISTER_ACTIVATION_KERNEL(acos, AcosKernel)
PD_REGISTER_ACTIVATION_KERNEL(asin, AsinKernel)
PD_REGISTER_ACTIVATION_KERNEL(atan, AtanKernel)
PD_REGISTER_ACTIVATION_KERNEL(sinh, SinhKernel)
PD_REGISTER_ACTIVATION_KERNEL(cosh, CoshKernel)
PD_REGISTER_ACTIVATION_KERNEL(asinh, AsinhKernel)
PD_REGISTER_ACTIVATION_KERNEL(acosh, AcoshKernel)
PD_REGISTER_ACTIVATION_KERNEL(atanh, AtanhKernel)
PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel)
PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel)
PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel)
PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
......@@ -130,4 +130,76 @@ void ReluDoubleGradKernel(const Context& dev_ctx,
relu_double_grad_functor);
}
template <typename T, typename Context>
void LeakyReluDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& ddx,
float alpha,
DenseTensor* ddout) {
funcs::LeakyReluGradGradFunctor<T> leaky_relu_double_grad_functor;
leaky_relu_double_grad_functor.alpha = alpha;
ActivationDoubleGradImpl<T, Context, funcs::LeakyReluGradGradFunctor<T>>(
dev_ctx,
&x,
nullptr,
&ddx,
nullptr,
nullptr,
ddout,
leaky_relu_double_grad_functor);
}
template <typename T, typename Context>
void TanhDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& out,
const DenseTensor& ddx,
const DenseTensor& dout,
DenseTensor* dout_new,
DenseTensor* ddout) {
if (dout_new) {
dout_new->Resize(out.dims());
dev_ctx.template Alloc<T>(dout_new);
}
if (ddout) {
ddout->Resize(out.dims());
dev_ctx.template Alloc<T>(ddout);
}
funcs::TanhGradGradFunctor<T> functor;
functor(dev_ctx, &out, &ddx, &dout, dout_new, ddout);
}
template <typename T, typename Context>
void TanhTripleGradKernel(const Context& dev_ctx,
const DenseTensor& out,
const DenseTensor& ddx,
const DenseTensor& dout,
const DenseTensor& d_ddout,
const DenseTensor& d_dout_new,
DenseTensor* d_out_new,
DenseTensor* d_dout,
DenseTensor* d_ddx) {
if (d_dout) {
d_dout->Resize(out.dims());
dev_ctx.template Alloc<T>(d_dout);
}
if (d_out_new) {
d_dout->Resize(out.dims());
dev_ctx.template Alloc<T>(d_out_new);
}
if (d_ddx) {
d_dout->Resize(ddx.dims());
dev_ctx.template Alloc<T>(d_ddx);
}
funcs::TanhTripleGradFunctor<T> functor;
functor(dev_ctx,
&out,
&ddx,
&dout,
&d_ddout,
&d_dout_new, // input
d_dout,
d_out_new,
d_ddx); // output
}
} // namespace phi
......@@ -16,40 +16,80 @@ limitations under the License. */
namespace phi {
#define DefineActGradDepXOpArgMap(func_name, op_name) \
KernelSignature func_name##GradOpArgumentMapping( \
const ArgumentMappingContext& ctx) { \
return KernelSignature( \
op_name "_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")}); \
#define DefineActGradDepXOpArgMap(func_name, op_name, attrs) \
KernelSignature func_name##GradOpArgumentMapping( \
const ArgumentMappingContext& ctx) { \
return KernelSignature(op_name "_grad", \
{"X", GradVarName("Out")}, \
{attrs}, \
{GradVarName("X")}); \
}
#define DefineActGradDepOutOpArgMap(func_name, op_name) \
KernelSignature func_name##GradOpArgumentMapping( \
const ArgumentMappingContext& ctx) { \
return KernelSignature( \
op_name "_grad", {"Out", GradVarName("Out")}, {}, {GradVarName("X")}); \
#define DefineActGradDepOutOpArgMap(func_name, op_name, attrs) \
KernelSignature func_name##GradOpArgumentMapping( \
const ArgumentMappingContext& ctx) { \
return KernelSignature(op_name "_grad", \
{"Out", GradVarName("Out")}, \
{attrs}, \
{GradVarName("X")}); \
}
#define comma ,
DefineActGradDepXOpArgMap(Cos, "cos", ); // NOLINT
DefineActGradDepXOpArgMap(Tan, "tan", ); // NOLINT
DefineActGradDepXOpArgMap(Acos, "acos", ); // NOLINT
DefineActGradDepXOpArgMap(Sin, "sin", ); // NOLINT
DefineActGradDepXOpArgMap(Asin, "asin", ); // NOLINT
DefineActGradDepXOpArgMap(Atan, "atan", ); // NOLINT
DefineActGradDepXOpArgMap(Sinh, "sinh", ); // NOLINT
DefineActGradDepXOpArgMap(Cosh, "cosh", ); // NOLINT
DefineActGradDepXOpArgMap(Asinh, "asinh", ); // NOLINT
DefineActGradDepXOpArgMap(Acosh, "acosh", ); // NOLINT
DefineActGradDepXOpArgMap(Atanh, "atanh", ); // NOLINT
DefineActGradDepXOpArgMap(BRelu, "brelu", "t_min" comma "t_max"); // NOLINT
DefineActGradDepXOpArgMap(LeakyRelu, "leaky_relu", "alpha"); // NOLINT
DefineActGradDepXOpArgMap(ThresholdedRelu,
"thresholded_relu",
"threshold"); // NOLINT
DefineActGradDepOutOpArgMap(Relu, "relu", ); // NOLINT
DefineActGradDepOutOpArgMap(Tanh, "tanh", ); // NOLINT
KernelSignature ReluDoubleGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("relu_double_grad", {"Out", "DDX"}, {}, {"DDOut"});
}
DefineActGradDepXOpArgMap(Cos, "cos");
DefineActGradDepXOpArgMap(Tan, "tan");
DefineActGradDepXOpArgMap(Acos, "acos");
DefineActGradDepXOpArgMap(Sin, "sin");
DefineActGradDepXOpArgMap(Asin, "asin");
DefineActGradDepXOpArgMap(Atan, "atan");
DefineActGradDepXOpArgMap(Sinh, "sinh");
DefineActGradDepXOpArgMap(Cosh, "cosh");
DefineActGradDepXOpArgMap(Asinh, "asinh");
DefineActGradDepXOpArgMap(Acosh, "acosh");
DefineActGradDepXOpArgMap(Atanh, "atanh");
DefineActGradDepOutOpArgMap(Relu, "relu");
KernelSignature TanhDoubleGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature(
"tanh_double_grad", {"Out", "DDX", "DOut"}, {}, {"DOutNew", "DDOut"});
}
KernelSignature TanhTripleGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("tanh_triple_grad",
{"Out", "DDX", "DOut", "D_DDOut", "D_DOut_New"},
{},
{"D_OutNew", "D_DOut", "D_DDx"});
}
KernelSignature LeakyReluDoubleGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature(
"leaky_relu_double_grad", {"X", "DDX"}, {"alpha"}, {"DDOut"});
}
KernelSignature LeakyReluOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("leaky_relu", {"X"}, {"alpha"}, {"Out"});
}
} // namespace phi
PD_REGISTER_BASE_KERNEL_NAME(relu_grad_grad, relu_double_grad);
PD_REGISTER_BASE_KERNEL_NAME(tanh_grad_grad, tanh_double_grad);
PD_REGISTER_BASE_KERNEL_NAME(leaky_relu_grad_grad, leaky_relu_double_grad);
PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping);
......@@ -65,3 +105,16 @@ PD_REGISTER_ARG_MAPPING_FN(atanh_grad, phi::AtanhGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(relu_grad, phi::ReluGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(relu_grad_grad,
phi::ReluDoubleGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(tanh_grad, phi::TanhGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(tanh_grad_grad,
phi::TanhDoubleGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(tanh_triple_grad,
phi::TanhTripleGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(brelu_grad, phi::BReluGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(leaky_relu, phi::LeakyReluOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad,
phi::LeakyReluGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad_grad,
phi::LeakyReluDoubleGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(thresholded_relu_grad,
phi::ThresholdedReluGradOpArgumentMapping);
......@@ -58,8 +58,9 @@ def get_compat_kernels_info():
content += line
if (registry and ";" in line):
data = content.replace("\n", "").replace(
" ", "").strip("return").strip(
"KernelSignature(").strip("\);").replace("\"", "")
" ",
"").strip("return").strip("KernelSignature(").strip(
"\);").replace("\"", "").replace("\\", "")
registry = False
name, registry_info = parse_compat_registry(data)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册