Revert "Move some activation to phi (#40727)" (#41056)

This reverts commit e77a947e.

Revert "Move some activation to phi (#40727)" (#41056)
This reverts commit e77a947e.
05f3d48e · tianshuo78520a · GitHub · 9c0eaada · 05f3d48e · 05f3d48e
29 changed file
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -53,7 +53,7 @@ USE_OP_ITSELF(tanh_grad);
 USE_OP(sum);
 USE_OP_ITSELF(slice_grad);
 USE_OP_ITSELF(lookup_table_grad);
-USE_OP_ITSELF(sqrt);
+USE_OP(sqrt);
 USE_OP_ITSELF(elementwise_max);
 USE_OP_ITSELF(elementwise_div);
 USE_OP_ITSELF(sgd);
@@ -83,7 +83,6 @@ PD_DECLARE_KERNEL(max_raw, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sgd, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(slice, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(slice_grad, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sqrt, GPU, ALL_LAYOUT);

 DECLARE_double(eager_delete_tensor_gb);


--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -1496,14 +1496,6 @@ REGISTER_ACTIVATION_OP(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,
                       HardSigmoidGradFunctor);
 REGISTER_ACTIVATION_OP(logsigmoid, LogSigmoid, LogSigmoidFunctor,
                       LogSigmoidGradFunctor);
-REGISTER_ACTIVATION_OP(expm1, Expm1, Expm1Functor, Expm1GradFunctor);
-REGISTER_ACTIVATION_OP(softplus, Softplus, SoftplusFunctor,
-                       SoftplusGradFunctor);
-REGISTER_ACTIVATION_OP(mish, Mish, MishFunctor, MishGradFunctor);
-REGISTER_ACTIVATION_OP(stanh, STanh, STanhFunctor, STanhGradFunctor);
-REGISTER_ACTIVATION_OP(reciprocal, Reciprocal, ReciprocalFunctor,
-                       ReciprocalGradFunctor);
-
 REGISTER_ACTIVATION_OP(log2, Log2, Log2Functor, Log2GradFunctor);
 REGISTER_ACTIVATION_OP(log10, Log10, Log10Functor, Log10GradFunctor);
 REGISTER_ACTIVATION_OP(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);
@@ -1638,7 +1630,12 @@ REGISTER_OPERATOR(logit, ops::LogitOp, ops::LogitOpMaker,
                  ops::LogitGradOpMaker<paddle::framework::OpDesc>,
                  ops::LogitGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(logit_grad, ops::LogitGradOp);
-
+REGISTER_OP_CPU_KERNEL(
+    logit, ops::LogitKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LogitKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    logit_grad, ops::LogitGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LogitGradKernel<paddle::platform::CPUDeviceContext, double>);
 /* ========================================================================== */

 /* ========================    celu  register     ============================
@@ -1687,6 +1684,7 @@ REGISTER_OPERATOR(
    ops::ActivationOpDoubleGrad<ops::SqrtGradGradFunctor<float>::FwdDeps()>,
    ops::ActivationDoubleGradOpInplaceInferer);

+REGISTER_ACTIVATION_CPU_KERNEL(sqrt, Sqrt, SqrtFunctor, SqrtGradFunctor);
 REGISTER_OP_CPU_KERNEL(
    sqrt_grad_grad, ops::SqrtDoubleGradKernel<plat::CPUDeviceContext,
                                              ops::SqrtGradGradFunctor<float>>,
@@ -1714,6 +1712,7 @@ REGISTER_OPERATOR(
    ops::ActivationOpDoubleGrad<ops::RsqrtGradGradFunctor<float>::FwdDeps()>,
    ops::ActivationDoubleGradOpInplaceInferer);

+REGISTER_ACTIVATION_CPU_KERNEL(rsqrt, Rsqrt, RsqrtFunctor, RsqrtGradFunctor);
 REGISTER_OP_CPU_KERNEL(
    rsqrt_grad_grad,
    ops::RsqrtDoubleGradKernel<plat::CPUDeviceContext,
@@ -1742,6 +1741,25 @@ REGISTER_OPERATOR(
    ops::ActivationOpDoubleGrad<ops::SquareGradGradFunctor<float>::FwdDeps()>,
    ops::ActivationDoubleGradOpInplaceInferer);

+REGISTER_OP_CPU_KERNEL(square,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::SquareFunctor<float>>,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::SquareFunctor<double>>,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::SquareFunctor<int>>,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::SquareFunctor<int64_t>>);
+REGISTER_OP_CPU_KERNEL(
+    square_grad, ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                                           ops::SquareGradFunctor<float>>,
+    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                              ops::SquareGradFunctor<double>>,
+    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                              ops::SquareGradFunctor<int>>,
+    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                              ops::SquareGradFunctor<int64_t>>);
+
 REGISTER_OP_CPU_KERNEL(
    square_grad_grad,
    ops::SquareDoubleGradKernel<plat::CPUDeviceContext,
@@ -1780,6 +1798,54 @@ REGISTER_OPERATOR(
 REGISTER_OPERATOR(exp_grad, ops::ActivationOpGrad,
                  ops::ActivationGradOpInplaceInferer);

+REGISTER_OP_CPU_KERNEL(exp,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::ExpFunctor<float>>,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::ExpFunctor<double>>,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::ExpFunctor<int>>,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::ExpFunctor<int64_t>>);
+REGISTER_OP_CPU_KERNEL(
+    exp_grad, ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                                        ops::ExpGradFunctor<float>>,
+    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                              ops::ExpGradFunctor<double>>,
+    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                              ops::ExpGradFunctor<int>>,
+    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                              ops::ExpGradFunctor<int64_t>>);
+/* ========================================================================== */
+
+/* ==========================   expm1 register  ============================ */
+REGISTER_OPERATOR(
+    expm1, ops::ActivationOp, ops::Expm1OpMaker, ops::ActivationOpInferVarType,
+    ops::ActivationGradOpMaker<ops::Expm1GradFunctor<float>::FwdDeps(),
+                               paddle::framework::OpDesc>,
+    ops::ActivationGradOpMaker<ops::Expm1GradFunctor<float>::FwdDeps(),
+                               paddle::imperative::OpBase>,
+    std::conditional<ops::CanInplaceAct<ops::Expm1GradFunctor<float>>(),
+                     ops::ActFwdInplaceInferer, void>::type);
+REGISTER_OPERATOR(expm1_grad, ops::ActivationOpGrad,
+                  ops::ActivationGradOpInplaceInferer);
+
+REGISTER_OP_CPU_KERNEL(expm1,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::Expm1Functor<float>>,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::Expm1Functor<double>>,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::Expm1Functor<plat::float16>>);
+REGISTER_OP_CPU_KERNEL(
+    expm1_grad, ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                                          ops::Expm1GradFunctor<float>>,
+    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                              ops::Expm1GradFunctor<double>>,
+    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                              ops::Expm1GradFunctor<plat::float16>>);
+/* ========================================================================== */
+
 /* ==========================  Log register ==================================*/
 REGISTER_OPERATOR(
    log, ops::ActivationOp, ops::LogOpMaker, ops::ActivationOpInferVarType,
@@ -1798,6 +1864,8 @@ REGISTER_OPERATOR(
    ops::ActivationOpDoubleGrad<ops::LogGradGradFunctor<float>::FwdDeps()>,
    ops::ActivationDoubleGradOpInplaceInferer);

+/* ========================================================================== */
+
 /* ==========================  register checkpoint ===========================*/
 REGISTER_OP_VERSION(leaky_relu)
    .AddCheckpoint(

--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -264,7 +264,6 @@ USE_PHI_FUNCTOR(Asinh)
 USE_PHI_FUNCTOR(Acosh)
 USE_PHI_FUNCTOR(Atanh)
 USE_PHI_FUNCTOR(Tanh)
-USE_PHI_FUNCTOR(Exp)
 USE_PHI_DOUBLE_GRAD_FUNCTOR(Tanh)
 USE_PHI_TRIPLE_GRAD_FUNCTOR(Tanh)
 USE_PHI_FUNCTOR(BRelu)
@@ -290,15 +289,6 @@ USE_PHI_FUNCTOR(Log1p)
 USE_PHI_FUNCTOR(Swish)
 USE_PHI_FUNCTOR(HardSwish)
 USE_PHI_FUNCTOR(Pow)
-USE_PHI_FUNCTOR(Exp)
-USE_PHI_FUNCTOR(Expm1)
-USE_PHI_FUNCTOR(Mish)
-USE_PHI_FUNCTOR(STanh)
-USE_PHI_FUNCTOR(Reciprocal)
-USE_PHI_FUNCTOR(Square)
-USE_PHI_FUNCTOR(Sqrt)
-USE_PHI_FUNCTOR(Rsqrt)
-USE_PHI_FUNCTOR(Softplus)

 template <typename T>
 using ELUGradNegativeAlphaFunctor = phi::funcs::ELUGradNegativeAlphaFunctor<T>;
@@ -315,8 +305,49 @@ using CeilFunctor = phi::funcs::CeilFunctor<T>;
 template <typename T>
 using ZeroGradFunctor = phi::funcs::ZeroGradFunctor<T>;

+// exp(x) = e^x
 template <typename T>
-using ELUGradNegativeAlphaFunctor = phi::funcs::ELUGradNegativeAlphaFunctor<T>;
+struct ExpFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.exp();
+  }
+};
+
+template <typename T>
+struct ExpGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * out;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+// expm1(x) = e^x - 1
+template <typename T>
+struct Expm1Functor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.expm1();
+  }
+};
+
+template <typename T>
+struct Expm1GradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * out + dout;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};

 // relu(x) = max(x, 0)

@@ -331,68 +362,92 @@ using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor<T>;
 template <typename T>
 using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor<T>;

+// sqrt(x) = x^(1/2)
 template <typename T>
-struct SqrtGradGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* Out,
-                  const framework::Tensor* ddX, framework::Tensor* ddOut,
-                  framework::Tensor* dOut, const framework::Tensor* dX) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "SqrtGradGrad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Output", "Out", "SqrtGradGrad"));
-    // sqrt GradGrad: ddy = 0.5 * ddx / y, dy = -1 * dx * ddx
-    // calculate dy first, so ddy can inplace ddx
-    if (dOut) {
-      auto dx = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dX, "Output", "DX", "SqrtGradGrad"));
-      auto dout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOut, "Output", "DOut", "SqrtGradGrad"));
-      dout.device(*d) = dx * ddx * static_cast<T>(-1) / out;
-    }
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SqrtGradGrad"));
-      ddout.device(*d) = ddx * static_cast<T>(0.5) / out;
-    }
+struct SqrtFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.sqrt();
  }
+};
+
+template <typename T>
+struct SqrtGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = static_cast<T>(0.5) * dout / out;
+  }
+
  static constexpr ActBwdOpFwdDeps FwdDeps() {
    return ActBwdOpFwdDeps::kDepOut;
  }
 };

+// rsqrt(x) = x^(-1/2)
 template <typename T>
-struct RsqrtGradGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* Out,
-                  const framework::Tensor* ddX, framework::Tensor* ddOut,
-                  framework::Tensor* dOut, const framework::Tensor* dX) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "RsqrtGradGrad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Output", "Out", "RsqrtGradGrad"));
+struct RsqrtFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.rsqrt();
+  }
+};

-    // rsqrt GradGrad: ddy = -0.5 * ddx * y * y * y, dy = (3/y) * dx * ddx
-    if (dOut) {
-      auto dx = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dX, "Output", "DX", "RsqrtGradGrad"));
-      auto dout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOut, "Output", "DOut", "RsqrtGradGrad"));
-      dout.device(*d) = (static_cast<T>(3.0) / out) * dx * ddx;
-    }
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "RsqrtGradGrad"));
-      ddout.device(*d) = ddx * static_cast<T>(-0.5) * out * out * out;
-    }
+template <typename T>
+struct RsqrtGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = static_cast<T>(-0.5) * dout * out * out * out;
  }
+
  static constexpr ActBwdOpFwdDeps FwdDeps() {
    return ActBwdOpFwdDeps::kDepOut;
  }
 };

+// reciprocal(x) = 1 / x
+template <typename T>
+struct ReciprocalFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = static_cast<T>(1) / x;
+  }
+};
+
+template <typename T>
+struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(-1) * out * out;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+// square(x) = x^2
+template <typename T>
+struct SquareFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.square();
+  }
+};
+
+template <typename T>
+struct SquareGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(2) * x;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
 // relu6(x) = min(max(0, x), 6)
 template <typename T>
 struct Relu6Functor : public BaseActivationFunctor<T> {
@@ -429,6 +484,114 @@ struct Relu6GradFunctor : public BaseActivationFunctor<T> {
  }
 };

+// For numerical stability, using the following formula instead of softplus(x) =
+// log(1 + exp(x))
+// softplus(x) = log(1 + exp(beta * x)) / beta when beta * x <= threshold(beta =
+// 1, threshold = 20 by default), otherwise x
+template <typename T>
+struct SoftplusFunctor : public BaseActivationFunctor<T> {
+  float beta;
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}, {"threshold", &threshold}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) {
+    auto x_beta = static_cast<T>(beta) * x;
+    out.device(d) = (x_beta > static_cast<T>(threshold))
+                        .select(x, (static_cast<T>(1) + x_beta.exp()).log() /
+                                       static_cast<T>(beta));
+  }
+};
+
+// For numerical stability, using the following formula instead of
+// d(softplus(x))/dx = 1 / (1 + exp(-x))
+// d(softplus(x))/dx = 1 / (1 + exp(-beta * x)) when beta * x <= threshold(beta
+// = 1, threshold = 20 by default), otherwise x
+template <typename T>
+struct SoftplusGradFunctor : public BaseActivationFunctor<T> {
+  float beta;
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}, {"threshold", &threshold}};
+  }
+
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) {
+    auto x_beta = static_cast<T>(beta) * x;
+    dx.device(d) =
+        (x_beta > static_cast<T>(threshold))
+            .select(dout, dout / (static_cast<T>(1) + (-x_beta).exp()));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+// mish(x) = x * tanh(softplus(x))
+// softplus(x) = x, if x > threshold
+//             = ln(1 + exp(x)), otherwise
+template <typename T>
+struct MishFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) {
+    auto sp = (x > static_cast<T>(threshold))
+                  .select(x, (static_cast<T>(1) + x.exp()).log());
+    out.device(d) = x * sp.tanh();
+  }
+};
+
+// dx = dout * (tanh(sp) + x * (1 - tanh(sp) ** 2) * (1 - exp(-sp)))
+// sp = softplus(x)
+template <typename T>
+struct MishGradFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) {
+    auto sp = (x > static_cast<T>(threshold))
+                  .select(x, (static_cast<T>(1) + x.exp()).log());
+    auto gsp = static_cast<T>(1) - (-sp).exp();
+    auto tsp = sp.tanh();
+    dx.device(d) = dout * (tsp + x * (static_cast<T>(1) - tsp * tsp) * gsp);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+// softsign(x) = x / (1 + |x|)
+template <typename T>
+struct SoftsignFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) {
+    out.device(d) = x / (static_cast<T>(1) + x.abs());
+  }
+};
+
+// d(softsign(x))/dx = 1 / (1 + |x|)^2
+// Taken from https://en.wikipedia.org/wiki/Activation_function
+template <typename T>
+struct SoftsignGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) {
+    dx.device(d) =
+        dout * (static_cast<T>(1) / (static_cast<T>(1) + x.abs()).square());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
 template <typename T>
 struct SoftReluFunctor : public BaseActivationFunctor<T> {
  float threshold;
@@ -543,6 +706,71 @@ struct CELUGradFunctor : public BaseActivationFunctor<T> {
  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };

+template <typename T>
+struct LogitFunctor {
+  template <typename Device, typename X, typename Out, typename P>
+  void operator()(Device d, X x, Out out, P p, float eps) const {
+    // logit(x) = ln(x/(1-x))
+    auto tmp_x =
+        (x.cwiseMin(static_cast<T>(1.0 - eps))).cwiseMax(static_cast<T>(eps));
+
+    if (!eps) {
+      out.device(d) = (x < static_cast<T>(0.0) || x > static_cast<T>(1.0))
+                          .select(p.constant(static_cast<T>(NAN)),
+                                  (tmp_x / (static_cast<T>(1) - tmp_x)).log());
+    } else {
+      out.device(d) = (tmp_x / (static_cast<T>(1) - tmp_x)).log();
+    }
+  }
+};
+
+template <typename T>
+struct LogitGradFunctor {
+  template <typename Device, typename X, typename dOut, typename dX, typename P>
+  void operator()(Device d, X x, dOut dout, dX dx, P p, float eps) const {
+    // logit(x)' = 1/(x*(1-x))
+    dx.device(d) =
+        (x < static_cast<T>(eps) || x > static_cast<T>(1.0 - eps))
+            .select(p.constant(static_cast<T>(0)),
+                    dout * (static_cast<T>(1) / ((static_cast<T>(1) - x) * x)));
+  }
+};
+
+template <typename T>
+struct STanhFunctor : public BaseActivationFunctor<T> {
+  float scale_a;
+  float scale_b;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) =
+        static_cast<T>(scale_b) * (static_cast<T>(scale_a) * x).tanh();
+  }
+};
+
+template <typename T>
+struct STanhGradFunctor : public BaseActivationFunctor<T> {
+  float scale_a;
+  float scale_b;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
+  }
+
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto a = static_cast<T>(scale_a);
+    auto b = static_cast<T>(scale_b);
+    auto temp = (a * x).tanh() * (a * x).tanh();
+    dx.device(d) = dout * a * b * (static_cast<T>(1) - temp);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
 template <typename T>
 struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
  template <typename Device>
@@ -603,6 +831,68 @@ struct CELUGradGradFunctor : public BaseActivationFunctor<T> {
  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };

+template <typename T>
+struct SqrtGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev, const framework::Tensor* Out,
+                  const framework::Tensor* ddX, framework::Tensor* ddOut,
+                  framework::Tensor* dOut, const framework::Tensor* dX) const {
+    auto* d = dev.eigen_device();
+    auto ddx = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "SqrtGradGrad"));
+    auto out = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Output", "Out", "SqrtGradGrad"));
+    // sqrt GradGrad: ddy = 0.5 * ddx / y, dy = -1 * dx * ddx
+    // calculate dy first, so ddy can inplace ddx
+    if (dOut) {
+      auto dx = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dX, "Output", "DX", "SqrtGradGrad"));
+      auto dout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Output", "DOut", "SqrtGradGrad"));
+      dout.device(*d) = dx * ddx * static_cast<T>(-1) / out;
+    }
+    if (ddOut) {
+      auto ddout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SqrtGradGrad"));
+      ddout.device(*d) = ddx * static_cast<T>(0.5) / out;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct RsqrtGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev, const framework::Tensor* Out,
+                  const framework::Tensor* ddX, framework::Tensor* ddOut,
+                  framework::Tensor* dOut, const framework::Tensor* dX) const {
+    auto* d = dev.eigen_device();
+    auto ddx = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "RsqrtGradGrad"));
+    auto out = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Output", "Out", "RsqrtGradGrad"));
+
+    // rsqrt GradGrad: ddy = -0.5 * ddx * y * y * y, dy = (3/y) * dx * ddx
+    if (dOut) {
+      auto dx = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dX, "Output", "DX", "RsqrtGradGrad"));
+      auto dout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Output", "DOut", "RsqrtGradGrad"));
+      dout.device(*d) = (static_cast<T>(3.0) / out) * dx * ddx;
+    }
+    if (ddOut) {
+      auto ddout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "RsqrtGradGrad"));
+      ddout.device(*d) = ddx * static_cast<T>(-0.5) * out * out * out;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
 template <typename T>
 struct SquareGradGradFunctor : public BaseActivationFunctor<T> {
  template <typename Device>
@@ -698,29 +988,6 @@ class SquareDoubleGradKernel
  }
 };

-template <typename T>
-struct SoftsignFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x / (static_cast<T>(1) + x.abs());
-  }
-};
-
-// d(softsign(x))/dx = 1 / (1 + |x|)^2
-// Taken from https://en.wikipedia.org/wiki/Activation_function
-
-template <typename T>
-struct SoftsignGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) =
-        dout * (static_cast<T>(1) / (static_cast<T>(1) + x.abs()).square());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 template <typename DeviceContext, typename Functor>
 class CELUDoubleGradKernel
    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -868,10 +1135,57 @@ class RsqrtDoubleGradKernel
  }
 };

+template <typename DeviceContext, typename T>
+class LogitKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto* in = context.Input<framework::Tensor>("X");
+    auto eps = context.Attr<float>("eps");
+    out->mutable_data<T>(in->place());
+
+    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
+    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    auto eigen_p = framework::EigenVector<T>::Flatten(*out);
+
+    LogitFunctor<T> functor;
+    functor(place, eigen_in, eigen_out, eigen_p, eps);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LogitGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* dout =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto eps = context.Attr<float>("eps");
+    dx->mutable_data<T>(dout->place());
+
+    auto eigen_x = framework::EigenVector<T>::Flatten(*x);
+    auto eigen_dout = framework::EigenVector<T>::Flatten(*dout);
+    auto eigen_dx = framework::EigenVector<T>::Flatten(*dx);
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    auto eigen_p = framework::EigenVector<T>::Flatten(*x);
+
+    LogitGradFunctor<T> functor;
+    functor(place, eigen_x, eigen_dout, eigen_dx, eigen_p, eps);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

-#define FOR_EACH_ACTIVATION_OP(__macro)                               \
-  __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \
-  __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor);  \
-  __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor);
+#define FOR_EACH_ACTIVATION_OP(__macro)                                      \
+  __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \
+  __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor);        \
+  __macro(stanh, STanh, STanhFunctor, STanhGradFunctor);                     \
+  __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor);         \
+  __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor);         \
+  __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor);                     \
+  __macro(mish, Mish, MishFunctor, MishGradFunctor);
--- a/paddle/fluid/operators/activation_op.kps
+++ b/paddle/fluid/operators/activation_op.kps
@@ -20,6 +20,140 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

+template <typename T>
+struct CudaReciprocalFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // reciprocal(x) = 1 / x
+  __device__ __forceinline__ T operator()(const T x) const { return one / x; }
+};
+
+template <typename T>
+struct CudaReciprocalGradFunctor : public BaseActivationFunctor<T> {
+  // dx = -dout * out^2
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return -dout * out * out;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaExpFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // exp(x) = exp(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(exp(x));
+  }
+};
+
+template <typename T>
+struct CudaExpGradFunctor : public BaseActivationFunctor<T> {
+  // dx = dout * out
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return dout * out;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaExpm1Functor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // expm1(x) = expm1(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(expm1(x));
+  }
+};
+
+template <typename T>
+struct CudaExpm1GradFunctor : public BaseActivationFunctor<T> {
+  // dx = dout * out
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return dout * out + dout;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaSquareFunctor : public BaseActivationFunctor<T> {
+  // square(x) = x * x
+  __device__ __forceinline__ T operator()(const T x) const { return x * x; }
+};
+
+template <typename T>
+struct CudaSquareGradFunctor : public BaseActivationFunctor<T> {
+  T two = static_cast<T>(2.0f);
+
+  // dx = dout * 2 * x
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return dout * two * x;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaSqrtFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // sqrt(x) = sqrt(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(sqrt(x));
+  }
+};
+
+template <typename T>
+struct CudaSqrtGradFunctor : public BaseActivationFunctor<T> {
+  T one_half = static_cast<T>(0.5f);
+
+  // dx = dout * 0.5 / out
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return one_half * dout / out;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaRsqrtFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // rsqrt(x) = rsqrt(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(rsqrt(x));
+  }
+};
+
+template <typename T>
+struct CudaRsqrtGradFunctor : public BaseActivationFunctor<T> {
+  T minus_one_half = static_cast<T>(-0.5f);
+
+  // dx = -0.5 * dout * out^3
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return minus_one_half * dout * out * out * out;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
 template <typename T>
 struct CudaSoftReluFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
@@ -67,6 +201,119 @@ struct CudaSoftReluGradFunctor : public BaseActivationFunctor<T> {
  }
 };

+template <typename T>
+struct CudaSTanhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  float scale_a;
+  float scale_b;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
+  }
+
+  // stanh(x) = b * tanh(a * x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    MPType a = static_cast<MPType>(scale_a);
+    MPType b = static_cast<MPType>(scale_b);
+    return static_cast<T>(b * tanh(a * x));
+  }
+};
+
+template <typename T>
+struct CudaSTanhGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float scale_a;
+  float scale_b;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
+  }
+
+  // dx = dout * a * b * (1 - tanh(a * x) * tanh(a * x))
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    MPType a = static_cast<MPType>(scale_a);
+    MPType b = static_cast<MPType>(scale_b);
+    MPType temp = tanh(a * x);
+    return static_cast<T>(dout * a * b * (one - temp * temp));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaSoftplusFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float beta;
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}, {"threshold", &threshold}};
+  }
+
+  // softplus(x) = beta * x > threshold ? x : log(1 + exp(beta * x)) / beta
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    MPType b = static_cast<MPType>(beta);
+    MPType t = static_cast<MPType>(threshold);
+    MPType x_beta = x * beta;
+    return static_cast<T>(x_beta > t ? x : log(one + exp(x_beta)) / b);
+  }
+};
+
+template <typename T>
+struct CudaSoftplusGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float beta;
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}, {"threshold", &threshold}};
+  }
+
+  // dx = x * beta > threshold ? dout : dout / (1 + exp(-beta * x))
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    MPType b = static_cast<MPType>(beta);
+    MPType t = static_cast<MPType>(threshold);
+    MPType x_beta = x * beta;
+    return x_beta > t ? arg_dout : static_cast<T>(dout / (one + exp(-x_beta)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaSoftsignFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // softsign(x) = x / (1 + abs(x))
+  __device__ __forceinline__ T operator()(const T x) const {
+    return x / (one + abs(x));
+  }
+};
+
+template <typename T>
+struct CudaSoftsignGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // dx = dout / (1 + abs(x))^2
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    T temp = one + abs(x);
+    return dout / (temp * temp);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
 template <typename T>
 struct CudaRelu6Functor : public BaseActivationFunctor<T> {
  T zero = static_cast<T>(0.0f);
@@ -104,23 +351,49 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
 };

 template <typename T>
-struct CudaSoftsignFunctor : public BaseActivationFunctor<T> {
-  T one = static_cast<T>(1.0f);
+struct CudaMishFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float threshold;

-  // softsign(x) = x / (1 + abs(x))
-  __device__ __forceinline__ T operator()(const T x) const {
-    return x / (one + abs(x));
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // mish(x) = x * tanh(softplus(x))
+  // softplus(x) = x, if x > threshold
+  //             = ln(1 + exp(x)), otherwise
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    MPType sp = (x > static_cast<MPType>(threshold)) ? x : log(one + exp(x));
+    return static_cast<T>(x * tanh(sp));
  }
 };

 template <typename T>
-struct CudaSoftsignGradFunctor : public BaseActivationFunctor<T> {
-  T one = static_cast<T>(1.0f);
+struct CudaMishGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float threshold;

-  // dx = dout / (1 + abs(x))^2
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    T temp = one + abs(x);
-    return dout / (temp * temp);
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // dx = dout * (tanh(sp) + x * (1 - tanh(sp) ** 2) * (1 - exp(-sp)))
+  // sp = softplus(x)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    MPType sp = (x > static_cast<MPType>(threshold)) ? x : log(one + exp(x));
+    MPType gsp =
+        (x > static_cast<MPType>(threshold)) ? one : one / (one + exp(-x));
+    MPType tsp = tanh(sp);
+    return static_cast<T>(dout * (tsp + x * (one - tsp * tsp) * gsp));
  }

  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
@@ -285,16 +558,6 @@ using CudaCeilFunctor = phi::funcs::CudaCeilFunctor<T>;
 template <typename T>
 using CudaZeroGradFunctor = phi::funcs::CudaZeroGradFunctor<T>;

-USE_PHI_FUNCTOR(CudaExp)
-USE_PHI_FUNCTOR(CudaExpm1)
-USE_PHI_FUNCTOR(CudaMish)
-USE_PHI_FUNCTOR(CudaSTanh)
-USE_PHI_FUNCTOR(CudaReciprocal)
-USE_PHI_FUNCTOR(CudaSquare)
-USE_PHI_FUNCTOR(CudaSqrt)
-USE_PHI_FUNCTOR(CudaRsqrt)
-USE_PHI_FUNCTOR(CudaSoftplus)
-
 template <typename T>
 using CudaELUGradNegativeAlphaFunctor =
    phi::funcs::CudaELUGradNegativeAlphaFunctor<T>;
@@ -373,6 +636,8 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */

 /* ===========================   sqrt register  ============================= */
+REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, CudaSqrtFunctor,
+                                CudaSqrtGradFunctor);

 REGISTER_OP_CUDA_KERNEL(
    sqrt_grad_grad,
@@ -388,6 +653,8 @@ REGISTER_OP_CUDA_KERNEL(

 /* ===========================   rsqrt register  =============================
 */
+REGISTER_ACTIVATION_CUDA_KERNEL(rsqrt, Rsqrt, CudaRsqrtFunctor,
+                                CudaRsqrtGradFunctor);

 REGISTER_OP_CUDA_KERNEL(
    rsqrt_grad_grad,
@@ -400,6 +667,8 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */

 /* ===========================  square register  ============================ */
+REGISTER_ACTIVATION_CUDA_KERNEL_INT(square, Square, CudaSquareFunctor,
+                                    CudaSquareGradFunctor);

 REGISTER_OP_CUDA_KERNEL(
    square_grad_grad,
@@ -419,19 +688,75 @@ REGISTER_OP_CUDA_KERNEL(

 /* ==========================   logit register  ============================ */
 namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    logit, ops::LogitKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LogitKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LogitKernel<paddle::platform::CUDADeviceContext,
+                     paddle::platform::float16>);
+REGISTER_OP_CUDA_KERNEL(
+    logit_grad,
+    ops::LogitGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LogitGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LogitGradKernel<paddle::platform::CUDADeviceContext,
+                         paddle::platform::float16>);
 /* ========================================================================== */

 /* ==========================   exp register  ============================ */
+REGISTER_OP_CUDA_KERNEL(
+    exp, ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                                   ops::CudaExpFunctor<float>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaExpFunctor<double>>,
+    ops::ActivationKernel<plat::CUDADeviceContext, ops::ExpFunctor<int>>,
+    ops::ActivationKernel<plat::CUDADeviceContext, ops::ExpFunctor<int64_t>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaExpFunctor<plat::float16>>);
+REGISTER_OP_CUDA_KERNEL(
+    exp_grad, ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                            ops::CudaExpGradFunctor<float>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaExpGradFunctor<double>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaExpGradFunctor<int>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaExpGradFunctor<int64_t>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaExpGradFunctor<plat::float16>>);
 /* ========================================================================== */

 /* ==========================   expm1 register  ============================ */
+
+REGISTER_OP_CUDA_KERNEL(
+    expm1, ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                                     ops::CudaExpm1Functor<float>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaExpm1Functor<double>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaExpm1Functor<plat::float16>>);
+REGISTER_OP_CUDA_KERNEL(
+    expm1_grad, ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                              ops::CudaExpm1GradFunctor<float>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaExpm1GradFunctor<double>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaExpm1GradFunctor<plat::float16>>);
 /* ========================================================================== */

 #define FOR_EACH_ACTIVATION_CUDA_OP(__macro)                                  \
+  __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor,                      \
+          CudaSoftShrinkGradFunctor);                                         \
+  __macro(reciprocal, Reciprocal, CudaReciprocalFunctor,                      \
+          CudaReciprocalGradFunctor);                                         \
  __macro(soft_relu, SoftRelu, CudaSoftReluFunctor, CudaSoftReluGradFunctor); \
+  __macro(stanh, STanh, CudaSTanhFunctor, CudaSTanhGradFunctor);              \
+  __macro(softplus, Softplus, CudaSoftplusFunctor, CudaSoftplusGradFunctor);  \
+  __macro(softsign, Softsign, CudaSoftsignFunctor, CudaSoftsignGradFunctor);  \
  __macro(relu6, Relu6, CudaRelu6Functor, CudaRelu6GradFunctor);              \
-  __macro(softsign, Softsign, CudaSoftsignFunctor, CudaSoftsignGradFunctor);
-
+  __macro(tanh_shrink, TanhShrink, CudaTanhShrinkFunctor,                     \
+          CudaTanhShrinkGradFunctor);                                         \
+  __macro(hard_shrink, HardShrink, CudaHardShrinkFunctor,                     \
+          CudaHardShrinkGradFunctor);                                         \
+  __macro(mish, Mish, CudaMishFunctor, CudaMishGradFunctor);
 FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL)

 #ifdef PADDLE_WITH_XPU_KP

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -22,9 +22,9 @@ math_library(sampler DEPS generator)
 math_library(maxouting)

 if(WITH_MKLDNN)
-    math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mkldnn_axpy_handler mixed_vector)
+    math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mkldnn_axpy_handler)
 else()
-    math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mixed_vector)
+    math_library(selected_rows_functor DEPS selected_rows_utils math_function blas)
 endif()

 math_library(sequence_padding)

--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"

 #ifdef PADDLE_WITH_MKLDNN

--- a/paddle/fluid/operators/temporal_shift_op.h
+++ b/paddle/fluid/operators/temporal_shift_op.h
@@ -19,6 +19,56 @@ namespace operators {
 using Tensor = framework::Tensor;
 using DataLayout = framework::DataLayout;

+template <typename T>
+void TemporalShiftFwNCHW(const T* input, T* output, const int ntchw,
+                         const int tchw, const int chw, const int hw,
+                         const int t, const int c1, const int c2) {
+  int src_it = 0;
+  for (int i = 0; i < ntchw; i++) {
+    int it = (i % tchw) / chw;
+    int ic = (i % chw) / hw;
+
+    if (ic < c1) {
+      src_it = it - 1;
+    } else if (ic < c2) {
+      src_it = it + 1;
+    } else {
+      src_it = it;
+    }
+
+    if (src_it < 0 || src_it >= t) {
+      output[i] = 0;
+    } else {
+      output[i] = input[i + (src_it - it) * chw];
+    }
+  }
+}
+
+template <typename T>
+void TemporalShiftFwNHWC(const T* input, T* output, const int nthwc,
+                         const int thwc, const int hwc, const int t,
+                         const int c, const int c1, const int c2) {
+  int src_it = 0;
+  for (int i = 0; i < nthwc; i++) {
+    int it = (i % thwc) / hwc;
+    int ic = i % c;
+
+    if (ic < c1) {
+      src_it = it - 1;
+    } else if (ic < c2) {
+      src_it = it + 1;
+    } else {
+      src_it = it;
+    }
+
+    if (src_it < 0 || src_it >= t) {
+      output[i] = 0;
+    } else {
+      output[i] = input[i + (src_it - it) * hwc];
+    }
+  }
+}
+
 template <typename T>
 void TemporalShiftBwNCHW(const T* output_grad, T* input_grad, const int ntchw,
                         const int tchw, const int chw, const int hw,
@@ -72,7 +122,45 @@ void TemporalShiftBwNHWC(const T* output_grad, T* input_grad, const int nthwc,
 template <typename T>
 class TemporalShiftKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& ctx) const override {}
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    int t = ctx.Attr<int>("seg_num");
+    float shift_ratio = ctx.Attr<float>("shift_ratio");
+    const std::string data_format_str = ctx.Attr<std::string>("data_format");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_format_str);
+
+    const int nt = input->dims()[0];
+    const int c = (data_layout == DataLayout::kNCHW ? input->dims()[1]
+                                                    : input->dims()[3]);
+    const int h = (data_layout == DataLayout::kNCHW ? input->dims()[2]
+                                                    : input->dims()[1]);
+    const int w = (data_layout == DataLayout::kNCHW ? input->dims()[3]
+                                                    : input->dims()[2]);
+
+    const int hw = h * w;
+    const int chw = c * hw;
+    const int tchw = t * chw;
+    const int ntchw = nt * chw;
+
+    const int c1 = static_cast<int>(c * shift_ratio);
+    const int c2 = static_cast<int>(c * 2 * shift_ratio);
+
+    framework::DDim out_dims =
+        (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
+                                          : phi::make_ddim({nt, h, w, c}));
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>(out_dims, ctx.GetPlace());
+
+    if (data_layout == DataLayout::kNCHW) {
+      TemporalShiftFwNCHW<T>(input_data, output_data, ntchw, tchw, chw, hw, t,
+                             c1, c2);
+    } else {
+      TemporalShiftFwNHWC<T>(input_data, output_data, ntchw, tchw, chw, t, c,
+                             c1, c2);
+    }
+  }
 };

 template <typename T>

--- a/paddle/phi/kernels/activation_kernel.h
+++ b/paddle/phi/kernels/activation_kernel.h
@@ -53,13 +53,6 @@ DECLARE_ACTIVATION_KERNEL(Acosh)
 DECLARE_ACTIVATION_KERNEL(Atanh)
 DECLARE_ACTIVATION_KERNEL(Relu)
 DECLARE_ACTIVATION_KERNEL(Tanh)
-DECLARE_ACTIVATION_KERNEL(Exp)
-DECLARE_ACTIVATION_KERNEL(Expm1)
-DECLARE_ACTIVATION_KERNEL(Reciprocal)
-DECLARE_ACTIVATION_KERNEL(Square)
-DECLARE_ACTIVATION_KERNEL(Sqrt)
-DECLARE_ACTIVATION_KERNEL(Rsqrt)
-
 DECLARE_ACTIVATION_KERNEL(TanhShrink)
 DECLARE_ACTIVATION_KERNEL(Silu)
 DECLARE_ACTIVATION_KERNEL(Sigmoid)
@@ -80,23 +73,8 @@ DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Swish, beta)

 DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(BRelu, t_min, t_max)
-DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(STanh, scale_a, scale_b)
 DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(HardSigmoid, slope, offset)

-DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(Softplus, beta, threshold)
-
-template <typename T, typename Context>
-void LogitKernel(const Context& dev_ctx,
-                 const DenseTensor& x,
-                 float eps,
-                 DenseTensor* out);
-
-template <typename T, typename Context>
-void MishKernel(const Context& dev_ctx,
-                const DenseTensor& x,
-                float threshold,
-                DenseTensor* out);
-
 template <typename T, typename Context>
 void HardSwishKernel(const Context& dev_ctx,
                     const DenseTensor& x,

--- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -129,13 +129,6 @@ DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, AcoshGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, AtanhGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, TanhShrinkGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, SiluGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Square, SquareGradFunctor);
-
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Exp, ExpGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1, Expm1GradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Reciprocal, ReciprocalGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, SqrtGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Rsqrt, RsqrtGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, LogSigmoidGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, LogGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log2, Log2GradFunctor);
@@ -164,24 +157,11 @@ DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink,
                                               threshold);
 DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish, SwishGradFunctor, beta);

-DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
-                                               MishGradFunctor,
-                                               threshold);
-
 DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
                                               BReluGradFunctor,
                                               t_min,
                                               t_max);

-DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(STanh,
-                                               STanhGradFunctor,
-                                               scale_a,
-                                               scale_b);
-
-DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(Softplus,
-                                               SoftplusGradFunctor,
-                                               beta,
-                                               threshold);
 DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid,
                                                 HardSigmoidGradFunctor,
                                                 slope,
@@ -267,12 +247,6 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(mish_grad, MishGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(stanh_grad, STanhGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(reciprocal_grad, ReciprocalGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(sqrt_grad, SqrtGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(rsqrt_grad, RsqrtGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(softplus_grad, SoftplusGradKernel)

 PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(relu_double_grad,
                                          ReluDoubleGradKernel)
@@ -289,34 +263,6 @@ PD_REGISTER_KERNEL(tanh_triple_grad,
                   float,
                   double,
                   phi::dtype::float16) {}
-
-PD_REGISTER_KERNEL(exp_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::ExpGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t) {}
-
-PD_REGISTER_KERNEL(expm1_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::Expm1GradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-
-PD_REGISTER_KERNEL(
-    logit_grad, CPU, ALL_LAYOUT, phi::LogitGradKernel, float, double) {}
-PD_REGISTER_KERNEL(square_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::SquareGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t) {}
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel)

--- a/paddle/phi/kernels/cpu/activation_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/phi/kernels/activation_kernel.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/funcs/activation_functor.h"
 #include "paddle/phi/kernels/impl/activation_impl.h"

 namespace phi {
@@ -73,12 +72,6 @@ DEFINE_CPU_ACTIVATION_KERNEL(Relu, ReluCPUFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Tanh, TanhFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(TanhShrink, TanhShrinkFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Silu, SiluFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Exp, ExpFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Expm1, Expm1Functor)
-DEFINE_CPU_ACTIVATION_KERNEL(Reciprocal, ReciprocalFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Square, SquareFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Sqrt, SqrtFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Rsqrt, RsqrtFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Sigmoid, SigmoidFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(LogSigmoid, LogSigmoidFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Log, LogFunctor)
@@ -90,19 +83,15 @@ DEFINE_CPU_ACTIVATION_KERNEL(Floor, FloorFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Ceil, CeilFunctor)

 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, LeakyReluFunctor, alpha)
-
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
                                     ThresholdedReluFunctor,
                                     threshold)
-DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Mish, MishFunctor, threshold)
-DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, BReluFunctor, t_min, t_max)
-DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(STanh, STanhFunctor, scale_a, scale_b)
-DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(Softplus, SoftplusFunctor, beta, threshold)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink, HardShrinkFunctor, threshold)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, SoftShrinkFunctor, lambda)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, ELUFunctor, alpha)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Swish, SwishFunctor, beta)

+DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, BReluFunctor, t_min, t_max)
 DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid,
                                     HardSigmoidFunctor,
                                     slope,
@@ -150,25 +139,6 @@ PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel)
-PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel)
-PD_REGISTER_ACTIVATION_KERNEL(stanh, STanhKernel)
-PD_REGISTER_ACTIVATION_KERNEL(reciprocal, ReciprocalKernel)
-PD_REGISTER_ACTIVATION_KERNEL(sqrt, SqrtKernel)
-PD_REGISTER_ACTIVATION_KERNEL(rsqrt, RsqrtKernel)
-PD_REGISTER_ACTIVATION_KERNEL(softplus, SoftplusKernel)
-
-PD_REGISTER_KERNEL(
-    exp, CPU, ALL_LAYOUT, phi::ExpKernel, float, double, int, int64_t) {}
-PD_REGISTER_KERNEL(expm1,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::Expm1Kernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(logit, CPU, ALL_LAYOUT, phi::LogitKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    square, CPU, ALL_LAYOUT, phi::SquareKernel, float, double, int, int64_t) {}
 PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel)
 PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel)
 PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel)

--- a/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/kernels/temporal_shift_grad_kernel.h"
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/kernel_registry.h"
-
-namespace phi {
-
-template <typename T>
-void TemporalShiftBwNCHW(const T* output_grad,
-                         T* input_grad,
-                         const int ntchw,
-                         const int tchw,
-                         const int chw,
-                         const int hw,
-                         const int t,
-                         const int c1,
-                         const int c2) {
-  int src_it = 0;
-  for (int i = 0; i < ntchw; i++) {
-    int it = (i % tchw) / chw;
-    int ic = (i % chw) / hw;
-
-    if (ic < c1) {
-      src_it = it + 1;
-    } else if (ic < c2) {
-      src_it = it - 1;
-    } else {
-      src_it = it;
-    }
-
-    if (src_it >= 0 && src_it < t) {
-      input_grad[i] = output_grad[i + (src_it - it) * chw];
-    } else {
-      input_grad[i] = 0;
-    }
-  }
-}
-
-template <typename T>
-void TemporalShiftBwNHWC(const T* output_grad,
-                         T* input_grad,
-                         const int nthwc,
-                         const int thwc,
-                         const int hwc,
-                         const int t,
-                         const int c,
-                         const int c1,
-                         const int c2) {
-  int src_it = 0;
-  for (int i = 0; i < nthwc; i++) {
-    int it = (i % thwc) / hwc;
-    int ic = i % c;
-
-    if (ic < c1) {
-      src_it = it + 1;
-    } else if (ic < c2) {
-      src_it = it - 1;
-    } else {
-      src_it = it;
-    }
-
-    if (src_it >= 0 && src_it < t) {
-      input_grad[i] = output_grad[i + (src_it - it) * hwc];
-    } else {
-      input_grad[i] = 0;
-    }
-  }
-}
-
-template <typename T, typename Context>
-void TemporalShiftGradKernel(const Context& dev_ctx,
-                             const DenseTensor& out_grad,
-                             int seg_num,
-                             float shift_ratio,
-                             const std::string& data_format_str,
-                             DenseTensor* x_grad) {
-  auto* input_grad = x_grad;
-  auto* output_grad = &out_grad;
-  int t = seg_num;
-  const DataLayout data_layout =
-      paddle::framework::StringToDataLayout(data_format_str);
-
-  const int nt = output_grad->dims()[0];
-  const int c = (data_layout == DataLayout::kNCHW ? output_grad->dims()[1]
-                                                  : output_grad->dims()[3]);
-  const int h = (data_layout == DataLayout::kNCHW ? output_grad->dims()[2]
-                                                  : output_grad->dims()[1]);
-  const int w = (data_layout == DataLayout::kNCHW ? output_grad->dims()[3]
-                                                  : output_grad->dims()[2]);
-
-  const int hw = h * w;
-  const int chw = c * hw;
-  const int tchw = t * chw;
-  const int ntchw = nt * chw;
-
-  const int c1 = static_cast<int>(c * shift_ratio);
-  const int c2 = static_cast<int>(c * 2 * shift_ratio);
-
-  DDim in_grad_dims =
-      (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
-                                        : phi::make_ddim({nt, h, w, c}));
-  const T* output_grad_data = output_grad->data<T>();
-  T* input_grad_data =
-      input_grad->mutable_data<T>(in_grad_dims, dev_ctx.GetPlace());
-
-  if (data_layout == DataLayout::kNCHW) {
-    TemporalShiftBwNCHW<T>(
-        output_grad_data, input_grad_data, ntchw, tchw, chw, hw, t, c1, c2);
-  } else {
-    TemporalShiftBwNHWC<T>(
-        output_grad_data, input_grad_data, ntchw, tchw, chw, t, c, c1, c2);
-  }
-}
-
-}  // namespace phi
-
-PD_REGISTER_KERNEL(temporal_shift_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::TemporalShiftGradKernel,
-                   float,
-                   double) {}
--- a/paddle/phi/kernels/cpu/temporal_shift_kernel.cc
+++ b/paddle/phi/kernels/cpu/temporal_shift_kernel.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/kernels/temporal_shift_kernel.h"
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/kernel_registry.h"
-
-namespace phi {
-
-template <typename T>
-void TemporalShiftFwNCHW(const T* input,
-                         T* output,
-                         const int ntchw,
-                         const int tchw,
-                         const int chw,
-                         const int hw,
-                         const int t,
-                         const int c1,
-                         const int c2) {
-  int src_it = 0;
-  for (int i = 0; i < ntchw; i++) {
-    int it = (i % tchw) / chw;
-    int ic = (i % chw) / hw;
-
-    if (ic < c1) {
-      src_it = it - 1;
-    } else if (ic < c2) {
-      src_it = it + 1;
-    } else {
-      src_it = it;
-    }
-
-    if (src_it < 0 || src_it >= t) {
-      output[i] = 0;
-    } else {
-      output[i] = input[i + (src_it - it) * chw];
-    }
-  }
-}
-
-template <typename T>
-void TemporalShiftFwNHWC(const T* input,
-                         T* output,
-                         const int nthwc,
-                         const int thwc,
-                         const int hwc,
-                         const int t,
-                         const int c,
-                         const int c1,
-                         const int c2) {
-  int src_it = 0;
-  for (int i = 0; i < nthwc; i++) {
-    int it = (i % thwc) / hwc;
-    int ic = i % c;
-
-    if (ic < c1) {
-      src_it = it - 1;
-    } else if (ic < c2) {
-      src_it = it + 1;
-    } else {
-      src_it = it;
-    }
-
-    if (src_it < 0 || src_it >= t) {
-      output[i] = 0;
-    } else {
-      output[i] = input[i + (src_it - it) * hwc];
-    }
-  }
-}
-
-template <typename T, typename Context>
-void TemporalShiftKernel(const Context& dev_ctx,
-                         const DenseTensor& x,
-                         int seg_num,
-                         float shift_ratio,
-                         const std::string& data_format_str,
-                         DenseTensor* out) {
-  auto* input = &x;
-  auto* output = out;
-  int t = seg_num;
-  const DataLayout data_layout =
-      paddle::framework::StringToDataLayout(data_format_str);
-
-  const int nt = input->dims()[0];
-  const int c =
-      (data_layout == DataLayout::kNCHW ? input->dims()[1] : input->dims()[3]);
-  const int h =
-      (data_layout == DataLayout::kNCHW ? input->dims()[2] : input->dims()[1]);
-  const int w =
-      (data_layout == DataLayout::kNCHW ? input->dims()[3] : input->dims()[2]);
-
-  const int hw = h * w;
-  const int chw = c * hw;
-  const int tchw = t * chw;
-  const int ntchw = nt * chw;
-
-  const int c1 = static_cast<int>(c * shift_ratio);
-  const int c2 = static_cast<int>(c * 2 * shift_ratio);
-
-  DDim out_dims =
-      (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
-                                        : phi::make_ddim({nt, h, w, c}));
-  const T* input_data = input->data<T>();
-  T* output_data = output->mutable_data<T>(out_dims, dev_ctx.GetPlace());
-
-  if (data_layout == DataLayout::kNCHW) {
-    TemporalShiftFwNCHW<T>(
-        input_data, output_data, ntchw, tchw, chw, hw, t, c1, c2);
-  } else {
-    TemporalShiftFwNHWC<T>(
-        input_data, output_data, ntchw, tchw, chw, t, c, c1, c2);
-  }
-}
-
-}  // namespace phi
-
-PD_REGISTER_KERNEL(
-    temporal_shift, CPU, ALL_LAYOUT, phi::TemporalShiftKernel, float, double) {}
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -106,31 +106,6 @@ struct SinFunctor : public BaseActivationFunctor<T> {
  }
 };

-// reciprocal(x) = 1 / x
-template <typename T>
-struct ReciprocalFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = static_cast<T>(1) / x;
-  }
-};
-
-template <typename T>
-struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device,
-            typename X,
-            typename Out,
-            typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * static_cast<T>(-1) * out * out;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
 // cosine'(x) = -sin(x)
 template <typename T>
 struct CosGradFunctor : public BaseActivationFunctor<T> {
@@ -155,108 +130,6 @@ struct CosFunctor : public BaseActivationFunctor<T> {
  }
 };

-template <typename T>
-struct LogitFunctor {
-  template <typename Device, typename X, typename Out, typename P>
-  void operator()(Device d, X x, Out out, P p, float eps) const {
-    // logit(x) = ln(x/(1-x))
-    auto tmp_x =
-        (x.cwiseMin(static_cast<T>(1.0 - eps))).cwiseMax(static_cast<T>(eps));
-
-    if (!eps) {
-      out.device(d) = (x < static_cast<T>(0.0) || x > static_cast<T>(1.0))
-                          .select(p.constant(static_cast<T>(NAN)),
-                                  (tmp_x / (static_cast<T>(1) - tmp_x)).log());
-    } else {
-      out.device(d) = (tmp_x / (static_cast<T>(1) - tmp_x)).log();
-    }
-  }
-};
-
-// mish(x) = x * tanh(softplus(x))
-// softplus(x) = x, if x > threshold
-//             = ln(1 + exp(x)), otherwise
-
-template <typename T>
-struct MishFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto sp = (x > static_cast<T>(threshold))
-                  .select(x, (static_cast<T>(1) + x.exp()).log());
-    out.device(d) = x * sp.tanh();
-  }
-};
-
-// dx = dout * (tanh(sp) + x * (1 - tanh(sp) ** 2) * (1 - exp(-sp)))
-// sp = softplus(x)
-
-template <typename T>
-struct MishGradFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  template <typename Device,
-            typename X,
-            typename Out,
-            typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto sp = (x > static_cast<T>(threshold))
-                  .select(x, (static_cast<T>(1) + x.exp()).log());
-    auto gsp = static_cast<T>(1) - (-sp).exp();
-    auto tsp = sp.tanh();
-    dx.device(d) = dout * (tsp + x * (static_cast<T>(1) - tsp * tsp) * gsp);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct STanhFunctor : public BaseActivationFunctor<T> {
-  float scale_a;
-  float scale_b;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) =
-        static_cast<T>(scale_b) * (static_cast<T>(scale_a) * x).tanh();
-  }
-};
-
-template <typename T>
-struct STanhGradFunctor : public BaseActivationFunctor<T> {
-  float scale_a;
-  float scale_b;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
-  }
-
-  template <typename Device,
-            typename X,
-            typename Out,
-            typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto a = static_cast<T>(scale_a);
-    auto b = static_cast<T>(scale_b);
-    auto temp = (a * x).tanh() * (a * x).tanh();
-    dx.device(d) = dout * a * b * (static_cast<T>(1) - temp);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 template <typename T>
 struct Tangent {
  HOSTDEVICE T operator()(const T& val) const { return tan(val); }
@@ -284,132 +157,6 @@ struct TanGradFunctor : public BaseActivationFunctor<T> {
  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };

-// square(x) = x^2
-template <typename T>
-struct SquareFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.square();
-  }
-};
-
-template <typename T>
-struct SquareGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device,
-            typename X,
-            typename Out,
-            typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * static_cast<T>(2) * x;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-// sqrt(x) = x^(1/2)
-template <typename T>
-struct SqrtFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.sqrt();
-  }
-};
-
-template <typename T>
-struct SqrtGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device,
-            typename X,
-            typename Out,
-            typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = static_cast<T>(0.5) * dout / out;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-// rsqrt(x) = x^(-1/2)
-template <typename T>
-struct RsqrtFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.rsqrt();
-  }
-};
-
-template <typename T>
-struct RsqrtGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device,
-            typename X,
-            typename Out,
-            typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = static_cast<T>(-0.5) * dout * out * out * out;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-// // For numerical stability, using the following formula instead of
-// softplus(x) =
-// // log(1 + exp(x))
-// // softplus(x) = log(1 + exp(beta * x)) / beta when beta * x <=
-// threshold(beta =
-// // 1, threshold = 20 by default), otherwise x
-
-template <typename T>
-struct SoftplusFunctor : public BaseActivationFunctor<T> {
-  float beta;
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"beta", &beta}, {"threshold", &threshold}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto x_beta = static_cast<T>(beta) * x;
-    out.device(d) = (x_beta > static_cast<T>(threshold))
-                        .select(x,
-                                (static_cast<T>(1) + x_beta.exp()).log() /
-                                    static_cast<T>(beta));
-  }
-};
-
-// For numerical stability, using the following formula instead of
-// d(softplus(x))/dx = 1 / (1 + exp(-x))
-// d(softplus(x))/dx = 1 / (1 + exp(-beta * x)) when beta * x <= threshold(beta
-// = 1, threshold = 20 by default), otherwise x
-
-template <typename T>
-struct SoftplusGradFunctor : public BaseActivationFunctor<T> {
-  float beta;
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"beta", &beta}, {"threshold", &threshold}};
-  }
-  template <typename Device,
-            typename X,
-            typename Out,
-            typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto x_beta = static_cast<T>(beta) * x;
-    dx.device(d) =
-        (x_beta > static_cast<T>(threshold))
-            .select(dout, dout / (static_cast<T>(1) + (-x_beta).exp()));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 // Tangent(x) = tan(x)
 template <typename T>
 struct TanFunctor : public BaseActivationFunctor<T> {
@@ -601,18 +348,6 @@ struct AtanGradFunctor : public BaseActivationFunctor<T> {
  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };

-template <typename T>
-struct LogitGradFunctor {
-  template <typename Device, typename X, typename dOut, typename dX, typename P>
-  void operator()(Device d, X x, dOut dout, dX dx, P p, float eps) const {
-    // logit(x)' = 1/(x*(1-x))
-    dx.device(d) =
-        (x < static_cast<T>(eps) || x > static_cast<T>(1.0 - eps))
-            .select(p.constant(static_cast<T>(0)),
-                    dout * (static_cast<T>(1) / ((static_cast<T>(1) - x) * x)));
-  }
-};
-
 template <typename T>
 struct Acosh {
  HOSTDEVICE T operator()(const T& val) const { return acosh(val); }
@@ -723,57 +458,6 @@ struct AtanhGradFunctor : public BaseActivationFunctor<T> {
  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };

-// exp functor
-// exp(x) = e^x
-template <typename T>
-struct ExpFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.exp();
-  }
-};
-
-template <typename T>
-struct ExpGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device,
-            typename X,
-            typename Out,
-            typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * out;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-// expm1(x) = e^x - 1
-template <typename T>
-struct Expm1Functor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.expm1();
-  }
-};
-
-template <typename T>
-struct Expm1GradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device,
-            typename X,
-            typename Out,
-            typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * out + dout;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
 // relu(x) = max(x, 0)
 template <typename T>
 struct ReluCPUFunctor : public BaseActivationFunctor<T> {
@@ -1876,90 +1560,6 @@ struct CudaCosGradFunctor : public BaseActivationFunctor<T> {
  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };

-template <typename T>
-struct CudaExpFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-
-  // exp(x) = exp(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(exp(x));
-  }
-};
-
-template <typename T>
-struct CudaSquareFunctor : public BaseActivationFunctor<T> {
-  // square(x) = x * x
-  __device__ __forceinline__ T operator()(const T x) const { return x * x; }
-};
-
-template <typename T>
-struct CudaSquareGradFunctor : public BaseActivationFunctor<T> {
-  T two = static_cast<T>(2.0f);
-
-  // dx = dout * 2 * x
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return dout * two * x;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct CudaExpGradFunctor : public BaseActivationFunctor<T> {
-  // dx = dout * out
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return dout * out;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-template <typename T>
-struct CudaReciprocalFunctor : public BaseActivationFunctor<T> {
-  T one = static_cast<T>(1.0f);
-
-  // reciprocal(x) = 1 / x
-  __device__ __forceinline__ T operator()(const T x) const { return one / x; }
-};
-
-template <typename T>
-struct CudaReciprocalGradFunctor : public BaseActivationFunctor<T> {
-  // dx = -dout * out^2
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return -dout * out * out;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-template <typename T>
-struct CudaExpm1Functor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-
-  // expm1(x) = expm1(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(expm1(x));
-  }
-};
-
-template <typename T>
-struct CudaExpm1GradFunctor : public BaseActivationFunctor<T> {
-  // dx = dout * out
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return dout * out + dout;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
 template <typename T>
 struct CudaSinFunctor : public BaseActivationFunctor<T> {
  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
@@ -2182,96 +1782,6 @@ struct CudaAtanhFunctor : public BaseActivationFunctor<T> {
  }
 };

-template <typename T>
-struct CudaSTanhFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-  float scale_a;
-  float scale_b;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
-  }
-
-  // stanh(x) = b * tanh(a * x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    MPType a = static_cast<MPType>(scale_a);
-    MPType b = static_cast<MPType>(scale_b);
-    return static_cast<T>(b * tanh(a * x));
-  }
-};
-
-template <typename T>
-struct CudaSTanhGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-  float scale_a;
-  float scale_b;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
-  }
-
-  // dx = dout * a * b * (1 - tanh(a * x) * tanh(a * x))
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    MPType a = static_cast<MPType>(scale_a);
-    MPType b = static_cast<MPType>(scale_b);
-    MPType temp = tanh(a * x);
-    return static_cast<T>(dout * a * b * (one - temp * temp));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct CudaSoftplusFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-  float beta;
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"beta", &beta}, {"threshold", &threshold}};
-  }
-
-  // softplus(x) = beta * x > threshold ? x : log(1 + exp(beta * x)) / beta
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    MPType b = static_cast<MPType>(beta);
-    MPType t = static_cast<MPType>(threshold);
-    MPType x_beta = x * beta;
-    return static_cast<T>(x_beta > t ? x : log(one + exp(x_beta)) / b);
-  }
-};
-
-template <typename T>
-struct CudaSoftplusGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-  float beta;
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"beta", &beta}, {"threshold", &threshold}};
-  }
-
-  // dx = x * beta > threshold ? dout : dout / (1 + exp(-beta * x))
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    MPType b = static_cast<MPType>(beta);
-    MPType t = static_cast<MPType>(threshold);
-    MPType x_beta = x * beta;
-    return x_beta > t ? arg_dout : static_cast<T>(dout / (one + exp(-x_beta)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 template <typename T>
 struct CudaAtanhGradFunctor : public BaseActivationFunctor<T> {
  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
@@ -2287,56 +1797,6 @@ struct CudaAtanhGradFunctor : public BaseActivationFunctor<T> {
  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };

-template <typename T>
-struct CudaSqrtFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-
-  // sqrt(x) = sqrt(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(sqrt(x));
-  }
-};
-
-template <typename T>
-struct CudaSqrtGradFunctor : public BaseActivationFunctor<T> {
-  T one_half = static_cast<T>(0.5f);
-
-  // dx = dout * 0.5 / out
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return one_half * dout / out;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-template <typename T>
-struct CudaRsqrtFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-
-  // rsqrt(x) = rsqrt(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(rsqrt(x));
-  }
-};
-
-template <typename T>
-struct CudaRsqrtGradFunctor : public BaseActivationFunctor<T> {
-  T minus_one_half = static_cast<T>(-0.5f);
-
-  // dx = -0.5 * dout * out^3
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return minus_one_half * dout * out * out * out;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
 template <typename T>
 struct CudaAtanFunctor : public BaseActivationFunctor<T> {
  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
@@ -2404,55 +1864,6 @@ struct CudaBReluFunctor : public BaseActivationFunctor<T> {
  }
 };

-template <typename T>
-struct CudaMishFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  // mish(x) = x * tanh(softplus(x))
-  // softplus(x) = x, if x > threshold
-  //             = ln(1 + exp(x)), otherwise
-  // Inputs: args[0], the input x
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    MPType sp = (x > static_cast<MPType>(threshold)) ? x : log(one + exp(x));
-    return static_cast<T>(x * tanh(sp));
-  }
-};
-
-template <typename T>
-struct CudaMishGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  // dx = dout * (tanh(sp) + x * (1 - tanh(sp) ** 2) * (1 - exp(-sp)))
-  // sp = softplus(x)
-  // Inputs: args[0], the input dout
-  //         args[1], the input x
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    MPType sp = (x > static_cast<MPType>(threshold)) ? x : log(one + exp(x));
-    MPType gsp =
-        (x > static_cast<MPType>(threshold)) ? one : one / (one + exp(-x));
-    MPType tsp = tanh(sp);
-    return static_cast<T>(dout * (tsp + x * (one - tsp * tsp) * gsp));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 template <typename T>
 struct CudaBReluGradFunctor : public BaseActivationFunctor<T> {
  T zero = static_cast<T>(0.0f);

--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -189,13 +189,6 @@ DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, CudaAcoshGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, CudaAtanhGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, CudaTanhShrinkGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, CudaSiluGradFunctor);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Square, CudaSquareGradFunctor);
-
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Exp, CudaExpGradFunctor);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1, CudaExpm1GradFunctor);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Reciprocal, CudaReciprocalGradFunctor);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, CudaSqrtGradFunctor);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Rsqrt, CudaRsqrtGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, CudaLogSigmoidGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, CudaLogGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log2, CudaLog2GradFunctor);
@@ -218,24 +211,11 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish,
                                               CudaSwishGradFunctor,
                                               beta);

-DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
-                                               CudaMishGradFunctor,
-                                               threshold);
-
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
                                               CudaBReluGradFunctor,
                                               t_min,
                                               t_max);

-DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(STanh,
-                                               CudaSTanhGradFunctor,
-                                               scale_a,
-                                               scale_b);
-
-DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(Softplus,
-                                               CudaSoftplusGradFunctor,
-                                               beta,
-                                               threshold);
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid,
                                                 CudaHardSigmoidGradFunctor,
                                                 slope,
@@ -346,57 +326,12 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_double_grad,
                                   LeakyReluDoubleGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad,
                                   ThresholdedReluGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(mish_grad, MishGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(stanh_grad, STanhGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(reciprocal_grad, ReciprocalGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(softplus_grad, SoftplusGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(sqrt_grad, SqrtGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(rsqrt_grad, RsqrtGradKernel)
-
-PD_REGISTER_KERNEL(exp_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::ExpGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16) {}
-
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel)
-
-PD_REGISTER_KERNEL(expm1_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::Expm1GradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-
-PD_REGISTER_KERNEL(logit_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::LogitGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-
-PD_REGISTER_KERNEL(square_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SquareGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
-
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel)

--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -19,7 +19,6 @@ limitations under the License. */
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
-#include "paddle/phi/kernels/impl/activation_grad_impl.h"
 #include "paddle/phi/kernels/impl/activation_impl.h"

 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
@@ -92,12 +91,6 @@ DEFINE_GPU_ACTIVATION_KERNEL(Relu, CudaReluFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Tanh, CudaTanhFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(TanhShrink, CudaTanhShrinkFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Silu, CudaSiluFunctor)
-DEFINE_GPU_ACTIVATION_KERNEL(Exp, CudaExpFunctor)
-DEFINE_GPU_ACTIVATION_KERNEL(Expm1, CudaExpm1Functor)
-DEFINE_GPU_ACTIVATION_KERNEL(Reciprocal, CudaReciprocalFunctor)
-DEFINE_GPU_ACTIVATION_KERNEL(Square, CudaSquareFunctor)
-DEFINE_GPU_ACTIVATION_KERNEL(Sqrt, CudaSqrtFunctor)
-DEFINE_GPU_ACTIVATION_KERNEL(Rsqrt, CudaRsqrtFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Sigmoid, CudaSigmoidFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(LogSigmoid, CudaLogSigmoidFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Log, CudaLogFunctor)
@@ -119,14 +112,7 @@ DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, CudaSoftShrinkFunctor, lambda)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, CudaELUFunctor, alpha)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Swish, CudaSwishFunctor, beta)

-DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Mish, CudaMishFunctor, threshold)
-
 DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, CudaBReluFunctor, t_min, t_max)
-DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Stanh, CudaSTanhFunctor, scale_a, scale_b)
-DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Softplus,
-                                     CudaSoftplusFunctor,
-                                     beta,
-                                     threshold)
 DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid,
                                     CudaHardSigmoidFunctor,
                                     slope,
@@ -194,46 +180,6 @@ PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
-PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel)
-PD_REGISTER_ACTIVATION_KERNEL(stanh, StanhKernel)
-PD_REGISTER_ACTIVATION_KERNEL(reciprocal, ReciprocalKernel)
-PD_REGISTER_ACTIVATION_KERNEL(sqrt, SqrtKernel)
-PD_REGISTER_ACTIVATION_KERNEL(rsqrt, RsqrtKernel)
-PD_REGISTER_ACTIVATION_KERNEL(softplus, SoftplusKernel)
-
-PD_REGISTER_KERNEL(exp,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::ExpKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(expm1,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::Expm1Kernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(logit,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::LogitKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(square,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SquareKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
 PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel)

--- a/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/temporal_shift_grad_kernel.h"
-
-namespace phi {
-
-template <typename T>
-__global__ void KeTemporalShiftBwNCHW(const T* output_grad,
-                                      T* input_grad,
-                                      const int ntchw,
-                                      const int tchw,
-                                      const int chw,
-                                      const int hw,
-                                      const int t,
-                                      const int c1,
-                                      const int c2) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  int src_it = 0;
-
-  for (; tid < ntchw; tid += stride) {
-    int it = (tid % tchw) / chw;
-    int ic = (tid % chw) / hw;
-
-    if (ic < c1) {
-      src_it = it + 1;
-    } else if (ic < c2) {
-      src_it = it - 1;
-    } else {
-      src_it = it;
-    }
-
-    if (src_it >= 0 && src_it < t) {
-      input_grad[tid] = output_grad[tid + (src_it - it) * chw];
-    } else {
-      input_grad[tid] = 0;
-    }
-  }
-}
-
-template <typename T>
-__global__ void KeTemporalShiftBwNHWC(const T* output_grad,
-                                      T* input_grad,
-                                      const int nthwc,
-                                      const int thwc,
-                                      const int hwc,
-                                      const int t,
-                                      const int c,
-                                      const int c1,
-                                      const int c2) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  int src_it = 0;
-
-  for (; tid < nthwc; tid += stride) {
-    int it = (tid % thwc) / hwc;
-    int ic = tid % c;
-
-    if (ic < c1) {
-      src_it = it + 1;
-    } else if (ic < c2) {
-      src_it = it - 1;
-    } else {
-      src_it = it;
-    }
-
-    if (src_it >= 0 && src_it < t) {
-      input_grad[tid] = output_grad[tid + (src_it - it) * hwc];
-    } else {
-      input_grad[tid] = 0;
-    }
-  }
-}
-
-template <typename T, typename Context>
-void TemporalShiftGradKernel(const Context& dev_ctx,
-                             const DenseTensor& out_grad,
-                             int seg_num,
-                             float shift_ratio,
-                             const std::string& data_format_str,
-                             DenseTensor* x_grad) {
-  auto* input_grad = x_grad;
-  auto* output_grad = &out_grad;
-  int t = seg_num;
-  const DataLayout data_layout =
-      paddle::framework::StringToDataLayout(data_format_str);
-
-  const int nt = output_grad->dims()[0];
-  const int c = (data_layout == DataLayout::kNCHW ? output_grad->dims()[1]
-                                                  : output_grad->dims()[3]);
-  const int h = (data_layout == DataLayout::kNCHW ? output_grad->dims()[2]
-                                                  : output_grad->dims()[1]);
-  const int w = (data_layout == DataLayout::kNCHW ? output_grad->dims()[3]
-                                                  : output_grad->dims()[2]);
-
-  const int hw = h * w;
-  const int chw = c * hw;
-  const int tchw = t * chw;
-  const int ntchw = nt * chw;
-
-  const int c1 = static_cast<int>(c * shift_ratio);
-  const int c2 = static_cast<int>(c * 2 * shift_ratio);
-
-  DDim in_grad_dims =
-      (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
-                                        : phi::make_ddim({nt, h, w, c}));
-  const T* output_grad_data = output_grad->data<T>();
-  T* input_grad_data =
-      input_grad->mutable_data<T>(in_grad_dims, dev_ctx.GetPlace());
-
-  int pixelNum = nt * chw;
-  int threads = 1024;
-  int grid = (pixelNum + threads - 1) / threads;
-  int blocks_per_sm = dev_ctx.GetMaxPhysicalThreadCount() / threads;
-  grid = std::min(dev_ctx.GetSMCount() * blocks_per_sm, grid);
-
-  if (data_layout == DataLayout::kNCHW) {
-    KeTemporalShiftBwNCHW<T><<<grid, threads, 0, dev_ctx.stream()>>>(
-        output_grad_data, input_grad_data, ntchw, tchw, chw, hw, t, c1, c2);
-  } else {
-    KeTemporalShiftBwNHWC<T><<<grid, threads, 0, dev_ctx.stream()>>>(
-        output_grad_data, input_grad_data, ntchw, tchw, chw, t, c, c1, c2);
-  }
-}
-
-}  // namespace phi
-
-PD_REGISTER_KERNEL(temporal_shift_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::TemporalShiftGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
--- a/paddle/phi/kernels/gpu/temporal_shift_kernel.cu
+++ b/paddle/phi/kernels/gpu/temporal_shift_kernel.cu
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/temporal_shift_kernel.h"
-
-namespace phi {
-
-template <typename T>
-__global__ void KeTemporalShiftFwNCHW(const T* input,
-                                      T* output,
-                                      const int ntchw,
-                                      const int tchw,
-                                      const int chw,
-                                      const int hw,
-                                      const int t,
-                                      const int c1,
-                                      const int c2) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  int src_it = 0;
-
-  for (; tid < ntchw; tid += stride) {
-    int it = (tid % tchw) / chw;
-    int ic = (tid % chw) / hw;
-
-    if (ic < c1) {
-      src_it = it - 1;
-    } else if (ic < c2) {
-      src_it = it + 1;
-    } else {
-      src_it = it;
-    }
-
-    if (src_it < 0 || src_it >= t) {
-      output[tid] = 0;
-    } else {
-      output[tid] = input[tid + (src_it - it) * chw];
-    }
-  }
-}
-
-template <typename T>
-__global__ void KeTemporalShiftFwNHWC(const T* input,
-                                      T* output,
-                                      const int nthwc,
-                                      const int thwc,
-                                      const int hwc,
-                                      const int t,
-                                      const int c,
-                                      const int c1,
-                                      const int c2) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  int src_it = 0;
-
-  for (; tid < nthwc; tid += stride) {
-    int it = (tid % thwc) / hwc;
-    int ic = tid % c;
-
-    if (ic < c1) {
-      src_it = it - 1;
-    } else if (ic < c2) {
-      src_it = it + 1;
-    } else {
-      src_it = it;
-    }
-
-    if (src_it < 0 || src_it >= t) {
-      output[tid] = 0;
-    } else {
-      output[tid] = input[tid + (src_it - it) * hwc];
-    }
-  }
-}
-
-template <typename T, typename Context>
-void TemporalShiftKernel(const Context& dev_ctx,
-                         const DenseTensor& x,
-                         int seg_num,
-                         float shift_ratio,
-                         const std::string& data_format_str,
-                         DenseTensor* out) {
-  auto* input = &x;
-  auto* output = out;
-  int t = seg_num;
-  const DataLayout data_layout =
-      paddle::framework::StringToDataLayout(data_format_str);
-
-  const int nt = input->dims()[0];
-  const int c =
-      (data_layout == DataLayout::kNCHW ? input->dims()[1] : input->dims()[3]);
-  const int h =
-      (data_layout == DataLayout::kNCHW ? input->dims()[2] : input->dims()[1]);
-  const int w =
-      (data_layout == DataLayout::kNCHW ? input->dims()[3] : input->dims()[2]);
-
-  const int hw = h * w;
-  const int chw = c * hw;
-  const int tchw = t * chw;
-  const int ntchw = nt * chw;
-
-  const int c1 = static_cast<int>(c * shift_ratio);
-  const int c2 = static_cast<int>(c * 2 * shift_ratio);
-
-  DDim out_dims =
-      (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
-                                        : phi::make_ddim({nt, h, w, c}));
-  const T* input_data = input->data<T>();
-  T* output_data = output->mutable_data<T>(out_dims, dev_ctx.GetPlace());
-
-  int pixelNum = nt * chw;
-  int threads = 1024;
-  int grid = (pixelNum + threads - 1) / threads;
-  int blocks_per_sm = dev_ctx.GetMaxPhysicalThreadCount() / threads;
-  grid = std::min(dev_ctx.GetSMCount() * blocks_per_sm, grid);
-
-  if (data_layout == DataLayout::kNCHW) {
-    KeTemporalShiftFwNCHW<T><<<grid, threads, 0, dev_ctx.stream()>>>(
-        input_data, output_data, ntchw, tchw, chw, hw, t, c1, c2);
-  } else {
-    KeTemporalShiftFwNHWC<T><<<grid, threads, 0, dev_ctx.stream()>>>(
-        input_data, output_data, ntchw, tchw, chw, t, c, c1, c2);
-  }
-}
-
-}  // namespace phi
-
-PD_REGISTER_KERNEL(temporal_shift,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::TemporalShiftKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
--- a/paddle/phi/kernels/impl/activation_grad_impl.h
+++ b/paddle/phi/kernels/impl/activation_grad_impl.h
@@ -222,24 +222,6 @@ void EluDoubleGradKernel(const Context& dev_ctx,
  functor(dev_ctx, &x, &ddx, ddout, &dout, dx);
 }

-template <typename T, typename Context>
-void LogitGradKernel(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& out_grad,
-                     float eps,
-                     DenseTensor* x_grad) {
-  dev_ctx.template Alloc<T>(x_grad);
-
-  auto eigen_x = EigenVector<T>::Flatten(x);
-  auto eigen_dout = EigenVector<T>::Flatten(out_grad);
-  auto eigen_dx = EigenVector<T>::Flatten(*x_grad);
-  auto& place = *dev_ctx.eigen_device();
-  auto eigen_p = EigenVector<T>::Flatten(x);
-
-  funcs::LogitGradFunctor<T> functor;
-  functor(place, eigen_x, eigen_dout, eigen_dx, eigen_p, eps);
-}
-
 template <typename T, typename Context>
 void SigmoidDoubleGradKernel(const Context& dev_ctx,
                             const DenseTensor& out,

--- a/paddle/phi/kernels/impl/activation_impl.h
+++ b/paddle/phi/kernels/impl/activation_impl.h
@@ -47,22 +47,6 @@ void ActivationImpl(const Context& dev_ctx,
  }
 }

-template <typename T, typename Context>
-void LogitKernel(const Context& dev_ctx,
-                 const DenseTensor& x,
-                 float eps,
-                 DenseTensor* out) {
-  dev_ctx.template Alloc<T>(out);
-
-  auto eigen_out = EigenVector<T>::Flatten(*out);
-  auto eigen_in = EigenVector<T>::Flatten(x);
-  auto& place = *dev_ctx.eigen_device();
-  auto eigen_p = EigenVector<T>::Flatten(*out);
-
-  funcs::LogitFunctor<T> functor;
-  functor(place, eigen_in, eigen_out, eigen_p, eps);
-}
-
 template <typename T, typename Context>
 void PowKernel(const Context& dev_ctx,
               const DenseTensor& x,

--- a/paddle/phi/kernels/selected_rows/activation_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/activation_kernel.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/kernels/selected_rows/activation_kernel.h"
-
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/activation_kernel.h"
-
-#include "paddle/phi/backends/cpu/cpu_context.h"
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
-
-namespace phi {
-namespace sr {
-
-template <typename T, typename Context>
-void SquareKernel(const Context& dev_ctx,
-                  const SelectedRows& x,
-                  SelectedRows* out) {
-  out->set_rows(x.rows());
-  out->set_height(x.height());
-  phi::SquareKernel<T, Context>(dev_ctx, x.value(), out->mutable_value());
-}
-
-template <typename T, typename Context>
-void SqrtKernel(const Context& dev_ctx,
-                const SelectedRows& x,
-                SelectedRows* out) {
-  out->set_rows(x.rows());
-  out->set_height(x.height());
-  phi::SqrtKernel<T, Context>(dev_ctx, x.value(), out->mutable_value());
-}
-
-}  // namespace sr
-}  // namespace phi
-
-PD_REGISTER_KERNEL(
-    square_sr, CPU, ALL_LAYOUT, phi::sr::SquareKernel, float, double) {}
-
-PD_REGISTER_KERNEL(
-    sqrt_sr, CPU, ALL_LAYOUT, phi::sr::SqrtKernel, float, double) {}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
-PD_REGISTER_KERNEL(square_sr,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::sr::SquareKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t) {}
-
-PD_REGISTER_KERNEL(
-    sqrt_sr, GPU, ALL_LAYOUT, phi::sr::SqrtKernel, float, double) {}
-
-#endif
--- a/paddle/phi/kernels/selected_rows/activation_kernel.h
+++ b/paddle/phi/kernels/selected_rows/activation_kernel.h
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/selected_rows.h"
-
-namespace phi {
-namespace sr {
-
-template <typename T, typename Context>
-void SquareKernel(const Context& dev_ctx,
-                  const SelectedRows& x,
-                  SelectedRows* out);
-
-template <typename T, typename Context>
-void SqrtKernel(const Context& dev_ctx,
-                const SelectedRows& x,
-                SelectedRows* out);
-
-}  // namespace sr
-}  // namespace phi
--- a/paddle/phi/kernels/temporal_shift_grad_kernel.h
+++ b/paddle/phi/kernels/temporal_shift_grad_kernel.h
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void TemporalShiftGradKernel(const Context& ctx,
-                             const DenseTensor& out_grad,
-                             int seg_num,
-                             float shift_ratio,
-                             const std::string& data_format,
-                             DenseTensor* x_grad);
-
-}  // namespace phi
--- a/paddle/phi/kernels/temporal_shift_kernel.h
+++ b/paddle/phi/kernels/temporal_shift_kernel.h
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void TemporalShiftKernel(const Context& ctx,
-                         const DenseTensor& x,
-                         int seg_num,
-                         float shift_ratio,
-                         const std::string& data_format,
-                         DenseTensor* out);
-
-}  // namespace phi
--- a/paddle/phi/ops/compat/activation_sig.cc
+++ b/paddle/phi/ops/compat/activation_sig.cc
@@ -43,19 +43,17 @@ namespace phi {

 #define comma ,

-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cos, "cos", );        // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Tan, "tan", );        // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acos, "acos", );      // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sin, "sin", );        // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asin, "asin", );      // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atan, "atan", );      // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sinh, "sinh", );      // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cosh, "cosh", );      // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asinh, "asinh", );    // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acosh, "acosh", );    // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atanh, "atanh", );    // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Square, "square", );  // NOLINT
-
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cos, "cos", );      // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Tan, "tan", );      // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acos, "acos", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sin, "sin", );      // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asin, "asin", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atan, "atan", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sinh, "sinh", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cosh, "cosh", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asinh, "asinh", );  // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acosh, "acosh", );  // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atanh, "atanh", );  // NOLINT
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(BRelu, "brelu", "t_min" comma "t_max");
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LeakyRelu, "leaky_relu", "alpha");
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(ThresholdedRelu,
@@ -63,7 +61,6 @@ DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(ThresholdedRelu,
                               "threshold");
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(SoftShrink, "soft_shrink", "lambda");
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardShrink, "hard_shrink", "threshold");
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Mish, "mish", "threshold");
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(TanhShrink, "tanh_shrink", );  // NOLINT
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Silu, "silu", );               // NOLINT
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LogSigmoid, "logsigmoid", );   // NOLINT
@@ -77,41 +74,12 @@ DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardSwish,
                               "offset");                // NOLINT
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Swish, "swish", "beta");  // NOLINT

-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(STanh,
-                               "stanh",
-                               "scale_a" comma "scale_b");  // NOLINT
-
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Softplus,
-                               "softplus",
-                               "beta" comma "threshold");  // NOLINT
-
-DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", );              // NOLINT
-DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", );              // NOLINT
-DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sigmoid, "sigmoid", );        // NOLINT
-DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Exp, "exp", );                // NOLINT
-DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Expm1, "expm1", );            // NOLINT
-DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Reciprocal, "reciprocal", );  // NOLINT
-DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sqrt, "sqrt", );              // NOLINT
-DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Rsqrt, "rsqrt", );            // NOLINT
-
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", );        // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", );        // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sigmoid, "sigmoid", );  // NOLINT
 DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(HardSigmoid,
                                 "hard_sigmoid",
                                 "slope" comma "offset");  // NOLINT
-KernelSignature SqrtActiOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  if (ctx.IsDenseTensorInput("X")) {
-    return KernelSignature("sqrt", {"X"}, {}, {"Out"});
-  } else {
-    return KernelSignature("sqrt_sr", {"X"}, {}, {"Out"});
-  }
-}
-
-KernelSignature SquareActiOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  if (ctx.IsDenseTensorInput("X")) {
-    return KernelSignature("square", {"X"}, {}, {"Out"});
-  } else {
-    return KernelSignature("square_sr", {"X"}, {}, {"Out"});
-  }
-}

 DEFINE_ACT_GRAD_NODEP_OP_ARGMAP(Round, "round", );  // NOLINT
 DEFINE_ACT_GRAD_NODEP_OP_ARGMAP(Floor, "floor", );  // NOLINT
@@ -164,11 +132,6 @@ KernelSignature EluOpArgumentMapping(const ArgumentMappingContext& ctx) {
  return KernelSignature("elu", {"X"}, {"alpha"}, {"Out"});
 }

-KernelSignature LogitGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "logit_grad", {"X", GradVarName("Out")}, {"eps"}, {GradVarName("X")});
-}
-
 KernelSignature EluGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
  return KernelSignature("elu_grad",
                         {"X", "Out", GradVarName("Out")},
@@ -231,18 +194,6 @@ PD_REGISTER_ARG_MAPPING_FN(asinh_grad, phi::AsinhGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(acosh_grad, phi::AcoshGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(atanh_grad, phi::AtanhGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(relu_grad, phi::ReluGradOpArgumentMapping);
-
-PD_REGISTER_ARG_MAPPING_FN(exp_grad, phi::ExpGradOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(expm1_grad, phi::Expm1GradOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(square_grad, phi::SquareGradOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(reciprocal_grad,
-                           phi::ReciprocalGradOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(sqrt_grad, phi::SqrtGradOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(rsqrt_grad, phi::RsqrtGradOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(mish_grad, phi::MishGradOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(stanh_grad, phi::STanhGradOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(softplus_grad, phi::SoftplusGradOpArgumentMapping);
-
 PD_REGISTER_ARG_MAPPING_FN(relu_grad_grad,
                           phi::ReluDoubleGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(tanh_grad, phi::TanhGradOpArgumentMapping);
@@ -277,16 +228,11 @@ PD_REGISTER_ARG_MAPPING_FN(logsigmoid_grad,
                           phi::LogSigmoidGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(hard_sigmoid_grad,
                           phi::HardSigmoidGradOpArgumentMapping);
-
-PD_REGISTER_ARG_MAPPING_FN(logit_grad, phi::LogitGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(log_grad, phi::LogGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(log_grad_grad, phi::LogDoubleGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(log2_grad, phi::Log2GradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(log10_grad, phi::Log10GradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(log1p_grad, phi::Log1pGradOpArgumentMapping);
-
-PD_REGISTER_ARG_MAPPING_FN(sqrt, phi::SqrtActiOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(square, phi::SquareActiOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(hard_swish_grad,
                           phi::HardSwishGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(swish_grad, phi::SwishGradOpArgumentMapping);

--- a/paddle/phi/ops/compat/temporal_shift_sig.cc
+++ b/paddle/phi/ops/compat/temporal_shift_sig.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature TemporalShiftOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("temporal_shift",
-                         {"X"},
-                         {"seg_num", "shift_ratio", "data_format"},
-                         {"Out"});
-}
-
-KernelSignature TemporalShiftGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("temporal_shift_grad",
-                         {GradVarName("Out")},
-                         {"seg_num", "shift_ratio", "data_format"},
-                         {GradVarName("X")});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(temporal_shift, phi::TemporalShiftOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(temporal_shift_grad,
-                           phi::TemporalShiftGradOpArgumentMapping);
--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
@@ -342,5 +342,4 @@ class TestLogDoubleGradCheck(unittest.TestCase):


 if __name__ == "__main__":
-    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_activation_sparse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_sparse_op.py
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-from op_test import OpTest
-import paddle
-
-
-class TestSparseSquareOp(unittest.TestCase):
-    def check_with_place(self, place):
-        scope = core.Scope()
-
-        # create and initialize Grad Variable   
-        height = 10
-        rows = [0, 4, 7]
-        self.row_numel = 12
-
-        x_selected_rows = scope.var('X').get_selected_rows()
-        x_selected_rows.set_height(height)
-        x_selected_rows.set_rows(rows)
-        np_array = np.ones((len(rows), self.row_numel)).astype("float32")
-        np_array[0, 0] = 2.0
-        np_array[2, 8] = 4.0
-
-        x_tensor = x_selected_rows.get_tensor()
-        x_tensor.set(np_array, place)
-
-        out_selected_rows = scope.var('Out').get_selected_rows()
-        # create and run sqrt operator
-        square_op = Operator("square", X='X', Out='Out')
-        square_op.run(scope, place)
-
-        # get and compare result
-        result_array = np.array(out_selected_rows.get_tensor())
-
-        self.assertTrue(np.array_equal(result_array, np.square(np_array)))
-
-    def test_sparse_acti(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        for place in places:
-            self.check_with_place(place)
-
-
-class TestSparseSqrtOp(unittest.TestCase):
-    def check_with_place(self, place):
-        scope = core.Scope()
-
-        # create and initialize Grad Variable   
-        height = 10
-        rows = [0, 4, 7]
-        self.row_numel = 12
-
-        x_selected_rows = scope.var('X1').get_selected_rows()
-        x_selected_rows.set_height(height)
-        x_selected_rows.set_rows(rows)
-        np_array = np.ones((len(rows), self.row_numel)).astype("float32")
-        np_array[0, 0] = 2.0
-        np_array[2, 8] = 4.0
-
-        x_tensor = x_selected_rows.get_tensor()
-        x_tensor.set(np_array, place)
-
-        out_selected_rows = scope.var('Out1').get_selected_rows()
-        # create and run sqrt operator
-        sqrt_op = Operator("sqrt", X='X1', Out='Out1')
-        sqrt_op.run(scope, place)
-
-        # get and compare result
-        result_array = np.array(out_selected_rows.get_tensor())
-        self.assertTrue(np.allclose(result_array, np.sqrt(np_array)))
-
-    def test_sparse_acti(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        for place in places:
-            self.check_with_place(place)
-
-
-if __name__ == "__main__":
-    paddle.enable_static()
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
@@ -16,7 +16,6 @@ from __future__ import print_function

 import unittest
 import numpy as np
-import paddle
 from op_test import OpTest

 import paddle.fluid as fluid
@@ -154,5 +153,4 @@ class TestClipByNormOpWithSelectedRows(unittest.TestCase):


 if __name__ == '__main__':
-    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
+++ b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
@@ -143,5 +143,4 @@ class TestTemporalShiftAPI(unittest.TestCase):


 if __name__ == "__main__":
-    paddle.enable_static()
    unittest.main()