Revert "Move some activation to phi (#40727)" (#41056)

This reverts commit e77a947e.

Revert "Move some activation to phi (#40727)" (#41056)
This reverts commit e77a947e.
05f3d48e · tianshuo78520a · GitHub · 9c0eaada · 05f3d48e · 05f3d48e
29 changed file
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -53,7 +53,7 @@ USE_OP_ITSELF(tanh_grad);
 USE_OP(sum);
 USE_OP_ITSELF(slice_grad);
 USE_OP_ITSELF(lookup_table_grad);
-USE_OP_ITSELF(sqrt);
+USE_OP(sqrt);
 USE_OP_ITSELF(elementwise_max);
 USE_OP_ITSELF(elementwise_div);
 USE_OP_ITSELF(sgd);
@@ -83,7 +83,6 @@ PD_DECLARE_KERNEL(max_raw, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sgd, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(slice, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(slice_grad, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(sqrt, GPU, ALL_LAYOUT);

 DECLARE_double(eager_delete_tensor_gb);


--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -1496,14 +1496,6 @@ REGISTER_ACTIVATION_OP(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,
                       HardSigmoidGradFunctor);
 REGISTER_ACTIVATION_OP(logsigmoid, LogSigmoid, LogSigmoidFunctor,
                       LogSigmoidGradFunctor);
-REGISTER_ACTIVATION_OP(expm1, Expm1, Expm1Functor, Expm1GradFunctor);
-REGISTER_ACTIVATION_OP(softplus, Softplus, SoftplusFunctor,
-                       SoftplusGradFunctor);
-REGISTER_ACTIVATION_OP(mish, Mish, MishFunctor, MishGradFunctor);
-REGISTER_ACTIVATION_OP(stanh, STanh, STanhFunctor, STanhGradFunctor);
-REGISTER_ACTIVATION_OP(reciprocal, Reciprocal, ReciprocalFunctor,
-                       ReciprocalGradFunctor);
-
 REGISTER_ACTIVATION_OP(log2, Log2, Log2Functor, Log2GradFunctor);
 REGISTER_ACTIVATION_OP(log10, Log10, Log10Functor, Log10GradFunctor);
 REGISTER_ACTIVATION_OP(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);
@@ -1638,7 +1630,12 @@ REGISTER_OPERATOR(logit, ops::LogitOp, ops::LogitOpMaker,
                  ops::LogitGradOpMaker<paddle::framework::OpDesc>,
                  ops::LogitGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(logit_grad, ops::LogitGradOp);
-
+REGISTER_OP_CPU_KERNEL(
+    logit, ops::LogitKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LogitKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    logit_grad, ops::LogitGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LogitGradKernel<paddle::platform::CPUDeviceContext, double>);
 /* ========================================================================== */

 /* ========================    celu  register     ============================
@@ -1687,6 +1684,7 @@ REGISTER_OPERATOR(
    ops::ActivationOpDoubleGrad<ops::SqrtGradGradFunctor<float>::FwdDeps()>,
    ops::ActivationDoubleGradOpInplaceInferer);

+REGISTER_ACTIVATION_CPU_KERNEL(sqrt, Sqrt, SqrtFunctor, SqrtGradFunctor);
 REGISTER_OP_CPU_KERNEL(
    sqrt_grad_grad, ops::SqrtDoubleGradKernel<plat::CPUDeviceContext,
                                              ops::SqrtGradGradFunctor<float>>,
@@ -1714,6 +1712,7 @@ REGISTER_OPERATOR(
    ops::ActivationOpDoubleGrad<ops::RsqrtGradGradFunctor<float>::FwdDeps()>,
    ops::ActivationDoubleGradOpInplaceInferer);

+REGISTER_ACTIVATION_CPU_KERNEL(rsqrt, Rsqrt, RsqrtFunctor, RsqrtGradFunctor);
 REGISTER_OP_CPU_KERNEL(
    rsqrt_grad_grad,
    ops::RsqrtDoubleGradKernel<plat::CPUDeviceContext,
@@ -1742,6 +1741,25 @@ REGISTER_OPERATOR(
    ops::ActivationOpDoubleGrad<ops::SquareGradGradFunctor<float>::FwdDeps()>,
    ops::ActivationDoubleGradOpInplaceInferer);

+REGISTER_OP_CPU_KERNEL(square,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::SquareFunctor<float>>,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::SquareFunctor<double>>,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::SquareFunctor<int>>,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::SquareFunctor<int64_t>>);
+REGISTER_OP_CPU_KERNEL(
+    square_grad, ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                                           ops::SquareGradFunctor<float>>,
+    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                              ops::SquareGradFunctor<double>>,
+    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                              ops::SquareGradFunctor<int>>,
+    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                              ops::SquareGradFunctor<int64_t>>);
+
 REGISTER_OP_CPU_KERNEL(
    square_grad_grad,
    ops::SquareDoubleGradKernel<plat::CPUDeviceContext,
@@ -1780,6 +1798,54 @@ REGISTER_OPERATOR(
 REGISTER_OPERATOR(exp_grad, ops::ActivationOpGrad,
                  ops::ActivationGradOpInplaceInferer);

+REGISTER_OP_CPU_KERNEL(exp,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::ExpFunctor<float>>,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::ExpFunctor<double>>,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::ExpFunctor<int>>,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::ExpFunctor<int64_t>>);
+REGISTER_OP_CPU_KERNEL(
+    exp_grad, ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                                        ops::ExpGradFunctor<float>>,
+    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                              ops::ExpGradFunctor<double>>,
+    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                              ops::ExpGradFunctor<int>>,
+    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                              ops::ExpGradFunctor<int64_t>>);
+/* ========================================================================== */
+
+/* ==========================   expm1 register  ============================ */
+REGISTER_OPERATOR(
+    expm1, ops::ActivationOp, ops::Expm1OpMaker, ops::ActivationOpInferVarType,
+    ops::ActivationGradOpMaker<ops::Expm1GradFunctor<float>::FwdDeps(),
+                               paddle::framework::OpDesc>,
+    ops::ActivationGradOpMaker<ops::Expm1GradFunctor<float>::FwdDeps(),
+                               paddle::imperative::OpBase>,
+    std::conditional<ops::CanInplaceAct<ops::Expm1GradFunctor<float>>(),
+                     ops::ActFwdInplaceInferer, void>::type);
+REGISTER_OPERATOR(expm1_grad, ops::ActivationOpGrad,
+                  ops::ActivationGradOpInplaceInferer);
+
+REGISTER_OP_CPU_KERNEL(expm1,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::Expm1Functor<float>>,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::Expm1Functor<double>>,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::Expm1Functor<plat::float16>>);
+REGISTER_OP_CPU_KERNEL(
+    expm1_grad, ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                                          ops::Expm1GradFunctor<float>>,
+    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                              ops::Expm1GradFunctor<double>>,
+    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                              ops::Expm1GradFunctor<plat::float16>>);
+/* ========================================================================== */
+
 /* ==========================  Log register ==================================*/
 REGISTER_OPERATOR(
    log, ops::ActivationOp, ops::LogOpMaker, ops::ActivationOpInferVarType,
@@ -1798,6 +1864,8 @@ REGISTER_OPERATOR(
    ops::ActivationOpDoubleGrad<ops::LogGradGradFunctor<float>::FwdDeps()>,
    ops::ActivationDoubleGradOpInplaceInferer);

+/* ========================================================================== */
+
 /* ==========================  register checkpoint ===========================*/
 REGISTER_OP_VERSION(leaky_relu)
    .AddCheckpoint(

--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
--- a/paddle/fluid/operators/activation_op.kps
+++ b/paddle/fluid/operators/activation_op.kps
@@ -20,6 +20,140 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

+template <typename T>
+struct CudaReciprocalFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // reciprocal(x) = 1 / x
+  __device__ __forceinline__ T operator()(const T x) const { return one / x; }
+};
+
+template <typename T>
+struct CudaReciprocalGradFunctor : public BaseActivationFunctor<T> {
+  // dx = -dout * out^2
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return -dout * out * out;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaExpFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // exp(x) = exp(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(exp(x));
+  }
+};
+
+template <typename T>
+struct CudaExpGradFunctor : public BaseActivationFunctor<T> {
+  // dx = dout * out
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return dout * out;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaExpm1Functor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // expm1(x) = expm1(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(expm1(x));
+  }
+};
+
+template <typename T>
+struct CudaExpm1GradFunctor : public BaseActivationFunctor<T> {
+  // dx = dout * out
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return dout * out + dout;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaSquareFunctor : public BaseActivationFunctor<T> {
+  // square(x) = x * x
+  __device__ __forceinline__ T operator()(const T x) const { return x * x; }
+};
+
+template <typename T>
+struct CudaSquareGradFunctor : public BaseActivationFunctor<T> {
+  T two = static_cast<T>(2.0f);
+
+  // dx = dout * 2 * x
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return dout * two * x;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaSqrtFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // sqrt(x) = sqrt(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(sqrt(x));
+  }
+};
+
+template <typename T>
+struct CudaSqrtGradFunctor : public BaseActivationFunctor<T> {
+  T one_half = static_cast<T>(0.5f);
+
+  // dx = dout * 0.5 / out
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return one_half * dout / out;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaRsqrtFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // rsqrt(x) = rsqrt(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(rsqrt(x));
+  }
+};
+
+template <typename T>
+struct CudaRsqrtGradFunctor : public BaseActivationFunctor<T> {
+  T minus_one_half = static_cast<T>(-0.5f);
+
+  // dx = -0.5 * dout * out^3
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return minus_one_half * dout * out * out * out;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
 template <typename T>
 struct CudaSoftReluFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
@@ -67,6 +201,119 @@ struct CudaSoftReluGradFunctor : public BaseActivationFunctor<T> {
  }
 };

+template <typename T>
+struct CudaSTanhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  float scale_a;
+  float scale_b;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
+  }
+
+  // stanh(x) = b * tanh(a * x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    MPType a = static_cast<MPType>(scale_a);
+    MPType b = static_cast<MPType>(scale_b);
+    return static_cast<T>(b * tanh(a * x));
+  }
+};
+
+template <typename T>
+struct CudaSTanhGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float scale_a;
+  float scale_b;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
+  }
+
+  // dx = dout * a * b * (1 - tanh(a * x) * tanh(a * x))
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    MPType a = static_cast<MPType>(scale_a);
+    MPType b = static_cast<MPType>(scale_b);
+    MPType temp = tanh(a * x);
+    return static_cast<T>(dout * a * b * (one - temp * temp));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaSoftplusFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float beta;
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}, {"threshold", &threshold}};
+  }
+
+  // softplus(x) = beta * x > threshold ? x : log(1 + exp(beta * x)) / beta
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    MPType b = static_cast<MPType>(beta);
+    MPType t = static_cast<MPType>(threshold);
+    MPType x_beta = x * beta;
+    return static_cast<T>(x_beta > t ? x : log(one + exp(x_beta)) / b);
+  }
+};
+
+template <typename T>
+struct CudaSoftplusGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float beta;
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}, {"threshold", &threshold}};
+  }
+
+  // dx = x * beta > threshold ? dout : dout / (1 + exp(-beta * x))
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    MPType b = static_cast<MPType>(beta);
+    MPType t = static_cast<MPType>(threshold);
+    MPType x_beta = x * beta;
+    return x_beta > t ? arg_dout : static_cast<T>(dout / (one + exp(-x_beta)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaSoftsignFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // softsign(x) = x / (1 + abs(x))
+  __device__ __forceinline__ T operator()(const T x) const {
+    return x / (one + abs(x));
+  }
+};
+
+template <typename T>
+struct CudaSoftsignGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // dx = dout / (1 + abs(x))^2
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    T temp = one + abs(x);
+    return dout / (temp * temp);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
 template <typename T>
 struct CudaRelu6Functor : public BaseActivationFunctor<T> {
  T zero = static_cast<T>(0.0f);
@@ -104,23 +351,49 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
 };

 template <typename T>
-struct CudaSoftsignFunctor : public BaseActivationFunctor<T> {
-  T one = static_cast<T>(1.0f);
+struct CudaMishFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float threshold;

-  // softsign(x) = x / (1 + abs(x))
-  __device__ __forceinline__ T operator()(const T x) const {
-    return x / (one + abs(x));
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // mish(x) = x * tanh(softplus(x))
+  // softplus(x) = x, if x > threshold
+  //             = ln(1 + exp(x)), otherwise
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    MPType sp = (x > static_cast<MPType>(threshold)) ? x : log(one + exp(x));
+    return static_cast<T>(x * tanh(sp));
  }
 };

 template <typename T>
-struct CudaSoftsignGradFunctor : public BaseActivationFunctor<T> {
-  T one = static_cast<T>(1.0f);
+struct CudaMishGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float threshold;

-  // dx = dout / (1 + abs(x))^2
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    T temp = one + abs(x);
-    return dout / (temp * temp);
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // dx = dout * (tanh(sp) + x * (1 - tanh(sp) ** 2) * (1 - exp(-sp)))
+  // sp = softplus(x)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    MPType sp = (x > static_cast<MPType>(threshold)) ? x : log(one + exp(x));
+    MPType gsp =
+        (x > static_cast<MPType>(threshold)) ? one : one / (one + exp(-x));
+    MPType tsp = tanh(sp);
+    return static_cast<T>(dout * (tsp + x * (one - tsp * tsp) * gsp));
  }

  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
@@ -285,16 +558,6 @@ using CudaCeilFunctor = phi::funcs::CudaCeilFunctor<T>;
 template <typename T>
 using CudaZeroGradFunctor = phi::funcs::CudaZeroGradFunctor<T>;

-USE_PHI_FUNCTOR(CudaExp)
-USE_PHI_FUNCTOR(CudaExpm1)
-USE_PHI_FUNCTOR(CudaMish)
-USE_PHI_FUNCTOR(CudaSTanh)
-USE_PHI_FUNCTOR(CudaReciprocal)
-USE_PHI_FUNCTOR(CudaSquare)
-USE_PHI_FUNCTOR(CudaSqrt)
-USE_PHI_FUNCTOR(CudaRsqrt)
-USE_PHI_FUNCTOR(CudaSoftplus)
-
 template <typename T>
 using CudaELUGradNegativeAlphaFunctor =
    phi::funcs::CudaELUGradNegativeAlphaFunctor<T>;
@@ -373,6 +636,8 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */

 /* ===========================   sqrt register  ============================= */
+REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, CudaSqrtFunctor,
+                                CudaSqrtGradFunctor);

 REGISTER_OP_CUDA_KERNEL(
    sqrt_grad_grad,
@@ -388,6 +653,8 @@ REGISTER_OP_CUDA_KERNEL(

 /* ===========================   rsqrt register  =============================
 */
+REGISTER_ACTIVATION_CUDA_KERNEL(rsqrt, Rsqrt, CudaRsqrtFunctor,
+                                CudaRsqrtGradFunctor);

 REGISTER_OP_CUDA_KERNEL(
    rsqrt_grad_grad,
@@ -400,6 +667,8 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */

 /* ===========================  square register  ============================ */
+REGISTER_ACTIVATION_CUDA_KERNEL_INT(square, Square, CudaSquareFunctor,
+                                    CudaSquareGradFunctor);

 REGISTER_OP_CUDA_KERNEL(
    square_grad_grad,
@@ -419,19 +688,75 @@ REGISTER_OP_CUDA_KERNEL(

 /* ==========================   logit register  ============================ */
 namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    logit, ops::LogitKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LogitKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LogitKernel<paddle::platform::CUDADeviceContext,
+                     paddle::platform::float16>);
+REGISTER_OP_CUDA_KERNEL(
+    logit_grad,
+    ops::LogitGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LogitGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LogitGradKernel<paddle::platform::CUDADeviceContext,
+                         paddle::platform::float16>);
 /* ========================================================================== */

 /* ==========================   exp register  ============================ */
+REGISTER_OP_CUDA_KERNEL(
+    exp, ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                                   ops::CudaExpFunctor<float>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaExpFunctor<double>>,
+    ops::ActivationKernel<plat::CUDADeviceContext, ops::ExpFunctor<int>>,
+    ops::ActivationKernel<plat::CUDADeviceContext, ops::ExpFunctor<int64_t>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaExpFunctor<plat::float16>>);
+REGISTER_OP_CUDA_KERNEL(
+    exp_grad, ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                            ops::CudaExpGradFunctor<float>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaExpGradFunctor<double>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaExpGradFunctor<int>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaExpGradFunctor<int64_t>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaExpGradFunctor<plat::float16>>);
 /* ========================================================================== */

 /* ==========================   expm1 register  ============================ */
+
+REGISTER_OP_CUDA_KERNEL(
+    expm1, ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                                     ops::CudaExpm1Functor<float>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaExpm1Functor<double>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaExpm1Functor<plat::float16>>);
+REGISTER_OP_CUDA_KERNEL(
+    expm1_grad, ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                              ops::CudaExpm1GradFunctor<float>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaExpm1GradFunctor<double>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaExpm1GradFunctor<plat::float16>>);
 /* ========================================================================== */

 #define FOR_EACH_ACTIVATION_CUDA_OP(__macro)                                  \
+  __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor,                      \
+          CudaSoftShrinkGradFunctor);                                         \
+  __macro(reciprocal, Reciprocal, CudaReciprocalFunctor,                      \
+          CudaReciprocalGradFunctor);                                         \
  __macro(soft_relu, SoftRelu, CudaSoftReluFunctor, CudaSoftReluGradFunctor); \
+  __macro(stanh, STanh, CudaSTanhFunctor, CudaSTanhGradFunctor);              \
+  __macro(softplus, Softplus, CudaSoftplusFunctor, CudaSoftplusGradFunctor);  \
+  __macro(softsign, Softsign, CudaSoftsignFunctor, CudaSoftsignGradFunctor);  \
  __macro(relu6, Relu6, CudaRelu6Functor, CudaRelu6GradFunctor);              \
-  __macro(softsign, Softsign, CudaSoftsignFunctor, CudaSoftsignGradFunctor);
-
+  __macro(tanh_shrink, TanhShrink, CudaTanhShrinkFunctor,                     \
+          CudaTanhShrinkGradFunctor);                                         \
+  __macro(hard_shrink, HardShrink, CudaHardShrinkFunctor,                     \
+          CudaHardShrinkGradFunctor);                                         \
+  __macro(mish, Mish, CudaMishFunctor, CudaMishGradFunctor);
 FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL)

 #ifdef PADDLE_WITH_XPU_KP

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -22,9 +22,9 @@ math_library(sampler DEPS generator)
 math_library(maxouting)

 if(WITH_MKLDNN)
-    math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mkldnn_axpy_handler mixed_vector)
+    math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mkldnn_axpy_handler)
 else()
-    math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mixed_vector)
+    math_library(selected_rows_functor DEPS selected_rows_utils math_function blas)
 endif()

 math_library(sequence_padding)

--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"

 #ifdef PADDLE_WITH_MKLDNN

--- a/paddle/fluid/operators/temporal_shift_op.h
+++ b/paddle/fluid/operators/temporal_shift_op.h
@@ -19,6 +19,56 @@ namespace operators {
 using Tensor = framework::Tensor;
 using DataLayout = framework::DataLayout;

+template <typename T>
+void TemporalShiftFwNCHW(const T* input, T* output, const int ntchw,
+                         const int tchw, const int chw, const int hw,
+                         const int t, const int c1, const int c2) {
+  int src_it = 0;
+  for (int i = 0; i < ntchw; i++) {
+    int it = (i % tchw) / chw;
+    int ic = (i % chw) / hw;
+
+    if (ic < c1) {
+      src_it = it - 1;
+    } else if (ic < c2) {
+      src_it = it + 1;
+    } else {
+      src_it = it;
+    }
+
+    if (src_it < 0 || src_it >= t) {
+      output[i] = 0;
+    } else {
+      output[i] = input[i + (src_it - it) * chw];
+    }
+  }
+}
+
+template <typename T>
+void TemporalShiftFwNHWC(const T* input, T* output, const int nthwc,
+                         const int thwc, const int hwc, const int t,
+                         const int c, const int c1, const int c2) {
+  int src_it = 0;
+  for (int i = 0; i < nthwc; i++) {
+    int it = (i % thwc) / hwc;
+    int ic = i % c;
+
+    if (ic < c1) {
+      src_it = it - 1;
+    } else if (ic < c2) {
+      src_it = it + 1;
+    } else {
+      src_it = it;
+    }
+
+    if (src_it < 0 || src_it >= t) {
+      output[i] = 0;
+    } else {
+      output[i] = input[i + (src_it - it) * hwc];
+    }
+  }
+}
+
 template <typename T>
 void TemporalShiftBwNCHW(const T* output_grad, T* input_grad, const int ntchw,
                         const int tchw, const int chw, const int hw,
@@ -72,7 +122,45 @@ void TemporalShiftBwNHWC(const T* output_grad, T* input_grad, const int nthwc,
 template <typename T>
 class TemporalShiftKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& ctx) const override {}
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    int t = ctx.Attr<int>("seg_num");
+    float shift_ratio = ctx.Attr<float>("shift_ratio");
+    const std::string data_format_str = ctx.Attr<std::string>("data_format");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_format_str);
+
+    const int nt = input->dims()[0];
+    const int c = (data_layout == DataLayout::kNCHW ? input->dims()[1]
+                                                    : input->dims()[3]);
+    const int h = (data_layout == DataLayout::kNCHW ? input->dims()[2]
+                                                    : input->dims()[1]);
+    const int w = (data_layout == DataLayout::kNCHW ? input->dims()[3]
+                                                    : input->dims()[2]);
+
+    const int hw = h * w;
+    const int chw = c * hw;
+    const int tchw = t * chw;
+    const int ntchw = nt * chw;
+
+    const int c1 = static_cast<int>(c * shift_ratio);
+    const int c2 = static_cast<int>(c * 2 * shift_ratio);
+
+    framework::DDim out_dims =
+        (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
+                                          : phi::make_ddim({nt, h, w, c}));
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>(out_dims, ctx.GetPlace());
+
+    if (data_layout == DataLayout::kNCHW) {
+      TemporalShiftFwNCHW<T>(input_data, output_data, ntchw, tchw, chw, hw, t,
+                             c1, c2);
+    } else {
+      TemporalShiftFwNHWC<T>(input_data, output_data, ntchw, tchw, chw, t, c,
+                             c1, c2);
+    }
+  }
 };

 template <typename T>

--- a/paddle/phi/kernels/activation_kernel.h
+++ b/paddle/phi/kernels/activation_kernel.h
@@ -53,13 +53,6 @@ DECLARE_ACTIVATION_KERNEL(Acosh)
 DECLARE_ACTIVATION_KERNEL(Atanh)
 DECLARE_ACTIVATION_KERNEL(Relu)
 DECLARE_ACTIVATION_KERNEL(Tanh)
-DECLARE_ACTIVATION_KERNEL(Exp)
-DECLARE_ACTIVATION_KERNEL(Expm1)
-DECLARE_ACTIVATION_KERNEL(Reciprocal)
-DECLARE_ACTIVATION_KERNEL(Square)
-DECLARE_ACTIVATION_KERNEL(Sqrt)
-DECLARE_ACTIVATION_KERNEL(Rsqrt)
-
 DECLARE_ACTIVATION_KERNEL(TanhShrink)
 DECLARE_ACTIVATION_KERNEL(Silu)
 DECLARE_ACTIVATION_KERNEL(Sigmoid)
@@ -80,23 +73,8 @@ DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Swish, beta)

 DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(BRelu, t_min, t_max)
-DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(STanh, scale_a, scale_b)
 DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(HardSigmoid, slope, offset)

-DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(Softplus, beta, threshold)
-
-template <typename T, typename Context>
-void LogitKernel(const Context& dev_ctx,
-                 const DenseTensor& x,
-                 float eps,
-                 DenseTensor* out);
-
-template <typename T, typename Context>
-void MishKernel(const Context& dev_ctx,
-                const DenseTensor& x,
-                float threshold,
-                DenseTensor* out);
-
 template <typename T, typename Context>
 void HardSwishKernel(const Context& dev_ctx,
                     const DenseTensor& x,

--- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -129,13 +129,6 @@ DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, AcoshGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, AtanhGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, TanhShrinkGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, SiluGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Square, SquareGradFunctor);
-
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Exp, ExpGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1, Expm1GradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Reciprocal, ReciprocalGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, SqrtGradFunctor);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Rsqrt, RsqrtGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, LogSigmoidGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, LogGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log2, Log2GradFunctor);
@@ -164,24 +157,11 @@ DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink,
                                               threshold);
 DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish, SwishGradFunctor, beta);

-DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
-                                               MishGradFunctor,
-                                               threshold);
-
 DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
                                               BReluGradFunctor,
                                               t_min,
                                               t_max);

-DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(STanh,
-                                               STanhGradFunctor,
-                                               scale_a,
-                                               scale_b);
-
-DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(Softplus,
-                                               SoftplusGradFunctor,
-                                               beta,
-                                               threshold);
 DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid,
                                                 HardSigmoidGradFunctor,
                                                 slope,
@@ -267,12 +247,6 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(mish_grad, MishGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(stanh_grad, STanhGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(reciprocal_grad, ReciprocalGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(sqrt_grad, SqrtGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(rsqrt_grad, RsqrtGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(softplus_grad, SoftplusGradKernel)

 PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(relu_double_grad,
                                          ReluDoubleGradKernel)
@@ -289,34 +263,6 @@ PD_REGISTER_KERNEL(tanh_triple_grad,
                   float,
                   double,
                   phi::dtype::float16) {}
-
-PD_REGISTER_KERNEL(exp_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::ExpGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t) {}
-
-PD_REGISTER_KERNEL(expm1_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::Expm1GradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-
-PD_REGISTER_KERNEL(
-    logit_grad, CPU, ALL_LAYOUT, phi::LogitGradKernel, float, double) {}
-PD_REGISTER_KERNEL(square_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::SquareGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t) {}
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel)

--- a/paddle/phi/kernels/cpu/activation_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/phi/kernels/activation_kernel.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/funcs/activation_functor.h"
 #include "paddle/phi/kernels/impl/activation_impl.h"

 namespace phi {
@@ -73,12 +72,6 @@ DEFINE_CPU_ACTIVATION_KERNEL(Relu, ReluCPUFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Tanh, TanhFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(TanhShrink, TanhShrinkFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Silu, SiluFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Exp, ExpFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Expm1, Expm1Functor)
-DEFINE_CPU_ACTIVATION_KERNEL(Reciprocal, ReciprocalFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Square, SquareFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Sqrt, SqrtFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Rsqrt, RsqrtFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Sigmoid, SigmoidFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(LogSigmoid, LogSigmoidFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Log, LogFunctor)
@@ -90,19 +83,15 @@ DEFINE_CPU_ACTIVATION_KERNEL(Floor, FloorFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Ceil, CeilFunctor)

 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, LeakyReluFunctor, alpha)
-
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
                                     ThresholdedReluFunctor,
                                     threshold)
-DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Mish, MishFunctor, threshold)
-DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, BReluFunctor, t_min, t_max)
-DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(STanh, STanhFunctor, scale_a, scale_b)
-DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(Softplus, SoftplusFunctor, beta, threshold)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink, HardShrinkFunctor, threshold)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, SoftShrinkFunctor, lambda)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, ELUFunctor, alpha)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Swish, SwishFunctor, beta)

+DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, BReluFunctor, t_min, t_max)
 DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid,
                                     HardSigmoidFunctor,
                                     slope,
@@ -150,25 +139,6 @@ PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel)
-PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel)
-PD_REGISTER_ACTIVATION_KERNEL(stanh, STanhKernel)
-PD_REGISTER_ACTIVATION_KERNEL(reciprocal, ReciprocalKernel)
-PD_REGISTER_ACTIVATION_KERNEL(sqrt, SqrtKernel)
-PD_REGISTER_ACTIVATION_KERNEL(rsqrt, RsqrtKernel)
-PD_REGISTER_ACTIVATION_KERNEL(softplus, SoftplusKernel)
-
-PD_REGISTER_KERNEL(
-    exp, CPU, ALL_LAYOUT, phi::ExpKernel, float, double, int, int64_t) {}
-PD_REGISTER_KERNEL(expm1,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::Expm1Kernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(logit, CPU, ALL_LAYOUT, phi::LogitKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    square, CPU, ALL_LAYOUT, phi::SquareKernel, float, double, int, int64_t) {}
 PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel)
 PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel)
 PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel)

--- a/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/kernels/temporal_shift_grad_kernel.h"
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/kernel_registry.h"
-
-namespace phi {
-
-template <typename T>
-void TemporalShiftBwNCHW(const T* output_grad,
-                         T* input_grad,
-                         const int ntchw,
-                         const int tchw,
-                         const int chw,
-                         const int hw,
-                         const int t,
-                         const int c1,
-                         const int c2) {
-  int src_it = 0;
-  for (int i = 0; i < ntchw; i++) {
-    int it = (i % tchw) / chw;
-    int ic = (i % chw) / hw;
-
-    if (ic < c1) {
-      src_it = it + 1;
-    } else if (ic < c2) {
-      src_it = it - 1;
-    } else {
-      src_it = it;
-    }
-
-    if (src_it >= 0 && src_it < t) {
-      input_grad[i] = output_grad[i + (src_it - it) * chw];
-    } else {
-      input_grad[i] = 0;
-    }
-  }
-}
-
-template <typename T>
-void TemporalShiftBwNHWC(const T* output_grad,
-                         T* input_grad,
-                         const int nthwc,
-                         const int thwc,
-                         const int hwc,
-                         const int t,
-                         const int c,
-                         const int c1,
-                         const int c2) {
-  int src_it = 0;
-  for (int i = 0; i < nthwc; i++) {
-    int it = (i % thwc) / hwc;
-    int ic = i % c;
-
-    if (ic < c1) {
-      src_it = it + 1;
-    } else if (ic < c2) {
-      src_it = it - 1;
-    } else {
-      src_it = it;
-    }
-
-    if (src_it >= 0 && src_it < t) {
-      input_grad[i] = output_grad[i + (src_it - it) * hwc];
-    } else {
-      input_grad[i] = 0;
-    }
-  }
-}
-
-template <typename T, typename Context>
-void TemporalShiftGradKernel(const Context& dev_ctx,
-                             const DenseTensor& out_grad,
-                             int seg_num,
-                             float shift_ratio,
-                             const std::string& data_format_str,
-                             DenseTensor* x_grad) {
-  auto* input_grad = x_grad;
-  auto* output_grad = &out_grad;
-  int t = seg_num;
-  const DataLayout data_layout =
-      paddle::framework::StringToDataLayout(data_format_str);
-
-  const int nt = output_grad->dims()[0];
-  const int c = (data_layout == DataLayout::kNCHW ? output_grad->dims()[1]
-                                                  : output_grad->dims()[3]);
-  const int h = (data_layout == DataLayout::kNCHW ? output_grad->dims()[2]
-                                                  : output_grad->dims()[1]);
-  const int w = (data_layout == DataLayout::kNCHW ? output_grad->dims()[3]
-                                                  : output_grad->dims()[2]);
-
-  const int hw = h * w;
-  const int chw = c * hw;
-  const int tchw = t * chw;
-  const int ntchw = nt * chw;
-
-  const int c1 = static_cast<int>(c * shift_ratio);
-  const int c2 = static_cast<int>(c * 2 * shift_ratio);
-
-  DDim in_grad_dims =
-      (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
-                                        : phi::make_ddim({nt, h, w, c}));
-  const T* output_grad_data = output_grad->data<T>();
-  T* input_grad_data =
-      input_grad->mutable_data<T>(in_grad_dims, dev_ctx.GetPlace());
-
-  if (data_layout == DataLayout::kNCHW) {
-    TemporalShiftBwNCHW<T>(
-        output_grad_data, input_grad_data, ntchw, tchw, chw, hw, t, c1, c2);
-  } else {
-    TemporalShiftBwNHWC<T>(
-        output_grad_data, input_grad_data, ntchw, tchw, chw, t, c, c1, c2);
-  }
-}
-
-}  // namespace phi
-
-PD_REGISTER_KERNEL(temporal_shift_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::TemporalShiftGradKernel,
-                   float,
-                   double) {}
--- a/paddle/phi/kernels/cpu/temporal_shift_kernel.cc
+++ b/paddle/phi/kernels/cpu/temporal_shift_kernel.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/kernels/temporal_shift_kernel.h"
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/kernel_registry.h"
-
-namespace phi {
-
-template <typename T>
-void TemporalShiftFwNCHW(const T* input,
-                         T* output,
-                         const int ntchw,
-                         const int tchw,
-                         const int chw,
-                         const int hw,
-                         const int t,
-                         const int c1,
-                         const int c2) {
-  int src_it = 0;
-  for (int i = 0; i < ntchw; i++) {
-    int it = (i % tchw) / chw;
-    int ic = (i % chw) / hw;
-
-    if (ic < c1) {
-      src_it = it - 1;
-    } else if (ic < c2) {
-      src_it = it + 1;
-    } else {
-      src_it = it;
-    }
-
-    if (src_it < 0 || src_it >= t) {
-      output[i] = 0;
-    } else {
-      output[i] = input[i + (src_it - it) * chw];
-    }
-  }
-}
-
-template <typename T>
-void TemporalShiftFwNHWC(const T* input,
-                         T* output,
-                         const int nthwc,
-                         const int thwc,
-                         const int hwc,
-                         const int t,
-                         const int c,
-                         const int c1,
-                         const int c2) {
-  int src_it = 0;
-  for (int i = 0; i < nthwc; i++) {
-    int it = (i % thwc) / hwc;
-    int ic = i % c;
-
-    if (ic < c1) {
-      src_it = it - 1;
-    } else if (ic < c2) {
-      src_it = it + 1;
-    } else {
-      src_it = it;
-    }
-
-    if (src_it < 0 || src_it >= t) {
-      output[i] = 0;
-    } else {
-      output[i] = input[i + (src_it - it) * hwc];
-    }
-  }
-}
-
-template <typename T, typename Context>
-void TemporalShiftKernel(const Context& dev_ctx,
-                         const DenseTensor& x,
-                         int seg_num,
-                         float shift_ratio,
-                         const std::string& data_format_str,
-                         DenseTensor* out) {
-  auto* input = &x;
-  auto* output = out;
-  int t = seg_num;
-  const DataLayout data_layout =
-      paddle::framework::StringToDataLayout(data_format_str);
-
-  const int nt = input->dims()[0];
-  const int c =
-      (data_layout == DataLayout::kNCHW ? input->dims()[1] : input->dims()[3]);
-  const int h =
-      (data_layout == DataLayout::kNCHW ? input->dims()[2] : input->dims()[1]);
-  const int w =
-      (data_layout == DataLayout::kNCHW ? input->dims()[3] : input->dims()[2]);
-
-  const int hw = h * w;
-  const int chw = c * hw;
-  const int tchw = t * chw;
-  const int ntchw = nt * chw;
-
-  const int c1 = static_cast<int>(c * shift_ratio);
-  const int c2 = static_cast<int>(c * 2 * shift_ratio);
-
-  DDim out_dims =
-      (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
-                                        : phi::make_ddim({nt, h, w, c}));
-  const T* input_data = input->data<T>();
-  T* output_data = output->mutable_data<T>(out_dims, dev_ctx.GetPlace());
-
-  if (data_layout == DataLayout::kNCHW) {
-    TemporalShiftFwNCHW<T>(
-        input_data, output_data, ntchw, tchw, chw, hw, t, c1, c2);
-  } else {
-    TemporalShiftFwNHWC<T>(
-        input_data, output_data, ntchw, tchw, chw, t, c, c1, c2);
-  }
-}
-
-}  // namespace phi
-
-PD_REGISTER_KERNEL(
-    temporal_shift, CPU, ALL_LAYOUT, phi::TemporalShiftKernel, float, double) {}
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -189,13 +189,6 @@ DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, CudaAcoshGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, CudaAtanhGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, CudaTanhShrinkGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, CudaSiluGradFunctor);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Square, CudaSquareGradFunctor);
-
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Exp, CudaExpGradFunctor);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1, CudaExpm1GradFunctor);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Reciprocal, CudaReciprocalGradFunctor);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, CudaSqrtGradFunctor);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Rsqrt, CudaRsqrtGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, CudaLogSigmoidGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, CudaLogGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log2, CudaLog2GradFunctor);
@@ -218,24 +211,11 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish,
                                               CudaSwishGradFunctor,
                                               beta);

-DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
-                                               CudaMishGradFunctor,
-                                               threshold);
-
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
                                               CudaBReluGradFunctor,
                                               t_min,
                                               t_max);

-DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(STanh,
-                                               CudaSTanhGradFunctor,
-                                               scale_a,
-                                               scale_b);
-
-DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(Softplus,
-                                               CudaSoftplusGradFunctor,
-                                               beta,
-                                               threshold);
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid,
                                                 CudaHardSigmoidGradFunctor,
                                                 slope,
@@ -346,57 +326,12 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_double_grad,
                                   LeakyReluDoubleGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad,
                                   ThresholdedReluGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(mish_grad, MishGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(stanh_grad, STanhGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(reciprocal_grad, ReciprocalGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(softplus_grad, SoftplusGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(sqrt_grad, SqrtGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(rsqrt_grad, RsqrtGradKernel)
-
-PD_REGISTER_KERNEL(exp_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::ExpGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16) {}
-
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel)
-
-PD_REGISTER_KERNEL(expm1_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::Expm1GradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-
-PD_REGISTER_KERNEL(logit_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::LogitGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-
-PD_REGISTER_KERNEL(square_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SquareGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
-
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel)

--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -19,7 +19,6 @@ limitations under the License. */
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
-#include "paddle/phi/kernels/impl/activation_grad_impl.h"
 #include "paddle/phi/kernels/impl/activation_impl.h"

 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
@@ -92,12 +91,6 @@ DEFINE_GPU_ACTIVATION_KERNEL(Relu, CudaReluFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Tanh, CudaTanhFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(TanhShrink, CudaTanhShrinkFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Silu, CudaSiluFunctor)
-DEFINE_GPU_ACTIVATION_KERNEL(Exp, CudaExpFunctor)
-DEFINE_GPU_ACTIVATION_KERNEL(Expm1, CudaExpm1Functor)
-DEFINE_GPU_ACTIVATION_KERNEL(Reciprocal, CudaReciprocalFunctor)
-DEFINE_GPU_ACTIVATION_KERNEL(Square, CudaSquareFunctor)
-DEFINE_GPU_ACTIVATION_KERNEL(Sqrt, CudaSqrtFunctor)
-DEFINE_GPU_ACTIVATION_KERNEL(Rsqrt, CudaRsqrtFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Sigmoid, CudaSigmoidFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(LogSigmoid, CudaLogSigmoidFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Log, CudaLogFunctor)
@@ -119,14 +112,7 @@ DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, CudaSoftShrinkFunctor, lambda)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, CudaELUFunctor, alpha)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Swish, CudaSwishFunctor, beta)

-DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Mish, CudaMishFunctor, threshold)
-
 DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, CudaBReluFunctor, t_min, t_max)
-DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Stanh, CudaSTanhFunctor, scale_a, scale_b)
-DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Softplus,
-                                     CudaSoftplusFunctor,
-                                     beta,
-                                     threshold)
 DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid,
                                     CudaHardSigmoidFunctor,
                                     slope,
@@ -194,46 +180,6 @@ PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
-PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel)
-PD_REGISTER_ACTIVATION_KERNEL(stanh, StanhKernel)
-PD_REGISTER_ACTIVATION_KERNEL(reciprocal, ReciprocalKernel)
-PD_REGISTER_ACTIVATION_KERNEL(sqrt, SqrtKernel)
-PD_REGISTER_ACTIVATION_KERNEL(rsqrt, RsqrtKernel)
-PD_REGISTER_ACTIVATION_KERNEL(softplus, SoftplusKernel)
-
-PD_REGISTER_KERNEL(exp,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::ExpKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(expm1,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::Expm1Kernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(logit,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::LogitKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(square,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SquareKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
 PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel)

--- a/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/temporal_shift_grad_kernel.h"
-
-namespace phi {
-
-template <typename T>
-__global__ void KeTemporalShiftBwNCHW(const T* output_grad,
-                                      T* input_grad,
-                                      const int ntchw,
-                                      const int tchw,
-                                      const int chw,
-                                      const int hw,
-                                      const int t,
-                                      const int c1,
-                                      const int c2) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  int src_it = 0;
-
-  for (; tid < ntchw; tid += stride) {
-    int it = (tid % tchw) / chw;
-    int ic = (tid % chw) / hw;
-
-    if (ic < c1) {
-      src_it = it + 1;
-    } else if (ic < c2) {
-      src_it = it - 1;
-    } else {
-      src_it = it;
-    }
-
-    if (src_it >= 0 && src_it < t) {
-      input_grad[tid] = output_grad[tid + (src_it - it) * chw];
-    } else {
-      input_grad[tid] = 0;
-    }
-  }
-}
-
-template <typename T>
-__global__ void KeTemporalShiftBwNHWC(const T* output_grad,
-                                      T* input_grad,
-                                      const int nthwc,
-                                      const int thwc,
-                                      const int hwc,
-                                      const int t,
-                                      const int c,
-                                      const int c1,
-                                      const int c2) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  int src_it = 0;
-
-  for (; tid < nthwc; tid += stride) {
-    int it = (tid % thwc) / hwc;
-    int ic = tid % c;
-
-    if (ic < c1) {
-      src_it = it + 1;
-    } else if (ic < c2) {
-      src_it = it - 1;
-    } else {
-      src_it = it;
-    }
-
-    if (src_it >= 0 && src_it < t) {
-      input_grad[tid] = output_grad[tid + (src_it - it) * hwc];
-    } else {
-      input_grad[tid] = 0;
-    }
-  }
-}
-
-template <typename T, typename Context>
-void TemporalShiftGradKernel(const Context& dev_ctx,
-                             const DenseTensor& out_grad,
-                             int seg_num,
-                             float shift_ratio,
-                             const std::string& data_format_str,
-                             DenseTensor* x_grad) {
-  auto* input_grad = x_grad;
-  auto* output_grad = &out_grad;
-  int t = seg_num;
-  const DataLayout data_layout =
-      paddle::framework::StringToDataLayout(data_format_str);
-
-  const int nt = output_grad->dims()[0];
-  const int c = (data_layout == DataLayout::kNCHW ? output_grad->dims()[1]
-                                                  : output_grad->dims()[3]);
-  const int h = (data_layout == DataLayout::kNCHW ? output_grad->dims()[2]
-                                                  : output_grad->dims()[1]);
-  const int w = (data_layout == DataLayout::kNCHW ? output_grad->dims()[3]
-                                                  : output_grad->dims()[2]);
-
-  const int hw = h * w;
-  const int chw = c * hw;
-  const int tchw = t * chw;
-  const int ntchw = nt * chw;
-
-  const int c1 = static_cast<int>(c * shift_ratio);
-  const int c2 = static_cast<int>(c * 2 * shift_ratio);
-
-  DDim in_grad_dims =
-      (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
-                                        : phi::make_ddim({nt, h, w, c}));
-  const T* output_grad_data = output_grad->data<T>();
-  T* input_grad_data =
-      input_grad->mutable_data<T>(in_grad_dims, dev_ctx.GetPlace());
-
-  int pixelNum = nt * chw;
-  int threads = 1024;
-  int grid = (pixelNum + threads - 1) / threads;
-  int blocks_per_sm = dev_ctx.GetMaxPhysicalThreadCount() / threads;
-  grid = std::min(dev_ctx.GetSMCount() * blocks_per_sm, grid);
-
-  if (data_layout == DataLayout::kNCHW) {
-    KeTemporalShiftBwNCHW<T><<<grid, threads, 0, dev_ctx.stream()>>>(
-        output_grad_data, input_grad_data, ntchw, tchw, chw, hw, t, c1, c2);
-  } else {
-    KeTemporalShiftBwNHWC<T><<<grid, threads, 0, dev_ctx.stream()>>>(
-        output_grad_data, input_grad_data, ntchw, tchw, chw, t, c, c1, c2);
-  }
-}
-
-}  // namespace phi
-
-PD_REGISTER_KERNEL(temporal_shift_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::TemporalShiftGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
--- a/paddle/phi/kernels/gpu/temporal_shift_kernel.cu
+++ b/paddle/phi/kernels/gpu/temporal_shift_kernel.cu
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/temporal_shift_kernel.h"
-
-namespace phi {
-
-template <typename T>
-__global__ void KeTemporalShiftFwNCHW(const T* input,
-                                      T* output,
-                                      const int ntchw,
-                                      const int tchw,
-                                      const int chw,
-                                      const int hw,
-                                      const int t,
-                                      const int c1,
-                                      const int c2) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  int src_it = 0;
-
-  for (; tid < ntchw; tid += stride) {
-    int it = (tid % tchw) / chw;
-    int ic = (tid % chw) / hw;
-
-    if (ic < c1) {
-      src_it = it - 1;
-    } else if (ic < c2) {
-      src_it = it + 1;
-    } else {
-      src_it = it;
-    }
-
-    if (src_it < 0 || src_it >= t) {
-      output[tid] = 0;
-    } else {
-      output[tid] = input[tid + (src_it - it) * chw];
-    }
-  }
-}
-
-template <typename T>
-__global__ void KeTemporalShiftFwNHWC(const T* input,
-                                      T* output,
-                                      const int nthwc,
-                                      const int thwc,
-                                      const int hwc,
-                                      const int t,
-                                      const int c,
-                                      const int c1,
-                                      const int c2) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  int src_it = 0;
-
-  for (; tid < nthwc; tid += stride) {
-    int it = (tid % thwc) / hwc;
-    int ic = tid % c;
-
-    if (ic < c1) {
-      src_it = it - 1;
-    } else if (ic < c2) {
-      src_it = it + 1;
-    } else {
-      src_it = it;
-    }
-
-    if (src_it < 0 || src_it >= t) {
-      output[tid] = 0;
-    } else {
-      output[tid] = input[tid + (src_it - it) * hwc];
-    }
-  }
-}
-
-template <typename T, typename Context>
-void TemporalShiftKernel(const Context& dev_ctx,
-                         const DenseTensor& x,
-                         int seg_num,
-                         float shift_ratio,
-                         const std::string& data_format_str,
-                         DenseTensor* out) {
-  auto* input = &x;
-  auto* output = out;
-  int t = seg_num;
-  const DataLayout data_layout =
-      paddle::framework::StringToDataLayout(data_format_str);
-
-  const int nt = input->dims()[0];
-  const int c =
-      (data_layout == DataLayout::kNCHW ? input->dims()[1] : input->dims()[3]);
-  const int h =
-      (data_layout == DataLayout::kNCHW ? input->dims()[2] : input->dims()[1]);
-  const int w =
-      (data_layout == DataLayout::kNCHW ? input->dims()[3] : input->dims()[2]);
-
-  const int hw = h * w;
-  const int chw = c * hw;
-  const int tchw = t * chw;
-  const int ntchw = nt * chw;
-
-  const int c1 = static_cast<int>(c * shift_ratio);
-  const int c2 = static_cast<int>(c * 2 * shift_ratio);
-
-  DDim out_dims =
-      (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
-                                        : phi::make_ddim({nt, h, w, c}));
-  const T* input_data = input->data<T>();
-  T* output_data = output->mutable_data<T>(out_dims, dev_ctx.GetPlace());
-
-  int pixelNum = nt * chw;
-  int threads = 1024;
-  int grid = (pixelNum + threads - 1) / threads;
-  int blocks_per_sm = dev_ctx.GetMaxPhysicalThreadCount() / threads;
-  grid = std::min(dev_ctx.GetSMCount() * blocks_per_sm, grid);
-
-  if (data_layout == DataLayout::kNCHW) {
-    KeTemporalShiftFwNCHW<T><<<grid, threads, 0, dev_ctx.stream()>>>(
-        input_data, output_data, ntchw, tchw, chw, hw, t, c1, c2);
-  } else {
-    KeTemporalShiftFwNHWC<T><<<grid, threads, 0, dev_ctx.stream()>>>(
-        input_data, output_data, ntchw, tchw, chw, t, c, c1, c2);
-  }
-}
-
-}  // namespace phi
-
-PD_REGISTER_KERNEL(temporal_shift,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::TemporalShiftKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
--- a/paddle/phi/kernels/impl/activation_grad_impl.h
+++ b/paddle/phi/kernels/impl/activation_grad_impl.h
@@ -222,24 +222,6 @@ void EluDoubleGradKernel(const Context& dev_ctx,
  functor(dev_ctx, &x, &ddx, ddout, &dout, dx);
 }

-template <typename T, typename Context>
-void LogitGradKernel(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& out_grad,
-                     float eps,
-                     DenseTensor* x_grad) {
-  dev_ctx.template Alloc<T>(x_grad);
-
-  auto eigen_x = EigenVector<T>::Flatten(x);
-  auto eigen_dout = EigenVector<T>::Flatten(out_grad);
-  auto eigen_dx = EigenVector<T>::Flatten(*x_grad);
-  auto& place = *dev_ctx.eigen_device();
-  auto eigen_p = EigenVector<T>::Flatten(x);
-
-  funcs::LogitGradFunctor<T> functor;
-  functor(place, eigen_x, eigen_dout, eigen_dx, eigen_p, eps);
-}
-
 template <typename T, typename Context>
 void SigmoidDoubleGradKernel(const Context& dev_ctx,
                             const DenseTensor& out,

--- a/paddle/phi/kernels/impl/activation_impl.h
+++ b/paddle/phi/kernels/impl/activation_impl.h
@@ -47,22 +47,6 @@ void ActivationImpl(const Context& dev_ctx,
  }
 }

-template <typename T, typename Context>
-void LogitKernel(const Context& dev_ctx,
-                 const DenseTensor& x,
-                 float eps,
-                 DenseTensor* out) {
-  dev_ctx.template Alloc<T>(out);
-
-  auto eigen_out = EigenVector<T>::Flatten(*out);
-  auto eigen_in = EigenVector<T>::Flatten(x);
-  auto& place = *dev_ctx.eigen_device();
-  auto eigen_p = EigenVector<T>::Flatten(*out);
-
-  funcs::LogitFunctor<T> functor;
-  functor(place, eigen_in, eigen_out, eigen_p, eps);
-}
-
 template <typename T, typename Context>
 void PowKernel(const Context& dev_ctx,
               const DenseTensor& x,

--- a/paddle/phi/kernels/selected_rows/activation_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/activation_kernel.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/kernels/selected_rows/activation_kernel.h"
-
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/activation_kernel.h"
-
-#include "paddle/phi/backends/cpu/cpu_context.h"
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
-
-namespace phi {
-namespace sr {
-
-template <typename T, typename Context>
-void SquareKernel(const Context& dev_ctx,
-                  const SelectedRows& x,
-                  SelectedRows* out) {
-  out->set_rows(x.rows());
-  out->set_height(x.height());
-  phi::SquareKernel<T, Context>(dev_ctx, x.value(), out->mutable_value());
-}
-
-template <typename T, typename Context>
-void SqrtKernel(const Context& dev_ctx,
-                const SelectedRows& x,
-                SelectedRows* out) {
-  out->set_rows(x.rows());
-  out->set_height(x.height());
-  phi::SqrtKernel<T, Context>(dev_ctx, x.value(), out->mutable_value());
-}
-
-}  // namespace sr
-}  // namespace phi
-
-PD_REGISTER_KERNEL(
-    square_sr, CPU, ALL_LAYOUT, phi::sr::SquareKernel, float, double) {}
-
-PD_REGISTER_KERNEL(
-    sqrt_sr, CPU, ALL_LAYOUT, phi::sr::SqrtKernel, float, double) {}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
-PD_REGISTER_KERNEL(square_sr,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::sr::SquareKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t) {}
-
-PD_REGISTER_KERNEL(
-    sqrt_sr, GPU, ALL_LAYOUT, phi::sr::SqrtKernel, float, double) {}
-
-#endif
--- a/paddle/phi/kernels/selected_rows/activation_kernel.h
+++ b/paddle/phi/kernels/selected_rows/activation_kernel.h
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/selected_rows.h"
-
-namespace phi {
-namespace sr {
-
-template <typename T, typename Context>
-void SquareKernel(const Context& dev_ctx,
-                  const SelectedRows& x,
-                  SelectedRows* out);
-
-template <typename T, typename Context>
-void SqrtKernel(const Context& dev_ctx,
-                const SelectedRows& x,
-                SelectedRows* out);
-
-}  // namespace sr
-}  // namespace phi
--- a/paddle/phi/kernels/temporal_shift_grad_kernel.h
+++ b/paddle/phi/kernels/temporal_shift_grad_kernel.h
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void TemporalShiftGradKernel(const Context& ctx,
-                             const DenseTensor& out_grad,
-                             int seg_num,
-                             float shift_ratio,
-                             const std::string& data_format,
-                             DenseTensor* x_grad);
-
-}  // namespace phi
--- a/paddle/phi/kernels/temporal_shift_kernel.h
+++ b/paddle/phi/kernels/temporal_shift_kernel.h
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void TemporalShiftKernel(const Context& ctx,
-                         const DenseTensor& x,
-                         int seg_num,
-                         float shift_ratio,
-                         const std::string& data_format,
-                         DenseTensor* out);
-
-}  // namespace phi
--- a/paddle/phi/ops/compat/activation_sig.cc
+++ b/paddle/phi/ops/compat/activation_sig.cc
@@ -43,19 +43,17 @@ namespace phi {

 #define comma ,

-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cos, "cos", );        // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Tan, "tan", );        // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acos, "acos", );      // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sin, "sin", );        // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asin, "asin", );      // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atan, "atan", );      // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sinh, "sinh", );      // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cosh, "cosh", );      // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asinh, "asinh", );    // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acosh, "acosh", );    // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atanh, "atanh", );    // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Square, "square", );  // NOLINT
-
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cos, "cos", );      // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Tan, "tan", );      // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acos, "acos", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sin, "sin", );      // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asin, "asin", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atan, "atan", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sinh, "sinh", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cosh, "cosh", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asinh, "asinh", );  // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acosh, "acosh", );  // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atanh, "atanh", );  // NOLINT
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(BRelu, "brelu", "t_min" comma "t_max");
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LeakyRelu, "leaky_relu", "alpha");
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(ThresholdedRelu,
@@ -63,7 +61,6 @@ DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(ThresholdedRelu,
                               "threshold");
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(SoftShrink, "soft_shrink", "lambda");
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardShrink, "hard_shrink", "threshold");
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Mish, "mish", "threshold");
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(TanhShrink, "tanh_shrink", );  // NOLINT
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Silu, "silu", );               // NOLINT
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LogSigmoid, "logsigmoid", );   // NOLINT
@@ -77,41 +74,12 @@ DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardSwish,
                               "offset");                // NOLINT
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Swish, "swish", "beta");  // NOLINT

-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(STanh,
-                               "stanh",
-                               "scale_a" comma "scale_b");  // NOLINT
-
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Softplus,
-                               "softplus",
-                               "beta" comma "threshold");  // NOLINT
-
-DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", );              // NOLINT
-DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", );              // NOLINT
-DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sigmoid, "sigmoid", );        // NOLINT
-DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Exp, "exp", );                // NOLINT
-DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Expm1, "expm1", );            // NOLINT
-DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Reciprocal, "reciprocal", );  // NOLINT
-DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sqrt, "sqrt", );              // NOLINT
-DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Rsqrt, "rsqrt", );            // NOLINT
-
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", );        // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", );        // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sigmoid, "sigmoid", );  // NOLINT
 DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(HardSigmoid,
                                 "hard_sigmoid",
                                 "slope" comma "offset");  // NOLINT
-KernelSignature SqrtActiOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  if (ctx.IsDenseTensorInput("X")) {
-    return KernelSignature("sqrt", {"X"}, {}, {"Out"});
-  } else {
-    return KernelSignature("sqrt_sr", {"X"}, {}, {"Out"});
-  }
-}
-
-KernelSignature SquareActiOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  if (ctx.IsDenseTensorInput("X")) {
-    return KernelSignature("square", {"X"}, {}, {"Out"});
-  } else {
-    return KernelSignature("square_sr", {"X"}, {}, {"Out"});
-  }
-}

 DEFINE_ACT_GRAD_NODEP_OP_ARGMAP(Round, "round", );  // NOLINT
 DEFINE_ACT_GRAD_NODEP_OP_ARGMAP(Floor, "floor", );  // NOLINT
@@ -164,11 +132,6 @@ KernelSignature EluOpArgumentMapping(const ArgumentMappingContext& ctx) {
  return KernelSignature("elu", {"X"}, {"alpha"}, {"Out"});
 }

-KernelSignature LogitGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "logit_grad", {"X", GradVarName("Out")}, {"eps"}, {GradVarName("X")});
-}
-
 KernelSignature EluGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
  return KernelSignature("elu_grad",
                         {"X", "Out", GradVarName("Out")},
@@ -231,18 +194,6 @@ PD_REGISTER_ARG_MAPPING_FN(asinh_grad, phi::AsinhGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(acosh_grad, phi::AcoshGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(atanh_grad, phi::AtanhGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(relu_grad, phi::ReluGradOpArgumentMapping);
-
-PD_REGISTER_ARG_MAPPING_FN(exp_grad, phi::ExpGradOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(expm1_grad, phi::Expm1GradOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(square_grad, phi::SquareGradOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(reciprocal_grad,
-                           phi::ReciprocalGradOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(sqrt_grad, phi::SqrtGradOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(rsqrt_grad, phi::RsqrtGradOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(mish_grad, phi::MishGradOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(stanh_grad, phi::STanhGradOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(softplus_grad, phi::SoftplusGradOpArgumentMapping);
-
 PD_REGISTER_ARG_MAPPING_FN(relu_grad_grad,
                           phi::ReluDoubleGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(tanh_grad, phi::TanhGradOpArgumentMapping);
@@ -277,16 +228,11 @@ PD_REGISTER_ARG_MAPPING_FN(logsigmoid_grad,
                           phi::LogSigmoidGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(hard_sigmoid_grad,
                           phi::HardSigmoidGradOpArgumentMapping);
-
-PD_REGISTER_ARG_MAPPING_FN(logit_grad, phi::LogitGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(log_grad, phi::LogGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(log_grad_grad, phi::LogDoubleGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(log2_grad, phi::Log2GradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(log10_grad, phi::Log10GradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(log1p_grad, phi::Log1pGradOpArgumentMapping);
-
-PD_REGISTER_ARG_MAPPING_FN(sqrt, phi::SqrtActiOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(square, phi::SquareActiOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(hard_swish_grad,
                           phi::HardSwishGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(swish_grad, phi::SwishGradOpArgumentMapping);

--- a/paddle/phi/ops/compat/temporal_shift_sig.cc
+++ b/paddle/phi/ops/compat/temporal_shift_sig.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature TemporalShiftOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("temporal_shift",
-                         {"X"},
-                         {"seg_num", "shift_ratio", "data_format"},
-                         {"Out"});
-}
-
-KernelSignature TemporalShiftGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("temporal_shift_grad",
-                         {GradVarName("Out")},
-                         {"seg_num", "shift_ratio", "data_format"},
-                         {GradVarName("X")});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(temporal_shift, phi::TemporalShiftOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(temporal_shift_grad,
-                           phi::TemporalShiftGradOpArgumentMapping);
--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
@@ -342,5 +342,4 @@ class TestLogDoubleGradCheck(unittest.TestCase):


 if __name__ == "__main__":
-    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_activation_sparse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_sparse_op.py
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-from op_test import OpTest
-import paddle
-
-
-class TestSparseSquareOp(unittest.TestCase):
-    def check_with_place(self, place):
-        scope = core.Scope()
-
-        # create and initialize Grad Variable   
-        height = 10
-        rows = [0, 4, 7]
-        self.row_numel = 12
-
-        x_selected_rows = scope.var('X').get_selected_rows()
-        x_selected_rows.set_height(height)
-        x_selected_rows.set_rows(rows)
-        np_array = np.ones((len(rows), self.row_numel)).astype("float32")
-        np_array[0, 0] = 2.0
-        np_array[2, 8] = 4.0
-
-        x_tensor = x_selected_rows.get_tensor()
-        x_tensor.set(np_array, place)
-
-        out_selected_rows = scope.var('Out').get_selected_rows()
-        # create and run sqrt operator
-        square_op = Operator("square", X='X', Out='Out')
-        square_op.run(scope, place)
-
-        # get and compare result
-        result_array = np.array(out_selected_rows.get_tensor())
-
-        self.assertTrue(np.array_equal(result_array, np.square(np_array)))
-
-    def test_sparse_acti(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        for place in places:
-            self.check_with_place(place)
-
-
-class TestSparseSqrtOp(unittest.TestCase):
-    def check_with_place(self, place):
-        scope = core.Scope()
-
-        # create and initialize Grad Variable   
-        height = 10
-        rows = [0, 4, 7]
-        self.row_numel = 12
-
-        x_selected_rows = scope.var('X1').get_selected_rows()
-        x_selected_rows.set_height(height)
-        x_selected_rows.set_rows(rows)
-        np_array = np.ones((len(rows), self.row_numel)).astype("float32")
-        np_array[0, 0] = 2.0
-        np_array[2, 8] = 4.0
-
-        x_tensor = x_selected_rows.get_tensor()
-        x_tensor.set(np_array, place)
-
-        out_selected_rows = scope.var('Out1').get_selected_rows()
-        # create and run sqrt operator
-        sqrt_op = Operator("sqrt", X='X1', Out='Out1')
-        sqrt_op.run(scope, place)
-
-        # get and compare result
-        result_array = np.array(out_selected_rows.get_tensor())
-        self.assertTrue(np.allclose(result_array, np.sqrt(np_array)))
-
-    def test_sparse_acti(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        for place in places:
-            self.check_with_place(place)
-
-
-if __name__ == "__main__":
-    paddle.enable_static()
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
@@ -16,7 +16,6 @@ from __future__ import print_function

 import unittest
 import numpy as np
-import paddle
 from op_test import OpTest

 import paddle.fluid as fluid
@@ -154,5 +153,4 @@ class TestClipByNormOpWithSelectedRows(unittest.TestCase):


 if __name__ == '__main__':
-    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
+++ b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
@@ -143,5 +143,4 @@ class TestTemporalShiftAPI(unittest.TestCase):


 if __name__ == "__main__":
-    paddle.enable_static()
    unittest.main()