Revert "Revert "Move some activation to phi (#40727)" (#41056)"

This reverts commit 05f3d48e.

Revert "Revert "Move some activation to phi (#40727)" (#41056)"
This reverts commit 05f3d48e.
07473786 · hong · GitHub · 9bb3744f · 07473786 · 07473786
29 changed file
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -53,7 +53,7 @@ USE_OP_ITSELF(tanh_grad);
 USE_OP(sum);
 USE_OP_ITSELF(slice_grad);
 USE_OP_ITSELF(lookup_table_grad);
-USE_OP(sqrt);
+USE_OP_ITSELF(sqrt);
 USE_OP_ITSELF(elementwise_max);
 USE_OP_ITSELF(elementwise_div);
 USE_OP_ITSELF(sgd);
@@ -83,6 +83,7 @@ PD_DECLARE_KERNEL(max_raw, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sgd, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(slice, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(slice_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sqrt, GPU, ALL_LAYOUT);
 DECLARE_double(eager_delete_tensor_gb);

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -1496,6 +1496,14 @@ REGISTER_ACTIVATION_OP(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,
                       HardSigmoidGradFunctor);
 REGISTER_ACTIVATION_OP(logsigmoid, LogSigmoid, LogSigmoidFunctor,
                       LogSigmoidGradFunctor);
+REGISTER_ACTIVATION_OP(expm1, Expm1, Expm1Functor, Expm1GradFunctor);
+REGISTER_ACTIVATION_OP(softplus, Softplus, SoftplusFunctor,
+                       SoftplusGradFunctor);
+REGISTER_ACTIVATION_OP(mish, Mish, MishFunctor, MishGradFunctor);
+REGISTER_ACTIVATION_OP(stanh, STanh, STanhFunctor, STanhGradFunctor);
+REGISTER_ACTIVATION_OP(reciprocal, Reciprocal, ReciprocalFunctor,
+                       ReciprocalGradFunctor);
 REGISTER_ACTIVATION_OP(log2, Log2, Log2Functor, Log2GradFunctor);
 REGISTER_ACTIVATION_OP(log10, Log10, Log10Functor, Log10GradFunctor);
 REGISTER_ACTIVATION_OP(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);
@@ -1630,12 +1638,7 @@ REGISTER_OPERATOR(logit, ops::LogitOp, ops::LogitOpMaker,
                  ops::LogitGradOpMaker<paddle::framework::OpDesc>,
                  ops::LogitGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(logit_grad, ops::LogitGradOp);
-REGISTER_OP_CPU_KERNEL(
-    logit, ops::LogitKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LogitKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    logit_grad, ops::LogitGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LogitGradKernel<paddle::platform::CPUDeviceContext, double>);
 /* ========================================================================== */
 /* ========================    celu  register     ============================
@@ -1684,7 +1687,6 @@ REGISTER_OPERATOR(
    ops::ActivationOpDoubleGrad<ops::SqrtGradGradFunctor<float>::FwdDeps()>,
    ops::ActivationDoubleGradOpInplaceInferer);
-REGISTER_ACTIVATION_CPU_KERNEL(sqrt, Sqrt, SqrtFunctor, SqrtGradFunctor);
 REGISTER_OP_CPU_KERNEL(
    sqrt_grad_grad, ops::SqrtDoubleGradKernel<plat::CPUDeviceContext,
                                              ops::SqrtGradGradFunctor<float>>,
@@ -1712,7 +1714,6 @@ REGISTER_OPERATOR(
    ops::ActivationOpDoubleGrad<ops::RsqrtGradGradFunctor<float>::FwdDeps()>,
    ops::ActivationDoubleGradOpInplaceInferer);
-REGISTER_ACTIVATION_CPU_KERNEL(rsqrt, Rsqrt, RsqrtFunctor, RsqrtGradFunctor);
 REGISTER_OP_CPU_KERNEL(
    rsqrt_grad_grad,
    ops::RsqrtDoubleGradKernel<plat::CPUDeviceContext,
@@ -1741,25 +1742,6 @@ REGISTER_OPERATOR(
    ops::ActivationOpDoubleGrad<ops::SquareGradGradFunctor<float>::FwdDeps()>,
    ops::ActivationDoubleGradOpInplaceInferer);
-REGISTER_OP_CPU_KERNEL(square,
-                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
-                                             ops::SquareFunctor<float>>,
-                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
-                                             ops::SquareFunctor<double>>,
-                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
-                                             ops::SquareFunctor<int>>,
-                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
-                                             ops::SquareFunctor<int64_t>>);
-REGISTER_OP_CPU_KERNEL(
-    square_grad, ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
-                                           ops::SquareGradFunctor<float>>,
-    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
-                              ops::SquareGradFunctor<double>>,
-    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
-                              ops::SquareGradFunctor<int>>,
-    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
-                              ops::SquareGradFunctor<int64_t>>);
 REGISTER_OP_CPU_KERNEL(
    square_grad_grad,
    ops::SquareDoubleGradKernel<plat::CPUDeviceContext,
@@ -1798,54 +1780,6 @@ REGISTER_OPERATOR(
 REGISTER_OPERATOR(exp_grad, ops::ActivationOpGrad,
                  ops::ActivationGradOpInplaceInferer);
-REGISTER_OP_CPU_KERNEL(exp,
-                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
-                                             ops::ExpFunctor<float>>,
-                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
-                                             ops::ExpFunctor<double>>,
-                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
-                                             ops::ExpFunctor<int>>,
-                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
-                                             ops::ExpFunctor<int64_t>>);
-REGISTER_OP_CPU_KERNEL(
-    exp_grad, ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
-                                        ops::ExpGradFunctor<float>>,
-    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
-                              ops::ExpGradFunctor<double>>,
-    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
-                              ops::ExpGradFunctor<int>>,
-    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
-                              ops::ExpGradFunctor<int64_t>>);
-/* ========================================================================== */
-/* ==========================   expm1 register  ============================ */
-REGISTER_OPERATOR(
-    expm1, ops::ActivationOp, ops::Expm1OpMaker, ops::ActivationOpInferVarType,
-    ops::ActivationGradOpMaker<ops::Expm1GradFunctor<float>::FwdDeps(),
-                               paddle::framework::OpDesc>,
-    ops::ActivationGradOpMaker<ops::Expm1GradFunctor<float>::FwdDeps(),
-                               paddle::imperative::OpBase>,
-    std::conditional<ops::CanInplaceAct<ops::Expm1GradFunctor<float>>(),
-                     ops::ActFwdInplaceInferer, void>::type);
-REGISTER_OPERATOR(expm1_grad, ops::ActivationOpGrad,
-                  ops::ActivationGradOpInplaceInferer);
-REGISTER_OP_CPU_KERNEL(expm1,
-                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
-                                             ops::Expm1Functor<float>>,
-                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
-                                             ops::Expm1Functor<double>>,
-                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
-                                             ops::Expm1Functor<plat::float16>>);
-REGISTER_OP_CPU_KERNEL(
-    expm1_grad, ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
-                                          ops::Expm1GradFunctor<float>>,
-    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
-                              ops::Expm1GradFunctor<double>>,
-    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
-                              ops::Expm1GradFunctor<plat::float16>>);
-/* ========================================================================== */
 /* ==========================  Log register ==================================*/
 REGISTER_OPERATOR(
    log, ops::ActivationOp, ops::LogOpMaker, ops::ActivationOpInferVarType,
@@ -1864,8 +1798,6 @@ REGISTER_OPERATOR(
    ops::ActivationOpDoubleGrad<ops::LogGradGradFunctor<float>::FwdDeps()>,
    ops::ActivationDoubleGradOpInplaceInferer);
-/* ========================================================================== */
 /* ==========================  register checkpoint ===========================*/
 REGISTER_OP_VERSION(leaky_relu)
    .AddCheckpoint(

--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
--- a/paddle/fluid/operators/activation_op.kps
+++ b/paddle/fluid/operators/activation_op.kps
@@ -20,140 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-template <typename T>
-struct CudaReciprocalFunctor : public BaseActivationFunctor<T> {
-  T one = static_cast<T>(1.0f);
-  // reciprocal(x) = 1 / x
-  __device__ __forceinline__ T operator()(const T x) const { return one / x; }
-};
-template <typename T>
-struct CudaReciprocalGradFunctor : public BaseActivationFunctor<T> {
-  // dx = -dout * out^2
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return -dout * out * out;
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-template <typename T>
-struct CudaExpFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  // exp(x) = exp(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(exp(x));
-  }
-};
-template <typename T>
-struct CudaExpGradFunctor : public BaseActivationFunctor<T> {
-  // dx = dout * out
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return dout * out;
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-template <typename T>
-struct CudaExpm1Functor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  // expm1(x) = expm1(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(expm1(x));
-  }
-};
-template <typename T>
-struct CudaExpm1GradFunctor : public BaseActivationFunctor<T> {
-  // dx = dout * out
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return dout * out + dout;
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-template <typename T>
-struct CudaSquareFunctor : public BaseActivationFunctor<T> {
-  // square(x) = x * x
-  __device__ __forceinline__ T operator()(const T x) const { return x * x; }
-};
-template <typename T>
-struct CudaSquareGradFunctor : public BaseActivationFunctor<T> {
-  T two = static_cast<T>(2.0f);
-  // dx = dout * 2 * x
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return dout * two * x;
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-template <typename T>
-struct CudaSqrtFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  // sqrt(x) = sqrt(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(sqrt(x));
-  }
-};
-template <typename T>
-struct CudaSqrtGradFunctor : public BaseActivationFunctor<T> {
-  T one_half = static_cast<T>(0.5f);
-  // dx = dout * 0.5 / out
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return one_half * dout / out;
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-template <typename T>
-struct CudaRsqrtFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  // rsqrt(x) = rsqrt(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(rsqrt(x));
-  }
-};
-template <typename T>
-struct CudaRsqrtGradFunctor : public BaseActivationFunctor<T> {
-  T minus_one_half = static_cast<T>(-0.5f);
-  // dx = -0.5 * dout * out^3
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return minus_one_half * dout * out * out * out;
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
 template <typename T>
 struct CudaSoftReluFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
@@ -201,119 +67,6 @@ struct CudaSoftReluGradFunctor : public BaseActivationFunctor<T> {
  }
 };
-template <typename T>
-struct CudaSTanhFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  float scale_a;
-  float scale_b;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
-  }
-  // stanh(x) = b * tanh(a * x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    MPType a = static_cast<MPType>(scale_a);
-    MPType b = static_cast<MPType>(scale_b);
-    return static_cast<T>(b * tanh(a * x));
-  }
-};
-template <typename T>
-struct CudaSTanhGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-  float scale_a;
-  float scale_b;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
-  }
-  // dx = dout * a * b * (1 - tanh(a * x) * tanh(a * x))
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    MPType a = static_cast<MPType>(scale_a);
-    MPType b = static_cast<MPType>(scale_b);
-    MPType temp = tanh(a * x);
-    return static_cast<T>(dout * a * b * (one - temp * temp));
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-template <typename T>
-struct CudaSoftplusFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-  float beta;
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"beta", &beta}, {"threshold", &threshold}};
-  }
-  // softplus(x) = beta * x > threshold ? x : log(1 + exp(beta * x)) / beta
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    MPType b = static_cast<MPType>(beta);
-    MPType t = static_cast<MPType>(threshold);
-    MPType x_beta = x * beta;
-    return static_cast<T>(x_beta > t ? x : log(one + exp(x_beta)) / b);
-  }
-};
-template <typename T>
-struct CudaSoftplusGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-  float beta;
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"beta", &beta}, {"threshold", &threshold}};
-  }
-  // dx = x * beta > threshold ? dout : dout / (1 + exp(-beta * x))
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    MPType b = static_cast<MPType>(beta);
-    MPType t = static_cast<MPType>(threshold);
-    MPType x_beta = x * beta;
-    return x_beta > t ? arg_dout : static_cast<T>(dout / (one + exp(-x_beta)));
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-template <typename T>
-struct CudaSoftsignFunctor : public BaseActivationFunctor<T> {
-  T one = static_cast<T>(1.0f);
-  // softsign(x) = x / (1 + abs(x))
-  __device__ __forceinline__ T operator()(const T x) const {
-    return x / (one + abs(x));
-  }
-};
-template <typename T>
-struct CudaSoftsignGradFunctor : public BaseActivationFunctor<T> {
-  T one = static_cast<T>(1.0f);
-  // dx = dout / (1 + abs(x))^2
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    T temp = one + abs(x);
-    return dout / (temp * temp);
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
 template <typename T>
 struct CudaRelu6Functor : public BaseActivationFunctor<T> {
  T zero = static_cast<T>(0.0f);
@@ -351,49 +104,23 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
 };
 template <typename T>
-struct CudaMishFunctor : public BaseActivationFunctor<T> {
+struct CudaSoftsignFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
+  T one = static_cast<T>(1.0f);
-  MPType one = static_cast<MPType>(1.0f);
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-  // mish(x) = x * tanh(softplus(x))
+  // softsign(x) = x / (1 + abs(x))
-  // softplus(x) = x, if x > threshold
+  __device__ __forceinline__ T operator()(const T x) const {
-  //             = ln(1 + exp(x)), otherwise
+    return x / (one + abs(x));
-  // Inputs: args[0], the input x
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    MPType sp = (x > static_cast<MPType>(threshold)) ? x : log(one + exp(x));
-    return static_cast<T>(x * tanh(sp));
  }
 };
 template <typename T>
-struct CudaMishGradFunctor : public BaseActivationFunctor<T> {
+struct CudaSoftsignGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
+  T one = static_cast<T>(1.0f);
-  MPType one = static_cast<MPType>(1.0f);
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-  // dx = dout * (tanh(sp) + x * (1 - tanh(sp) ** 2) * (1 - exp(-sp)))
+  // dx = dout / (1 + abs(x))^2
-  // sp = softplus(x)
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-  // Inputs: args[0], the input dout
+    T temp = one + abs(x);
-  //         args[1], the input x
+    return dout / (temp * temp);
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    MPType sp = (x > static_cast<MPType>(threshold)) ? x : log(one + exp(x));
-    MPType gsp =
-        (x > static_cast<MPType>(threshold)) ? one : one / (one + exp(-x));
-    MPType tsp = tanh(sp);
-    return static_cast<T>(dout * (tsp + x * (one - tsp * tsp) * gsp));
  }
  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
@@ -558,6 +285,16 @@ using CudaCeilFunctor = phi::funcs::CudaCeilFunctor<T>;
 template <typename T>
 using CudaZeroGradFunctor = phi::funcs::CudaZeroGradFunctor<T>;
+USE_PHI_FUNCTOR(CudaExp)
+USE_PHI_FUNCTOR(CudaExpm1)
+USE_PHI_FUNCTOR(CudaMish)
+USE_PHI_FUNCTOR(CudaSTanh)
+USE_PHI_FUNCTOR(CudaReciprocal)
+USE_PHI_FUNCTOR(CudaSquare)
+USE_PHI_FUNCTOR(CudaSqrt)
+USE_PHI_FUNCTOR(CudaRsqrt)
+USE_PHI_FUNCTOR(CudaSoftplus)
 template <typename T>
 using CudaELUGradNegativeAlphaFunctor =
    phi::funcs::CudaELUGradNegativeAlphaFunctor<T>;
@@ -636,8 +373,6 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 /* ===========================   sqrt register  ============================= */
-REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, CudaSqrtFunctor,
-                                CudaSqrtGradFunctor);
 REGISTER_OP_CUDA_KERNEL(
    sqrt_grad_grad,
@@ -653,8 +388,6 @@ REGISTER_OP_CUDA_KERNEL(
 /* ===========================   rsqrt register  =============================
 */
-REGISTER_ACTIVATION_CUDA_KERNEL(rsqrt, Rsqrt, CudaRsqrtFunctor,
-                                CudaRsqrtGradFunctor);
 REGISTER_OP_CUDA_KERNEL(
    rsqrt_grad_grad,
@@ -667,8 +400,6 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 /* ===========================  square register  ============================ */
-REGISTER_ACTIVATION_CUDA_KERNEL_INT(square, Square, CudaSquareFunctor,
-                                    CudaSquareGradFunctor);
 REGISTER_OP_CUDA_KERNEL(
    square_grad_grad,
@@ -688,75 +419,19 @@ REGISTER_OP_CUDA_KERNEL(
 /* ==========================   logit register  ============================ */
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    logit, ops::LogitKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LogitKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LogitKernel<paddle::platform::CUDADeviceContext,
-                     paddle::platform::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    logit_grad,
-    ops::LogitGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LogitGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LogitGradKernel<paddle::platform::CUDADeviceContext,
-                         paddle::platform::float16>);
 /* ========================================================================== */
 /* ==========================   exp register  ============================ */
-REGISTER_OP_CUDA_KERNEL(
-    exp, ops::ActivationCudaKernel<plat::CUDADeviceContext,
-                                   ops::CudaExpFunctor<float>>,
-    ops::ActivationCudaKernel<plat::CUDADeviceContext,
-                              ops::CudaExpFunctor<double>>,
-    ops::ActivationKernel<plat::CUDADeviceContext, ops::ExpFunctor<int>>,
-    ops::ActivationKernel<plat::CUDADeviceContext, ops::ExpFunctor<int64_t>>,
-    ops::ActivationCudaKernel<plat::CUDADeviceContext,
-                              ops::CudaExpFunctor<plat::float16>>);
-REGISTER_OP_CUDA_KERNEL(
-    exp_grad, ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                            ops::CudaExpGradFunctor<float>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                  ops::CudaExpGradFunctor<double>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                  ops::CudaExpGradFunctor<int>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                  ops::CudaExpGradFunctor<int64_t>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                  ops::CudaExpGradFunctor<plat::float16>>);
 /* ========================================================================== */
 /* ==========================   expm1 register  ============================ */
-REGISTER_OP_CUDA_KERNEL(
-    expm1, ops::ActivationCudaKernel<plat::CUDADeviceContext,
-                                     ops::CudaExpm1Functor<float>>,
-    ops::ActivationCudaKernel<plat::CUDADeviceContext,
-                              ops::CudaExpm1Functor<double>>,
-    ops::ActivationCudaKernel<plat::CUDADeviceContext,
-                              ops::CudaExpm1Functor<plat::float16>>);
-REGISTER_OP_CUDA_KERNEL(
-    expm1_grad, ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                              ops::CudaExpm1GradFunctor<float>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                  ops::CudaExpm1GradFunctor<double>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                  ops::CudaExpm1GradFunctor<plat::float16>>);
 /* ========================================================================== */
 #define FOR_EACH_ACTIVATION_CUDA_OP(__macro)                                  \
-  __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor,                      \
-          CudaSoftShrinkGradFunctor);                                         \
-  __macro(reciprocal, Reciprocal, CudaReciprocalFunctor,                      \
-          CudaReciprocalGradFunctor);                                         \
  __macro(soft_relu, SoftRelu, CudaSoftReluFunctor, CudaSoftReluGradFunctor); \
-  __macro(stanh, STanh, CudaSTanhFunctor, CudaSTanhGradFunctor);              \
-  __macro(softplus, Softplus, CudaSoftplusFunctor, CudaSoftplusGradFunctor);  \
-  __macro(softsign, Softsign, CudaSoftsignFunctor, CudaSoftsignGradFunctor);  \
  __macro(relu6, Relu6, CudaRelu6Functor, CudaRelu6GradFunctor);              \
-  __macro(tanh_shrink, TanhShrink, CudaTanhShrinkFunctor,                     \
+  __macro(softsign, Softsign, CudaSoftsignFunctor, CudaSoftsignGradFunctor);
-          CudaTanhShrinkGradFunctor);                                         \
-  __macro(hard_shrink, HardShrink, CudaHardShrinkFunctor,                     \
-          CudaHardShrinkGradFunctor);                                         \
-  __macro(mish, Mish, CudaMishFunctor, CudaMishGradFunctor);
 FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL)
 #ifdef PADDLE_WITH_XPU_KP

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -22,9 +22,9 @@ math_library(sampler DEPS generator)
 math_library(maxouting)
 if(WITH_MKLDNN)
-    math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mkldnn_axpy_handler)
+    math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mkldnn_axpy_handler mixed_vector)
 else()
-    math_library(selected_rows_functor DEPS selected_rows_utils math_function blas)
+    math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mixed_vector)
 endif()
 math_library(sequence_padding)

--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #ifdef PADDLE_WITH_MKLDNN

--- a/paddle/fluid/operators/temporal_shift_op.h
+++ b/paddle/fluid/operators/temporal_shift_op.h
@@ -19,56 +19,6 @@ namespace operators {
 using Tensor = framework::Tensor;
 using DataLayout = framework::DataLayout;
-template <typename T>
-void TemporalShiftFwNCHW(const T* input, T* output, const int ntchw,
-                         const int tchw, const int chw, const int hw,
-                         const int t, const int c1, const int c2) {
-  int src_it = 0;
-  for (int i = 0; i < ntchw; i++) {
-    int it = (i % tchw) / chw;
-    int ic = (i % chw) / hw;
-    if (ic < c1) {
-      src_it = it - 1;
-    } else if (ic < c2) {
-      src_it = it + 1;
-    } else {
-      src_it = it;
-    }
-    if (src_it < 0 || src_it >= t) {
-      output[i] = 0;
-    } else {
-      output[i] = input[i + (src_it - it) * chw];
-    }
-  }
-}
-template <typename T>
-void TemporalShiftFwNHWC(const T* input, T* output, const int nthwc,
-                         const int thwc, const int hwc, const int t,
-                         const int c, const int c1, const int c2) {
-  int src_it = 0;
-  for (int i = 0; i < nthwc; i++) {
-    int it = (i % thwc) / hwc;
-    int ic = i % c;
-    if (ic < c1) {
-      src_it = it - 1;
-    } else if (ic < c2) {
-      src_it = it + 1;
-    } else {
-      src_it = it;
-    }
-    if (src_it < 0 || src_it >= t) {
-      output[i] = 0;
-    } else {
-      output[i] = input[i + (src_it - it) * hwc];
-    }
-  }
-}
 template <typename T>
 void TemporalShiftBwNCHW(const T* output_grad, T* input_grad, const int ntchw,
                         const int tchw, const int chw, const int hw,
@@ -122,45 +72,7 @@ void TemporalShiftBwNHWC(const T* output_grad, T* input_grad, const int nthwc,
 template <typename T>
 class TemporalShiftKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {}
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    int t = ctx.Attr<int>("seg_num");
-    float shift_ratio = ctx.Attr<float>("shift_ratio");
-    const std::string data_format_str = ctx.Attr<std::string>("data_format");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_format_str);
-    const int nt = input->dims()[0];
-    const int c = (data_layout == DataLayout::kNCHW ? input->dims()[1]
-                                                    : input->dims()[3]);
-    const int h = (data_layout == DataLayout::kNCHW ? input->dims()[2]
-                                                    : input->dims()[1]);
-    const int w = (data_layout == DataLayout::kNCHW ? input->dims()[3]
-                                                    : input->dims()[2]);
-    const int hw = h * w;
-    const int chw = c * hw;
-    const int tchw = t * chw;
-    const int ntchw = nt * chw;
-    const int c1 = static_cast<int>(c * shift_ratio);
-    const int c2 = static_cast<int>(c * 2 * shift_ratio);
-    framework::DDim out_dims =
-        (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
-                                          : phi::make_ddim({nt, h, w, c}));
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>(out_dims, ctx.GetPlace());
-    if (data_layout == DataLayout::kNCHW) {
-      TemporalShiftFwNCHW<T>(input_data, output_data, ntchw, tchw, chw, hw, t,
-                             c1, c2);
-    } else {
-      TemporalShiftFwNHWC<T>(input_data, output_data, ntchw, tchw, chw, t, c,
-                             c1, c2);
-    }
-  }
 };
 template <typename T>

--- a/paddle/phi/kernels/activation_kernel.h
+++ b/paddle/phi/kernels/activation_kernel.h
@@ -53,6 +53,13 @@ DECLARE_ACTIVATION_KERNEL(Acosh)
 DECLARE_ACTIVATION_KERNEL(Atanh)
 DECLARE_ACTIVATION_KERNEL(Relu)
 DECLARE_ACTIVATION_KERNEL(Tanh)
+DECLARE_ACTIVATION_KERNEL(Exp)
+DECLARE_ACTIVATION_KERNEL(Expm1)
+DECLARE_ACTIVATION_KERNEL(Reciprocal)
+DECLARE_ACTIVATION_KERNEL(Square)
+DECLARE_ACTIVATION_KERNEL(Sqrt)
+DECLARE_ACTIVATION_KERNEL(Rsqrt)
 DECLARE_ACTIVATION_KERNEL(TanhShrink)
 DECLARE_ACTIVATION_KERNEL(Silu)
 DECLARE_ACTIVATION_KERNEL(Sigmoid)
@@ -73,8 +80,23 @@ DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Swish, beta)
 DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(BRelu, t_min, t_max)
+DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(STanh, scale_a, scale_b)
 DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(HardSigmoid, slope, offset)
+DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(Softplus, beta, threshold)
+template <typename T, typename Context>
+void LogitKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 float eps,
+                 DenseTensor* out);
+template <typename T, typename Context>
+void MishKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                float threshold,
+                DenseTensor* out);
 template <typename T, typename Context>
 void HardSwishKernel(const Context& dev_ctx,
                     const DenseTensor& x,

--- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -129,6 +129,13 @@ DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, AcoshGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, AtanhGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, TanhShrinkGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, SiluGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Square, SquareGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Exp, ExpGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1, Expm1GradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Reciprocal, ReciprocalGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, SqrtGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Rsqrt, RsqrtGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, LogSigmoidGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, LogGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log2, Log2GradFunctor);
@@ -157,11 +164,24 @@ DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink,
                                               threshold);
 DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish, SwishGradFunctor, beta);
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
+                                               MishGradFunctor,
+                                               threshold);
 DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
                                               BReluGradFunctor,
                                               t_min,
                                               t_max);
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(STanh,
+                                               STanhGradFunctor,
+                                               scale_a,
+                                               scale_b);
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(Softplus,
+                                               SoftplusGradFunctor,
+                                               beta,
+                                               threshold);
 DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid,
                                                 HardSigmoidGradFunctor,
                                                 slope,
@@ -247,6 +267,12 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(mish_grad, MishGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(stanh_grad, STanhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(reciprocal_grad, ReciprocalGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sqrt_grad, SqrtGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(rsqrt_grad, RsqrtGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(softplus_grad, SoftplusGradKernel)
 PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(relu_double_grad,
                                          ReluDoubleGradKernel)
@@ -263,6 +289,34 @@ PD_REGISTER_KERNEL(tanh_triple_grad,
                   float,
                   double,
                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(exp_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ExpGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+PD_REGISTER_KERNEL(expm1_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::Expm1GradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(
+    logit_grad, CPU, ALL_LAYOUT, phi::LogitGradKernel, float, double) {}
+PD_REGISTER_KERNEL(square_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SquareGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel)

--- a/paddle/phi/kernels/cpu/activation_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/activation_kernel.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/activation_functor.h"
 #include "paddle/phi/kernels/impl/activation_impl.h"
 namespace phi {
@@ -72,6 +73,12 @@ DEFINE_CPU_ACTIVATION_KERNEL(Relu, ReluCPUFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Tanh, TanhFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(TanhShrink, TanhShrinkFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Silu, SiluFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Exp, ExpFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Expm1, Expm1Functor)
+DEFINE_CPU_ACTIVATION_KERNEL(Reciprocal, ReciprocalFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Square, SquareFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Sqrt, SqrtFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Rsqrt, RsqrtFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Sigmoid, SigmoidFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(LogSigmoid, LogSigmoidFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Log, LogFunctor)
@@ -83,15 +90,19 @@ DEFINE_CPU_ACTIVATION_KERNEL(Floor, FloorFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Ceil, CeilFunctor)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, LeakyReluFunctor, alpha)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
                                     ThresholdedReluFunctor,
                                     threshold)
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Mish, MishFunctor, threshold)
+DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, BReluFunctor, t_min, t_max)
+DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(STanh, STanhFunctor, scale_a, scale_b)
+DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(Softplus, SoftplusFunctor, beta, threshold)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink, HardShrinkFunctor, threshold)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, SoftShrinkFunctor, lambda)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, ELUFunctor, alpha)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Swish, SwishFunctor, beta)
-DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, BReluFunctor, t_min, t_max)
 DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid,
                                     HardSigmoidFunctor,
                                     slope,
@@ -139,6 +150,25 @@ PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel)
+PD_REGISTER_ACTIVATION_KERNEL(stanh, STanhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(reciprocal, ReciprocalKernel)
+PD_REGISTER_ACTIVATION_KERNEL(sqrt, SqrtKernel)
+PD_REGISTER_ACTIVATION_KERNEL(rsqrt, RsqrtKernel)
+PD_REGISTER_ACTIVATION_KERNEL(softplus, SoftplusKernel)
+PD_REGISTER_KERNEL(
+    exp, CPU, ALL_LAYOUT, phi::ExpKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(expm1,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::Expm1Kernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(logit, CPU, ALL_LAYOUT, phi::LogitKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    square, CPU, ALL_LAYOUT, phi::SquareKernel, float, double, int, int64_t) {}
 PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel)
 PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel)
 PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel)

--- a/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/temporal_shift_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/kernel_registry.h"
+namespace phi {
+template <typename T>
+void TemporalShiftBwNCHW(const T* output_grad,
+                         T* input_grad,
+                         const int ntchw,
+                         const int tchw,
+                         const int chw,
+                         const int hw,
+                         const int t,
+                         const int c1,
+                         const int c2) {
+  int src_it = 0;
+  for (int i = 0; i < ntchw; i++) {
+    int it = (i % tchw) / chw;
+    int ic = (i % chw) / hw;
+    if (ic < c1) {
+      src_it = it + 1;
+    } else if (ic < c2) {
+      src_it = it - 1;
+    } else {
+      src_it = it;
+    }
+    if (src_it >= 0 && src_it < t) {
+      input_grad[i] = output_grad[i + (src_it - it) * chw];
+    } else {
+      input_grad[i] = 0;
+    }
+  }
+}
+template <typename T>
+void TemporalShiftBwNHWC(const T* output_grad,
+                         T* input_grad,
+                         const int nthwc,
+                         const int thwc,
+                         const int hwc,
+                         const int t,
+                         const int c,
+                         const int c1,
+                         const int c2) {
+  int src_it = 0;
+  for (int i = 0; i < nthwc; i++) {
+    int it = (i % thwc) / hwc;
+    int ic = i % c;
+    if (ic < c1) {
+      src_it = it + 1;
+    } else if (ic < c2) {
+      src_it = it - 1;
+    } else {
+      src_it = it;
+    }
+    if (src_it >= 0 && src_it < t) {
+      input_grad[i] = output_grad[i + (src_it - it) * hwc];
+    } else {
+      input_grad[i] = 0;
+    }
+  }
+}
+template <typename T, typename Context>
+void TemporalShiftGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out_grad,
+                             int seg_num,
+                             float shift_ratio,
+                             const std::string& data_format_str,
+                             DenseTensor* x_grad) {
+  auto* input_grad = x_grad;
+  auto* output_grad = &out_grad;
+  int t = seg_num;
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_format_str);
+  const int nt = output_grad->dims()[0];
+  const int c = (data_layout == DataLayout::kNCHW ? output_grad->dims()[1]
+                                                  : output_grad->dims()[3]);
+  const int h = (data_layout == DataLayout::kNCHW ? output_grad->dims()[2]
+                                                  : output_grad->dims()[1]);
+  const int w = (data_layout == DataLayout::kNCHW ? output_grad->dims()[3]
+                                                  : output_grad->dims()[2]);
+  const int hw = h * w;
+  const int chw = c * hw;
+  const int tchw = t * chw;
+  const int ntchw = nt * chw;
+  const int c1 = static_cast<int>(c * shift_ratio);
+  const int c2 = static_cast<int>(c * 2 * shift_ratio);
+  DDim in_grad_dims =
+      (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
+                                        : phi::make_ddim({nt, h, w, c}));
+  const T* output_grad_data = output_grad->data<T>();
+  T* input_grad_data =
+      input_grad->mutable_data<T>(in_grad_dims, dev_ctx.GetPlace());
+  if (data_layout == DataLayout::kNCHW) {
+    TemporalShiftBwNCHW<T>(
+        output_grad_data, input_grad_data, ntchw, tchw, chw, hw, t, c1, c2);
+  } else {
+    TemporalShiftBwNHWC<T>(
+        output_grad_data, input_grad_data, ntchw, tchw, chw, t, c, c1, c2);
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(temporal_shift_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TemporalShiftGradKernel,
+                   float,
+                   double) {}
--- a/paddle/phi/kernels/cpu/temporal_shift_kernel.cc
+++ b/paddle/phi/kernels/cpu/temporal_shift_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/temporal_shift_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/kernel_registry.h"
+namespace phi {
+template <typename T>
+void TemporalShiftFwNCHW(const T* input,
+                         T* output,
+                         const int ntchw,
+                         const int tchw,
+                         const int chw,
+                         const int hw,
+                         const int t,
+                         const int c1,
+                         const int c2) {
+  int src_it = 0;
+  for (int i = 0; i < ntchw; i++) {
+    int it = (i % tchw) / chw;
+    int ic = (i % chw) / hw;
+    if (ic < c1) {
+      src_it = it - 1;
+    } else if (ic < c2) {
+      src_it = it + 1;
+    } else {
+      src_it = it;
+    }
+    if (src_it < 0 || src_it >= t) {
+      output[i] = 0;
+    } else {
+      output[i] = input[i + (src_it - it) * chw];
+    }
+  }
+}
+template <typename T>
+void TemporalShiftFwNHWC(const T* input,
+                         T* output,
+                         const int nthwc,
+                         const int thwc,
+                         const int hwc,
+                         const int t,
+                         const int c,
+                         const int c1,
+                         const int c2) {
+  int src_it = 0;
+  for (int i = 0; i < nthwc; i++) {
+    int it = (i % thwc) / hwc;
+    int ic = i % c;
+    if (ic < c1) {
+      src_it = it - 1;
+    } else if (ic < c2) {
+      src_it = it + 1;
+    } else {
+      src_it = it;
+    }
+    if (src_it < 0 || src_it >= t) {
+      output[i] = 0;
+    } else {
+      output[i] = input[i + (src_it - it) * hwc];
+    }
+  }
+}
+template <typename T, typename Context>
+void TemporalShiftKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         int seg_num,
+                         float shift_ratio,
+                         const std::string& data_format_str,
+                         DenseTensor* out) {
+  auto* input = &x;
+  auto* output = out;
+  int t = seg_num;
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_format_str);
+  const int nt = input->dims()[0];
+  const int c =
+      (data_layout == DataLayout::kNCHW ? input->dims()[1] : input->dims()[3]);
+  const int h =
+      (data_layout == DataLayout::kNCHW ? input->dims()[2] : input->dims()[1]);
+  const int w =
+      (data_layout == DataLayout::kNCHW ? input->dims()[3] : input->dims()[2]);
+  const int hw = h * w;
+  const int chw = c * hw;
+  const int tchw = t * chw;
+  const int ntchw = nt * chw;
+  const int c1 = static_cast<int>(c * shift_ratio);
+  const int c2 = static_cast<int>(c * 2 * shift_ratio);
+  DDim out_dims =
+      (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
+                                        : phi::make_ddim({nt, h, w, c}));
+  const T* input_data = input->data<T>();
+  T* output_data = output->mutable_data<T>(out_dims, dev_ctx.GetPlace());
+  if (data_layout == DataLayout::kNCHW) {
+    TemporalShiftFwNCHW<T>(
+        input_data, output_data, ntchw, tchw, chw, hw, t, c1, c2);
+  } else {
+    TemporalShiftFwNHWC<T>(
+        input_data, output_data, ntchw, tchw, chw, t, c, c1, c2);
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(
+    temporal_shift, CPU, ALL_LAYOUT, phi::TemporalShiftKernel, float, double) {}
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -189,6 +189,13 @@ DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, CudaAcoshGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, CudaAtanhGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, CudaTanhShrinkGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, CudaSiluGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Square, CudaSquareGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Exp, CudaExpGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1, CudaExpm1GradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Reciprocal, CudaReciprocalGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, CudaSqrtGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Rsqrt, CudaRsqrtGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, CudaLogSigmoidGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, CudaLogGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log2, CudaLog2GradFunctor);
@@ -211,11 +218,24 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish,
                                               CudaSwishGradFunctor,
                                               beta);
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
+                                               CudaMishGradFunctor,
+                                               threshold);
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
                                               CudaBReluGradFunctor,
                                               t_min,
                                               t_max);
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(STanh,
+                                               CudaSTanhGradFunctor,
+                                               scale_a,
+                                               scale_b);
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(Softplus,
+                                               CudaSoftplusGradFunctor,
+                                               beta,
+                                               threshold);
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid,
                                                 CudaHardSigmoidGradFunctor,
                                                 slope,
@@ -326,12 +346,57 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_double_grad,
                                   LeakyReluDoubleGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad,
                                   ThresholdedReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(mish_grad, MishGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(stanh_grad, STanhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(reciprocal_grad, ReciprocalGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(softplus_grad, SoftplusGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sqrt_grad, SqrtGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(rsqrt_grad, RsqrtGradKernel)
+PD_REGISTER_KERNEL(exp_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ExpGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel)
+PD_REGISTER_KERNEL(expm1_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Expm1GradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(logit_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LogitGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(square_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SquareGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel)

--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/impl/activation_grad_impl.h"
 #include "paddle/phi/kernels/impl/activation_impl.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
@@ -91,6 +92,12 @@ DEFINE_GPU_ACTIVATION_KERNEL(Relu, CudaReluFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Tanh, CudaTanhFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(TanhShrink, CudaTanhShrinkFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Silu, CudaSiluFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Exp, CudaExpFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Expm1, CudaExpm1Functor)
+DEFINE_GPU_ACTIVATION_KERNEL(Reciprocal, CudaReciprocalFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Square, CudaSquareFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Sqrt, CudaSqrtFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Rsqrt, CudaRsqrtFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Sigmoid, CudaSigmoidFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(LogSigmoid, CudaLogSigmoidFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Log, CudaLogFunctor)
@@ -112,7 +119,14 @@ DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, CudaSoftShrinkFunctor, lambda)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, CudaELUFunctor, alpha)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Swish, CudaSwishFunctor, beta)
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Mish, CudaMishFunctor, threshold)
 DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, CudaBReluFunctor, t_min, t_max)
+DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Stanh, CudaSTanhFunctor, scale_a, scale_b)
+DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Softplus,
+                                     CudaSoftplusFunctor,
+                                     beta,
+                                     threshold)
 DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid,
                                     CudaHardSigmoidFunctor,
                                     slope,
@@ -180,6 +194,46 @@ PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel)
+PD_REGISTER_ACTIVATION_KERNEL(stanh, StanhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(reciprocal, ReciprocalKernel)
+PD_REGISTER_ACTIVATION_KERNEL(sqrt, SqrtKernel)
+PD_REGISTER_ACTIVATION_KERNEL(rsqrt, RsqrtKernel)
+PD_REGISTER_ACTIVATION_KERNEL(softplus, SoftplusKernel)
+PD_REGISTER_KERNEL(exp,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ExpKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(expm1,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Expm1Kernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(logit,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LogitKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(square,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SquareKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel)

--- a/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/temporal_shift_grad_kernel.h"
+namespace phi {
+template <typename T>
+__global__ void KeTemporalShiftBwNCHW(const T* output_grad,
+                                      T* input_grad,
+                                      const int ntchw,
+                                      const int tchw,
+                                      const int chw,
+                                      const int hw,
+                                      const int t,
+                                      const int c1,
+                                      const int c2) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  int src_it = 0;
+  for (; tid < ntchw; tid += stride) {
+    int it = (tid % tchw) / chw;
+    int ic = (tid % chw) / hw;
+    if (ic < c1) {
+      src_it = it + 1;
+    } else if (ic < c2) {
+      src_it = it - 1;
+    } else {
+      src_it = it;
+    }
+    if (src_it >= 0 && src_it < t) {
+      input_grad[tid] = output_grad[tid + (src_it - it) * chw];
+    } else {
+      input_grad[tid] = 0;
+    }
+  }
+}
+template <typename T>
+__global__ void KeTemporalShiftBwNHWC(const T* output_grad,
+                                      T* input_grad,
+                                      const int nthwc,
+                                      const int thwc,
+                                      const int hwc,
+                                      const int t,
+                                      const int c,
+                                      const int c1,
+                                      const int c2) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  int src_it = 0;
+  for (; tid < nthwc; tid += stride) {
+    int it = (tid % thwc) / hwc;
+    int ic = tid % c;
+    if (ic < c1) {
+      src_it = it + 1;
+    } else if (ic < c2) {
+      src_it = it - 1;
+    } else {
+      src_it = it;
+    }
+    if (src_it >= 0 && src_it < t) {
+      input_grad[tid] = output_grad[tid + (src_it - it) * hwc];
+    } else {
+      input_grad[tid] = 0;
+    }
+  }
+}
+template <typename T, typename Context>
+void TemporalShiftGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out_grad,
+                             int seg_num,
+                             float shift_ratio,
+                             const std::string& data_format_str,
+                             DenseTensor* x_grad) {
+  auto* input_grad = x_grad;
+  auto* output_grad = &out_grad;
+  int t = seg_num;
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_format_str);
+  const int nt = output_grad->dims()[0];
+  const int c = (data_layout == DataLayout::kNCHW ? output_grad->dims()[1]
+                                                  : output_grad->dims()[3]);
+  const int h = (data_layout == DataLayout::kNCHW ? output_grad->dims()[2]
+                                                  : output_grad->dims()[1]);
+  const int w = (data_layout == DataLayout::kNCHW ? output_grad->dims()[3]
+                                                  : output_grad->dims()[2]);
+  const int hw = h * w;
+  const int chw = c * hw;
+  const int tchw = t * chw;
+  const int ntchw = nt * chw;
+  const int c1 = static_cast<int>(c * shift_ratio);
+  const int c2 = static_cast<int>(c * 2 * shift_ratio);
+  DDim in_grad_dims =
+      (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
+                                        : phi::make_ddim({nt, h, w, c}));
+  const T* output_grad_data = output_grad->data<T>();
+  T* input_grad_data =
+      input_grad->mutable_data<T>(in_grad_dims, dev_ctx.GetPlace());
+  int pixelNum = nt * chw;
+  int threads = 1024;
+  int grid = (pixelNum + threads - 1) / threads;
+  int blocks_per_sm = dev_ctx.GetMaxPhysicalThreadCount() / threads;
+  grid = std::min(dev_ctx.GetSMCount() * blocks_per_sm, grid);
+  if (data_layout == DataLayout::kNCHW) {
+    KeTemporalShiftBwNCHW<T><<<grid, threads, 0, dev_ctx.stream()>>>(
+        output_grad_data, input_grad_data, ntchw, tchw, chw, hw, t, c1, c2);
+  } else {
+    KeTemporalShiftBwNHWC<T><<<grid, threads, 0, dev_ctx.stream()>>>(
+        output_grad_data, input_grad_data, ntchw, tchw, chw, t, c, c1, c2);
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(temporal_shift_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TemporalShiftGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
--- a/paddle/phi/kernels/gpu/temporal_shift_kernel.cu
+++ b/paddle/phi/kernels/gpu/temporal_shift_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/temporal_shift_kernel.h"
+namespace phi {
+template <typename T>
+__global__ void KeTemporalShiftFwNCHW(const T* input,
+                                      T* output,
+                                      const int ntchw,
+                                      const int tchw,
+                                      const int chw,
+                                      const int hw,
+                                      const int t,
+                                      const int c1,
+                                      const int c2) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  int src_it = 0;
+  for (; tid < ntchw; tid += stride) {
+    int it = (tid % tchw) / chw;
+    int ic = (tid % chw) / hw;
+    if (ic < c1) {
+      src_it = it - 1;
+    } else if (ic < c2) {
+      src_it = it + 1;
+    } else {
+      src_it = it;
+    }
+    if (src_it < 0 || src_it >= t) {
+      output[tid] = 0;
+    } else {
+      output[tid] = input[tid + (src_it - it) * chw];
+    }
+  }
+}
+template <typename T>
+__global__ void KeTemporalShiftFwNHWC(const T* input,
+                                      T* output,
+                                      const int nthwc,
+                                      const int thwc,
+                                      const int hwc,
+                                      const int t,
+                                      const int c,
+                                      const int c1,
+                                      const int c2) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  int src_it = 0;
+  for (; tid < nthwc; tid += stride) {
+    int it = (tid % thwc) / hwc;
+    int ic = tid % c;
+    if (ic < c1) {
+      src_it = it - 1;
+    } else if (ic < c2) {
+      src_it = it + 1;
+    } else {
+      src_it = it;
+    }
+    if (src_it < 0 || src_it >= t) {
+      output[tid] = 0;
+    } else {
+      output[tid] = input[tid + (src_it - it) * hwc];
+    }
+  }
+}
+template <typename T, typename Context>
+void TemporalShiftKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         int seg_num,
+                         float shift_ratio,
+                         const std::string& data_format_str,
+                         DenseTensor* out) {
+  auto* input = &x;
+  auto* output = out;
+  int t = seg_num;
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_format_str);
+  const int nt = input->dims()[0];
+  const int c =
+      (data_layout == DataLayout::kNCHW ? input->dims()[1] : input->dims()[3]);
+  const int h =
+      (data_layout == DataLayout::kNCHW ? input->dims()[2] : input->dims()[1]);
+  const int w =
+      (data_layout == DataLayout::kNCHW ? input->dims()[3] : input->dims()[2]);
+  const int hw = h * w;
+  const int chw = c * hw;
+  const int tchw = t * chw;
+  const int ntchw = nt * chw;
+  const int c1 = static_cast<int>(c * shift_ratio);
+  const int c2 = static_cast<int>(c * 2 * shift_ratio);
+  DDim out_dims =
+      (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
+                                        : phi::make_ddim({nt, h, w, c}));
+  const T* input_data = input->data<T>();
+  T* output_data = output->mutable_data<T>(out_dims, dev_ctx.GetPlace());
+  int pixelNum = nt * chw;
+  int threads = 1024;
+  int grid = (pixelNum + threads - 1) / threads;
+  int blocks_per_sm = dev_ctx.GetMaxPhysicalThreadCount() / threads;
+  grid = std::min(dev_ctx.GetSMCount() * blocks_per_sm, grid);
+  if (data_layout == DataLayout::kNCHW) {
+    KeTemporalShiftFwNCHW<T><<<grid, threads, 0, dev_ctx.stream()>>>(
+        input_data, output_data, ntchw, tchw, chw, hw, t, c1, c2);
+  } else {
+    KeTemporalShiftFwNHWC<T><<<grid, threads, 0, dev_ctx.stream()>>>(
+        input_data, output_data, ntchw, tchw, chw, t, c, c1, c2);
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(temporal_shift,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TemporalShiftKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
--- a/paddle/phi/kernels/impl/activation_grad_impl.h
+++ b/paddle/phi/kernels/impl/activation_grad_impl.h
@@ -222,6 +222,24 @@ void EluDoubleGradKernel(const Context& dev_ctx,
  functor(dev_ctx, &x, &ddx, ddout, &dout, dx);
 }
+template <typename T, typename Context>
+void LogitGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& out_grad,
+                     float eps,
+                     DenseTensor* x_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+  auto eigen_x = EigenVector<T>::Flatten(x);
+  auto eigen_dout = EigenVector<T>::Flatten(out_grad);
+  auto eigen_dx = EigenVector<T>::Flatten(*x_grad);
+  auto& place = *dev_ctx.eigen_device();
+  auto eigen_p = EigenVector<T>::Flatten(x);
+  funcs::LogitGradFunctor<T> functor;
+  functor(place, eigen_x, eigen_dout, eigen_dx, eigen_p, eps);
+}
 template <typename T, typename Context>
 void SigmoidDoubleGradKernel(const Context& dev_ctx,
                             const DenseTensor& out,

--- a/paddle/phi/kernels/impl/activation_impl.h
+++ b/paddle/phi/kernels/impl/activation_impl.h
@@ -47,6 +47,22 @@ void ActivationImpl(const Context& dev_ctx,
  }
 }
+template <typename T, typename Context>
+void LogitKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 float eps,
+                 DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  auto eigen_out = EigenVector<T>::Flatten(*out);
+  auto eigen_in = EigenVector<T>::Flatten(x);
+  auto& place = *dev_ctx.eigen_device();
+  auto eigen_p = EigenVector<T>::Flatten(*out);
+  funcs::LogitFunctor<T> functor;
+  functor(place, eigen_in, eigen_out, eigen_p, eps);
+}
 template <typename T, typename Context>
 void PowKernel(const Context& dev_ctx,
               const DenseTensor& x,

--- a/paddle/phi/kernels/selected_rows/activation_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/activation_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/selected_rows/activation_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/activation_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+namespace phi {
+namespace sr {
+template <typename T, typename Context>
+void SquareKernel(const Context& dev_ctx,
+                  const SelectedRows& x,
+                  SelectedRows* out) {
+  out->set_rows(x.rows());
+  out->set_height(x.height());
+  phi::SquareKernel<T, Context>(dev_ctx, x.value(), out->mutable_value());
+}
+template <typename T, typename Context>
+void SqrtKernel(const Context& dev_ctx,
+                const SelectedRows& x,
+                SelectedRows* out) {
+  out->set_rows(x.rows());
+  out->set_height(x.height());
+  phi::SqrtKernel<T, Context>(dev_ctx, x.value(), out->mutable_value());
+}
+}  // namespace sr
+}  // namespace phi
+PD_REGISTER_KERNEL(
+    square_sr, CPU, ALL_LAYOUT, phi::sr::SquareKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    sqrt_sr, CPU, ALL_LAYOUT, phi::sr::SqrtKernel, float, double) {}
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(square_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sr::SquareKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+PD_REGISTER_KERNEL(
+    sqrt_sr, GPU, ALL_LAYOUT, phi::sr::SqrtKernel, float, double) {}
+#endif
--- a/paddle/phi/kernels/selected_rows/activation_kernel.h
+++ b/paddle/phi/kernels/selected_rows/activation_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/selected_rows.h"
+namespace phi {
+namespace sr {
+template <typename T, typename Context>
+void SquareKernel(const Context& dev_ctx,
+                  const SelectedRows& x,
+                  SelectedRows* out);
+template <typename T, typename Context>
+void SqrtKernel(const Context& dev_ctx,
+                const SelectedRows& x,
+                SelectedRows* out);
+}  // namespace sr
+}  // namespace phi
--- a/paddle/phi/kernels/temporal_shift_grad_kernel.h
+++ b/paddle/phi/kernels/temporal_shift_grad_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void TemporalShiftGradKernel(const Context& ctx,
+                             const DenseTensor& out_grad,
+                             int seg_num,
+                             float shift_ratio,
+                             const std::string& data_format,
+                             DenseTensor* x_grad);
+}  // namespace phi
--- a/paddle/phi/kernels/temporal_shift_kernel.h
+++ b/paddle/phi/kernels/temporal_shift_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void TemporalShiftKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         int seg_num,
+                         float shift_ratio,
+                         const std::string& data_format,
+                         DenseTensor* out);
+}  // namespace phi
--- a/paddle/phi/ops/compat/activation_sig.cc
+++ b/paddle/phi/ops/compat/activation_sig.cc
@@ -43,17 +43,19 @@ namespace phi {
 #define comma ,
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cos, "cos", );      // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cos, "cos", );        // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Tan, "tan", );      // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Tan, "tan", );        // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acos, "acos", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acos, "acos", );      // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sin, "sin", );      // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sin, "sin", );        // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asin, "asin", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asin, "asin", );      // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atan, "atan", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atan, "atan", );      // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sinh, "sinh", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sinh, "sinh", );      // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cosh, "cosh", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cosh, "cosh", );      // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asinh, "asinh", );  // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asinh, "asinh", );    // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acosh, "acosh", );  // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acosh, "acosh", );    // NOLINT
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atanh, "atanh", );  // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atanh, "atanh", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Square, "square", );  // NOLINT
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(BRelu, "brelu", "t_min" comma "t_max");
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LeakyRelu, "leaky_relu", "alpha");
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(ThresholdedRelu,
@@ -61,6 +63,7 @@ DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(ThresholdedRelu,
                               "threshold");
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(SoftShrink, "soft_shrink", "lambda");
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardShrink, "hard_shrink", "threshold");
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Mish, "mish", "threshold");
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(TanhShrink, "tanh_shrink", );  // NOLINT
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Silu, "silu", );               // NOLINT
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LogSigmoid, "logsigmoid", );   // NOLINT
@@ -74,12 +77,41 @@ DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardSwish,
                               "offset");                // NOLINT
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Swish, "swish", "beta");  // NOLINT
-DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", );        // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(STanh,
-DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", );        // NOLINT
+                               "stanh",
-DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sigmoid, "sigmoid", );  // NOLINT
+                               "scale_a" comma "scale_b");  // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Softplus,
+                               "softplus",
+                               "beta" comma "threshold");  // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", );              // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", );              // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sigmoid, "sigmoid", );        // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Exp, "exp", );                // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Expm1, "expm1", );            // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Reciprocal, "reciprocal", );  // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sqrt, "sqrt", );              // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Rsqrt, "rsqrt", );            // NOLINT
 DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(HardSigmoid,
                                 "hard_sigmoid",
                                 "slope" comma "offset");  // NOLINT
+KernelSignature SqrtActiOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("X")) {
+    return KernelSignature("sqrt", {"X"}, {}, {"Out"});
+  } else {
+    return KernelSignature("sqrt_sr", {"X"}, {}, {"Out"});
+  }
+}
+KernelSignature SquareActiOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("X")) {
+    return KernelSignature("square", {"X"}, {}, {"Out"});
+  } else {
+    return KernelSignature("square_sr", {"X"}, {}, {"Out"});
+  }
+}
 DEFINE_ACT_GRAD_NODEP_OP_ARGMAP(Round, "round", );  // NOLINT
 DEFINE_ACT_GRAD_NODEP_OP_ARGMAP(Floor, "floor", );  // NOLINT
@@ -132,6 +164,11 @@ KernelSignature EluOpArgumentMapping(const ArgumentMappingContext& ctx) {
  return KernelSignature("elu", {"X"}, {"alpha"}, {"Out"});
 }
+KernelSignature LogitGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "logit_grad", {"X", GradVarName("Out")}, {"eps"}, {GradVarName("X")});
+}
 KernelSignature EluGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
  return KernelSignature("elu_grad",
                         {"X", "Out", GradVarName("Out")},
@@ -194,6 +231,18 @@ PD_REGISTER_ARG_MAPPING_FN(asinh_grad, phi::AsinhGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(acosh_grad, phi::AcoshGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(atanh_grad, phi::AtanhGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(relu_grad, phi::ReluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(exp_grad, phi::ExpGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(expm1_grad, phi::Expm1GradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(square_grad, phi::SquareGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reciprocal_grad,
+                           phi::ReciprocalGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sqrt_grad, phi::SqrtGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(rsqrt_grad, phi::RsqrtGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(mish_grad, phi::MishGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(stanh_grad, phi::STanhGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(softplus_grad, phi::SoftplusGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(relu_grad_grad,
                           phi::ReluDoubleGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(tanh_grad, phi::TanhGradOpArgumentMapping);
@@ -228,11 +277,16 @@ PD_REGISTER_ARG_MAPPING_FN(logsigmoid_grad,
                           phi::LogSigmoidGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(hard_sigmoid_grad,
                           phi::HardSigmoidGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(logit_grad, phi::LogitGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(log_grad, phi::LogGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(log_grad_grad, phi::LogDoubleGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(log2_grad, phi::Log2GradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(log10_grad, phi::Log10GradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(log1p_grad, phi::Log1pGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sqrt, phi::SqrtActiOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(square, phi::SquareActiOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(hard_swish_grad,
                           phi::HardSwishGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(swish_grad, phi::SwishGradOpArgumentMapping);

--- a/paddle/phi/ops/compat/temporal_shift_sig.cc
+++ b/paddle/phi/ops/compat/temporal_shift_sig.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/core/compat/op_utils.h"
+namespace phi {
+KernelSignature TemporalShiftOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("temporal_shift",
+                         {"X"},
+                         {"seg_num", "shift_ratio", "data_format"},
+                         {"Out"});
+}
+KernelSignature TemporalShiftGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("temporal_shift_grad",
+                         {GradVarName("Out")},
+                         {"seg_num", "shift_ratio", "data_format"},
+                         {GradVarName("X")});
+}
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(temporal_shift, phi::TemporalShiftOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(temporal_shift_grad,
+                           phi::TemporalShiftGradOpArgumentMapping);
--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
@@ -342,4 +342,5 @@ class TestLogDoubleGradCheck(unittest.TestCase):
 if __name__ == "__main__":
+    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_activation_sparse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_sparse_op.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from op_test import OpTest
+import paddle
+class TestSparseSquareOp(unittest.TestCase):
+    def check_with_place(self, place):
+        scope = core.Scope()
+        # create and initialize Grad Variable   
+        height = 10
+        rows = [0, 4, 7]
+        self.row_numel = 12
+        x_selected_rows = scope.var('X').get_selected_rows()
+        x_selected_rows.set_height(height)
+        x_selected_rows.set_rows(rows)
+        np_array = np.ones((len(rows), self.row_numel)).astype("float32")
+        np_array[0, 0] = 2.0
+        np_array[2, 8] = 4.0
+        x_tensor = x_selected_rows.get_tensor()
+        x_tensor.set(np_array, place)
+        out_selected_rows = scope.var('Out').get_selected_rows()
+        # create and run sqrt operator
+        square_op = Operator("square", X='X', Out='Out')
+        square_op.run(scope, place)
+        # get and compare result
+        result_array = np.array(out_selected_rows.get_tensor())
+        self.assertTrue(np.array_equal(result_array, np.square(np_array)))
+    def test_sparse_acti(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.check_with_place(place)
+class TestSparseSqrtOp(unittest.TestCase):
+    def check_with_place(self, place):
+        scope = core.Scope()
+        # create and initialize Grad Variable   
+        height = 10
+        rows = [0, 4, 7]
+        self.row_numel = 12
+        x_selected_rows = scope.var('X1').get_selected_rows()
+        x_selected_rows.set_height(height)
+        x_selected_rows.set_rows(rows)
+        np_array = np.ones((len(rows), self.row_numel)).astype("float32")
+        np_array[0, 0] = 2.0
+        np_array[2, 8] = 4.0
+        x_tensor = x_selected_rows.get_tensor()
+        x_tensor.set(np_array, place)
+        out_selected_rows = scope.var('Out1').get_selected_rows()
+        # create and run sqrt operator
+        sqrt_op = Operator("sqrt", X='X1', Out='Out1')
+        sqrt_op.run(scope, place)
+        # get and compare result
+        result_array = np.array(out_selected_rows.get_tensor())
+        self.assertTrue(np.allclose(result_array, np.sqrt(np_array)))
+    def test_sparse_acti(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.check_with_place(place)
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
 from op_test import OpTest
 import paddle.fluid as fluid
@@ -153,4 +154,5 @@ class TestClipByNormOpWithSelectedRows(unittest.TestCase):
 if __name__ == '__main__':
+    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
+++ b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
@@ -143,4 +143,5 @@ class TestTemporalShiftAPI(unittest.TestCase):
 if __name__ == "__main__":
+    paddle.enable_static()
    unittest.main()