diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 526d35af6e0f4306fda247adcddd0b563ab74a87..98c73c3cb3f4f0b5fc7d761aee352f0c51e8794d 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -440,19 +440,6 @@ struct LogSigmoidGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-template <typename T>
-struct Expm1GradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * out + dout;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
 // relu(x) = max(x, 0)
 
 template <typename T>
diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps
index 17476c48e41ac3b28ae4a90e7f30e5ad64a42b6c..e63bf6ec47ef0838d4f42419eafc6af10518335b 100644
--- a/paddle/fluid/operators/activation_op.kps
+++ b/paddle/fluid/operators/activation_op.kps
@@ -140,18 +140,6 @@ struct CudaReciprocalGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
-template <typename T>
-struct CudaExpm1GradFunctor : public BaseActivationFunctor<T> {
-  // dx = dout * out
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return dout * out + dout;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
 template <typename T>
 struct CudaLogFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
index 5b397e7774a255d6e1361a03a6a07d3301481908..d3d6989696b18a67e7b974bec70a80006254d51b 100644
--- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -104,6 +104,7 @@ DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, AtanhGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, TanhShrinkGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, SiluGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Exp, ExpGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1, Expm1GradFunctor);
 
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, ReluGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, TanhGradFunctor);
@@ -214,3 +215,11 @@ PD_REGISTER_KERNEL(exp_grad,
                    double,
                    int,
                    int64_t) {}
+
+PD_REGISTER_KERNEL(expm1_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::Expm1GradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc
index edad9734ccea4e0991094ee015c3632f863dec4b..46a85a2b9b2f0a0a52dc2bbf9028352cb3237011 100644
--- a/paddle/phi/kernels/cpu/activation_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -120,12 +120,12 @@ PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel)
-// PD_REGISTER_ACTIVATION_KERNEL(mish, Mish)
+PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel)
 PD_REGISTER_ACTIVATION_KERNEL(stanh, STanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL(reciprocal, ReciprocalKernel)
 PD_REGISTER_ACTIVATION_KERNEL(sqrt, SqrtKernel)
 PD_REGISTER_ACTIVATION_KERNEL(rsqrt, RsqrtKernel)
-// PD_REGISTER_ACTIVATION_KERNEL(softplus, Softplus)
+PD_REGISTER_ACTIVATION_KERNEL(softplus, SoftplusKernel)
 PD_REGISTER_ACTIVATION_KERNEL(softsign, SoftsignKernel)
 
 PD_REGISTER_KERNEL(
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
index 372834d657ad48c58a2b53653143d580c29108f6..eb98ee2ea2637aeaaf5da587fad4101eeb1d2f6e 100644
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -157,9 +157,10 @@ struct LogitFunctor {
   }
 };
 
-// mish(x) = x * tanh(softplus(x))
-// softplus(x) = x, if x > threshold
-//             = ln(1 + exp(x)), otherwise
+// // mish(x) = x * tanh(softplus(x))
+// // softplus(x) = x, if x > threshold
+// //             = ln(1 + exp(x)), otherwise
+
 template <typename T>
 struct MishFunctor : public BaseActivationFunctor<T> {
   float threshold;
@@ -168,7 +169,7 @@ struct MishFunctor : public BaseActivationFunctor<T> {
   }
 
   template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) {
+  void operator()(Device d, X x, Out out) const {
     auto sp = (x > static_cast<T>(threshold))
                   .select(x, (static_cast<T>(1) + x.exp()).log());
     out.device(d) = x * sp.tanh();
@@ -244,20 +245,41 @@ struct RsqrtFunctor : public BaseActivationFunctor<T> {
   }
 };
 
-// For numerical stability, using the following formula instead of softplus(x) =
-// log(1 + exp(x))
-// softplus(x) = log(1 + exp(beta * x)) / beta when beta * x <= threshold(beta =
-// 1, threshold = 20 by default), otherwise x
+// // For numerical stability, using the following formula instead of
+// softplus(x) =
+// // log(1 + exp(x))
+// // softplus(x) = log(1 + exp(beta * x)) / beta when beta * x <=
+// threshold(beta =
+// // 1, threshold = 20 by default), otherwise x
+// template <typename T>
+// struct SoftplusFunctor : public BaseActivationFunctor<T> {
+//   float beta;
+//   float threshold;
+//   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+//     return {{"beta", &beta}, {"threshold", &threshold}};
+//   }
+
+//   template <typename Device, typename X, typename Out>
+//   void operator()(Device d, X x, Out out) {
+//     auto x_beta = static_cast<T>(beta) * x;
+//     out.device(d) = (x_beta > static_cast<T>(threshold))
+//                         .select(x,
+//                                 (static_cast<T>(1) + x_beta.exp()).log() /
+//                                     static_cast<T>(beta));
+//   }
+// };
+
 template <typename T>
 struct SoftplusFunctor : public BaseActivationFunctor<T> {
   float beta;
   float threshold;
+
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"beta", &beta}, {"threshold", &threshold}};
   }
 
   template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) {
+  void operator()(Device d, X x, Out out) const {
     auto x_beta = static_cast<T>(beta) * x;
     out.device(d) = (x_beta > static_cast<T>(threshold))
                         .select(x,
@@ -602,6 +624,22 @@ struct Expm1Functor : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct Expm1GradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * out + dout;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
 // relu(x) = max(x, 0)
 template <typename T>
 struct ReluCPUFunctor : public BaseActivationFunctor<T> {
@@ -822,11 +860,10 @@ struct BReluGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-// softsign(x) = x / (1 + |x|)
 template <typename T>
 struct SoftsignFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) {
+  void operator()(Device d, X x, Out out) const {
     out.device(d) = x / (static_cast<T>(1) + x.abs());
   }
 };
@@ -1264,6 +1301,18 @@ struct CudaExpm1Functor : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct CudaExpm1GradFunctor : public BaseActivationFunctor<T> {
+  // dx = dout * out
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return dout * out + dout;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
 template <typename T>
 struct CudaSinFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
index a7f89ff85d6bd017aa824aac3f3679d4e7c2264e..00c08ff497dbd8ff630f4e5b093291b91023cb2d 100644
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -158,6 +158,7 @@ DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, CudaAtanhGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, CudaTanhShrinkGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, CudaSiluGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Exp, CudaExpGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1, CudaExpm1GradFunctor);
 
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
                                                CudaLeakyReluGradFunctor,
@@ -274,9 +275,18 @@ PD_REGISTER_KERNEL(exp_grad,
                    double,
                    int,
                    int64_t) {}
+
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel)
+
+PD_REGISTER_KERNEL(expm1_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Expm1GradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}