diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h
index a8c9640d479d3e92a64a5797b332621414048458..e80dfba325937796d5539022850356bd1addd3ca 100644
--- a/paddle/fluid/operators/elementwise/elementwise_functor.h
+++ b/paddle/fluid/operators/elementwise/elementwise_functor.h
@@ -54,7 +54,7 @@ using InverseDivFunctor = pten::funcs::InverseDivideFunctor<T>;
 // Floor Divide
 template <typename T>
 struct FloorDivFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
     PADDLE_ENFORCE(b != 0, DIV_ERROR_INFO);
     return static_cast<T>(std::trunc(a / b));
   }
@@ -62,7 +62,7 @@ struct FloorDivFunctor {
 
 template <typename T>
 struct InverseFloorDivFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
     PADDLE_ENFORCE(a != 0, DIV_ERROR_INFO);
     return static_cast<T>(std::trunc(b / a));
   }
@@ -73,7 +73,7 @@ struct InverseFloorDivFunctor {
 // Maximum
 template <typename T>
 struct MaxFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
     return a > b ? a : b;
   }
 };
@@ -81,7 +81,7 @@ struct MaxFunctor {
 // Minmum
 template <typename T>
 struct MinFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
     return a < b ? a : b;
   }
 };
@@ -119,14 +119,14 @@ struct DivGradXYFunctor<Complex<InT>, Complex<OutT>> {
 // Float div grad
 template <typename T>
 struct DivGradXFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a / b; }
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
 };
 
 // Complex div grad
 template <typename T>
 struct DivGradXFunctor<Complex<T>> {
-  inline HOSTDEVICE Complex<T> operator()(const Complex<T>& a,
-                                          const Complex<T>& b) const {
+  inline HOSTDEVICE Complex<T> operator()(const Complex<T> a,
+                                          const Complex<T> b) const {
     Complex<T> b_conj(b.real, -b.imag);
     return a / b_conj;
   }
@@ -135,7 +135,7 @@ struct DivGradXFunctor<Complex<T>> {
 // Float mul and div
 template <typename T>
 struct DivGradYFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b, const T& c) const {
+  inline HOSTDEVICE T operator()(const T a, const T b, const T c) const {
     return -a * b / c;
   }
 };
@@ -143,9 +143,9 @@ struct DivGradYFunctor {
 // Complex mul and div
 template <typename T>
 struct DivGradYFunctor<Complex<T>> {
-  inline HOSTDEVICE Complex<T> operator()(const Complex<T>& a,
-                                          const Complex<T>& b,
-                                          const Complex<T>& c) const {
+  inline HOSTDEVICE Complex<T> operator()(const Complex<T> a,
+                                          const Complex<T> b,
+                                          const Complex<T> c) const {
     Complex<T> out_div_c_conj((b / c).real, -(b / c).imag);
     return -a * out_div_c_conj;
   }
@@ -154,7 +154,7 @@ struct DivGradYFunctor<Complex<T>> {
 // Fmax
 template <typename T>
 struct FMaxFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
     return std::fmax(a, b);
   }
 };
@@ -162,8 +162,8 @@ struct FMaxFunctor {
 template <>
 struct FMaxFunctor<paddle::platform::float16> {
   inline HOSTDEVICE paddle::platform::float16 operator()(
-      const paddle::platform::float16& a,
-      const paddle::platform::float16& b) const {
+      const paddle::platform::float16 a,
+      const paddle::platform::float16 b) const {
     float float_a = static_cast<float>(a);
     float float_b = static_cast<float>(b);
     auto result = std::fmax(float_a, float_b);
@@ -173,7 +173,7 @@ struct FMaxFunctor<paddle::platform::float16> {
 
 template <>
 struct FMaxFunctor<int> {
-  inline HOSTDEVICE int operator()(const int& a, const int& b) const {
+  inline HOSTDEVICE int operator()(const int a, const int b) const {
     float float_a = static_cast<float>(a);
     float float_b = static_cast<float>(b);
     auto result = std::fmax(float_a, float_b);
@@ -183,8 +183,7 @@ struct FMaxFunctor<int> {
 
 template <>
 struct FMaxFunctor<int64_t> {
-  inline HOSTDEVICE int64_t operator()(const int64_t& a,
-                                       const int64_t& b) const {
+  inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const {
     double double_a = static_cast<double>(a);
     double double_b = static_cast<double>(b);
     auto result = std::fmax(double_a, double_b);
@@ -195,7 +194,7 @@ struct FMaxFunctor<int64_t> {
 // Fmin
 template <typename T>
 struct FMinFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
     return std::fmin(a, b);
   }
 };
@@ -203,8 +202,8 @@ struct FMinFunctor {
 template <>
 struct FMinFunctor<paddle::platform::float16> {
   inline HOSTDEVICE paddle::platform::float16 operator()(
-      const paddle::platform::float16& a,
-      const paddle::platform::float16& b) const {
+      const paddle::platform::float16 a,
+      const paddle::platform::float16 b) const {
     float float_a = static_cast<float>(a);
     float float_b = static_cast<float>(b);
     auto result = std::fmin(float_a, float_b);
@@ -214,7 +213,7 @@ struct FMinFunctor<paddle::platform::float16> {
 
 template <>
 struct FMinFunctor<int> {
-  inline HOSTDEVICE int operator()(const int& a, const int& b) const {
+  inline HOSTDEVICE int operator()(const int a, const int b) const {
     float float_a = static_cast<float>(a);
     float float_b = static_cast<float>(b);
     auto result = std::fmin(float_a, float_b);
@@ -224,8 +223,7 @@ struct FMinFunctor<int> {
 
 template <>
 struct FMinFunctor<int64_t> {
-  inline HOSTDEVICE int64_t operator()(const int64_t& a,
-                                       const int64_t& b) const {
+  inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const {
     double double_a = static_cast<double>(a);
     double double_b = static_cast<double>(b);
     auto result = std::fmin(double_a, double_b);
@@ -261,12 +259,12 @@ struct MinGradXYFunctor {
 
 template <typename T>
 struct MulGradFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return a * b; }
 };
 template <typename T>
 struct MulGradFunctor<Complex<T>> {
-  inline HOSTDEVICE Complex<T> operator()(const Complex<T>& a,
-                                          const Complex<T>& b) const {
+  inline HOSTDEVICE Complex<T> operator()(const Complex<T> a,
+                                          const Complex<T> b) const {
     Complex<T> b_conj(b.real, -b.imag);
     return a * b_conj;
   }
@@ -274,9 +272,9 @@ struct MulGradFunctor<Complex<T>> {
 
 template <typename InT, typename OutT>
 struct MulGradXYFunctor {
-  inline HOSTDEVICE paddle::framework::Array<OutT, 2> operator()(const InT& a,
-                                                                 const InT& b,
-                                                                 const InT& c) {
+  inline HOSTDEVICE paddle::framework::Array<OutT, 2> operator()(const InT a,
+                                                                 const InT b,
+                                                                 const InT c) {
     paddle::framework::Array<OutT, 2> outs;
     // dx = dout * y
     outs[0] = a * b;
@@ -289,7 +287,7 @@ struct MulGradXYFunctor {
 template <typename InT, typename OutT>
 struct MulGradXYFunctor<Complex<InT>, Complex<OutT>> {
   inline HOSTDEVICE paddle::framework::Array<Complex<OutT>, 2> operator()(
-      const Complex<InT>& a, const Complex<InT>& b, const Complex<InT>& c) {
+      const Complex<InT> a, const Complex<InT> b, const Complex<InT> c) {
     paddle::framework::Array<Complex<OutT>, 2> outs;
     // dx = dout * y
     Complex<InT> b_conj(b.real, -b.imag);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cu b/paddle/fluid/operators/elementwise/elementwise_mod_op.cu
index d2106645a472791fe37bddda4deadd938920dd8d..4ef957c617870e67e880afcff022fbef73dc8e5b 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cu
@@ -20,31 +20,6 @@ namespace plat = paddle::platform;
 namespace paddle {
 namespace operators {
 
-template <typename T, typename Enable = void>
-struct CudaModFunctor {
-  inline HOSTDEVICE T operator()(const T* args) const {
-    T res = args[0] % args[1];
-
-    // Accoding to #PR26732: in dividen % divsor
-    // remainder shall have the same sign as divsor.
-    if ((res != 0) && ((args[1] ^ res) < 0)) res += args[1];
-    return res;
-  }
-};
-
-template <typename T>
-struct CudaModFunctor<
-    T, typename std::enable_if_t<std::is_floating_point<T>::value>> {
-  inline HOSTDEVICE T operator()(const T* args) const {
-    T res = fmod(args[0], args[1]);
-
-    // Accoding to #PR26732: in dividen % divsor
-    // remainder shall have the same sign as divsor.
-    if ((res != 0) && ((res < 0) != (args[1] < 0))) res += args[1];
-    return res;
-  }
-};
-
 template <typename T>
 class ElementwiseModKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
@@ -56,7 +31,7 @@ class ElementwiseModKernel<platform::CUDADeviceContext, T>
         ctx.template device_context<platform::CUDADeviceContext>();
     int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
     LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
-        cuda_ctx, ins, &outs, axis, CudaModFunctor<T>());
+        cuda_ctx, ins, &outs, axis, ModFunctor<T>());
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.h b/paddle/fluid/operators/elementwise/elementwise_mod_op.h
index 66c3e553c141fd925d64adb037525d016355dedd..bfb12e44b6b94cfcdbc0b2eceb03c73733ff7774 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.h
@@ -19,29 +19,36 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename Enable = void>
 struct ModFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
     T res = a % b;
-    if ((res != 0) && ((res < 0) != (b < 0))) res += b;
+
+    // Accoding to #PR26732: in dividen % divsor
+    // remainder shall have the same sign as divsor.
+    if ((res != 0) && ((b ^ res) < 0)) res += b;
     return res;
   }
 };
 
 template <typename T>
-struct InverseModFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const {
-    T res = b % a;
-    if ((res != 0) && ((res < 0) != (a < 0))) res += a;
+struct ModFunctor<T,
+                  typename std::enable_if_t<std::is_floating_point<T>::value>> {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
+    T res = fmod(a, b);
+
+    // Accoding to #PR26732: in dividen % divsor
+    // remainder shall have the same sign as divsor.
+    if ((res != 0) && ((res < 0) != (b < 0))) res += b;
     return res;
   }
 };
 
 template <typename T>
-struct ModFunctorFP {
+struct InverseModFunctor {
   inline HOSTDEVICE T operator()(T a, T b) const {
-    T res = fmod(a, b);
-    if ((res != 0) && ((b < 0) != (res < 0))) res += b;
+    T res = b % a;
+    if ((res != 0) && ((res < 0) != (a < 0))) res += a;
     return res;
   }
 };
@@ -79,8 +86,8 @@ void elementwise_mod_fp(const framework::ExecutionContext &ctx,
   auto x_dims = x->dims();
   auto y_dims = y->dims();
   if (x_dims.size() >= y_dims.size()) {
-    ElementwiseComputeEx<ModFunctorFP<T>, DeviceContext, T>(
-        ctx, x, y, axis, ModFunctorFP<T>(), z);
+    ElementwiseComputeEx<ModFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          ModFunctor<T>(), z);
   } else {
     ElementwiseComputeEx<InverseModFunctorFP<T>, DeviceContext, T>(
         ctx, x, y, axis, InverseModFunctorFP<T>(), z);
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
index 0f3aa8c3e1b9b152e8d941407841248b938bcfe3..722a53d188061b91d9c880fafac11bf70107bf6a 100644
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
@@ -16,26 +16,6 @@ namespace ops = paddle::operators;
 namespace paddle {
 namespace operators {
 
-template <typename T, typename Enable = void>
-struct CudaPowFunctor {
-  inline HOSTDEVICE T operator()(const T args[]) const {
-    return std::pow(args[0], args[1]);
-  }
-};
-
-template <typename T>
-struct CudaPowFunctor<
-    T, typename std::enable_if<std::is_integral<T>::value>::type> {
-  // On CUDAPlace, std::pow(3, 1) calls pow(float, float), and
-  // it will return a float number like 2.99... , which floor to 2
-  // when cast to int by default and it is wrong.
-  // Use llrint to cast it to the nearest integer, which is 3.
-  inline HOSTDEVICE T operator()(const T args[]) const {
-    return std::llrint(
-        std::pow(static_cast<double>(args[0]), static_cast<double>(args[1])));
-  }
-};
-
 template <typename T>
 class ElementwisePowKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
@@ -48,7 +28,7 @@ class ElementwisePowKernel<platform::CUDADeviceContext, T>
 
     int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
     LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
-        cuda_ctx, ins, &outs, axis, CudaPowFunctor<T>());
+        cuda_ctx, ins, &outs, axis, PowFunctor<T>());
   }
 };
 
diff --git a/paddle/pten/kernels/funcs/elementwise_functor.h b/paddle/pten/kernels/funcs/elementwise_functor.h
index 9b2519b0fd6b1418a9e73d3c3985bf6ce99dbbbd..7f33150739e1c50383c33d84445cff6eaa450983 100644
--- a/paddle/pten/kernels/funcs/elementwise_functor.h
+++ b/paddle/pten/kernels/funcs/elementwise_functor.h
@@ -26,31 +26,31 @@ namespace funcs {
 // Add
 template <typename T>
 struct AddFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a + b; }
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return a + b; }
 };
 template <typename T>
 struct InverseAddFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b + a; }
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return b + a; }
 };
 
 // Subtract
 template <typename T>
 struct SubtractFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a - b; }
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return a - b; }
 };
 template <typename T>
 struct InverseSubtractFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b - a; }
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return b - a; }
 };
 
 // Multiply
 template <typename T>
 struct MultiplyFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return a * b; }
 };
 template <typename T>
 struct InverseMultiplyFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b * a; }
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return b * a; }
 };
 
 // Divide
@@ -60,14 +60,14 @@ struct InverseMultiplyFunctor {
 
 template <typename T, typename Enable = void>
 struct DivideFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a / b; }
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
 };
 
 template <typename T>
 struct DivideFunctor<
     T,
     typename std::enable_if<std::is_integral<T>::value>::type> {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
     // For int32/int64, need to check whether the divison is zero.
     PADDLE_ENFORCE(b != 0, DIV_ERROR_INFO);
     return a / b;
@@ -76,7 +76,7 @@ struct DivideFunctor<
 
 template <typename T, typename Enable = void>
 struct InverseDivideFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b / a; }
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return b / a; }
 };
 
 }  // namespace funcs