From bf30503335c2c8015dd20f991ef4480af9b5898d Mon Sep 17 00:00:00 2001
From: Zhang Ting <zhangting_2017@163.com>
Date: Fri, 11 Feb 2022 10:14:45 +0800
Subject: [PATCH] Support different dtypes of inputs for elementwise ops
 (#38859)

* improve backward performance

* support different dtypes for elementwise ops
---
 paddle/fluid/operators/abs_op.cu              |   5 +-
 paddle/fluid/operators/activation_op.cu       |  24 +--
 paddle/fluid/operators/bce_loss_op.cu         |  13 +-
 paddle/fluid/operators/clip_op.h              |   7 +-
 .../fluid/operators/controlflow/bitwise_op.cu |   4 +-
 .../operators/controlflow/compare_all_op.cu   |   4 +-
 paddle/fluid/operators/dropout_impl.cu.h      |  36 ++--
 .../elementwise/elementwise_functor.h         |  18 +-
 .../elementwise/elementwise_op_impl.cu.h      |   8 +-
 paddle/fluid/operators/label_smooth_op.cu     |   8 +-
 paddle/fluid/operators/lgamma_op.cu           |   4 +-
 paddle/fluid/operators/p_norm_op.cu           |   3 +-
 paddle/fluid/operators/renorm_op.cu           |   5 +-
 paddle/fluid/operators/viterbi_decode_op.cu   |   5 +-
 paddle/fluid/platform/function_traits.h       |   1 +
 paddle/pten/kernels/funcs/elementwise_base.h  | 187 ++++++++++++------
 paddle/pten/kernels/gpu/cast_kernel.cu        |   4 +-
 paddle/pten/kernels/gpu/elementwise.h         |   5 +-
 paddle/pten/kernels/gpu/full_kernel.cu        |   8 +-
 paddle/pten/kernels/gpu/reduce.h              |   2 +-
 paddle/pten/kernels/gpu/scale_kernel.cu       |   4 +-
 .../kernels/primitive/datamover_primitives.h  |  54 +++++
 .../primitive/datamover_primitives_xpu2.h     |  46 +++++
 23 files changed, 309 insertions(+), 146 deletions(-)
diff --git a/paddle/fluid/operators/abs_op.cu b/paddle/fluid/operators/abs_op.cu
index 48e19defd03..882c8547a04 100644
--- a/paddle/fluid/operators/abs_op.cu
+++ b/paddle/fluid/operators/abs_op.cu
@@ -50,9 +50,8 @@ class AbsKernel<platform::CUDADeviceContext, T>
     std::vector<const framework::Tensor*> ins = {x};
     std::vector<framework::Tensor*> outs = {out};
     auto functor = CudaAbsFunctor<T>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<
-        ElementwiseType::kUnary, T, math::Real<T>>(dev_ctx, ins, &outs,
-                                                   functor);
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<math::Real<T>>(
+        dev_ctx, ins, &outs, functor);
   }
 };
 
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 1ee5f35883d..e578ad899e7 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -1367,14 +1367,14 @@ class ELUGradCudaKernel : public framework::OpKernel<T> {
     if (alpha > 0) {
       CudaELUGradFunctor<T> functor;
       functor.alpha = alpha;
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
-          ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor);
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
+                                                                &outs, functor);
     } else {
       CudaELUGradNegativeAlphaFunctor<T> functor;
       functor.alpha = alpha;
       ins.push_back(x);
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
-          ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor);
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
+                                                                &outs, functor);
     }
   }
 };
@@ -1450,8 +1450,8 @@ class ActivationCudaKernel
     for (auto& attr : attrs) {
       *attr.second = ctx.Attr<float>(attr.first);
     }
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<
-        ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor);
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
+                                                              &outs, functor);
   }
 };
 
@@ -1480,17 +1480,17 @@ class ActivationGradCudaKernel
     if (static_cast<int>(Functor::FwdDeps()) == static_cast<int>(kDepOut)) {
       // Only need forward output Out
       ins.push_back(out);
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
-          ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor);
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
+                                                                &outs, functor);
     } else if (static_cast<int>(Functor::FwdDeps()) ==
                static_cast<int>(kDepX)) {
       // Only need forward input X
       ins.push_back(x);
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
-          ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor);
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
+                                                                &outs, functor);
     } else {
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
-          ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor);
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
+                                                                &outs, functor);
     }
   }
 };
diff --git a/paddle/fluid/operators/bce_loss_op.cu b/paddle/fluid/operators/bce_loss_op.cu
index 6ab2e8a6df2..80bdfc5a21a 100644
--- a/paddle/fluid/operators/bce_loss_op.cu
+++ b/paddle/fluid/operators/bce_loss_op.cu
@@ -31,7 +31,7 @@ struct BCELossFunctor {
     neg_100 = static_cast<T>(-100.);
   }
 
-  HOSTDEVICE inline T operator()(const T& x, const T& label) const {
+  HOSTDEVICE inline T operator()(const T x, const T label) const {
     PADDLE_ENFORCE(
         (x >= static_cast<T>(0)) && (x <= one),
         "Input is expected to be within the interval [0, 1], but recieved %f.",
@@ -52,8 +52,7 @@ struct BCELossGradFunctor {
     eps = static_cast<T>(1e-12);
   }
 
-  HOSTDEVICE inline T operator()(const T& x, const T& label,
-                                 const T& dout) const {
+  HOSTDEVICE inline T operator()(const T x, const T label, const T dout) const {
     T term1 = max((one - x) * x, eps);
     return (dout * (x - label) / term1);
   }
@@ -73,8 +72,8 @@ class BCELossCUDAKernel : public framework::OpKernel<T> {
     std::vector<framework::Tensor*> outs = {out};
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto functor = BCELossFunctor<T>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<
-        ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor);
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
+                                                              &outs, functor);
   }
 };
 
@@ -91,8 +90,8 @@ class BCELossGradCUDAKernel : public framework::OpKernel<T> {
     std::vector<framework::Tensor*> outs = {dx};
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto functor = BCELossGradFunctor<T>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<
-        ElementwiseType::kTernary, T, T>(dev_ctx, ins, &outs, functor);
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
+                                                              &outs, functor);
   }
 };
 
diff --git a/paddle/fluid/operators/clip_op.h b/paddle/fluid/operators/clip_op.h
index 47bb61a77fa..dde1214679a 100644
--- a/paddle/fluid/operators/clip_op.h
+++ b/paddle/fluid/operators/clip_op.h
@@ -45,7 +45,7 @@ template <typename T>
 class ClipGradFunctor {
  public:
   explicit ClipGradFunctor(const T min, const T max) : min_(min), max_(max) {}
-  HOSTDEVICE T operator()(const T& x, const T& y) const {
+  HOSTDEVICE T operator()(const T x, const T y) const {
     return (y > min_ && y < max_) ? x : static_cast<T>(0);
   }
 
@@ -103,8 +103,7 @@ class ClipKernel : public framework::OpKernel<T> {
         std::vector<const framework::Tensor*> ins = {x};
         std::vector<framework::Tensor*> outs = {out};
         auto functor = ClipFunctor<T>(min, max);
-        paddle::operators::LaunchSameDimsElementwiseCudaKernel<
-            ElementwiseType::kUnary, T, T>(
+        paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
             context.template device_context<platform::CUDADeviceContext>(), ins,
             &outs, functor);
 #endif
@@ -177,7 +176,7 @@ class ClipGradKernel : public framework::OpKernel<T> {
       std::vector<framework::Tensor*> outs = {d_x};
       auto functor = ClipGradFunctor<T>(min, max);
       d_x->mutable_data<T>(context.GetPlace());
-      LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+      LaunchSameDimsElementwiseCudaKernel<T>(
           context.template device_context<platform::CUDADeviceContext>(), ins,
           &outs, functor);
 #else
diff --git a/paddle/fluid/operators/controlflow/bitwise_op.cu b/paddle/fluid/operators/controlflow/bitwise_op.cu
index 3a4d5303953..5d98da2c027 100644
--- a/paddle/fluid/operators/controlflow/bitwise_op.cu
+++ b/paddle/fluid/operators/controlflow/bitwise_op.cu
@@ -57,8 +57,8 @@ class UnaryBitwiseOpKernel<platform::CUDADeviceContext, Functor>
     std::vector<framework::Tensor*> outs = {out};
     const auto& cuda_ctx =
         ctx.template device_context<platform::CUDADeviceContext>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<
-        ElementwiseType::kUnary, T, T>(cuda_ctx, ins, &outs, functor);
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(cuda_ctx, ins,
+                                                              &outs, functor);
   }
 };
 
diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cu b/paddle/fluid/operators/controlflow/compare_all_op.cu
index 1dfa7f44279..d96dcebe51f 100644
--- a/paddle/fluid/operators/controlflow/compare_all_op.cu
+++ b/paddle/fluid/operators/controlflow/compare_all_op.cu
@@ -55,8 +55,8 @@ class CompareReduceOpKernel
           context.template device_context<platform::CUDADeviceContext>();
       std::vector<const framework::Tensor*> ins = {x, y};
       std::vector<framework::Tensor*> outs = {&tmp};
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
-          ElementwiseType::kBinary, T, bool>(cuda_ctx, ins, &outs, Functor());
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<bool>(
+          cuda_ctx, ins, &outs, Functor());
 
       // Reduce by 'bitwise and' operator
       std::vector<int> reduce_dims;
diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index 192902902e5..d7c49466d5a 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -32,6 +32,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/dropout_impl_util.h"
 #include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/pten/kernels/funcs/cuda_kernel_config.h"
@@ -123,6 +124,19 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
   }
 }
 
+template <typename T, typename MaskType>
+struct CudaDropoutGradFunctor {
+  explicit CudaDropoutGradFunctor(const T factor) : factor_(factor) {}
+
+  __device__ __forceinline__ T operator()(const T dout,
+                                          const MaskType mask) const {
+    return dout * static_cast<T>(mask) * factor_;
+  }
+
+ private:
+  T factor_;
+};
+
 template <typename T, typename MaskType, int VecSize>
 __global__ void DropoutGradCUDAKernel(const T* dout, const MaskType* mask,
                                       const T factor, const int64_t size,
@@ -259,21 +273,13 @@ void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
       if (dropout_prob == 1.0f) {
         dX.device(place) = static_cast<T>(0) * dY;
       } else {
-        int vec_size = platform::GetVectorizedSize<T>(grad_y.data<T>());
-        if (vec_size == 4 && size % 4 == 0) {
-          auto factor = static_cast<T>(1.0f / (1.0f - dropout_prob));
-          auto stream = dev_ctx.stream();
-          platform::GpuLaunchConfig config =
-              platform::GetGpuLaunchConfig1D(dev_ctx, size, vec_size);
-          DropoutGradCUDAKernel<
-              T, uint8_t,
-              4><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
-              grad_y.data<T>(), mask.data<uint8_t>(), factor, size,
-              grad_x->data<T>());
-        } else {
-          dX.device(place) =
-              dY * M.cast<T>() / static_cast<T>(1.0f - dropout_prob);
-        }
+        auto factor = static_cast<T>(1.0f / (1.0f - dropout_prob));
+        auto stream = dev_ctx.stream();
+        std::vector<const framework::Tensor*> ins = {&grad_y, &mask};
+        std::vector<framework::Tensor*> outs = {grad_x};
+        auto functor = CudaDropoutGradFunctor<T, uint8_t>(factor);
+        paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
+            dev_ctx, ins, &outs, functor);
       }
     } else {
       dX.device(place) = dY * M.cast<T>();
diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h
index 19d3a6c385c..fc6126be058 100644
--- a/paddle/fluid/operators/elementwise/elementwise_functor.h
+++ b/paddle/fluid/operators/elementwise/elementwise_functor.h
@@ -234,21 +234,22 @@ struct FMinFunctor<int64_t> {
 
 template <typename T>
 struct MinGradXFunctor {
-  inline HOSTDEVICE T operator()(const T& x, const T& y, const T& dout) const {
+  inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
     return dout * static_cast<T>(x < y);
   }
 };
 template <typename T>
 struct MinGradYFunctor {
-  inline HOSTDEVICE T operator()(const T& x, const T& y, const T& dout) const {
+  inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
     return dout * static_cast<T>(x >= y);
   }
 };
 
 template <typename InT, typename OutT>
 struct MinGradXYFunctor {
-  inline HOSTDEVICE pten::framework::Array<OutT, 2> operator()(
-      const InT& x, const InT& y, const InT& dout) {
+  inline HOSTDEVICE pten::framework::Array<OutT, 2> operator()(const InT x,
+                                                               const InT y,
+                                                               const InT dout) {
     pten::framework::Array<OutT, 2> outs;
     // dx = dout * (x < y)
     outs[0] = static_cast<OutT>(dout * static_cast<InT>(x < y));
@@ -303,21 +304,22 @@ struct MulGradXYFunctor<Complex<InT>, Complex<OutT>> {
 // Ternary compare
 template <typename T>
 struct MaxGradXFunctor {
-  inline HOSTDEVICE T operator()(const T& x, const T& y, const T& dout) const {
+  inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
     return dout * static_cast<T>(x > y);
   }
 };
 template <typename T>
 struct MaxGradYFunctor {
-  inline HOSTDEVICE T operator()(const T& x, const T& y, const T& dout) const {
+  inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
     return dout * static_cast<T>(x <= y);
   }
 };
 
 template <typename InT, typename OutT>
 struct MaxGradXYFunctor {
-  inline HOSTDEVICE pten::framework::Array<OutT, 2> operator()(
-      const InT& x, const InT& y, const InT& dout) {
+  inline HOSTDEVICE pten::framework::Array<OutT, 2> operator()(const InT x,
+                                                               const InT y,
+                                                               const InT dout) {
     pten::framework::Array<OutT, 2> outs;
     // dx = dout * (x > y)
     outs[0] = static_cast<OutT>(dout * static_cast<InT>(x > y));
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
index c09d2363bb5..233b0767ed6 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -26,8 +26,7 @@ namespace operators {
 
 using ElementwiseType = pten::ElementwiseType;
 
-template <ElementwiseType ET, typename InT, typename OutT, typename Functor,
-          int NumOuts = 1>
+template <typename OutT, typename Functor, int NumOuts = 1>
 void LaunchSameDimsElementwiseCudaKernel(
     const KPDevice &ctx, const std::vector<const framework::Tensor *> &ins,
     std::vector<framework::Tensor *> *outs, Functor func) {
@@ -54,9 +53,8 @@ void LaunchSameDimsElementwiseCudaKernel(
   for (int i = 0; i < pt_outputs_tmp.size(); i++) {
     pt_outputs.push_back(pt_outputs_tmp[i].get());
   }
-  pten::funcs::LaunchSameDimsElementwiseCudaKernel<ET, InT, OutT, Functor,
-                                                   NumOuts>(ctx, pt_inputs,
-                                                            &pt_outputs, func);
+  pten::funcs::LaunchSameDimsElementwiseCudaKernel<OutT, Functor, NumOuts>(
+      ctx, pt_inputs, &pt_outputs, func);
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/label_smooth_op.cu b/paddle/fluid/operators/label_smooth_op.cu
index 7979d3a74bb..f149e104eff 100644
--- a/paddle/fluid/operators/label_smooth_op.cu
+++ b/paddle/fluid/operators/label_smooth_op.cu
@@ -87,8 +87,8 @@ class LabelSmoothGPUKernel : public framework::OpKernel<T> {
       std::vector<const framework::Tensor*> ins = {in_t};
       std::vector<framework::Tensor*> outs = {out_t};
       auto functor = LabelSmoothFunctor<T>(epsilon, label_dim);
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
-          ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor);
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
+                                                                &outs, functor);
     }
   }
 };
@@ -107,8 +107,8 @@ class LabelSmoothGradGPUKernel : public framework::OpKernel<T> {
     std::vector<const framework::Tensor*> ins = {d_out_t};
     std::vector<framework::Tensor*> outs = {d_in_t};
     auto functor = LabelSmoothGradFunctor<T>(epsilon);
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<
-        ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor);
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
+                                                              &outs, functor);
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/lgamma_op.cu b/paddle/fluid/operators/lgamma_op.cu
index 64d1a479627..b9f273727b0 100644
--- a/paddle/fluid/operators/lgamma_op.cu
+++ b/paddle/fluid/operators/lgamma_op.cu
@@ -39,8 +39,8 @@ class LgammaKernel<platform::CUDADeviceContext, T>
     std::vector<const framework::Tensor*> ins = {x};
     std::vector<framework::Tensor*> outs = {out};
     auto functor = CudaLgammaFunctor<T>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<
-        ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor);
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
+                                                              &outs, functor);
   }
 };
 
diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu
index abbbffb6331..7c8dfc7f647 100644
--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
@@ -126,8 +126,7 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
       std::vector<framework::Tensor*> outs = {out_norm};
       const auto& cuda_ctx =
           ctx.template device_context<platform::CUDADeviceContext>();
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
-          ElementwiseType::kUnary, T, T, UnsignedPowFunctor<T>>(
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
           cuda_ctx, ins, &outs, UnsignedPowFunctor<T>(1. / porder));
     }
   }
diff --git a/paddle/fluid/operators/renorm_op.cu b/paddle/fluid/operators/renorm_op.cu
index ec1b4f6d5b2..089ecccac64 100644
--- a/paddle/fluid/operators/renorm_op.cu
+++ b/paddle/fluid/operators/renorm_op.cu
@@ -151,9 +151,8 @@ class CUDARenormKernel : public framework::OpKernel<T> {
     const auto& cuda_ctx =
         context.template device_context<platform::CUDADeviceContext>();
 
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<
-        ElementwiseType::kUnary, MT, T, UnsignedPowFunctor<MT, T>>(
-        cuda_ctx, ins, &outs, func);
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(cuda_ctx, ins,
+                                                              &outs, func);
     std::vector<int> reduce_axis = {0, 2};
     TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
         cuda_ctx, pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis,
diff --git a/paddle/fluid/operators/viterbi_decode_op.cu b/paddle/fluid/operators/viterbi_decode_op.cu
index b52f46e4a8a..d527cefd724 100644
--- a/paddle/fluid/operators/viterbi_decode_op.cu
+++ b/paddle/fluid/operators/viterbi_decode_op.cu
@@ -80,9 +80,8 @@ struct GetMask<platform::CUDADeviceContext, CompareFunctor, T> {
     std::vector<const Tensor*> ins = {&lhs, &rhs};
     std::vector<Tensor*> outs = {mask};
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<
-        ElementwiseType::kBinary, int64_t, T>(dev_ctx, ins, &outs,
-                                              CompareFunctor<int64_t, T>());
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
+        dev_ctx, ins, &outs, CompareFunctor<int64_t, T>());
   }
 };
 
diff --git a/paddle/fluid/platform/function_traits.h b/paddle/fluid/platform/function_traits.h
index eca78e03e17..662e3ac58a6 100644
--- a/paddle/fluid/platform/function_traits.h
+++ b/paddle/fluid/platform/function_traits.h
@@ -54,6 +54,7 @@ template <typename ReturnType, typename... Args>
 struct FunctionTraits<ReturnType(Args...)> {
   static const size_t arity = sizeof...(Args);
   static const bool has_pointer_args = IsPointerArgs<arity, Args...>::value;
+  using ArgsTuple = std::tuple<Args...>;
 };
 
 }  // namespace platform
diff --git a/paddle/pten/kernels/funcs/elementwise_base.h b/paddle/pten/kernels/funcs/elementwise_base.h
index 0f26f3d8aa6..110b405bbcb 100644
--- a/paddle/pten/kernels/funcs/elementwise_base.h
+++ b/paddle/pten/kernels/funcs/elementwise_base.h
@@ -438,14 +438,78 @@ inline void ElementwiseGradPreProcess(const DenseTensor &dout,
 
 #if defined(__NVCC__) || defined(__HIPCC__)
 
-template <typename InT, typename OutT>
+// static unroller
+template <template <int Index, int VecSize> typename Func,
+          int VecSize,
+          int End,
+          int Begin = 0>
+struct Unroller {
+  template <typename... Args>
+  static HOSTDEVICE inline void step(Args &&... args) {
+    Func<Begin, VecSize>::Apply(std::forward<Args>(args)...);
+    Unroller<Func, VecSize, End, Begin + 1>::step(args...);
+  }
+};
+
+template <template <int Index, int VecSize> typename Func, int VecSize, int End>
+struct Unroller<Func, VecSize, End, End> {
+  template <typename... Args>
+  static HOSTDEVICE inline void step(Args &&... args) {}
+};
+
+template <int Index, int VecSize>
+struct Loader {
+  template <typename Array, typename ArgsT>
+  static __device__ void Apply(const Array &in,
+                               ArgsT *args,
+                               int num,
+                               int data_offset,
+                               bool is_boundary) {
+    using Type = std::tuple_element_t<Index, ArgsT>;
+    kps::Init<Type, ArgsT, Index, VecSize>(args, static_cast<Type>(1.0f));
+    if (is_boundary) {
+      kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, true>(
+          args, reinterpret_cast<const Type *>(in[Index]) + data_offset, num);
+    } else {
+      kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, false>(
+          args, reinterpret_cast<const Type *>(in[Index]) + data_offset, num);
+    }
+  }
+};
+
+template <int Index, int VecSize>
+struct InputSetter {
+  template <typename Array>
+  static HOSTDEVICE void Apply(
+      const std::vector<const DenseTensor *> &ins_tensor, Array *ins_data) {
+    (*ins_data)[Index] =
+        reinterpret_cast<const _ptr_ char *>(ins_tensor[Index]->data());
+  }
+};
+
+template <int Index, int VecSize>
+struct VecSizeGetter {
+  template <typename ArgsT>
+  static HOSTDEVICE void Apply(const std::vector<const DenseTensor *> &ins,
+                               const ArgsT &args,
+                               int *vec_size) {
+    using Type = std::tuple_element_t<Index, ArgsT>;
+    *vec_size = std::min<int>(
+        *vec_size,
+        paddle::platform::GetVectorizedSize(ins[Index]->data<Type>()));
+  }
+};
+
+template <typename OutT, typename Functor>
 int GetVectorizedSizeForTensors(const std::vector<const DenseTensor *> &ins,
                                 const std::vector<DenseTensor *> &outs) {
+  using Traits = paddle::platform::FunctionTraits<Functor>;
+  using ArgsT = typename Traits::ArgsTuple;
+  const int Arity = Traits::arity;
   int vec_size = 4;
-  for (auto iter = ins.begin(); iter != ins.end(); ++iter) {
-    vec_size = std::min<int>(
-        vec_size, paddle::platform::GetVectorizedSize((*iter)->data<InT>()));
-  }
+  ArgsT arg;
+  // The Arg VecSize=1 is to match the Unroller template.
+  Unroller<VecSizeGetter, 1, Arity>::step(ins, arg, &vec_size);
   for (auto iter = outs.begin(); iter != outs.end(); ++iter) {
     vec_size = std::min<int>(
         vec_size, paddle::platform::GetVectorizedSize((*iter)->data<OutT>()));
@@ -514,6 +578,39 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 3, false> {
   }
 };
 
+namespace detail {
+template <class F, class Tuple, std::size_t... Index>
+// GCC/Clang need the decltype() return type
+HOSTDEVICE constexpr decltype(auto) ApplyImpl(F &&f,
+                                              Tuple &&t,
+                                              std::index_sequence<Index...>) {
+  return std::forward<F>(f)(std::get<Index>(std::forward<Tuple>(t))...);
+}
+}  // namespace detail
+
+template <class F, class Tuple>
+HOSTDEVICE constexpr decltype(auto) Apply(F &&f, Tuple &&t) {
+  return detail::ApplyImpl(
+      std::forward<F>(f),
+      std::forward<Tuple>(t),
+      std::make_index_sequence<
+          std::tuple_size<std::remove_reference_t<Tuple>>::value>{});
+}
+
+template <typename OutT,
+          int VecSize,
+          typename Functor,
+          typename ArgsT,
+          int Arity>
+struct SameDimsElementwisePrimitiveCaller {
+  __device__ inline void operator()(Functor func, ArgsT *args, OutT *result) {
+#pragma unroll
+    for (int idx = 0; idx < VecSize; ++idx) {
+      result[idx] = static_cast<OutT>(Apply(func, args[idx]));
+    }
+  }
+};
+
 template <typename OutT, int VecSize, bool IsBoundary, int NumOuts>
 struct ElementwiseWriteDataCaller {
   __device__ __forceinline__ void operator()(
@@ -549,8 +646,7 @@ struct ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, 1> {
   }
 };
 
-template <typename InT,
-          typename OutT,
+template <typename OutT,
           typename Functor,
           int Arity,
           int NumOuts,
@@ -558,42 +654,32 @@ template <typename InT,
           bool IsBoundary>
 __device__ void VectorizedElementwiseKernelImpl(
 
-    const pten::framework::Array<const _ptr_ InT *__restrict__, Arity> &in,
+    const pten::framework::Array<const _ptr_ char *__restrict__, Arity> &in,
     pten::framework::Array<_ptr_ OutT *, NumOuts> outs,
     int num,
     int data_offset,
     Functor func) {
-  InT args[Arity > 1 ? Arity : 1][VecSize];
+  using Traits = paddle::platform::FunctionTraits<Functor>;
+  using ArgsT = typename Traits::ArgsTuple;
+  ArgsT args[VecSize];
   ConditionalT<OutT, NumOuts> result[VecSize];
 
-#pragma unroll
-  for (int i = 0; i < Arity; i++) {
-    kps::Init<InT, VecSize>(args[i], static_cast<InT>(1.0f));
-    kps::ReadData<InT, VecSize, 1, 1, IsBoundary>(
-        args[i], in[i] + data_offset, num);
-  }
+  Unroller<Loader, VecSize, Arity>::step(
+      in, args, num, data_offset, IsBoundary);
 
-  constexpr bool kCallElementwiseAny =
-      paddle::platform::FunctionTraits<Functor>::has_pointer_args;
-  ElementwisePrimitiveCaller<InT,
-                             ConditionalT<OutT, NumOuts>,
-                             VecSize,
-                             Functor,
-                             Arity,
-                             kCallElementwiseAny>()(func, args, result);
+  SameDimsElementwisePrimitiveCaller<ConditionalT<OutT, NumOuts>,
+                                     VecSize,
+                                     Functor,
+                                     ArgsT,
+                                     Arity>()(func, args, result);
 
   ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, NumOuts>()(
       outs, result, data_offset, num);
 }
 
-template <typename InT,
-          typename OutT,
-          typename Functor,
-          int Arity,
-          int NumOuts,
-          int VecSize>
+template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
 __global__ void VectorizedElementwiseKernel(
-    pten::framework::Array<const _ptr_ InT *__restrict__, Arity> ins,
+    pten::framework::Array<const _ptr_ char *__restrict__, Arity> ins,
     pten::framework::Array<_ptr_ OutT *, NumOuts> outs,
     int size,
     int main_offset,
@@ -601,8 +687,7 @@ __global__ void VectorizedElementwiseKernel(
   int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
   int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
   for (; data_offset < main_offset; data_offset += stride) {
-    VectorizedElementwiseKernelImpl<InT,
-                                    OutT,
+    VectorizedElementwiseKernelImpl<OutT,
                                     Functor,
                                     Arity,
                                     NumOuts,
@@ -613,8 +698,7 @@ __global__ void VectorizedElementwiseKernel(
 
   int num = size - data_offset;
   if (num > 0) {
-    VectorizedElementwiseKernelImpl<InT,
-                                    OutT,
+    VectorizedElementwiseKernelImpl<OutT,
                                     Functor,
                                     Arity,
                                     NumOuts,
@@ -623,24 +707,17 @@ __global__ void VectorizedElementwiseKernel(
   }
 }
 
-template <typename InT,
-          typename OutT,
-          typename Functor,
-          int Arity,
-          int NumOuts,
-          int VecSize>
+template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
 void ElementwiseCudaKernel(const KPDevice &ctx,
                            const std::vector<const DenseTensor *> &ins,
                            std::vector<DenseTensor *> *outs,
                            Functor func) {
   auto numel =
       (*outs)[0]->numel();  // To avoid running errors when ins.size()== 0
-  pten::framework::Array<const _ptr_ InT *__restrict__, Arity> ins_data;
+  pten::framework::Array<const _ptr_ char *__restrict__, Arity> ins_data;
   pten::framework::Array<_ptr_ OutT *, NumOuts> outs_data;
 
-  for (int i = 0; i < Arity; ++i) {
-    ins_data[i] = ins[i]->data<InT>();
-  }
+  Unroller<InputSetter, VecSize, Arity>::step(ins, &ins_data);
   for (int i = 0; i < NumOuts; ++i) {
     outs_data[i] = ctx.Alloc<OutT>((*outs)[i]);
   }
@@ -649,8 +726,7 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
   int grid_size = 8;
   auto stream = ctx.x_context()->xpu_stream;
   int main_offset = (numel / (VecSize * block_size)) * VecSize * block_size;
-  VectorizedElementwiseKernel<InT,
-                              OutT,
+  VectorizedElementwiseKernel<OutT,
                               Functor,
                               Arity,
                               NumOuts,
@@ -662,7 +738,7 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
   int main_offset = (numel / (VecSize * gpu_config.GetBlockSize())) * VecSize *
                     gpu_config.GetBlockSize();
   auto stream = ctx.stream();
-  VectorizedElementwiseKernel<InT, OutT, Functor, Arity, NumOuts, VecSize><<<
+  VectorizedElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSize><<<
       gpu_config.block_per_grid,
       gpu_config.thread_per_block,
       0,
@@ -670,19 +746,14 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
 #endif
 }
 
-template <ElementwiseType ET,
-          typename InT,
-          typename OutT,
-          typename Functor,
-          int NumOuts = 1>
+template <typename OutT, typename Functor, int NumOuts = 1>
 void LaunchSameDimsElementwiseCudaKernel(
     const KPDevice &ctx,
     const std::vector<const DenseTensor *> &ins,
     std::vector<DenseTensor *> *outs,
     Functor func) {
   using Traits = paddle::platform::FunctionTraits<Functor>;
-  const int kArity =
-      Traits::has_pointer_args ? static_cast<int>(ET) : Traits::arity;
+  const int kArity = Traits::arity;
   PADDLE_ENFORCE_EQ(ins.size(),
                     kArity,
                     paddle::platform::errors::InvalidArgument(
@@ -712,18 +783,18 @@ void LaunchSameDimsElementwiseCudaKernel(
   }
 
   // calculate the max vec_size for all ins and outs
-  int vec_size = GetVectorizedSizeForTensors<InT, OutT>(ins, *outs);
+  int vec_size = GetVectorizedSizeForTensors<OutT, Functor>(ins, *outs);
   switch (vec_size) {
     case 4:
-      ElementwiseCudaKernel<InT, OutT, Functor, kArity, NumOuts, 4>(
+      ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, 4>(
           ctx, ins, outs, func);
       break;
     case 2:
-      ElementwiseCudaKernel<InT, OutT, Functor, kArity, NumOuts, 2>(
+      ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, 2>(
           ctx, ins, outs, func);
       break;
     case 1:
-      ElementwiseCudaKernel<InT, OutT, Functor, kArity, NumOuts, 1>(
+      ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, 1>(
           ctx, ins, outs, func);
       break;
     default: {
diff --git a/paddle/pten/kernels/gpu/cast_kernel.cu b/paddle/pten/kernels/gpu/cast_kernel.cu
index 81d09ef1646..8fba5bc6fba 100644
--- a/paddle/pten/kernels/gpu/cast_kernel.cu
+++ b/paddle/pten/kernels/gpu/cast_kernel.cu
@@ -44,9 +44,7 @@ void CastCUDAKernelImpl(const GPUContext& dev_ctx,
   inputs.emplace_back(&x);
   outputs.emplace_back(out);
   dev_ctx.Alloc<OutT>(out);
-  pten::funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary,
-                                                   InT,
-                                                   OutT>(
+  pten::funcs::LaunchSameDimsElementwiseCudaKernel<OutT>(
       dev_ctx, inputs, &outputs, CastFuctor<InT, OutT>());
 }
 
diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h
index 3313c1ddcfe..8bcac972994 100644
--- a/paddle/pten/kernels/gpu/elementwise.h
+++ b/paddle/pten/kernels/gpu/elementwise.h
@@ -574,9 +574,8 @@ void LaunchElementwiseCudaKernel(const KPDevice &ctx,
     dims_size.emplace_back(in->dims().size());
   }
   if (no_broadcast_flag) {
-    pten::funcs::
-        LaunchSameDimsElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
-            ctx, ins, outs, func);
+    pten::funcs::LaunchSameDimsElementwiseCudaKernel<OutT, Functor, NumOuts>(
+        ctx, ins, outs, func);
   } else {
     axis = axis == -1
                ? *std::max_element(dims_size.begin(), dims_size.end()) -
diff --git a/paddle/pten/kernels/gpu/full_kernel.cu b/paddle/pten/kernels/gpu/full_kernel.cu
index 5dbae41e00c..6ea1f1282dd 100644
--- a/paddle/pten/kernels/gpu/full_kernel.cu
+++ b/paddle/pten/kernels/gpu/full_kernel.cu
@@ -48,9 +48,7 @@ void FullKernel(const ContextT& dev_ctx,
     // This function has no input, so the inputs.size() == 0. Use kUnary, but
     // the data will not be loaded in the kernel because the number of
     // parameters in the operator is 0
-    pten::funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary,
-                                                     T,
-                                                     T>(
+    pten::funcs::LaunchSameDimsElementwiseCudaKernel<T>(
         dev_ctx, inputs, &outputs, FullFuctor<T>(val.to<T>()));
   }
 }
@@ -90,9 +88,7 @@ void FullLikeKernel(const ContextT& dev_ctx,
   // the operator is 0
   int numel = out->numel();
   if (numel > 0) {
-    pten::funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary,
-                                                     T,
-                                                     T>(
+    pten::funcs::LaunchSameDimsElementwiseCudaKernel<T>(
         dev_ctx, inputs, &outputs, FullFuctor<T>(value));
   }
 }
diff --git a/paddle/pten/kernels/gpu/reduce.h b/paddle/pten/kernels/gpu/reduce.h
index ad988317d57..c83662c03c7 100644
--- a/paddle/pten/kernels/gpu/reduce.h
+++ b/paddle/pten/kernels/gpu/reduce.h
@@ -1093,7 +1093,7 @@ void TensorReduceImpl(const pten::GPUContext& dev_ctx,
   if (config.reduce_num == 1) {
     std::vector<const DenseTensor*> inputs = {&x};
     std::vector<DenseTensor*> outputs = {y};
-    funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, Tx, Ty>(
+    funcs::LaunchSameDimsElementwiseCudaKernel<Ty>(
         dev_ctx, inputs, &outputs, transform);
     return;
   }
diff --git a/paddle/pten/kernels/gpu/scale_kernel.cu b/paddle/pten/kernels/gpu/scale_kernel.cu
index 5add34a2303..e1cf78224a1 100644
--- a/paddle/pten/kernels/gpu/scale_kernel.cu
+++ b/paddle/pten/kernels/gpu/scale_kernel.cu
@@ -54,9 +54,7 @@ void ScaleKernel(const Context& dev_ctx,
   inputs.emplace_back(&x);
   outputs.emplace_back(out);
   dev_ctx.template Alloc<T>(out);
-  pten::funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary,
-                                                   T,
-                                                   T>(
+  pten::funcs::LaunchSameDimsElementwiseCudaKernel<T>(
       dev_ctx,
       inputs,
       &outputs,
diff --git a/paddle/pten/kernels/primitive/datamover_primitives.h b/paddle/pten/kernels/primitive/datamover_primitives.h
index 2a8006f3ecb..6979a750386 100644
--- a/paddle/pten/kernels/primitive/datamover_primitives.h
+++ b/paddle/pten/kernels/primitive/datamover_primitives.h
@@ -229,6 +229,18 @@ __device__ __forceinline__ void Init(T* dst, T init_data) {
   }
 }
 
+/**
+ * The difference from the above function is that
+ * it supports different data types of inputs.
+ */
+template <typename T, typename ArgsT, int Index, int NX>
+__device__ __forceinline__ void Init(ArgsT* dst, T init_data) {
+#pragma unroll
+  for (int i = 0; i < NX; i++) {
+    std::get<Index>(dst[i]) = init_data;
+  }
+}
+
 /**
  * @brief Read 1D data from global memory to register. When IsBoundary = true
  * and (NX % 4 == 0 or Nx % 2 == 0), vectorized load data will be used to
@@ -282,6 +294,48 @@ __device__ __forceinline__ void ReadData(T* dst,
   }
 }
 
+/**
+ * @brief Read 1D data from global memory to register. The difference
+ * from the above function is that it supports different data types of inputs.
+ */
+template <typename T,
+          int NX,
+          int NY,
+          int BlockSize,
+          typename ArgsT,
+          int Index,
+          bool IsBoundary = false>
+__device__ __forceinline__ void ReadData(ArgsT* dst,
+                                         const T* __restrict__ src,
+                                         int num) {
+  if (IsBoundary) {  // blockDim.x * NX > num
+    int thread_offset = threadIdx.x * NX;
+#pragma unroll
+    for (int idx = 0; idx < NX; ++idx) {
+      if (idx + thread_offset < num) {
+        std::get<Index>(dst[idx]) = src[thread_offset + idx];
+      }
+    }
+  } else {  // blockDim,x * NX < num
+    constexpr int kVectorSize = (NX % 4 == 0) ? 4 : (NX % 2 == 0) ? 2 : 1;
+    constexpr int kVectorsPerThread = NX / kVectorSize;
+    int thread_offset = threadIdx.x * kVectorsPerThread;
+
+    using VecType = details::VectorType<T, kVectorSize>;
+    const VecType* vec_input = reinterpret_cast<const VecType*>(src);
+    VecType vec_temp[kVectorsPerThread];
+
+#pragma unroll
+    for (int i = 0; i < kVectorsPerThread; ++i) {
+      vec_temp[i] = vec_input[thread_offset + i];
+#pragma unroll
+      for (int idx = 0; idx < NX; ++idx) {
+        std::get<Index>(dst[idx]) = *(reinterpret_cast<T*>(vec_temp) + idx);
+      }
+    }
+  }
+}
+
 /**
  * @brief Read 2D data from global memory to registers with broadcast form.
  *
diff --git a/paddle/pten/kernels/primitive/datamover_primitives_xpu2.h b/paddle/pten/kernels/primitive/datamover_primitives_xpu2.h
index d6586368c80..84d52682834 100644
--- a/paddle/pten/kernels/primitive/datamover_primitives_xpu2.h
+++ b/paddle/pten/kernels/primitive/datamover_primitives_xpu2.h
@@ -189,6 +189,18 @@ __device__ __inline__ void Init(T* dst, T init_data) {
   }
 }
 
+/**
+ * The difference from the above function is that
+ * it supports different data types of inputs.
+ */
+template <typename T, typename ArgsT, int Index, int NX>
+__device__ __forceinline__ void Init(ArgsT* dst, T init_data) {
+#pragma unroll
+  for (int i = 0; i < NX; i++) {
+    std::get<Index>(dst[i]) = init_data;
+  }
+}
+
 /**
  * @brief Read 1D data from global memory to register. When IsBoundary = true
  * and (NX % 4 == 0 or Nx % 2 == 0), vectorized load data will be used to
@@ -229,6 +241,40 @@ __device__ __inline__ void ReadData(T* dst,
   }
 }
 
+/**
+ * @brief Read 1D data from global memory to register. The difference
+ * from the above function is that it supports different data types of inputs.
+ */
+template <typename T,
+          int NX,
+          int NY,
+          int BlockSize,
+          typename ArgsT,
+          int Index,
+          bool IsBoundary = false>
+__device__ __forceinline__ void ReadData(ArgsT* dst,
+                                         const T* __restrict__ src,
+                                         int num) {
+  int thread_offset = core_id() * NX;
+  __local__ T in_temp[1];
+  __local__ T in_vec[NX];
+  if (IsBoundary) {  // core_num() * NX > num
+#pragma unroll
+    for (int idx = 0; idx < NX; ++idx) {
+      if (idx + thread_offset < num) {
+        GM2LM(src + thread_offset + idx, in_temp, sizeof(T));
+        std::get<Index>(dst[idx]) = in_temp[0];
+      }
+    }
+  } else {  // core_num() * NX < num
+    GM2LM(src + thread_offset, in_vec, NX * sizeof(T));
+#pragma unroll
+    for (int idx = 0; idx < NX; ++idx) {
+      std::get<Index>(dst[idx]) = in_vec[idx];
+    }
+  }
+}
+
 /**
  * @brief Read 2D data from global memory to registers with broadcast form.
  *
-- 
GitLab