Support different dtypes of inputs for elementwise ops (#38859)

* improve backward performance * support different dtypes for elementwise ops

Support different dtypes of inputs for elementwise ops (#38859)
* improve backward performance * support different dtypes for elementwise ops
bf305033 · Zhang Ting · GitHub · 7d6096ff · bf305033 · bf305033
23 changed file
--- a/paddle/fluid/operators/abs_op.cu
+++ b/paddle/fluid/operators/abs_op.cu
@@ -50,9 +50,8 @@ class AbsKernel<platform::CUDADeviceContext, T>
    std::vector<const framework::Tensor*> ins = {x};
    std::vector<framework::Tensor*> outs = {out};
    auto functor = CudaAbsFunctor<T>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<math::Real<T>>(
-        ElementwiseType::kUnary, T, math::Real<T>>(dev_ctx, ins, &outs,
+        dev_ctx, ins, &outs, functor);
-                                                   functor);
  }
 };

--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -1367,14 +1367,14 @@ class ELUGradCudaKernel : public framework::OpKernel<T> {
    if (alpha > 0) {
      CudaELUGradFunctor<T> functor;
      functor.alpha = alpha;
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-          ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor);
+                                                                &outs, functor);
    } else {
      CudaELUGradNegativeAlphaFunctor<T> functor;
      functor.alpha = alpha;
      ins.push_back(x);
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-          ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor);
+                                                                &outs, functor);
    }
  }
 };
@@ -1450,8 +1450,8 @@ class ActivationCudaKernel
    for (auto& attr : attrs) {
      *attr.second = ctx.Attr<float>(attr.first);
    }
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-        ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor);
+                                                              &outs, functor);
  }
 };
@@ -1480,17 +1480,17 @@ class ActivationGradCudaKernel
    if (static_cast<int>(Functor::FwdDeps()) == static_cast<int>(kDepOut)) {
      // Only need forward output Out
      ins.push_back(out);
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-          ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor);
+                                                                &outs, functor);
    } else if (static_cast<int>(Functor::FwdDeps()) ==
               static_cast<int>(kDepX)) {
      // Only need forward input X
      ins.push_back(x);
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-          ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor);
+                                                                &outs, functor);
    } else {
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-          ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor);
+                                                                &outs, functor);
    }
  }
 };

--- a/paddle/fluid/operators/bce_loss_op.cu
+++ b/paddle/fluid/operators/bce_loss_op.cu
@@ -31,7 +31,7 @@ struct BCELossFunctor {
    neg_100 = static_cast<T>(-100.);
  }
-  HOSTDEVICE inline T operator()(const T& x, const T& label) const {
+  HOSTDEVICE inline T operator()(const T x, const T label) const {
    PADDLE_ENFORCE(
        (x >= static_cast<T>(0)) && (x <= one),
        "Input is expected to be within the interval [0, 1], but recieved %f.",
@@ -52,8 +52,7 @@ struct BCELossGradFunctor {
    eps = static_cast<T>(1e-12);
  }
-  HOSTDEVICE inline T operator()(const T& x, const T& label,
+  HOSTDEVICE inline T operator()(const T x, const T label, const T dout) const {
-                                 const T& dout) const {
    T term1 = max((one - x) * x, eps);
    return (dout * (x - label) / term1);
  }
@@ -73,8 +72,8 @@ class BCELossCUDAKernel : public framework::OpKernel<T> {
    std::vector<framework::Tensor*> outs = {out};
    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    auto functor = BCELossFunctor<T>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-        ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor);
+                                                              &outs, functor);
  }
 };
@@ -91,8 +90,8 @@ class BCELossGradCUDAKernel : public framework::OpKernel<T> {
    std::vector<framework::Tensor*> outs = {dx};
    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    auto functor = BCELossGradFunctor<T>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-        ElementwiseType::kTernary, T, T>(dev_ctx, ins, &outs, functor);
+                                                              &outs, functor);
  }
 };

--- a/paddle/fluid/operators/clip_op.h
+++ b/paddle/fluid/operators/clip_op.h
@@ -45,7 +45,7 @@ template <typename T>
 class ClipGradFunctor {
 public:
  explicit ClipGradFunctor(const T min, const T max) : min_(min), max_(max) {}
-  HOSTDEVICE T operator()(const T& x, const T& y) const {
+  HOSTDEVICE T operator()(const T x, const T y) const {
    return (y > min_ && y < max_) ? x : static_cast<T>(0);
  }
@@ -103,8 +103,7 @@ class ClipKernel : public framework::OpKernel<T> {
        std::vector<const framework::Tensor*> ins = {x};
        std::vector<framework::Tensor*> outs = {out};
        auto functor = ClipFunctor<T>(min, max);
-        paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+        paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
-            ElementwiseType::kUnary, T, T>(
            context.template device_context<platform::CUDADeviceContext>(), ins,
            &outs, functor);
 #endif
@@ -177,7 +176,7 @@ class ClipGradKernel : public framework::OpKernel<T> {
      std::vector<framework::Tensor*> outs = {d_x};
      auto functor = ClipGradFunctor<T>(min, max);
      d_x->mutable_data<T>(context.GetPlace());
-      LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+      LaunchSameDimsElementwiseCudaKernel<T>(
          context.template device_context<platform::CUDADeviceContext>(), ins,
          &outs, functor);
 #else

--- a/paddle/fluid/operators/controlflow/bitwise_op.cu
+++ b/paddle/fluid/operators/controlflow/bitwise_op.cu
@@ -57,8 +57,8 @@ class UnaryBitwiseOpKernel<platform::CUDADeviceContext, Functor>
    std::vector<framework::Tensor*> outs = {out};
    const auto& cuda_ctx =
        ctx.template device_context<platform::CUDADeviceContext>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(cuda_ctx, ins,
-        ElementwiseType::kUnary, T, T>(cuda_ctx, ins, &outs, functor);
+                                                              &outs, functor);
  }
 };

--- a/paddle/fluid/operators/controlflow/compare_all_op.cu
+++ b/paddle/fluid/operators/controlflow/compare_all_op.cu
@@ -55,8 +55,8 @@ class CompareReduceOpKernel
          context.template device_context<platform::CUDADeviceContext>();
      std::vector<const framework::Tensor*> ins = {x, y};
      std::vector<framework::Tensor*> outs = {&tmp};
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<bool>(
-          ElementwiseType::kBinary, T, bool>(cuda_ctx, ins, &outs, Functor());
+          cuda_ctx, ins, &outs, Functor());
      // Reduce by 'bitwise and' operator
      std::vector<int> reduce_dims;

--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -32,6 +32,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/dropout_impl_util.h"
 #include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/pten/kernels/funcs/cuda_kernel_config.h"
@@ -123,6 +124,19 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
  }
 }
+template <typename T, typename MaskType>
+struct CudaDropoutGradFunctor {
+  explicit CudaDropoutGradFunctor(const T factor) : factor_(factor) {}
+  __device__ __forceinline__ T operator()(const T dout,
+                                          const MaskType mask) const {
+    return dout * static_cast<T>(mask) * factor_;
+  }
+ private:
+  T factor_;
+};
 template <typename T, typename MaskType, int VecSize>
 __global__ void DropoutGradCUDAKernel(const T* dout, const MaskType* mask,
                                      const T factor, const int64_t size,
@@ -259,21 +273,13 @@ void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
      if (dropout_prob == 1.0f) {
        dX.device(place) = static_cast<T>(0) * dY;
      } else {
-        int vec_size = platform::GetVectorizedSize<T>(grad_y.data<T>());
+        auto factor = static_cast<T>(1.0f / (1.0f - dropout_prob));
-        if (vec_size == 4 && size % 4 == 0) {
+        auto stream = dev_ctx.stream();
-          auto factor = static_cast<T>(1.0f / (1.0f - dropout_prob));
+        std::vector<const framework::Tensor*> ins = {&grad_y, &mask};
-          auto stream = dev_ctx.stream();
+        std::vector<framework::Tensor*> outs = {grad_x};
-          platform::GpuLaunchConfig config =
+        auto functor = CudaDropoutGradFunctor<T, uint8_t>(factor);
-              platform::GetGpuLaunchConfig1D(dev_ctx, size, vec_size);
+        paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
-          DropoutGradCUDAKernel<
+            dev_ctx, ins, &outs, functor);
-              T, uint8_t,
-              4><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
-              grad_y.data<T>(), mask.data<uint8_t>(), factor, size,
-              grad_x->data<T>());
-        } else {
-          dX.device(place) =
-              dY * M.cast<T>() / static_cast<T>(1.0f - dropout_prob);
-        }
      }
    } else {
      dX.device(place) = dY * M.cast<T>();

--- a/paddle/fluid/operators/elementwise/elementwise_functor.h
+++ b/paddle/fluid/operators/elementwise/elementwise_functor.h
@@ -234,21 +234,22 @@ struct FMinFunctor<int64_t> {
 template <typename T>
 struct MinGradXFunctor {
-  inline HOSTDEVICE T operator()(const T& x, const T& y, const T& dout) const {
+  inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
    return dout * static_cast<T>(x < y);
  }
 };
 template <typename T>
 struct MinGradYFunctor {
-  inline HOSTDEVICE T operator()(const T& x, const T& y, const T& dout) const {
+  inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
    return dout * static_cast<T>(x >= y);
  }
 };
 template <typename InT, typename OutT>
 struct MinGradXYFunctor {
-  inline HOSTDEVICE pten::framework::Array<OutT, 2> operator()(
+  inline HOSTDEVICE pten::framework::Array<OutT, 2> operator()(const InT x,
-      const InT& x, const InT& y, const InT& dout) {
+                                                               const InT y,
+                                                               const InT dout) {
    pten::framework::Array<OutT, 2> outs;
    // dx = dout * (x < y)
    outs[0] = static_cast<OutT>(dout * static_cast<InT>(x < y));
@@ -303,21 +304,22 @@ struct MulGradXYFunctor<Complex<InT>, Complex<OutT>> {
 // Ternary compare
 template <typename T>
 struct MaxGradXFunctor {
-  inline HOSTDEVICE T operator()(const T& x, const T& y, const T& dout) const {
+  inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
    return dout * static_cast<T>(x > y);
  }
 };
 template <typename T>
 struct MaxGradYFunctor {
-  inline HOSTDEVICE T operator()(const T& x, const T& y, const T& dout) const {
+  inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
    return dout * static_cast<T>(x <= y);
  }
 };
 template <typename InT, typename OutT>
 struct MaxGradXYFunctor {
-  inline HOSTDEVICE pten::framework::Array<OutT, 2> operator()(
+  inline HOSTDEVICE pten::framework::Array<OutT, 2> operator()(const InT x,
-      const InT& x, const InT& y, const InT& dout) {
+                                                               const InT y,
+                                                               const InT dout) {
    pten::framework::Array<OutT, 2> outs;
    // dx = dout * (x > y)
    outs[0] = static_cast<OutT>(dout * static_cast<InT>(x > y));

--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -26,8 +26,7 @@ namespace operators {
 using ElementwiseType = pten::ElementwiseType;
-template <ElementwiseType ET, typename InT, typename OutT, typename Functor,
+template <typename OutT, typename Functor, int NumOuts = 1>
-          int NumOuts = 1>
 void LaunchSameDimsElementwiseCudaKernel(
    const KPDevice &ctx, const std::vector<const framework::Tensor *> &ins,
    std::vector<framework::Tensor *> *outs, Functor func) {
@@ -54,9 +53,8 @@ void LaunchSameDimsElementwiseCudaKernel(
  for (int i = 0; i < pt_outputs_tmp.size(); i++) {
    pt_outputs.push_back(pt_outputs_tmp[i].get());
  }
-  pten::funcs::LaunchSameDimsElementwiseCudaKernel<ET, InT, OutT, Functor,
+  pten::funcs::LaunchSameDimsElementwiseCudaKernel<OutT, Functor, NumOuts>(
-                                                   NumOuts>(ctx, pt_inputs,
+      ctx, pt_inputs, &pt_outputs, func);
-                                                            &pt_outputs, func);
 }
 }  // namespace operators

--- a/paddle/fluid/operators/label_smooth_op.cu
+++ b/paddle/fluid/operators/label_smooth_op.cu
@@ -87,8 +87,8 @@ class LabelSmoothGPUKernel : public framework::OpKernel<T> {
      std::vector<const framework::Tensor*> ins = {in_t};
      std::vector<framework::Tensor*> outs = {out_t};
      auto functor = LabelSmoothFunctor<T>(epsilon, label_dim);
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-          ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor);
+                                                                &outs, functor);
    }
  }
 };
@@ -107,8 +107,8 @@ class LabelSmoothGradGPUKernel : public framework::OpKernel<T> {
    std::vector<const framework::Tensor*> ins = {d_out_t};
    std::vector<framework::Tensor*> outs = {d_in_t};
    auto functor = LabelSmoothGradFunctor<T>(epsilon);
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-        ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor);
+                                                              &outs, functor);
  }
 };
 }  // namespace operators

--- a/paddle/fluid/operators/lgamma_op.cu
+++ b/paddle/fluid/operators/lgamma_op.cu
@@ -39,8 +39,8 @@ class LgammaKernel<platform::CUDADeviceContext, T>
    std::vector<const framework::Tensor*> ins = {x};
    std::vector<framework::Tensor*> outs = {out};
    auto functor = CudaLgammaFunctor<T>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-        ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor);
+                                                              &outs, functor);
  }
 };

--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
@@ -126,8 +126,7 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
      std::vector<framework::Tensor*> outs = {out_norm};
      const auto& cuda_ctx =
          ctx.template device_context<platform::CUDADeviceContext>();
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
-          ElementwiseType::kUnary, T, T, UnsignedPowFunctor<T>>(
          cuda_ctx, ins, &outs, UnsignedPowFunctor<T>(1. / porder));
    }
  }

--- a/paddle/fluid/operators/renorm_op.cu
+++ b/paddle/fluid/operators/renorm_op.cu
@@ -151,9 +151,8 @@ class CUDARenormKernel : public framework::OpKernel<T> {
    const auto& cuda_ctx =
        context.template device_context<platform::CUDADeviceContext>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(cuda_ctx, ins,
-        ElementwiseType::kUnary, MT, T, UnsignedPowFunctor<MT, T>>(
+                                                              &outs, func);
-        cuda_ctx, ins, &outs, func);
    std::vector<int> reduce_axis = {0, 2};
    TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
        cuda_ctx, pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis,

--- a/paddle/fluid/operators/viterbi_decode_op.cu
+++ b/paddle/fluid/operators/viterbi_decode_op.cu
@@ -80,9 +80,8 @@ struct GetMask<platform::CUDADeviceContext, CompareFunctor, T> {
    std::vector<const Tensor*> ins = {&lhs, &rhs};
    std::vector<Tensor*> outs = {mask};
    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
-        ElementwiseType::kBinary, int64_t, T>(dev_ctx, ins, &outs,
+        dev_ctx, ins, &outs, CompareFunctor<int64_t, T>());
-                                              CompareFunctor<int64_t, T>());
  }
 };

--- a/paddle/fluid/platform/function_traits.h
+++ b/paddle/fluid/platform/function_traits.h
@@ -54,6 +54,7 @@ template <typename ReturnType, typename... Args>
 struct FunctionTraits<ReturnType(Args...)> {
  static const size_t arity = sizeof...(Args);
  static const bool has_pointer_args = IsPointerArgs<arity, Args...>::value;
+  using ArgsTuple = std::tuple<Args...>;
 };
 }  // namespace platform

--- a/paddle/pten/kernels/funcs/elementwise_base.h
+++ b/paddle/pten/kernels/funcs/elementwise_base.h
@@ -438,14 +438,78 @@ inline void ElementwiseGradPreProcess(const DenseTensor &dout,
 #if defined(__NVCC__) || defined(__HIPCC__)
-template <typename InT, typename OutT>
+// static unroller
+template <template <int Index, int VecSize> typename Func,
+          int VecSize,
+          int End,
+          int Begin = 0>
+struct Unroller {
+  template <typename... Args>
+  static HOSTDEVICE inline void step(Args &&... args) {
+    Func<Begin, VecSize>::Apply(std::forward<Args>(args)...);
+    Unroller<Func, VecSize, End, Begin + 1>::step(args...);
+  }
+};
+template <template <int Index, int VecSize> typename Func, int VecSize, int End>
+struct Unroller<Func, VecSize, End, End> {
+  template <typename... Args>
+  static HOSTDEVICE inline void step(Args &&... args) {}
+};
+template <int Index, int VecSize>
+struct Loader {
+  template <typename Array, typename ArgsT>
+  static __device__ void Apply(const Array &in,
+                               ArgsT *args,
+                               int num,
+                               int data_offset,
+                               bool is_boundary) {
+    using Type = std::tuple_element_t<Index, ArgsT>;
+    kps::Init<Type, ArgsT, Index, VecSize>(args, static_cast<Type>(1.0f));
+    if (is_boundary) {
+      kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, true>(
+          args, reinterpret_cast<const Type *>(in[Index]) + data_offset, num);
+    } else {
+      kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, false>(
+          args, reinterpret_cast<const Type *>(in[Index]) + data_offset, num);
+    }
+  }
+};
+template <int Index, int VecSize>
+struct InputSetter {
+  template <typename Array>
+  static HOSTDEVICE void Apply(
+      const std::vector<const DenseTensor *> &ins_tensor, Array *ins_data) {
+    (*ins_data)[Index] =
+        reinterpret_cast<const _ptr_ char *>(ins_tensor[Index]->data());
+  }
+};
+template <int Index, int VecSize>
+struct VecSizeGetter {
+  template <typename ArgsT>
+  static HOSTDEVICE void Apply(const std::vector<const DenseTensor *> &ins,
+                               const ArgsT &args,
+                               int *vec_size) {
+    using Type = std::tuple_element_t<Index, ArgsT>;
+    *vec_size = std::min<int>(
+        *vec_size,
+        paddle::platform::GetVectorizedSize(ins[Index]->data<Type>()));
+  }
+};
+template <typename OutT, typename Functor>
 int GetVectorizedSizeForTensors(const std::vector<const DenseTensor *> &ins,
                                const std::vector<DenseTensor *> &outs) {
+  using Traits = paddle::platform::FunctionTraits<Functor>;
+  using ArgsT = typename Traits::ArgsTuple;
+  const int Arity = Traits::arity;
  int vec_size = 4;
-  for (auto iter = ins.begin(); iter != ins.end(); ++iter) {
+  ArgsT arg;
-    vec_size = std::min<int>(
+  // The Arg VecSize=1 is to match the Unroller template.
-        vec_size, paddle::platform::GetVectorizedSize((*iter)->data<InT>()));
+  Unroller<VecSizeGetter, 1, Arity>::step(ins, arg, &vec_size);
-  }
  for (auto iter = outs.begin(); iter != outs.end(); ++iter) {
    vec_size = std::min<int>(
        vec_size, paddle::platform::GetVectorizedSize((*iter)->data<OutT>()));
@@ -514,6 +578,39 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 3, false> {
  }
 };
+namespace detail {
+template <class F, class Tuple, std::size_t... Index>
+// GCC/Clang need the decltype() return type
+HOSTDEVICE constexpr decltype(auto) ApplyImpl(F &&f,
+                                              Tuple &&t,
+                                              std::index_sequence<Index...>) {
+  return std::forward<F>(f)(std::get<Index>(std::forward<Tuple>(t))...);
+}
+}  // namespace detail
+template <class F, class Tuple>
+HOSTDEVICE constexpr decltype(auto) Apply(F &&f, Tuple &&t) {
+  return detail::ApplyImpl(
+      std::forward<F>(f),
+      std::forward<Tuple>(t),
+      std::make_index_sequence<
+          std::tuple_size<std::remove_reference_t<Tuple>>::value>{});
+}
+template <typename OutT,
+          int VecSize,
+          typename Functor,
+          typename ArgsT,
+          int Arity>
+struct SameDimsElementwisePrimitiveCaller {
+  __device__ inline void operator()(Functor func, ArgsT *args, OutT *result) {
+#pragma unroll
+    for (int idx = 0; idx < VecSize; ++idx) {
+      result[idx] = static_cast<OutT>(Apply(func, args[idx]));
+    }
+  }
+};
 template <typename OutT, int VecSize, bool IsBoundary, int NumOuts>
 struct ElementwiseWriteDataCaller {
  __device__ __forceinline__ void operator()(
@@ -549,8 +646,7 @@ struct ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, 1> {
  }
 };
-template <typename InT,
+template <typename OutT,
-          typename OutT,
          typename Functor,
          int Arity,
          int NumOuts,
@@ -558,42 +654,32 @@ template <typename InT,
          bool IsBoundary>
 __device__ void VectorizedElementwiseKernelImpl(
-    const pten::framework::Array<const _ptr_ InT *__restrict__, Arity> &in,
+    const pten::framework::Array<const _ptr_ char *__restrict__, Arity> &in,
    pten::framework::Array<_ptr_ OutT *, NumOuts> outs,
    int num,
    int data_offset,
    Functor func) {
-  InT args[Arity > 1 ? Arity : 1][VecSize];
+  using Traits = paddle::platform::FunctionTraits<Functor>;
+  using ArgsT = typename Traits::ArgsTuple;
+  ArgsT args[VecSize];
  ConditionalT<OutT, NumOuts> result[VecSize];
-#pragma unroll
+  Unroller<Loader, VecSize, Arity>::step(
-  for (int i = 0; i < Arity; i++) {
+      in, args, num, data_offset, IsBoundary);
-    kps::Init<InT, VecSize>(args[i], static_cast<InT>(1.0f));
-    kps::ReadData<InT, VecSize, 1, 1, IsBoundary>(
-        args[i], in[i] + data_offset, num);
-  }
-  constexpr bool kCallElementwiseAny =
+  SameDimsElementwisePrimitiveCaller<ConditionalT<OutT, NumOuts>,
-      paddle::platform::FunctionTraits<Functor>::has_pointer_args;
+                                     VecSize,
-  ElementwisePrimitiveCaller<InT,
+                                     Functor,
-                             ConditionalT<OutT, NumOuts>,
+                                     ArgsT,
-                             VecSize,
+                                     Arity>()(func, args, result);
-                             Functor,
-                             Arity,
-                             kCallElementwiseAny>()(func, args, result);
  ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, NumOuts>()(
      outs, result, data_offset, num);
 }
-template <typename InT,
+template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
-          typename OutT,
-          typename Functor,
-          int Arity,
-          int NumOuts,
-          int VecSize>
 __global__ void VectorizedElementwiseKernel(
-    pten::framework::Array<const _ptr_ InT *__restrict__, Arity> ins,
+    pten::framework::Array<const _ptr_ char *__restrict__, Arity> ins,
    pten::framework::Array<_ptr_ OutT *, NumOuts> outs,
    int size,
    int main_offset,
@@ -601,8 +687,7 @@ __global__ void VectorizedElementwiseKernel(
  int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
  int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
  for (; data_offset < main_offset; data_offset += stride) {
-    VectorizedElementwiseKernelImpl<InT,
+    VectorizedElementwiseKernelImpl<OutT,
-                                    OutT,
                                    Functor,
                                    Arity,
                                    NumOuts,
@@ -613,8 +698,7 @@ __global__ void VectorizedElementwiseKernel(
  int num = size - data_offset;
  if (num > 0) {
-    VectorizedElementwiseKernelImpl<InT,
+    VectorizedElementwiseKernelImpl<OutT,
-                                    OutT,
                                    Functor,
                                    Arity,
                                    NumOuts,
@@ -623,24 +707,17 @@ __global__ void VectorizedElementwiseKernel(
  }
 }
-template <typename InT,
+template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
-          typename OutT,
-          typename Functor,
-          int Arity,
-          int NumOuts,
-          int VecSize>
 void ElementwiseCudaKernel(const KPDevice &ctx,
                           const std::vector<const DenseTensor *> &ins,
                           std::vector<DenseTensor *> *outs,
                           Functor func) {
  auto numel =
      (*outs)[0]->numel();  // To avoid running errors when ins.size()== 0
-  pten::framework::Array<const _ptr_ InT *__restrict__, Arity> ins_data;
+  pten::framework::Array<const _ptr_ char *__restrict__, Arity> ins_data;
  pten::framework::Array<_ptr_ OutT *, NumOuts> outs_data;
-  for (int i = 0; i < Arity; ++i) {
+  Unroller<InputSetter, VecSize, Arity>::step(ins, &ins_data);
-    ins_data[i] = ins[i]->data<InT>();
-  }
  for (int i = 0; i < NumOuts; ++i) {
    outs_data[i] = ctx.Alloc<OutT>((*outs)[i]);
  }
@@ -649,8 +726,7 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
  int grid_size = 8;
  auto stream = ctx.x_context()->xpu_stream;
  int main_offset = (numel / (VecSize * block_size)) * VecSize * block_size;
-  VectorizedElementwiseKernel<InT,
+  VectorizedElementwiseKernel<OutT,
-                              OutT,
                              Functor,
                              Arity,
                              NumOuts,
@@ -662,7 +738,7 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
  int main_offset = (numel / (VecSize * gpu_config.GetBlockSize())) * VecSize *
                    gpu_config.GetBlockSize();
  auto stream = ctx.stream();
-  VectorizedElementwiseKernel<InT, OutT, Functor, Arity, NumOuts, VecSize><<<
+  VectorizedElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSize><<<
      gpu_config.block_per_grid,
      gpu_config.thread_per_block,
      0,
@@ -670,19 +746,14 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
 #endif
 }
-template <ElementwiseType ET,
+template <typename OutT, typename Functor, int NumOuts = 1>
-          typename InT,
-          typename OutT,
-          typename Functor,
-          int NumOuts = 1>
 void LaunchSameDimsElementwiseCudaKernel(
    const KPDevice &ctx,
    const std::vector<const DenseTensor *> &ins,
    std::vector<DenseTensor *> *outs,
    Functor func) {
  using Traits = paddle::platform::FunctionTraits<Functor>;
-  const int kArity =
+  const int kArity = Traits::arity;
-      Traits::has_pointer_args ? static_cast<int>(ET) : Traits::arity;
  PADDLE_ENFORCE_EQ(ins.size(),
                    kArity,
                    paddle::platform::errors::InvalidArgument(
@@ -712,18 +783,18 @@ void LaunchSameDimsElementwiseCudaKernel(
  }
  // calculate the max vec_size for all ins and outs
-  int vec_size = GetVectorizedSizeForTensors<InT, OutT>(ins, *outs);
+  int vec_size = GetVectorizedSizeForTensors<OutT, Functor>(ins, *outs);
  switch (vec_size) {
    case 4:
-      ElementwiseCudaKernel<InT, OutT, Functor, kArity, NumOuts, 4>(
+      ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, 4>(
          ctx, ins, outs, func);
      break;
    case 2:
-      ElementwiseCudaKernel<InT, OutT, Functor, kArity, NumOuts, 2>(
+      ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, 2>(
          ctx, ins, outs, func);
      break;
    case 1:
-      ElementwiseCudaKernel<InT, OutT, Functor, kArity, NumOuts, 1>(
+      ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, 1>(
          ctx, ins, outs, func);
      break;
    default: {

--- a/paddle/pten/kernels/gpu/cast_kernel.cu
+++ b/paddle/pten/kernels/gpu/cast_kernel.cu
@@ -44,9 +44,7 @@ void CastCUDAKernelImpl(const GPUContext& dev_ctx,
  inputs.emplace_back(&x);
  outputs.emplace_back(out);
  dev_ctx.Alloc<OutT>(out);
-  pten::funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary,
+  pten::funcs::LaunchSameDimsElementwiseCudaKernel<OutT>(
-                                                   InT,
-                                                   OutT>(
      dev_ctx, inputs, &outputs, CastFuctor<InT, OutT>());
 }

--- a/paddle/pten/kernels/gpu/elementwise.h
+++ b/paddle/pten/kernels/gpu/elementwise.h
@@ -574,9 +574,8 @@ void LaunchElementwiseCudaKernel(const KPDevice &ctx,
    dims_size.emplace_back(in->dims().size());
  }
  if (no_broadcast_flag) {
-    pten::funcs::
+    pten::funcs::LaunchSameDimsElementwiseCudaKernel<OutT, Functor, NumOuts>(
-        LaunchSameDimsElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
+        ctx, ins, outs, func);
-            ctx, ins, outs, func);
  } else {
    axis = axis == -1
               ? *std::max_element(dims_size.begin(), dims_size.end()) -

--- a/paddle/pten/kernels/gpu/full_kernel.cu
+++ b/paddle/pten/kernels/gpu/full_kernel.cu
@@ -48,9 +48,7 @@ void FullKernel(const ContextT& dev_ctx,
    // This function has no input, so the inputs.size() == 0. Use kUnary, but
    // the data will not be loaded in the kernel because the number of
    // parameters in the operator is 0
-    pten::funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary,
+    pten::funcs::LaunchSameDimsElementwiseCudaKernel<T>(
-                                                     T,
-                                                     T>(
        dev_ctx, inputs, &outputs, FullFuctor<T>(val.to<T>()));
  }
 }
@@ -90,9 +88,7 @@ void FullLikeKernel(const ContextT& dev_ctx,
  // the operator is 0
  int numel = out->numel();
  if (numel > 0) {
-    pten::funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary,
+    pten::funcs::LaunchSameDimsElementwiseCudaKernel<T>(
-                                                     T,
-                                                     T>(
        dev_ctx, inputs, &outputs, FullFuctor<T>(value));
  }
 }

--- a/paddle/pten/kernels/gpu/reduce.h
+++ b/paddle/pten/kernels/gpu/reduce.h
@@ -1093,7 +1093,7 @@ void TensorReduceImpl(const pten::GPUContext& dev_ctx,
  if (config.reduce_num == 1) {
    std::vector<const DenseTensor*> inputs = {&x};
    std::vector<DenseTensor*> outputs = {y};
-    funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, Tx, Ty>(
+    funcs::LaunchSameDimsElementwiseCudaKernel<Ty>(
        dev_ctx, inputs, &outputs, transform);
    return;
  }

--- a/paddle/pten/kernels/gpu/scale_kernel.cu
+++ b/paddle/pten/kernels/gpu/scale_kernel.cu
@@ -54,9 +54,7 @@ void ScaleKernel(const Context& dev_ctx,
  inputs.emplace_back(&x);
  outputs.emplace_back(out);
  dev_ctx.template Alloc<T>(out);
-  pten::funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary,
+  pten::funcs::LaunchSameDimsElementwiseCudaKernel<T>(
-                                                   T,
-                                                   T>(
      dev_ctx,
      inputs,
      &outputs,

--- a/paddle/pten/kernels/primitive/datamover_primitives.h
+++ b/paddle/pten/kernels/primitive/datamover_primitives.h
@@ -229,6 +229,18 @@ __device__ __forceinline__ void Init(T* dst, T init_data) {
  }
 }
+/**
+ * The difference from the above function is that
+ * it supports different data types of inputs.
+ */
+template <typename T, typename ArgsT, int Index, int NX>
+__device__ __forceinline__ void Init(ArgsT* dst, T init_data) {
+#pragma unroll
+  for (int i = 0; i < NX; i++) {
+    std::get<Index>(dst[i]) = init_data;
+  }
+}
 /**
 * @brief Read 1D data from global memory to register. When IsBoundary = true
 * and (NX % 4 == 0 or Nx % 2 == 0), vectorized load data will be used to
@@ -282,6 +294,48 @@ __device__ __forceinline__ void ReadData(T* dst,
  }
 }
+/**
+ * @brief Read 1D data from global memory to register. The difference
+ * from the above function is that it supports different data types of inputs.
+ */
+template <typename T,
+          int NX,
+          int NY,
+          int BlockSize,
+          typename ArgsT,
+          int Index,
+          bool IsBoundary = false>
+__device__ __forceinline__ void ReadData(ArgsT* dst,
+                                         const T* __restrict__ src,
+                                         int num) {
+  if (IsBoundary) {  // blockDim.x * NX > num
+    int thread_offset = threadIdx.x * NX;
+#pragma unroll
+    for (int idx = 0; idx < NX; ++idx) {
+      if (idx + thread_offset < num) {
+        std::get<Index>(dst[idx]) = src[thread_offset + idx];
+      }
+    }
+  } else {  // blockDim,x * NX < num
+    constexpr int kVectorSize = (NX % 4 == 0) ? 4 : (NX % 2 == 0) ? 2 : 1;
+    constexpr int kVectorsPerThread = NX / kVectorSize;
+    int thread_offset = threadIdx.x * kVectorsPerThread;
+    using VecType = details::VectorType<T, kVectorSize>;
+    const VecType* vec_input = reinterpret_cast<const VecType*>(src);
+    VecType vec_temp[kVectorsPerThread];
+#pragma unroll
+    for (int i = 0; i < kVectorsPerThread; ++i) {
+      vec_temp[i] = vec_input[thread_offset + i];
+#pragma unroll
+      for (int idx = 0; idx < NX; ++idx) {
+        std::get<Index>(dst[idx]) = *(reinterpret_cast<T*>(vec_temp) + idx);
+      }
+    }
+  }
+}
 /**
 * @brief Read 2D data from global memory to registers with broadcast form.
 *

--- a/paddle/pten/kernels/primitive/datamover_primitives_xpu2.h
+++ b/paddle/pten/kernels/primitive/datamover_primitives_xpu2.h
@@ -189,6 +189,18 @@ __device__ __inline__ void Init(T* dst, T init_data) {
  }
 }
+/**
+ * The difference from the above function is that
+ * it supports different data types of inputs.
+ */
+template <typename T, typename ArgsT, int Index, int NX>
+__device__ __forceinline__ void Init(ArgsT* dst, T init_data) {
+#pragma unroll
+  for (int i = 0; i < NX; i++) {
+    std::get<Index>(dst[i]) = init_data;
+  }
+}
 /**
 * @brief Read 1D data from global memory to register. When IsBoundary = true
 * and (NX % 4 == 0 or Nx % 2 == 0), vectorized load data will be used to
@@ -229,6 +241,40 @@ __device__ __inline__ void ReadData(T* dst,
  }
 }
+/**
+ * @brief Read 1D data from global memory to register. The difference
+ * from the above function is that it supports different data types of inputs.
+ */
+template <typename T,
+          int NX,
+          int NY,
+          int BlockSize,
+          typename ArgsT,
+          int Index,
+          bool IsBoundary = false>
+__device__ __forceinline__ void ReadData(ArgsT* dst,
+                                         const T* __restrict__ src,
+                                         int num) {
+  int thread_offset = core_id() * NX;
+  __local__ T in_temp[1];
+  __local__ T in_vec[NX];
+  if (IsBoundary) {  // core_num() * NX > num
+#pragma unroll
+    for (int idx = 0; idx < NX; ++idx) {
+      if (idx + thread_offset < num) {
+        GM2LM(src + thread_offset + idx, in_temp, sizeof(T));
+        std::get<Index>(dst[idx]) = in_temp[0];
+      }
+    }
+  } else {  // core_num() * NX < num
+    GM2LM(src + thread_offset, in_vec, NX * sizeof(T));
+#pragma unroll
+    for (int idx = 0; idx < NX; ++idx) {
+      std::get<Index>(dst[idx]) = in_vec[idx];
+    }
+  }
+}
 /**
 * @brief Read 2D data from global memory to registers with broadcast form.
 *