diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
index 7ca62b1969e261f9661ba0e2acaf1734ca809b37..94ce32cdaf182f281cfc666bd7d07fef38c3c167 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
@@ -19,17 +19,13 @@
 namespace paddle {
 namespace operators {
 
-template <ElementwiseType ET,
-          typename InT,
-          typename OutT,
-          typename Functor,
-          int NumOuts = 1>
+template <typename OutT, typename Functor, int NumOuts = 1>
 void LaunchElementwiseCudaKernel(
     const KPDevice &ctx,
     const std::vector<const phi::DenseTensor *> &ins,
     std::vector<phi::DenseTensor *> *outs,
-    int axis,
-    Functor func) {
+    Functor func,
+    int axis = -1) {
   std::vector<const phi::DenseTensor *> pt_inputs;
   std::vector<phi::DenseTensor *> pt_outputs;
   // TODO(YuanRisheng) *_tmp for cache DenseTensor, because the temporary
@@ -53,8 +49,8 @@ void LaunchElementwiseCudaKernel(
   for (int i = 0; i < pt_outputs_tmp.size(); i++) {
     pt_outputs.push_back(pt_outputs_tmp[i].get());
   }
-  phi::funcs::BroadcastKernel<ET, InT, OutT, Functor, NumOuts>(
-      ctx, pt_inputs, &pt_outputs, axis, func);
+  phi::funcs::BroadcastKernel<OutT, Functor, NumOuts>(
+      ctx, pt_inputs, &pt_outputs, func, axis);
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 251df9b9d9e4e3151ebfd72c670994be75b75613..ab721d35278b68aa04d4a45e50fb935626bb3ff1 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -188,7 +188,7 @@ void ElementwiseComputeEx(const framework::ExecutionContext &ctx,
   z->mutable_data<OutType>(ctx.GetPlace());
   const auto &dev_ctx = ctx.template device_context<DeviceContext>();
   phi::funcs::ElementwiseCompute<Functor, T, OutType>(
-      dev_ctx, *x, *y, axis, func, z);
+      dev_ctx, *x, *y, func, z, axis);
 }
 
 // FusedElemwiseAndAct
@@ -1596,7 +1596,7 @@ static inline std::vector<int> GetReduceDim(const framework::DDim &in,
 
 #if defined(__NVCC__) || defined(__HIPCC__)
 
-template <ElementwiseType ET, typename T, typename Functor>
+template <typename T, typename Functor>
 void GetGradXAndYOut(const phi::GPUContext &dev_ctx,
                      const platform::Place &place,
                      int axis,
@@ -1605,11 +1605,11 @@ void GetGradXAndYOut(const phi::GPUContext &dev_ctx,
                      phi::DenseTensor *dx,
                      phi::DenseTensor *dy,
                      Functor func) {
-  phi::GetGradXAndYOut<ET, T, Functor>(
+  phi::GetGradXAndYOut<T, Functor>(
       dev_ctx, place, axis, ins, *dout, dx, dy, func);
 }
 
-template <ElementwiseType ET, typename T, typename Functor>
+template <typename T, typename Functor>
 void GetGradXOrYOut(const phi::GPUContext &dev_ctx,
                     const platform::Place &place,
                     int axis,
@@ -1617,8 +1617,7 @@ void GetGradXOrYOut(const phi::GPUContext &dev_ctx,
                     const phi::DenseTensor *dout,
                     phi::DenseTensor *dxy,
                     Functor func) {
-  phi::GetGradXOrYOut<ET, T, Functor>(
-      dev_ctx, place, axis, ins, *dout, dxy, func);
+  phi::GetGradXOrYOut<T, Functor>(dev_ctx, place, axis, ins, *dout, dxy, func);
 }
 
 #endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
index 2680c8ecc5e521bf8f91a83b252ed721e8bc6083..47e317831409abd10c7adb19c85a036393b55ebe 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using ElementwiseType = phi::ElementwiseType;
-
 template <typename OutT, typename Functor, int NumOuts = 1>
 void LaunchSameDimsElementwiseCudaKernel(
     const KPDevice &ctx,
diff --git a/paddle/fluid/operators/fused/attn_gemm.h b/paddle/fluid/operators/fused/attn_gemm.h
index 9709f60bbc1ce2c11694704ba8b80661dcfba434..277e29c4d59ce56cb7c3056ee85b277165a2cbdf 100644
--- a/paddle/fluid/operators/fused/attn_gemm.h
+++ b/paddle/fluid/operators/fused/attn_gemm.h
@@ -109,8 +109,8 @@ class AttnMatMul {
       // bias_out = output + bias
       std::vector<const phi::DenseTensor*> ins = {output, bias};
       std::vector<phi::DenseTensor*> outs = {bias_out};
-      phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
-          dev_ctx_, ins, &outs, -1, phi::funcs::AddFunctor<T>());
+      phi::funcs::BroadcastKernel<T>(
+          dev_ctx_, ins, &outs, phi::funcs::AddFunctor<T>());
     }
   }
 
diff --git a/paddle/fluid/operators/fused/attn_gemm_int8.h b/paddle/fluid/operators/fused/attn_gemm_int8.h
index c61a7f60d43599cfcc09a2dc647f934e52f7a88f..705cb8ece418e886c88a35334b8271b924a228fc 100644
--- a/paddle/fluid/operators/fused/attn_gemm_int8.h
+++ b/paddle/fluid/operators/fused/attn_gemm_int8.h
@@ -85,8 +85,8 @@ class AttnMatmulINT8 {
       // bias_out = output + bias
       std::vector<const phi::DenseTensor*> ins = {output, bias};
       std::vector<phi::DenseTensor*> outs = {bias_out};
-      phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
-          dev_ctx_, ins, &outs, -1, phi::funcs::AddFunctor<T>());
+      phi::funcs::BroadcastKernel<T>(
+          dev_ctx_, ins, &outs, phi::funcs::AddFunctor<T>());
       PADDLE_ENFORCE_EQ(cudaGetLastError(),
                         cudaSuccess,
                         platform::errors::Fatal(
@@ -139,8 +139,8 @@ class AttnMatmulINT8 {
       // bias_out = output + bias
       std::vector<const phi::DenseTensor*> ins = {output, bias};
       std::vector<phi::DenseTensor*> outs = {bias_out};
-      phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
-          dev_ctx_, ins, &outs, -1, phi::funcs::AddFunctor<T>());
+      phi::funcs::BroadcastKernel<T>(
+          dev_ctx_, ins, &outs, phi::funcs::AddFunctor<T>());
       PADDLE_ENFORCE_EQ(cudaGetLastError(),
                         cudaSuccess,
                         platform::errors::Fatal(
diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
index 1d83c7a62b1d94031c0b6bbde81dd5504a056068..843b5009a6fccd03b708db7fc07300b8df8828ca 100644
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -255,12 +255,11 @@ class FMHARef {
         ins.emplace_back(src_mask_tensor);
         outs.emplace_back(src_mask_out_tensor);
         int elewise_add_axis = -1;
-        phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
-            dev_ctx_,
-            ins,
-            &outs,
-            elewise_add_axis,
-            phi::funcs::AddFunctor<T>());
+        phi::funcs::BroadcastKernel<T>(dev_ctx_,
+                                       ins,
+                                       &outs,
+                                       phi::funcs::AddFunctor<T>(),
+                                       elewise_add_axis);
 
         phi::SoftmaxForwardCUDAKernelDriver<T>(
             dev_ctx_, *src_mask_out_tensor, softmax_axis, softmax_out_tensor);
@@ -432,12 +431,11 @@ class FMHARef {
         ins.emplace_back(src_mask_tensor);
         outs.emplace_back(src_mask_out_tensor);
         int elewise_add_axis = -1;
-        phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
-            dev_ctx_,
-            ins,
-            &outs,
-            elewise_add_axis,
-            phi::funcs::AddFunctor<T>());
+        phi::funcs::BroadcastKernel<T>(dev_ctx_,
+                                       ins,
+                                       &outs,
+                                       phi::funcs::AddFunctor<T>(),
+                                       elewise_add_axis);
 
         phi::SoftmaxForwardCUDAKernelDriver<T>(
             dev_ctx_, *src_mask_out_tensor, softmax_axis, softmax_out_tensor);
diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h
index c8c4733df2e2e53aa8a5e7afdb3120bde8a66d7a..105647baf1c35ff55cc24e8a82fd75f9652e32c0 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention.h
+++ b/paddle/fluid/operators/fused/fused_gate_attention.h
@@ -689,13 +689,13 @@ class FMHAGateRef {
       std::vector<const phi::DenseTensor*> ins = {
           qk_out, src_mask, nonbatched_bias};
       std::vector<phi::DenseTensor*> outs = {qk_out};
-      phi::funcs::BroadcastKernel<phi::ElementwiseType::kTernary, T, T>(
-          dev_ctx_, ins, &outs, -1, TernaryAddFunctor<T>());
+      phi::funcs::BroadcastKernel<T>(
+          dev_ctx_, ins, &outs, TernaryAddFunctor<T>());
     } else {
       std::vector<const phi::DenseTensor*> ins = {qk_out, src_mask};
       std::vector<phi::DenseTensor*> outs = {qk_out};
-      phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
-          dev_ctx_, ins, &outs, -1, phi::funcs::AddFunctor<T>());
+      phi::funcs::BroadcastKernel<T>(
+          dev_ctx_, ins, &outs, phi::funcs::AddFunctor<T>());
     }
     phi::SoftmaxForwardCUDAKernelDriver<T>(dev_ctx_, *qk_out, -1, softmax_out);
   }
diff --git a/paddle/fluid/operators/fused_token_prune_op.cu b/paddle/fluid/operators/fused_token_prune_op.cu
index 434c072e5aa6a3e66b6ddf96fa76f52206871999..8f0a53611f3b29f4e58b22138fac5193981865cc 100644
--- a/paddle/fluid/operators/fused_token_prune_op.cu
+++ b/paddle/fluid/operators/fused_token_prune_op.cu
@@ -141,8 +141,7 @@ class FusedTokenPruneOpCUDAKernel : public framework::OpKernel<T> {
     ins.emplace_back(attn);
     ins.emplace_back(mask);
     outs.emplace_back(&attn_tmp);
-    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
-        dev_ctx, ins, &outs, -1, AttnMaskFunctor<T>());
+    LaunchElementwiseCudaKernel<T>(dev_ctx, ins, &outs, AttnMaskFunctor<T>());
 
     // 2. Reduce sum
     const std::vector<int64_t> reduce_dims{1, 2};
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index f9b79010263088846b490425d90017caefaf82cb..aaa86f8c37f62629eb674ffe41c681c0a85c62f8 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -836,12 +836,11 @@ class ReduceCudaGradKernel : public framework::OpKernel<T> {
     }
 
     using MPType = typename kps::details::MPTypeTrait<T>::Type;
-    phi::ReduceGrad<T, TransformOp<T, MPType>>(
-        dev_ctx,
-        pt_d_out.get(),
-        pt_d_x.get(),
-        pt_out_dtype,
-        TransformOp<T, MPType>(reduce_num));
+    phi::ReduceGrad<TransformOp<T, MPType>>(dev_ctx,
+                                            pt_d_out.get(),
+                                            pt_d_x.get(),
+                                            pt_out_dtype,
+                                            TransformOp<T, MPType>(reduce_num));
   }
 };
 
diff --git a/paddle/phi/kernels/cpu/bitwise_kernel.cc b/paddle/phi/kernels/cpu/bitwise_kernel.cc
index 80424ef624f61bb8f28de66cf122053ef1514c1d..a6297efd9cd3e284de3a0dcaa1e6b3007394a51a 100644
--- a/paddle/phi/kernels/cpu/bitwise_kernel.cc
+++ b/paddle/phi/kernels/cpu/bitwise_kernel.cc
@@ -24,15 +24,15 @@ limitations under the License. */
 
 namespace phi {
 
-#define DEFINE_BITWISE_KERNEL(op_type)                                    \
-  template <typename T, typename Context>                                 \
-  void Bitwise##op_type##Kernel(const Context& dev_ctx,                   \
-                                const DenseTensor& x,                     \
-                                const DenseTensor& y,                     \
-                                DenseTensor* out) {                       \
-    funcs::Bitwise##op_type##Functor<T> func;                             \
-    funcs::ElementwiseCompute<funcs::Bitwise##op_type##Functor<T>, T, T>( \
-        dev_ctx, x, y, -1, func, out);                                    \
+#define DEFINE_BITWISE_KERNEL(op_type)                                 \
+  template <typename T, typename Context>                              \
+  void Bitwise##op_type##Kernel(const Context& dev_ctx,                \
+                                const DenseTensor& x,                  \
+                                const DenseTensor& y,                  \
+                                DenseTensor* out) {                    \
+    funcs::Bitwise##op_type##Functor<T> func;                          \
+    funcs::ElementwiseCompute<funcs::Bitwise##op_type##Functor<T>, T>( \
+        dev_ctx, x, y, func, out);                                     \
   }
 
 DEFINE_BITWISE_KERNEL(And)
diff --git a/paddle/phi/kernels/cpu/compare_kernel.cc b/paddle/phi/kernels/cpu/compare_kernel.cc
index cf8eb47fb427f6b73fc5b96592440678609086cd..0fd1332d76c641bc7779bbc8c1fd90ab08297378 100644
--- a/paddle/phi/kernels/cpu/compare_kernel.cc
+++ b/paddle/phi/kernels/cpu/compare_kernel.cc
@@ -33,10 +33,10 @@ inline void CompareKernelImpl(const Context& ctx,
   ctx.template Alloc<bool>(out);
   if (x.dims().size() >= y.dims().size()) {
     funcs::ElementwiseCompute<Functor, T, bool>(
-        ctx, x, y, axis, Functor(), out);
+        ctx, x, y, Functor(), out, axis);
   } else {
     funcs::ElementwiseCompute<InverseFunctor, T, bool>(
-        ctx, x, y, axis, InverseFunctor(), out);
+        ctx, x, y, InverseFunctor(), out, axis);
   }
 }
 
@@ -59,7 +59,7 @@ inline void CompareAllKernelImpl(const Context& ctx,
       tmp_data[0] = Functor()(x.data<T>()[0], y.data<T>()[0]);
     } else {
       funcs::ElementwiseCompute<Functor, T, bool>(
-          ctx, x, y, 0, Functor(), &tmp);
+          ctx, x, y, Functor(), &tmp, 0);
     }
     auto tmp_flat = EigenVector<bool>::Flatten(tmp);
     auto out_es = EigenScalar<bool>::From(*out);
diff --git a/paddle/phi/kernels/cpu/dirichlet_kernel.cc b/paddle/phi/kernels/cpu/dirichlet_kernel.cc
index c124920dfa0db8af9cb85e5b4b5889b664dfe989..855e6bdfe1e1ff19fcf4f6e08616e6feef368300 100644
--- a/paddle/phi/kernels/cpu/dirichlet_kernel.cc
+++ b/paddle/phi/kernels/cpu/dirichlet_kernel.cc
@@ -91,8 +91,8 @@ struct DirichletSampler<CPUContext, T> {
         true,
         false);
 
-    funcs::ElementwiseCompute<funcs::DivideFunctor<T>, T, T>(
-        dev_ctx, gamma_samples, gamma_sum, -1, funcs::DivideFunctor<T>(), out);
+    funcs::ElementwiseCompute<funcs::DivideFunctor<T>, T>(
+        dev_ctx, gamma_samples, gamma_sum, funcs::DivideFunctor<T>(), out);
   }
 };
 
diff --git a/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc b/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc
index 40d0e863ea50e92484e80e9f005f7cf2059caeeb..8dbbfc13b81e2e4d578551fe991d6965e5f4297a 100644
--- a/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc
@@ -38,10 +38,10 @@ void DivideRawKernel(const Context& dev_ctx,
     auto y_dims = y.dims();
     if (x_dims.size() >= y_dims.size()) {
       funcs::ElementwiseCompute<funcs::DivideFunctor<T>, T>(
-          dev_ctx, x, y, axis, funcs::DivideFunctor<T>(), out);
+          dev_ctx, x, y, funcs::DivideFunctor<T>(), out, axis);
     } else {
       funcs::ElementwiseCompute<funcs::InverseDivideFunctor<T>, T>(
-          dev_ctx, x, y, axis, funcs::InverseDivideFunctor<T>(), out);
+          dev_ctx, x, y, funcs::InverseDivideFunctor<T>(), out, axis);
     }
   }
 }
diff --git a/paddle/phi/kernels/cpu/elementwise_kernel.cc b/paddle/phi/kernels/cpu/elementwise_kernel.cc
index 11aac8bbfe3ad37749d1098d81a977db6aaffd2e..321b439547e8d96c7fc906ba61882446199f1b05 100644
--- a/paddle/phi/kernels/cpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc
@@ -30,7 +30,7 @@ void MaximumRawKernel(const Context& dev_ctx,
   // allocate memory for out
   dev_ctx.template Alloc<T>(out);
   funcs::ElementwiseCompute<funcs::MaximumFunctor<T>, T>(
-      dev_ctx, x, y, axis, funcs::MaximumFunctor<T>(), out);
+      dev_ctx, x, y, funcs::MaximumFunctor<T>(), out, axis);
 }
 
 template <typename T, typename Context>
@@ -42,7 +42,7 @@ void MinimumRawKernel(const Context& dev_ctx,
   // allocate memory for out
   dev_ctx.template Alloc<T>(out);
   funcs::ElementwiseCompute<funcs::MinimumFunctor<T>, T>(
-      dev_ctx, x, y, axis, funcs::MinimumFunctor<T>(), out);
+      dev_ctx, x, y, funcs::MinimumFunctor<T>(), out, axis);
 }
 
 template <typename T, typename Context>
@@ -57,10 +57,10 @@ void RemainderRawKernel(const Context& dev_ctx,
   auto y_dims = y.dims();
   if (x_dims.size() >= y_dims.size()) {
     funcs::ElementwiseCompute<funcs::RemainderFunctor<T>, T>(
-        dev_ctx, x, y, axis, funcs::RemainderFunctor<T>(), out);
+        dev_ctx, x, y, funcs::RemainderFunctor<T>(), out, axis);
   } else {
     funcs::ElementwiseCompute<funcs::InverseRemainderFunctor<T>, T>(
-        dev_ctx, x, y, axis, funcs::InverseRemainderFunctor<T>(), out);
+        dev_ctx, x, y, funcs::InverseRemainderFunctor<T>(), out, axis);
   }
 }
 
@@ -76,10 +76,10 @@ void FloorDivideRawKernel(const Context& dev_ctx,
   auto y_dims = y.dims();
   if (x_dims.size() >= y_dims.size()) {
     funcs::ElementwiseCompute<funcs::FloorDivideFunctor<T>, T>(
-        dev_ctx, x, y, axis, funcs::FloorDivideFunctor<T>(), out);
+        dev_ctx, x, y, funcs::FloorDivideFunctor<T>(), out, axis);
   } else {
     funcs::ElementwiseCompute<funcs::InverseFloorDivideFunctor<T>, T>(
-        dev_ctx, x, y, axis, funcs::InverseFloorDivideFunctor<T>(), out);
+        dev_ctx, x, y, funcs::InverseFloorDivideFunctor<T>(), out, axis);
   }
 }
 
@@ -95,10 +95,10 @@ void ElementwisePowRawKernel(const Context& dev_ctx,
   auto y_dims = y.dims();
   if (x_dims.size() >= y_dims.size()) {
     funcs::ElementwiseCompute<funcs::ElementwisePowFunctor<T>, T>(
-        dev_ctx, x, y, axis, funcs::ElementwisePowFunctor<T>(), out);
+        dev_ctx, x, y, funcs::ElementwisePowFunctor<T>(), out, axis);
   } else {
     funcs::ElementwiseCompute<funcs::ElementwiseInversePowFunctor<T>, T>(
-        dev_ctx, x, y, axis, funcs::ElementwiseInversePowFunctor<T>(), out);
+        dev_ctx, x, y, funcs::ElementwiseInversePowFunctor<T>(), out, axis);
   }
 }
 
@@ -110,7 +110,7 @@ void HeavisideKernel(const Context& dev_ctx,
   // allocate memory for out
   dev_ctx.template Alloc<T>(out);
   funcs::ElementwiseCompute<funcs::ElementwiseHeavisideFunctor<T>, T>(
-      dev_ctx, x, y, -1, funcs::ElementwiseHeavisideFunctor<T>(), out);
+      dev_ctx, x, y, funcs::ElementwiseHeavisideFunctor<T>(), out);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
index c42e423ba2d34c000e396b9f1f623f46e60f975c..630e786b571bc7cfbc38cf87c87074e27ae01546 100644
--- a/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
@@ -68,20 +68,15 @@ void LayerNormGradKernel(const Context& dev_ctx,
     temp_norm.Resize(matrix_shape);
     dev_ctx.template Alloc<T>(&temp_norm);
     // get x_norm
-    phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
-        dev_ctx,
-        x_tmp,
-        mean,
-        /*axis*/ 0,
-        funcs::SubtractFunctor<T>(),
-        &temp_norm);
-    phi::funcs::ElementwiseCompute<funcs::DivAndSqrtFunctor<T>, T, T>(
+    phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T>(
+        dev_ctx, x_tmp, mean, funcs::SubtractFunctor<T>(), &temp_norm, 0);
+    phi::funcs::ElementwiseCompute<funcs::DivAndSqrtFunctor<T>, T>(
         dev_ctx,
         temp_norm,
         variance,
-        /*axis*/ 0,
         funcs::DivAndSqrtFunctor<T>(static_cast<T>(epsilon)),
-        &temp_norm);
+        &temp_norm,
+        0);
   }
 
   if (d_bias) {
@@ -90,8 +85,8 @@ void LayerNormGradKernel(const Context& dev_ctx,
   }
   if (d_scale) {
     dev_ctx.template Alloc<T>(d_scale);
-    phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
-        dev_ctx, temp_norm, d_y, 0, funcs::MultiplyFunctor<T>(), &temp);
+    phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T>(
+        dev_ctx, temp_norm, d_y, funcs::MultiplyFunctor<T>(), &temp, 0);
     colwise_sum(dev_ctx, temp, d_scale);
   }
 
@@ -107,70 +102,45 @@ void LayerNormGradKernel(const Context& dev_ctx,
 
     if (d_scale) {
       // dy_dx
-      phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
-          dev_ctx, d_y, *scale, /*axis*/ 1, funcs::MultiplyFunctor<T>(), &temp);
+      phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T>(
+          dev_ctx, d_y, *scale, funcs::MultiplyFunctor<T>(), &temp, 1);
       phi::Copy<Context>(dev_ctx, temp, dev_ctx.GetPlace(), false, d_x);
 
       // dy_dmean_dx
       row_mean(dev_ctx, temp, &temp_vec);
-      phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
-          dev_ctx,
-          *d_x,
-          temp_vec,
-          /*axis*/ 0,
-          funcs::SubtractFunctor<T>(),
-          d_x);
+      phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T>(
+          dev_ctx, *d_x, temp_vec, funcs::SubtractFunctor<T>(), d_x, 0);
 
       // dy_var_dx
-      phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
-          dev_ctx,
-          temp,
-          temp_norm,
-          /*axis*/ 0,
-          funcs::MultiplyFunctor<T>(),
-          &temp);
+      phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T>(
+          dev_ctx, temp, temp_norm, funcs::MultiplyFunctor<T>(), &temp, 0);
     } else {
       // dy_dx
       phi::Copy<Context>(dev_ctx, d_y, dev_ctx.GetPlace(), false, d_x);
 
       // dy_dmean_dx
       row_mean(dev_ctx, d_y, &temp_vec);
-      phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
-          dev_ctx,
-          *d_x,
-          temp_vec,
-          /*axis*/ 0,
-          funcs::SubtractFunctor<T>(),
-          d_x);
+      phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T>(
+          dev_ctx, *d_x, temp_vec, funcs::SubtractFunctor<T>(), d_x, 0);
 
       // dy_var_dx
-      phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
-          dev_ctx,
-          d_y,
-          temp_norm,
-          /*axis*/ 0,
-          funcs::MultiplyFunctor<T>(),
-          &temp);
+      phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T>(
+          dev_ctx, d_y, temp_norm, funcs::MultiplyFunctor<T>(), &temp, 0);
     }
     // dy_var_dx
     row_mean(dev_ctx, temp, &temp_vec);
-    phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
-        dev_ctx,
-        temp_norm,
-        temp_vec,
-        /*axis*/ 0,
-        funcs::MultiplyFunctor<T>(),
-        &temp);
-    phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
-        dev_ctx, *d_x, temp, /*axis*/ 0, funcs::SubtractFunctor<T>(), d_x);
-
-    phi::funcs::ElementwiseCompute<funcs::DivAndSqrtFunctor<T>, T, T>(
+    phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T>(
+        dev_ctx, temp_norm, temp_vec, funcs::MultiplyFunctor<T>(), &temp, 0);
+    phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T>(
+        dev_ctx, *d_x, temp, funcs::SubtractFunctor<T>(), d_x, 0);
+
+    phi::funcs::ElementwiseCompute<funcs::DivAndSqrtFunctor<T>, T>(
         dev_ctx,
         *d_x,
         variance,
-        /*axis*/ 0,
         funcs::DivAndSqrtFunctor<T>(static_cast<T>(epsilon)),
-        d_x);
+        d_x,
+        0);
     d_x->Resize(dx_dim);
   }
 }
diff --git a/paddle/phi/kernels/cpu/layer_norm_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_kernel.cc
index 1c82866f0bbda06ed35a8e9390c80c3d6305015d..2a93d03b4abc15a6a544426ea12ffe3a2320a2bb 100644
--- a/paddle/phi/kernels/cpu/layer_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/layer_norm_kernel.cc
@@ -67,30 +67,30 @@ void LayerNormKernel(const Context& dev_ctx,
 
   // get variance
 
-  phi::funcs::ElementwiseCompute<funcs::SubAndSquareFunctor<T>, T, T>(
-      dev_ctx, x_tmp, *mean, 0, funcs::SubAndSquareFunctor<T>(), &out);
+  phi::funcs::ElementwiseCompute<funcs::SubAndSquareFunctor<T>, T>(
+      dev_ctx, x_tmp, *mean, funcs::SubAndSquareFunctor<T>(), &out, 0);
 
   row_mean(dev_ctx, out, var);
 
   // get x_norm
-  phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
-      dev_ctx, x_tmp, *mean, 0, funcs::SubtractFunctor<T>(), &out);
+  phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T>(
+      dev_ctx, x_tmp, *mean, funcs::SubtractFunctor<T>(), &out, 0);
 
-  phi::funcs::ElementwiseCompute<funcs::DivAndSqrtFunctor<T>, T, T>(
+  phi::funcs::ElementwiseCompute<funcs::DivAndSqrtFunctor<T>, T>(
       dev_ctx,
       out,
       *var,
-      0,
       funcs::DivAndSqrtFunctor<T>(static_cast<T>(epsilon)),
-      &out);
+      &out,
+      0);
 
   if (scale) {
-    phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
-        dev_ctx, out, *scale, 1, funcs::MultiplyFunctor<T>(), &out);
+    phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T>(
+        dev_ctx, out, *scale, funcs::MultiplyFunctor<T>(), &out, 1);
   }
   if (bias) {
-    phi::funcs::ElementwiseCompute<funcs::AddFunctor<T>, T, T>(
-        dev_ctx, out, *bias, 1, funcs::AddFunctor<T>(), &out);
+    phi::funcs::ElementwiseCompute<funcs::AddFunctor<T>, T>(
+        dev_ctx, out, *bias, funcs::AddFunctor<T>(), &out, 1);
   }
 #else
   PADDLE_ENFORCE_EQ(mean->numel(),
diff --git a/paddle/phi/kernels/cpu/logical_kernel.cc b/paddle/phi/kernels/cpu/logical_kernel.cc
index 3a669883f78378db76d19103336870c3171b6126..38c927e976f8c874e2c18f17d8b0d6c09e57a9af 100644
--- a/paddle/phi/kernels/cpu/logical_kernel.cc
+++ b/paddle/phi/kernels/cpu/logical_kernel.cc
@@ -32,7 +32,7 @@ namespace phi {
                              DenseTensor* out) {                          \
     funcs::Logical##type##Functor<T> binary_func;                         \
     funcs::ElementwiseCompute<funcs::Logical##type##Functor<T>, T, bool>( \
-        dev_ctx, x, y, -1, binary_func, out);                             \
+        dev_ctx, x, y, binary_func, out);                                 \
   }
 
 DEFINE_LOGICAL_BINARY_KERNEL(And)
diff --git a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
index fbb16138567d154360acb33bb94ba695dd48ed0b..e1188fda486c77cdb6b588e857377e34b8d3b967 100644
--- a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
+++ b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
@@ -132,11 +132,10 @@ void MatrixRankTolKernel(const Context& dev_ctx,
   DenseTensor tol_tensor;
   tol_tensor.Resize(dim_out);
   dev_ctx.template Alloc<T>(&tol_tensor);
-  funcs::ElementwiseCompute<GreaterElementFunctor<T>, T, T>(
+  funcs::ElementwiseCompute<GreaterElementFunctor<T>, T>(
       dev_ctx,
       atol_tensor,
       rtol_tensor,
-      -1,
       GreaterElementFunctor<T>(),
       &tol_tensor);
 
@@ -151,17 +150,17 @@ void MatrixRankTolKernel(const Context& dev_ctx,
         dev_ctx,
         eigenvalue_tensor,
         tol_tensor,
-        axis,
         funcs::GreaterThanFunctor<T, int64_t>(),
-        &compare_result);
+        &compare_result,
+        axis);
   } else {
     funcs::ElementwiseCompute<funcs::LessThanFunctor<T, int64_t>, T, int>(
         dev_ctx,
         eigenvalue_tensor,
         tol_tensor,
-        axis,
         funcs::LessThanFunctor<T, int64_t>(),
-        &compare_result);
+        &compare_result,
+        axis);
   }
 
   phi::SumKernel<int64_t>(dev_ctx,
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index f96a1764c24a5e63db1fbf8e40e79535a5c4f309..e754ce3bf49e4659f885e9d94a116bc98ef0aa26 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -31,20 +31,49 @@ namespace funcs {
 
 enum BroadcastLoadType { kMixed = 1, kBroadcast = 2, kElementwise = 3 };
 
-template <typename InT, typename OutT, int Arity>
+template <int Index>
+struct UseBroadcast {
+  template <typename ArgsT, typename Array1, typename Array2>
+  static HOSTDEVICE void Apply(
+      const std::vector<const DenseTensor *> &ins_tensor,
+      const ArgsT &args,
+      int64_t numel,
+      Array1 *ins_data,
+      Array2 *use_broadcast,
+      int *broadcast_num,
+      bool *all_elementwise) {
+    (*ins_data)[Index] = (const _ptr_ char *)(ins_tensor[Index]->data());
+    bool is_same_dim = ins_tensor[Index]->numel() == numel;
+    if (is_same_dim) {
+      (*use_broadcast)[Index] = false;
+    } else {
+      (*use_broadcast)[Index] = true;
+      (*broadcast_num)++;
+    }
+    *all_elementwise &= is_same_dim;
+  }
+};
+
+template <typename OutT, int Arity, typename Functor>
 struct LoaderTypeClassifier {
  public:
   int64_t numel{0};
-  int vec_size{1};
+  int vec_size{4};
   int broadcast_num{0};
   bool all_elementwise{true};
-  phi::Array<int, Arity> use_broadcast;
-  phi::Array<const _ptr_ InT *__restrict__, Arity> ins_data;
+  phi::Array<bool, Arity> use_broadcast;
+  phi::Array<const _ptr_ char *__restrict__, Arity> ins_data;
 
   LoaderTypeClassifier() {}
   LoaderTypeClassifier(const std::vector<const DenseTensor *> &ins,
                        std::vector<DenseTensor *> *outs) {
+    using Traits = phi::funcs::FunctionTraits<Functor>;
+    using ArgsT = typename Traits::ArgsTuple;
+    ArgsT arg;
     uint64_t out_addr = reinterpret_cast<uint64_t>((*outs)[0]->data<OutT>());
+
+    UnrollerWithoutVecSize<VecSizeGetter, Arity>::step(ins, arg, &vec_size);
+
     for (auto i = 1; i < outs->size(); ++i) {
       PADDLE_ENFORCE_EQ(
           (*outs)[i]->dims(),
@@ -56,165 +85,191 @@ struct LoaderTypeClassifier {
       out_addr =
           (out_addr | reinterpret_cast<uint64_t>((*outs)[i]->data<OutT>()));
     }
-    int out_vec_size =
-        phi::GetVectorizedSize<OutT>(reinterpret_cast<OutT *>(out_addr));
 
-    uint64_t in_addr = static_cast<uint64_t>(0);
+    vec_size = std::min(
+        vec_size,
+        phi::GetVectorizedSize<OutT>(reinterpret_cast<OutT *>(out_addr)));
     numel = (*outs)[0]->numel();
-    for (int i = 0; i < Arity; ++i) {
-      auto in_data = ins[i]->data<InT>();
-      ins_data[i] = (const _ptr_ InT *)(in_data);
-
-      bool is_same_dim = ins[i]->numel() == numel;
-      if (is_same_dim) {
-        use_broadcast[i] = false;
-        in_addr = (in_addr | reinterpret_cast<uint64_t>(in_data));
-      } else {
-        use_broadcast[i] = true;
-        broadcast_num++;
-      }
-      all_elementwise &= is_same_dim;
-    }
-    int in_vec_size = std::min(
-        4, phi::GetVectorizedSize<InT>(reinterpret_cast<InT *>(in_addr)));
-    vec_size = std::min(out_vec_size, in_vec_size);
+    UnrollerWithoutVecSize<UseBroadcast, Arity>::step(ins,
+                                                      arg,
+                                                      numel,
+                                                      &ins_data,
+                                                      &use_broadcast,
+                                                      &broadcast_num,
+                                                      &all_elementwise);
   }
 };
 
-#ifndef PADDLE_WITH_XPU_KP
 // Common broadcast/elementwise Loader.
-template <typename T, int VecSize, int Arity, bool IsBoundary, int LoadType>
+template <int Index, int VecSize, bool IsBoundary, int LoadType>
 struct BroadcastDataLoader {
-  __device__ __forceinline__ void operator()(
-      T args[Arity][VecSize],
-      const phi::Array<const _ptr_ T *__restrict__, Arity> &ins,
-      const phi::Array<kps::details::BroadcastConfig, Arity> &configs,
-      const phi::Array<int, Arity> &use_broadcast,
-      const int block_offset,
-      const int num,
-      const uint32_t numel) {
-#pragma unroll
-    for (int i = 0; i < Arity; ++i) {
-      kps::Init<T, VecSize>(args[i], static_cast<T>(1.0f));
-      if (use_broadcast[i]) {
-        kps::ReadDataBc<T, VecSize, 1, IsBoundary>(
-            args[i], ins[i], block_offset, configs[i], numel, VecSize);
-      } else {
-        kps::ReadData<T, VecSize, 1, IsBoundary>(
-            args[i], ins[i] + block_offset, num, VecSize);
-      }
+  template <typename Array1, typename Array2, typename Array3, typename ArgsT>
+  static __device__ __forceinline__ void Apply(const Array1 &ins,
+                                               ArgsT *args,
+                                               const Array2 &configs,
+                                               const Array3 &use_broadcast,
+                                               const int block_offset,
+                                               const int num,
+                                               const uint32_t numel,
+                                               int read_lens) {
+    using Type = std::tuple_element_t<Index, ArgsT>;
+#ifdef PADDLE_WITH_XPU_KP
+    kps::Init<Type, ArgsT, Index, VecSize>(
+        args, static_cast<Type>(1.0f), read_lens);
+    if (use_broadcast[Index]) {
+      kps::ReadDataBc<Type, VecSize, 1, ArgsT, Index, IsBoundary>(
+          args,
+          reinterpret_cast<const _ptr_ Type *>(ins[Index]),
+          block_offset,
+          configs[Index],
+          numel,
+          read_lens);
+    } else {
+      kps::ReadData<Type, VecSize, 1, ArgsT, Index, IsBoundary>(
+          args,
+          reinterpret_cast<const _ptr_ Type *>(ins[Index]) + block_offset,
+          num,
+          read_lens);
     }
+#else
+    kps::Init<Type, ArgsT, Index, VecSize>(args, static_cast<Type>(1.0f));
+    if (use_broadcast[Index]) {
+      kps::ReadDataBc<Type, VecSize, 1, ArgsT, Index, IsBoundary>(
+          args,
+          reinterpret_cast<const _ptr_ Type *>(ins[Index]),
+          block_offset,
+          configs[Index],
+          numel,
+          VecSize);
+    }
+    // NOTE: If use if...else... with condition `use_broadcast[Index]` here,
+    // there will be some errs with clang12 while compiling in ROCm.
+    // When the compiler is upgraded, if...else... may be used.
+    if (!use_broadcast[Index]) {
+      kps::ReadData<Type, VecSize, 1, ArgsT, Index, IsBoundary>(
+          args,
+          reinterpret_cast<const _ptr_ Type *>(ins[Index]) + block_offset,
+          num,
+          VecSize);
+    }
+#endif
   }
 };
 
+/* BroadcastDataLoaders Partial specialization */
+#ifndef PADDLE_WITH_XPU_KP
 // Scalar elementwise Loader with consideration of IsBoundary.
-template <typename T, int VecSize, int Arity>
-struct BroadcastDataLoader<T, VecSize, Arity, true, kElementwise> {
-  __device__ __forceinline__ void operator()(
-      T args[Arity][VecSize],
-      const phi::Array<const _ptr_ T *__restrict__, Arity> &ins,
-      const phi::Array<kps::details::BroadcastConfig, Arity> &configs,
-      const phi::Array<int, Arity> &use_broadcast,
-      const int block_offset,
-      const int num,
-      const uint32_t numel) {
+template <int Index, int VecSize>
+struct BroadcastDataLoader<Index, VecSize, true, kElementwise> {
+  template <typename Array1, typename Array2, typename Array3, typename ArgsT>
+  static __device__ __forceinline__ void Apply(const Array1 &ins,
+                                               ArgsT *args,
+                                               const Array2 &configs,
+                                               const Array3 &use_broadcast,
+                                               const int block_offset,
+                                               const int num,
+                                               const uint32_t numel,
+                                               int read_lens) {
+    using Type = std::tuple_element_t<Index, ArgsT>;
     int thread_offset = threadIdx.x * VecSize + block_offset;
 #pragma unroll
-    for (int i = 0; i < Arity; ++i) {
-#pragma unroll
-      for (int idx = 0; idx < VecSize; ++idx) {
-        args[i][idx] = static_cast<T>(1);
-        int index = thread_offset + idx;
-        if (index < numel) {
-          args[i][idx] = ins[i][index];
-        }
+    for (int idx = 0; idx < VecSize; ++idx) {
+      std::get<Index>(args[idx]) = static_cast<Type>(1);
+      int index = thread_offset + idx;
+      if (index < numel) {
+        std::get<Index>(args[idx]) =
+            reinterpret_cast<const _ptr_ Type *>(ins[Index])[index];
       }
     }
   }
 };
 
 // Vectorized elementwise Loader without consideration of IsBoundary.
-template <typename T, int VecSize, int Arity>
-struct BroadcastDataLoader<T, VecSize, Arity, false, kElementwise> {
-  __device__ __forceinline__ void operator()(
-      T args[Arity][VecSize],
-      const phi::Array<const _ptr_ T *__restrict__, Arity> &ins,
-      const phi::Array<kps::details::BroadcastConfig, Arity> &configs,
-      const phi::Array<int, Arity> &use_broadcast,
-      const int block_offset,
-      const int num,
-      const uint32_t numel) {
-    using VecType = phi::kps::details::VectorType<T, VecSize>;
-    VecType vec_temp[Arity];
+template <int Index, int VecSize>
+struct BroadcastDataLoader<Index, VecSize, false, kElementwise> {
+  template <typename Array1, typename Array2, typename Array3, typename ArgsT>
+  static __device__ __forceinline__ void Apply(const Array1 &ins,
+                                               ArgsT *args,
+                                               const Array2 &configs,
+                                               const Array3 &use_broadcast,
+                                               const int block_offset,
+                                               const int num,
+                                               const uint32_t numel,
+                                               int read_lens) {
+    using Type = std::tuple_element_t<Index, ArgsT>;
+    using VecType = phi::kps::details::VectorType<Type, VecSize>;
+    VecType vec_temp;
 
     int thread_offset = threadIdx.x + blockIdx.x * blockDim.x;
+    const VecType *__restrict__ vec_input =
+        reinterpret_cast<const VecType *__restrict__>(ins[Index]);
+    vec_temp = vec_input[thread_offset];
 #pragma unroll
-    for (int i = 0; i < Arity; ++i) {
-      const VecType *__restrict__ vec_input =
-          reinterpret_cast<const VecType *__restrict__>(ins[i]);
-      vec_temp[i] = vec_input[thread_offset];
-#pragma unroll
-      for (int idx = 0; idx < VecSize; ++idx) {
-        args[i][idx] = vec_temp[i].val[idx];
-      }
+    for (int idx = 0; idx < VecSize; ++idx) {
+      std::get<Index>(args[idx]) = vec_temp.val[idx];
     }
   }
 };
 
-// Common broadcast data loader.
-template <typename T, int VecSize, int Arity, bool IsBoundary>
-struct BroadcastDataLoader<T, VecSize, Arity, IsBoundary, kBroadcast> {
-  __device__ __forceinline__ void operator()(
-      T args[Arity][VecSize],
-      const phi::Array<const _ptr_ T *__restrict__, Arity> &ins,
-      const phi::Array<kps::details::BroadcastConfig, Arity> &configs,
-      const phi::Array<int, Arity> &use_broadcast,
-      const int block_offset,
-      const int num,
-      const uint32_t numel) {
-    uint32_t index_bc[Arity][VecSize];
-#pragma unroll
-    for (int j = 0; j < Arity; ++j) {
-#pragma unroll
-      for (int k = 0; k < VecSize; ++k) {
-        index_bc[j][k] = 0;
-        args[j][k] = static_cast<T>(1);
-      }
-    }
-
-    uint32_t thread_offset = block_offset + threadIdx.x * VecSize;
+template <int Index, int VecSize>
+struct BroadcastDataInit {
+  template <typename ArgsT>
+  static __device__ __forceinline__ void Apply(ArgsT *args) {
+    using Type = std::tuple_element_t<Index, ArgsT>;
 #pragma unroll
     for (int k = 0; k < VecSize; ++k) {
-      uint32_t idx = thread_offset + k;
-      if (IsBoundary) {
-        if (idx == numel) break;
-      }
-
-#pragma unroll
-      for (int i = 0; i < phi::DDim::kMaxRank; ++i) {
-        if (i == configs[0].rank) break;
-        auto fast_divmoder = configs[0].divmoders[i].Divmod(idx);
-        idx = fast_divmoder.val[0];
-#pragma unroll
-        for (int j = 0; j < Arity; ++j) {
-          index_bc[j][k] += fast_divmoder.val[1] * configs[j].strides[i];
-        }
-      }
+      std::get<Index>(args[k]) = static_cast<Type>(1);
     }
+  }
+};
 
+template <int Index, int VecSize>
+struct BroadcastDataSetter {
+  template <typename Array, typename ArgsT>
+  static __device__ __forceinline__ void Apply(const Array &ins,
+                                               ArgsT *args,
+                                               uint32_t index_bc[][VecSize]) {
+    using Type = std::tuple_element_t<Index, ArgsT>;
 #pragma unroll
-    for (int j = 0; j < Arity; ++j) {
-#pragma unroll
-      for (int k = 0; k < VecSize; ++k) {
-        args[j][k] = ins[j][index_bc[j][k]];
-      }
+    for (int k = 0; k < VecSize; ++k) {
+      std::get<Index>(args[k]) =
+          reinterpret_cast<const _ptr_ Type *>(ins[Index])[index_bc[Index][k]];
     }
   }
 };
+
 #endif
 
-template <typename InT,
-          typename OutT,
+// static broadcast unroller
+template <template <int Index, int VecSize, bool IsBoundary, int LoadType>
+          typename Func,
+          bool IsBoundary,
+          int LoadType,
+          int VecSize,
+          int End,
+          int Begin = 0>
+struct BcUnroller {
+  template <typename... Args>
+  static HOSTDEVICE inline void step(Args &&...args) {
+    Func<Begin, VecSize, IsBoundary, LoadType>::Apply(
+        std::forward<Args>(args)...);
+    BcUnroller<Func, IsBoundary, LoadType, VecSize, End, Begin + 1>::step(
+        args...);
+  }
+};
+
+template <template <int Index, int VecSize, bool IsBoundary, int LoadType>
+          typename Func,
+          bool IsBoundary,
+          int LoadType,
+          int VecSize,
+          int End>
+struct BcUnroller<Func, IsBoundary, LoadType, VecSize, End, End> {
+  template <typename... Args>
+  static HOSTDEVICE inline void step(Args &&...args) {}
+};
+
+template <typename OutT,
           typename Functor,
           int Arity,
           int NumOuts,
@@ -222,59 +277,69 @@ template <typename InT,
           bool IsBoundary,
           int LoadType>
 __device__ void VectorizedBroadcastKernelImpl(
-    const phi::Array<const _ptr_ InT *__restrict__, Arity> &ins,
+    const phi::Array<const _ptr_ char *__restrict__, Arity> &ins,
     phi::Array<_ptr_ OutT *, NumOuts> outs,
-    const phi::Array<int, Arity> &use_broadcast,
+    const phi::Array<bool, Arity> &use_broadcast,
     const uint32_t numel,
     const phi::Array<kps::details::BroadcastConfig, Arity> &configs,
     int num,
     int block_offset,
     int read_lens,
     Functor func) {
-  __simd__ InT args[Arity][VecSize];
+  using Traits = phi::funcs::FunctionTraits<Functor>;
+  using ArgsT = typename Traits::ArgsTuple;
+  __simd__ ArgsT args[VecSize];
   __simd__ ConditionalT<OutT, NumOuts> result[VecSize];
+
 #ifdef PADDLE_WITH_XPU_KP
+  BcUnroller<BroadcastDataLoader, IsBoundary, LoadType, VecSize, Arity>::step(
+      ins, args, configs, use_broadcast, block_offset, num, numel, read_lens);
+#else
+  if (LoadType == kBroadcast) {
+    uint32_t index_bc[Arity][VecSize] = {0};
+    Unroller<BroadcastDataInit, VecSize, Arity>::step(args);
+    uint32_t thread_offset = block_offset + threadIdx.x * VecSize;
 #pragma unroll
-  for (int i = 0; i < Arity; ++i) {
-    kps::Init<InT, VecSize>(args[i], static_cast<InT>(1.0f), read_lens);
-    if (use_broadcast[i]) {
-      kps::ReadDataBc<InT, VecSize, 1, IsBoundary>(
-          args[i], ins[i], block_offset, configs[i], numel, read_lens);
-    } else {
-      kps::ReadData<InT, VecSize, 1, IsBoundary>(
-          args[i], ins[i] + block_offset, num, read_lens);
+    for (int k = 0; k < VecSize; ++k) {
+      uint32_t idx = thread_offset + k;
+      if (IsBoundary && idx == numel) break;
+#pragma unroll
+      for (int i = 0; i < phi::DDim::kMaxRank; ++i) {
+        if (i == configs[0].rank) break;
+        auto fast_divmoder = configs[0].divmoders[i].Divmod(idx);
+        idx = fast_divmoder.val[0];
+#pragma unroll
+        for (int j = 0; j < Arity; ++j) {
+          index_bc[j][k] += fast_divmoder.val[1] * configs[j].strides[i];
+        }
+      }
     }
+    Unroller<BroadcastDataSetter, VecSize, Arity>::step(ins, args, index_bc);
+  } else {
+    BcUnroller<BroadcastDataLoader, IsBoundary, LoadType, VecSize, Arity>::step(
+        ins, args, configs, use_broadcast, block_offset, num, numel, read_lens);
   }
-#else
-  BroadcastDataLoader<InT, VecSize, Arity, IsBoundary, LoadType>()(
-      args, ins, configs, use_broadcast, block_offset, num, numel);
 #endif
-
-  constexpr bool kCallElementwiseAny =
-      phi::funcs::FunctionTraits<Functor>::has_pointer_args;
-  phi::funcs::ElementwisePrimitiveCaller<InT,
-                                         ConditionalT<OutT, NumOuts>,
-                                         VecSize,
-                                         Functor,
-                                         Arity,
-                                         kCallElementwiseAny>()(
-      func, args, result, read_lens);
+  SameDimsElementwisePrimitiveCaller<ConditionalT<OutT, NumOuts>,
+                                     VecSize,
+                                     Functor,
+                                     ArgsT,
+                                     Arity>()(func, args, result, read_lens);
   phi::funcs::
       ElementwiseWriteDataCallerBc<OutT, VecSize, IsBoundary, NumOuts>()(
           outs, result, block_offset, num, read_lens);
 }
 
 template <typename Functor,
-          typename InT,
           typename OutT,
           int Arity,
           int NumOuts,
           int VecSize,
           int LoadType>
 __global__ void VectorizedBroadcastKernel(
-    phi::Array<const _ptr_ InT *__restrict__, Arity> ins,
+    phi::Array<const _ptr_ char *__restrict__, Arity> ins,
     phi::Array<_ptr_ OutT *, NumOuts> outs,
-    phi::Array<int, Arity> use_broadcast,
+    phi::Array<bool, Arity> use_broadcast,
     uint32_t numel,
     phi::Array<kps::details::BroadcastConfig, Arity> configs,
     int main_offset,
@@ -285,8 +350,7 @@ __global__ void VectorizedBroadcastKernel(
   int block_offset = BLOCK_ID_X * BLOCK_NUM_X * read_lens;
   int stride = BLOCK_NUM_X * GRID_NUM_X * read_lens;
   for (; block_offset < main_offset; block_offset += stride) {
-    VectorizedBroadcastKernelImpl<InT,
-                                  OutT,
+    VectorizedBroadcastKernelImpl<OutT,
                                   Functor,
                                   Arity,
                                   NumOuts,
@@ -304,8 +368,7 @@ __global__ void VectorizedBroadcastKernel(
   }
   int num = numel - block_offset;
   if (num > 0) {
-    VectorizedBroadcastKernelImpl<InT,
-                                  OutT,
+    VectorizedBroadcastKernelImpl<OutT,
                                   Functor,
                                   Arity,
                                   NumOuts,
@@ -324,8 +387,7 @@ __global__ void VectorizedBroadcastKernel(
 #else
   int block_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
   if (block_offset < main_offset) {
-    VectorizedBroadcastKernelImpl<InT,
-                                  OutT,
+    VectorizedBroadcastKernelImpl<OutT,
                                   Functor,
                                   Arity,
                                   NumOuts,
@@ -341,8 +403,7 @@ __global__ void VectorizedBroadcastKernel(
                                             read_lens,
                                             func);
   } else {
-    VectorizedBroadcastKernelImpl<InT,
-                                  OutT,
+    VectorizedBroadcastKernelImpl<OutT,
                                   Functor,
                                   Arity,
                                   NumOuts,
@@ -361,19 +422,14 @@ __global__ void VectorizedBroadcastKernel(
 #endif
 }
 
-template <typename InT,
-          typename OutT,
-          typename Func,
-          int Arity,
-          int NumOuts,
-          int VecSize>
+template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
 void LaunchBroadcastKernel(
     const KPDevice &ctx,
     const std::vector<const DenseTensor *> &ins,
     std::vector<DenseTensor *> *outs,
-    Func func,
+    Functor func,
     const phi::Array<kps::details::BroadcastConfig, Arity> &configs,
-    const LoaderTypeClassifier<InT, OutT, Arity> &loader_classifier) {
+    const LoaderTypeClassifier<OutT, Arity, Functor> &loader_classifier) {
   phi::Array<_ptr_ OutT *, NumOuts> outs_data;
   for (int i = 0; i < NumOuts; ++i) {
     outs_data[i] = (_ptr_ OutT *)(ctx.Alloc<OutT>((*outs)[i]));
@@ -388,7 +444,7 @@ void LaunchBroadcastKernel(
   int main_offset = (numel / (read_lens * threads)) * read_lens * threads;
   int tail_tid = numel % (read_lens * threads);
 
-  VectorizedBroadcastKernel<Func, InT, OutT, Arity, NumOuts, VecSize, false>
+  VectorizedBroadcastKernel<Functor, OutT, Arity, NumOuts, VecSize, false>
       <<<blocks, threads, 0, stream>>>(loader_classifier.ins_data,
                                        outs_data,
                                        loader_classifier.use_broadcast,
@@ -409,8 +465,7 @@ void LaunchBroadcastKernel(
   int tail_tid = numel % (VecSize * threads);
 
   if (loader_classifier.all_elementwise) {
-    VectorizedBroadcastKernel<Func,
-                              InT,
+    VectorizedBroadcastKernel<Functor,
                               OutT,
                               Arity,
                               NumOuts,
@@ -427,7 +482,7 @@ void LaunchBroadcastKernel(
                                          func);
   } else if (loader_classifier.broadcast_num > (Arity >> 1)) {
     constexpr BroadcastLoadType type_ = (Arity > 1) ? kBroadcast : kMixed;
-    VectorizedBroadcastKernel<Func, InT, OutT, Arity, NumOuts, VecSize, type_>
+    VectorizedBroadcastKernel<Functor, OutT, Arity, NumOuts, VecSize, type_>
         <<<blocks, threads, 0, stream>>>(loader_classifier.ins_data,
                                          outs_data,
                                          loader_classifier.use_broadcast,
@@ -438,7 +493,7 @@ void LaunchBroadcastKernel(
                                          VecSize,
                                          func);
   } else {
-    VectorizedBroadcastKernel<Func, InT, OutT, Arity, NumOuts, VecSize, kMixed>
+    VectorizedBroadcastKernel<Functor, OutT, Arity, NumOuts, VecSize, kMixed>
         <<<blocks, threads, 0, stream>>>(loader_classifier.ins_data,
                                          outs_data,
                                          loader_classifier.use_broadcast,
@@ -471,94 +526,49 @@ HOSTDEVICE static int64_t ConvertSrcIdxToDstIdx(
   return dst_idx;
 }
 
-template <typename T, int VecSize, bool IsBoundary>
-HOSTDEVICE static void ReadVecDataWithInt64Index(
-    const T *in,
-    int64_t idx,
-    bool need_broadcast,
-    const phi::Array<int64_t, phi::DDim::kMaxRank + 1> &src_strides,
-    const phi::Array<int64_t, phi::DDim::kMaxRank + 1> &dst_strides,
-    int rank,
-    int n,
-    phi::AlignedVector<T, VecSize> *out) {
-  if (IsBoundary) {
-    for (int i = 0; i < n; ++i) {
-      (*out)[i] =
-          in[ConvertSrcIdxToDstIdx(idx + i, src_strides, dst_strides, rank)];
-    }
-  } else {
-    if (!need_broadcast) {
-      phi::Load<T, VecSize>(in + idx, out);
-    } else {
+template <int N>
+struct MaxWithOne {
+  static constexpr auto kValue = (N >= 1 ? N : 1);
+};
+
+template <int Index, int VecSize>
+struct ReadVecDataWithInt64Index {
+  template <typename Array1, typename Array2, typename Array3, typename ArgsT>
+  static __device__ __forceinline__ void Apply(
+      const Array1 &in,
+      ArgsT *args,
+      int64_t idx,
+      const Array2 &need_broadcast,
+      const phi::Array<int64_t, phi::DDim::kMaxRank + 1> &src_strides,
+      const Array3 &dst_strides,
+      int rank,
+      bool is_boundary) {
+    using Type = std::tuple_element_t<Index, ArgsT>;
+    if (is_boundary) {
 #pragma unroll
       for (int i = 0; i < VecSize; ++i) {
-        (*out)[i] =
-            in[ConvertSrcIdxToDstIdx(idx + i, src_strides, dst_strides, rank)];
+        std::get<Index>(args[i]) = in[Index][ConvertSrcIdxToDstIdx(
+            idx + i, src_strides, dst_strides[Index], rank)];
+      }
+    } else {
+      if (!need_broadcast[Index]) {
+        kps::ReadData<Type, VecSize, 1, ArgsT, Index, false>(
+            args, reinterpret_cast<const _ptr_ Type *>(in[Index]) + idx, 1);
+      } else {
+#pragma unroll
+        for (int i = 0; i < VecSize; ++i) {
+          std::get<Index>(args[i]) = in[Index][ConvertSrcIdxToDstIdx(
+              idx + i, src_strides, dst_strides[Index], rank)];
+        }
       }
     }
   }
-}
-
-template <typename InT,
-          typename OutT,
-          typename Functor,
-          int VecSize,
-          int NumIns>
-struct ApplyFunctorWithInt64IndexHelper {
-  HOSTDEVICE static OutT Run(const phi::AlignedVector<InT, VecSize> *ins_vec,
-                             Functor functor,
-                             int i);
-};
-
-template <typename InT, typename OutT, typename Functor, int VecSize>
-struct ApplyFunctorWithInt64IndexHelper<InT, OutT, Functor, VecSize, 0> {
-  HOSTDEVICE static OutT Run(const phi::AlignedVector<InT, VecSize> *ins_vec,
-                             Functor functor,
-                             int i) {
-    return static_cast<OutT>(functor());
-  }
-};
-
-template <typename InT, typename OutT, typename Functor, int VecSize>
-struct ApplyFunctorWithInt64IndexHelper<InT, OutT, Functor, VecSize, 1> {
-  HOSTDEVICE static OutT Run(const phi::AlignedVector<InT, VecSize> *ins_vec,
-                             Functor functor,
-                             int i) {
-    return static_cast<OutT>(functor(ins_vec[0][i]));
-  }
-};
-
-template <typename InT, typename OutT, typename Functor, int VecSize>
-struct ApplyFunctorWithInt64IndexHelper<InT, OutT, Functor, VecSize, 2> {
-  HOSTDEVICE static OutT Run(const phi::AlignedVector<InT, VecSize> *ins_vec,
-                             Functor functor,
-                             int i) {
-    return static_cast<OutT>(functor(ins_vec[0][i], ins_vec[1][i]));
-  }
 };
 
-template <typename InT, typename OutT, typename Functor, int VecSize>
-struct ApplyFunctorWithInt64IndexHelper<InT, OutT, Functor, VecSize, 3> {
-  HOSTDEVICE static OutT Run(const phi::AlignedVector<InT, VecSize> *ins_vec,
-                             Functor functor,
-                             int i) {
-    return static_cast<OutT>(
-        functor(ins_vec[0][i], ins_vec[1][i], ins_vec[2][i]));
-  }
-};
-
-template <int N>
-struct MaxWithOne {
-  static constexpr auto kValue = (N >= 1 ? N : 1);
-};
-
-template <typename InT,
-          typename OutT,
-          typename Functor,
-          int VecSize,
-          int NumIns>
+template <typename OutT, typename Functor, int VecSize, int NumIns>
 __global__ void BroadcastKernelWithInt64Index(
-    phi::Array<const InT *, MaxWithOne<NumIns>::kValue> ins,
+    const phi::Array<const _ptr_ char *__restrict__, MaxWithOne<NumIns>::kValue>
+        &ins,
     OutT *out,
     phi::Array<phi::Array<int64_t, phi::DDim::kMaxRank + 1>,
                MaxWithOne<NumIns>::kValue> ins_strides,
@@ -572,70 +582,34 @@ __global__ void BroadcastKernelWithInt64Index(
   int64_t stride = static_cast<int64_t>(blockDim.x) * gridDim.x * VecSize;
   int64_t limit = numel - VecSize;
 
-  phi::Array<phi::AlignedVector<InT, VecSize>, MaxWithOne<NumIns>::kValue>
-      ins_vec;
+  using Traits = phi::funcs::FunctionTraits<Functor>;
+  using ArgsT = typename Traits::ArgsTuple;
+
+  ArgsT args[VecSize];
   phi::AlignedVector<OutT, VecSize> out_vec;
   for (; idx <= limit; idx += stride) {
-#pragma unroll
-    for (int i = 0; i < NumIns; ++i) {
-      ReadVecDataWithInt64Index<InT, VecSize, false>(ins[i],
-                                                     idx,
-                                                     need_broadcasts[i],
-                                                     out_strides,
-                                                     ins_strides[i],
-                                                     rank,
-                                                     VecSize,
-                                                     &ins_vec[i]);
-    }
+    Unroller<ReadVecDataWithInt64Index, VecSize, NumIns>::step(
+        ins, args, idx, need_broadcasts, out_strides, ins_strides, rank, false);
 
 #pragma unroll
     for (int i = 0; i < VecSize; ++i) {
-      out_vec[i] = ApplyFunctorWithInt64IndexHelper<InT,
-                                                    OutT,
-                                                    Functor,
-                                                    VecSize,
-                                                    NumIns>::Run(ins_vec.Get(),
-                                                                 functor,
-                                                                 i);
+      out_vec[i] = static_cast<OutT>(Apply(functor, args[i]));
     }
-
     phi::Store<OutT, VecSize>(out_vec, out + idx);
   }
 
   if (idx < numel) {
     int remain = numel - idx;  // remain is always less than VecSize, therefore
                                // `int` is enough here
-#pragma unroll
-    for (int i = 0; i < NumIns; ++i) {
-      ReadVecDataWithInt64Index<InT, VecSize, true>(ins[i],
-                                                    idx,
-                                                    need_broadcasts[i],
-                                                    out_strides,
-                                                    ins_strides[i],
-                                                    rank,
-                                                    remain,
-                                                    &ins_vec[i]);
-    }
-
+    Unroller<ReadVecDataWithInt64Index, VecSize, NumIns>::step(
+        ins, args, idx, need_broadcasts, out_strides, ins_strides, rank, true);
     for (int i = 0; i < remain; ++i) {
-      out[idx + i] =
-          ApplyFunctorWithInt64IndexHelper<InT,
-                                           OutT,
-                                           Functor,
-                                           VecSize,
-                                           NumIns>::Run(ins_vec.Get(),
-                                                        functor,
-                                                        i);
+      out_vec[idx + i] = static_cast<OutT>(Apply(functor, args[i]));
     }
   }
 }
 
-template <typename InT,
-          typename OutT,
-          typename Functor,
-          int Arity,
-          int NumOuts,
-          int VecSize>
+template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
 struct LaunchBroadcastKernelWithInt64IndexHelper {
   static void Run(const KPDevice &ctx,
                   const std::vector<const DenseTensor *> &ins,
@@ -647,9 +621,8 @@ struct LaunchBroadcastKernelWithInt64IndexHelper {
   }
 };
 
-template <typename InT, typename OutT, typename Functor, int Arity, int VecSize>
-struct LaunchBroadcastKernelWithInt64IndexHelper<InT,
-                                                 OutT,
+template <typename OutT, typename Functor, int Arity, int VecSize>
+struct LaunchBroadcastKernelWithInt64IndexHelper<OutT,
                                                  Functor,
                                                  Arity,
                                                  /*NumOuts=*/1,
@@ -659,10 +632,9 @@ struct LaunchBroadcastKernelWithInt64IndexHelper<InT,
                   std::vector<DenseTensor *> *outs,
                   int axis,
                   Functor functor) {
-    phi::Array<const InT *, MaxWithOne<Arity>::kValue> ins_ptrs;
-    for (int i = 0; i < Arity; ++i) {
-      ins_ptrs[i] = ins[i]->data<InT>();
-    }
+    phi::Array<const _ptr_ char *__restrict__, MaxWithOne<Arity>::kValue>
+        ins_ptrs;
+    UnrollerWithoutVecSize<InputSetter, Arity>::step(ins, &ins_ptrs);
     auto *out_tensor = (*outs)[0];
     auto *out_ptr = ctx.Alloc<OutT>(out_tensor);
 
@@ -734,7 +706,7 @@ struct LaunchBroadcastKernelWithInt64IndexHelper<InT,
     auto gpu_config =
         phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize);
 
-    BroadcastKernelWithInt64Index<InT, OutT, Functor, VecSize, Arity>
+    BroadcastKernelWithInt64Index<OutT, Functor, VecSize, Arity>
         <<<gpu_config.block_per_grid,
            gpu_config.thread_per_block,
            0,
@@ -843,58 +815,24 @@ struct LaunchBroadcastKernelWithInt64IndexHelper<InT,
 };
 #endif
 
-template <ElementwiseType ET,
-          typename InT,
-          typename OutT,
-          typename Functor,
-          int NumOuts = 1>
+template <typename OutT, typename Functor, int kArity, int NumOuts = 1>
 void BroadcastKernelForDifferentVecSize(
     const KPDevice &ctx,
     const std::vector<const DenseTensor *> &ins,
     std::vector<DenseTensor *> *outs,
     int axis,
     Functor func) {
-  using Traits = phi::funcs::FunctionTraits<Functor>;
-  const int kArity =
-      Traits::has_pointer_args ? static_cast<int>(ET) : Traits::arity;
-  PADDLE_ENFORCE_EQ(
-      ins.size(),
-      kArity,
-      phi::errors::InvalidArgument("The number of inputs is expected to be "
-                                   "equal to the "
-                                   "arity of functor. But received: the "
-                                   "number of inputs "
-                                   "is %d, the arity of functor is %d.",
-                                   ins.size(),
-                                   kArity));
-  PADDLE_ENFORCE_LE(
-      kArity,
-      3,
-      phi::errors::InvalidArgument("Currently only broadcast of ternary is "
-                                   "supported "
-                                   "and verified, but received %d.",
-                                   kArity));
-  PADDLE_ENFORCE_EQ(
-      outs->size(),
-      NumOuts,
-      phi::errors::InvalidArgument("Number of outputs shall equal to number "
-                                   "of functions, "
-                                   "but number of outputs is %d, of "
-                                   "functions is %d.",
-                                   outs->size(),
-                                   NumOuts));
-
 #ifndef PADDLE_WITH_XPU_KP
   constexpr bool kEnabledInt64IndexKernel = (NumOuts == 1 && kArity <= 3);
   bool use_int64_index_kernel =
       kEnabledInt64IndexKernel &&
       (*outs)[0]->numel() >= std::numeric_limits<int32_t>::max();
   if (use_int64_index_kernel) {
-    auto loader_classifier = LoaderTypeClassifier<InT, OutT, kArity>(ins, outs);
+    auto loader_classifier =
+        LoaderTypeClassifier<OutT, kArity, Functor>(ins, outs);
     switch (loader_classifier.vec_size) {
       case VecSizeL: {
-        LaunchBroadcastKernelWithInt64IndexHelper<InT,
-                                                  OutT,
+        LaunchBroadcastKernelWithInt64IndexHelper<OutT,
                                                   Functor,
                                                   kArity,
                                                   NumOuts,
@@ -906,8 +844,7 @@ void BroadcastKernelForDifferentVecSize(
         break;
       }
       case VecSizeM: {
-        LaunchBroadcastKernelWithInt64IndexHelper<InT,
-                                                  OutT,
+        LaunchBroadcastKernelWithInt64IndexHelper<OutT,
                                                   Functor,
                                                   kArity,
                                                   NumOuts,
@@ -919,8 +856,7 @@ void BroadcastKernelForDifferentVecSize(
         break;
       }
       case VecSizeS: {
-        LaunchBroadcastKernelWithInt64IndexHelper<InT,
-                                                  OutT,
+        LaunchBroadcastKernelWithInt64IndexHelper<OutT,
                                                   Functor,
                                                   kArity,
                                                   NumOuts,
@@ -949,7 +885,7 @@ void BroadcastKernelForDifferentVecSize(
       phi::errors::InvalidArgument(
           "XPU only support inputs is 2, but received %d", ins.size()));
 
-  auto loader_classifier = LoaderTypeClassifier<InT, OutT, kArity>();
+  auto loader_classifier = LoaderTypeClassifier<OutT, kArity, Functor>();
   const auto dims_simplifier =
       BroadcastDimsSimplifier(ins, (*outs)[0]->dims(), axis);
   if (VLOG_IS_ON(6)) {
@@ -968,7 +904,8 @@ void BroadcastKernelForDifferentVecSize(
   bool is_optimize = configs[0].cmp_type != type;
   int vec_size = is_optimize ? VecSizeL : VecSizeM;
 #else
-  auto loader_classifier = LoaderTypeClassifier<InT, OutT, kArity>(ins, outs);
+  auto loader_classifier =
+      LoaderTypeClassifier<OutT, kArity, Functor>(ins, outs);
   if (!loader_classifier.all_elementwise) {
     const auto dims_simplifier =
         BroadcastDimsSimplifier(ins, (*outs)[0]->dims(), axis);
@@ -991,17 +928,17 @@ void BroadcastKernelForDifferentVecSize(
 #endif
   switch (loader_classifier.vec_size) {
     case VecSizeL: {
-      LaunchBroadcastKernel<InT, OutT, Functor, kArity, NumOuts, VecSizeL>(
+      LaunchBroadcastKernel<OutT, Functor, kArity, NumOuts, VecSizeL>(
           ctx, ins, outs, func, configs, loader_classifier);
       break;
     }
     case VecSizeM: {
-      LaunchBroadcastKernel<InT, OutT, Functor, kArity, NumOuts, VecSizeM>(
+      LaunchBroadcastKernel<OutT, Functor, kArity, NumOuts, VecSizeM>(
           ctx, ins, outs, func, configs, loader_classifier);
       break;
     }
     case VecSizeS: {
-      LaunchBroadcastKernel<InT, OutT, Functor, kArity, NumOuts, VecSizeS>(
+      LaunchBroadcastKernel<OutT, Functor, kArity, NumOuts, VecSizeS>(
           ctx, ins, outs, func, configs, loader_classifier);
       break;
     }
@@ -1013,18 +950,36 @@ void BroadcastKernelForDifferentVecSize(
   }
 }
 
-template <ElementwiseType ET,
-          typename InT,
-          typename OutT,
-          typename Functor,
-          int NumOuts = 1>
+template <typename OutT, typename Functor, int NumOuts = 1>
 void BroadcastKernel(const KPDevice &ctx,
                      const std::vector<const DenseTensor *> &ins,
                      std::vector<DenseTensor *> *outs,
-                     int axis,
-                     Functor func) {
+                     Functor func,
+                     int axis = -1) {
   // When there are multiple inputs, the outputs's rank should be equal the
   // maximum rank of all inputs.
+  using Traits = phi::funcs::FunctionTraits<Functor>;
+  const int kArity = Traits::arity;
+  PADDLE_ENFORCE_EQ(
+      ins.size(),
+      kArity,
+      phi::errors::InvalidArgument("The number of inputs is expected to be "
+                                   "equal to the "
+                                   "arity of functor. But received: the "
+                                   "number of inputs "
+                                   "is %d, the arity of functor is %d.",
+                                   ins.size(),
+                                   kArity));
+  PADDLE_ENFORCE_EQ(
+      outs->size(),
+      NumOuts,
+      phi::errors::InvalidArgument("Number of outputs shall equal to number "
+                                   "of functions, "
+                                   "but number of outputs is %d, of "
+                                   "functions is %d.",
+                                   outs->size(),
+                                   NumOuts));
+
   int max_rank = 0;
   int min_rank = phi::DDim::kMaxRank;
   for (auto *in : ins) {
@@ -1037,7 +992,7 @@ void BroadcastKernel(const KPDevice &ctx,
     max_rank = std::max(max_rank, (*outs)[0]->dims().size());
   }
   axis = axis == -1 ? max_rank - min_rank : axis;
-  BroadcastKernelForDifferentVecSize<ET, InT, OutT, Functor, NumOuts>(
+  BroadcastKernelForDifferentVecSize<OutT, Functor, kArity, NumOuts>(
       ctx, ins, outs, axis, func);
 }
 
@@ -1045,14 +1000,14 @@ template <typename Functor, typename T, typename OutType = T>
 void ElementwiseCompute(const GPUContext &dev_ctx,
                         const DenseTensor &x,
                         const DenseTensor &y,
-                        int axis,
                         Functor func,
-                        DenseTensor *z) {
+                        DenseTensor *z,
+                        int axis = -1) {
   std::vector<const DenseTensor *> ins = {&x, &y};
   std::vector<DenseTensor *> outs = {z};
   dev_ctx.template Alloc<OutType>(z);
-  BroadcastKernel<ElementwiseType::kBinary, T, OutType, Functor, 1>(
-      dev_ctx, ins, &outs, axis, func);
+
+  BroadcastKernel<OutType, Functor, 1>(dev_ctx, ins, &outs, func, axis);
 }
 
 template <typename DeviceContext,
@@ -1067,7 +1022,7 @@ void DefaultElementwiseOperator(const DeviceContext &dev_ctx,
   auto x_dims = x.dims();
   auto y_dims = y.dims();
   dev_ctx.template Alloc<T>(z);
-  funcs::ElementwiseCompute<Functor, T>(dev_ctx, x, y, axis, Functor(), z);
+  funcs::ElementwiseCompute<Functor, T>(dev_ctx, x, y, Functor(), z, axis);
 }
 
 #else
@@ -1085,10 +1040,10 @@ void DefaultElementwiseOperator(const DeviceContext &dev_ctx,
   auto y_dims = y.dims();
   dev_ctx.template Alloc<T>(z);
   if (x_dims.size() >= y_dims.size()) {
-    funcs::ElementwiseCompute<Functor, T>(dev_ctx, x, y, axis, Functor(), z);
+    funcs::ElementwiseCompute<Functor, T>(dev_ctx, x, y, Functor(), z, axis);
   } else {
     funcs::ElementwiseCompute<InverseFunctor, T>(
-        dev_ctx, x, y, axis, InverseFunctor(), z);
+        dev_ctx, x, y, InverseFunctor(), z, axis);
   }
 }
 #endif
diff --git a/paddle/phi/kernels/funcs/dropout_impl.cu.h b/paddle/phi/kernels/funcs/dropout_impl.cu.h
index 0b47febb0d30e6076384cff6273272bdfab217ef..48a7008463c964ecfddae8335c82cea502ed2cea 100644
--- a/paddle/phi/kernels/funcs/dropout_impl.cu.h
+++ b/paddle/phi/kernels/funcs/dropout_impl.cu.h
@@ -191,25 +191,19 @@ __global__ void VectorizedRandomGenerator(const size_t n,
 }
 
 template <typename T>
-__global__ void DropOutNdForwardKernel(
-    const size_t n,
-    uint64_t seed,
-    const float dropout_prob,
-    const T* src,
-    uint8_t* mask,
-    uint64_t increment,
-    size_t main_offset,
-    DstFunctor<T> dst_functor,
-    MaskFunctor<T> mask_functor,
-    T* y,
-    int64_t N,
-    kps::details::BroadcastConfig broadcast_config,
-    const uint64_t* seed_ptr) {
+__global__ void VectorizedGeneratorMask(const size_t n,
+                                        uint64_t seed,
+                                        const float dropout_prob,
+                                        const T* src,
+                                        uint8_t* mask,
+                                        uint64_t increment,
+                                        size_t main_offset,
+                                        MaskFunctor<T> mask_functor,
+
+                                        const uint64_t* seed_ptr) {
   // Vectorized Generate Mask
   // kCount is 4 for curand_uniform4 is used
-  if (seed_ptr) {
-    seed = seed_ptr[0];
-  }
+  if (seed_ptr) seed = seed_ptr[0];
 
   constexpr int kCount = phi::funcs::uniform_distribution<float>::kReturnsCount;
   size_t idx = static_cast<size_t>(BLOCK_ID_X * BLOCK_NUM_X);
@@ -259,21 +253,6 @@ __global__ void DropOutNdForwardKernel(
     kps::WriteData<uint8_t, kCount, 1, true>(
         mask + fix, &mask_result[0], remainder);
   }
-  // Broadcast mask data and do elementwise operaiton with DstFunctor
-  CUDA_KERNEL_LOOP(i, N) {
-    uint32_t offset = 0u;
-    uint32_t idx = i;
-    // Use (j < phi::DDim::kMaxRank) conditiion rather than
-    // (j < broadcast_config.rank) for (#pragma unroll)
-#pragma unroll
-    for (int j = 0; j < phi::DDim::kMaxRank; ++j) {
-      if (j == broadcast_config.rank) break;
-      auto fast_divmoder = broadcast_config.divmoders[j].Divmod(idx);
-      idx = fast_divmoder.val[0];
-      offset += broadcast_config.strides[j] * fast_divmoder.val[1];
-    }
-    y[i] = dst_functor(src[i], mask[offset]);
-  }
 }
 
 template <typename T, typename MT>
@@ -347,18 +326,6 @@ void DropoutFwGPUKernelDriver(
         size / (block_size * kVecSize) * (block_size * kVecSize);
 
     if (is_dropout_nd) {
-      auto dst_functor =
-          DstFunctor<T>(1.0f - dropout_prob, upscale_in_train, x_numel);
-
-      std::vector<int64_t> out_dims =
-          std::move(phi::vectorize<int64_t>(x.dims()));
-      std::vector<int64_t> in_dims =
-          std::move(phi::vectorize<int64_t>(mask->dims()));
-      std::reverse(out_dims.begin(), out_dims.end());
-      std::reverse(in_dims.begin(), in_dims.end());
-      kps::details::BroadcastConfig broadcast_config(
-          out_dims, in_dims, x.dims().size());
-
       auto mask_functor = MaskFunctor<T>(1.0f - dropout_prob);
       bool copy_in_kernel = GetSeedDataAndIncrement(dev_ctx,
                                                     seed,
@@ -371,20 +338,22 @@ void DropoutFwGPUKernelDriver(
       const uint64_t* seed_ptr =
           copy_in_kernel ? seed->data<uint64_t>() : nullptr;
 
-      DropOutNdForwardKernel<T>
+      VectorizedGeneratorMask<T>
           <<<grid_size, block_size, 0, stream>>>(size,
                                                  seed_data,
                                                  dropout_prob,
                                                  x_data,
                                                  mask_data,
+
                                                  increment,
                                                  main_offset,
-                                                 dst_functor,
                                                  mask_functor,
-                                                 y_data,
-                                                 y->numel(),
-                                                 broadcast_config,
                                                  seed_ptr);
+      auto dst_functor =
+          DstFunctor<T>(1.0f - dropout_prob, upscale_in_train, x_numel);
+      std::vector<const phi::DenseTensor*> ins = {&x, mask};
+      std::vector<phi::DenseTensor*> outs = {y};
+      phi::funcs::BroadcastKernel<T>(dev_ctx, ins, &outs, dst_functor);
     } else {
       bool copy_in_kernel = GetSeedDataAndIncrement(
           dev_ctx, seed, is_fix_seed, seed_val, offset, &seed_data, &increment);
@@ -458,41 +427,26 @@ void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
     // y = factor * x
     ScaleByDropoutFactor<T, MT>(dev_ctx, grad_y, grad_x, factor);
   } else {
-    phi::DenseTensor broadcasted_mask;
-    if (is_dropout_nd) {
-      broadcasted_mask.Resize(grad_y.dims());
-      dev_ctx.template Alloc<uint8_t>(&broadcasted_mask);
-
-      std::vector<const phi::DenseTensor*> broadcast_ins = {&mask};
-      std::vector<phi::DenseTensor*> broadcast_outs = {&broadcasted_mask};
-      phi::funcs::BroadcastKernel<phi::ElementwiseType::kUnary,
-                                  uint8_t,
-                                  uint8_t>(dev_ctx,
-                                           broadcast_ins,
-                                           &broadcast_outs,
-                                           -1,
-                                           kps::IdentityFunctor<uint8_t>());
-    }
-
-    std::vector<const phi::DenseTensor*> ins = {
-        &grad_y, is_dropout_nd ? &broadcasted_mask : &mask};
-    std::vector<phi::DenseTensor*> outs = {grad_x};
-    if (upscale_in_train) {
-      if (dropout_prob == 1.0f) {
+    if (upscale_in_train && dropout_prob == 1.0f) {
 #ifdef PADDLE_WITH_HIP
-        hipMemset(grad_x->data<T>(), 0, grad_x->numel() * sizeof(T));
+      hipMemset(grad_x->data<T>(), 0, grad_x->numel() * sizeof(T));
 #else
-        cudaMemset(grad_x->data<T>(), 0, grad_x->numel() * sizeof(T));
+      cudaMemset(grad_x->data<T>(), 0, grad_x->numel() * sizeof(T));
 #endif
+    } else {
+      MT factor = upscale_in_train
+                      ? static_cast<MT>(1.0f / (1.0f - dropout_prob))
+                      : static_cast<MT>(1.0f);
+
+      std::vector<const phi::DenseTensor*> ins = {&grad_y, &mask};
+      std::vector<phi::DenseTensor*> outs = {grad_x};
+      if (is_dropout_nd) {
+        phi::funcs::BroadcastKernel<T>(
+            dev_ctx, ins, &outs, CudaDropoutGradFunctor<T>(factor));
       } else {
-        MT factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
         phi::funcs::ElementwiseKernel<T>(
             dev_ctx, ins, &outs, CudaDropoutGradFunctor<T>(factor));
       }
-    } else {
-      MT factor = static_cast<MT>(1.0f);
-      phi::funcs::ElementwiseKernel<T>(
-          dev_ctx, ins, &outs, CudaDropoutGradFunctor<T>(factor));
     }
   }
 }
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index 1d40d4d8c2957276ae2b9b4502089d5f585b42e1..377854086db595a75677ce058508cc0d0cb0428f 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -35,7 +35,6 @@ namespace kps = phi::kps;
 
 namespace phi {
 
-enum ElementwiseType { kUnary = 1, kBinary = 2, kTernary = 3, kAny = -1 };
 /* Packing scalar type T(float, int etc.) into Array<T, NumOuts> type
    for supporting multiple-output feature in elementwise system.*/
 template <class T, int Num>
@@ -369,9 +368,9 @@ template <typename Functor, typename T, typename OutType = T>
 void ElementwiseCompute(const CPUContext &dev_ctx,
                         const DenseTensor &x,
                         const DenseTensor &y,
-                        int axis,
                         Functor func,
-                        DenseTensor *z) {
+                        DenseTensor *z,
+                        int axis = -1) {
   dev_ctx.Alloc<OutType>(z);
   auto x_dims = x.dims();
   auto y_dims = y.dims();
@@ -508,15 +507,31 @@ struct Unroller<Func, VecSize, End, End> {
   static HOSTDEVICE inline void step(Args &&...args) {}
 };
 
+// static unroller without VecSize for broadcast
+template <template <int Index> typename Func, int End, int Begin = 0>
+struct UnrollerWithoutVecSize {
+  template <typename... Args>
+  static HOSTDEVICE inline void step(Args &&...args) {
+    Func<Begin>::Apply(std::forward<Args>(args)...);
+    UnrollerWithoutVecSize<Func, End, Begin + 1>::step(args...);
+  }
+};
+
+template <template <int Index> typename Func, int End>
+struct UnrollerWithoutVecSize<Func, End, End> {
+  template <typename... Args>
+  static HOSTDEVICE inline void step(Args &&...args) {}
+};
+
 template <int Index, int VecSize>
 struct Loader {
   template <typename Array, typename ArgsT>
-  static __device__ void Apply(const Array &in,
-                               ArgsT *args,
-                               kps::IndexType offset,
-                               int num,
-                               int read_lens,
-                               bool is_boundary) {
+  static __device__ __forceinline__ void Apply(const Array &in,
+                                               ArgsT *args,
+                                               kps::IndexType offset,
+                                               int num,
+                                               int read_lens,
+                                               bool is_boundary) {
     using Type = std::tuple_element_t<Index, ArgsT>;
     kps::Init<Type, ArgsT, Index, VecSize>(
         args, static_cast<Type>(1.0f), read_lens);
@@ -536,7 +551,7 @@ struct Loader {
   }
 };
 
-template <int Index, int VecSize>
+template <int Index>
 struct InputSetter {
   template <typename Array>
   static HOSTDEVICE void Apply(
@@ -545,7 +560,7 @@ struct InputSetter {
   }
 };
 
-template <int Index, int VecSize>
+template <int Index>
 struct VecSizeGetter {
   template <typename ArgsT>
   static HOSTDEVICE void Apply(const std::vector<const DenseTensor *> &ins,
@@ -569,8 +584,7 @@ int GetVectorizedSizeForTensors(const std::vector<const DenseTensor *> &ins,
   int vec_size = 4;
   uint64_t addr = static_cast<uint64_t>(0);
   ArgsT arg;
-  // The Arg VecSize=1 is to match the Unroller template.
-  Unroller<VecSizeGetter, 1, Arity>::step(ins, arg, &vec_size);
+  UnrollerWithoutVecSize<VecSizeGetter, Arity>::step(ins, arg, &vec_size);
   for (auto iter = outs.begin(); iter != outs.end(); ++iter) {
     addr = (addr | reinterpret_cast<uint64_t>((*iter)->data<OutT>()));
   }
@@ -580,73 +594,6 @@ int GetVectorizedSizeForTensors(const std::vector<const DenseTensor *> &ins,
   return vec_size;
 }
 
-template <typename InT,
-          typename OutT,
-          int VecSize,
-          typename Functor,
-          int Arity,
-          bool CallElementwiseAny = false>
-struct ElementwisePrimitiveCaller {
-  __device__ inline void operator()(Functor func,
-                                    InT (*args)[VecSize],
-                                    OutT *result,
-                                    int read_lens);
-};
-
-template <typename InT, typename OutT, int VecSize, typename Functor, int Arity>
-struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, Arity, true> {
-  __device__ inline void operator()(Functor func,
-                                    InT (*args)[VecSize],
-                                    OutT *result,
-                                    int read_lens) {
-    kps::ElementwiseAny<InT, OutT, VecSize, 1, Arity, Functor>(
-        result, args, func);
-  }
-};
-
-template <typename InT, typename OutT, int VecSize, typename Functor>
-struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 0, false> {
-  __device__ inline void operator()(Functor func,
-                                    InT (*args)[VecSize],
-                                    OutT *result,
-                                    int read_lens) {
-    kps::ElementwiseConstant<InT, OutT, VecSize, 1, Functor>(result, func);
-  }
-};
-
-template <typename InT, typename OutT, int VecSize, typename Functor>
-struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 1, false> {
-  __device__ inline void operator()(Functor func,
-                                    InT (*args)[VecSize],
-                                    OutT *result,
-                                    int read_lens) {
-    kps::ElementwiseUnary<InT, OutT, VecSize, 1, Functor>(
-        result, args[0], func);
-  }
-};
-
-template <typename InT, typename OutT, int VecSize, typename Functor>
-struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 2, false> {
-  __device__ inline void operator()(Functor func,
-                                    InT (*args)[VecSize],
-                                    OutT *result,
-                                    int read_lens) {
-    kps::ElementwiseBinary<InT, OutT, VecSize, 1, Functor>(
-        result, args[0], args[1], func, read_lens);
-  }
-};
-
-template <typename InT, typename OutT, int VecSize, typename Functor>
-struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 3, false> {
-  __device__ inline void operator()(Functor func,
-                                    InT (*args)[VecSize],
-                                    OutT *result,
-                                    int read_lens) {
-    kps::ElementwiseTernary<InT, OutT, VecSize, 1, Functor>(
-        result, args[0], args[1], args[2], func);
-  }
-};
-
 namespace detail {
 template <class F, class Tuple, std::size_t... Index>
 // GCC/Clang need the decltype() return type
@@ -802,7 +749,7 @@ void LaunchElementwiseCudaKernel(const KPDevice &ctx,
   phi::Array<const _ptr_ char *__restrict__, Arity> ins_data;
   phi::Array<_ptr_ OutT *, NumOuts> outs_data;
 
-  Unroller<InputSetter, VecSize, Arity>::step(ins, &ins_data);
+  UnrollerWithoutVecSize<InputSetter, Arity>::step(ins, &ins_data);
   for (int i = 0; i < NumOuts; ++i) {
     outs_data[i] = (_ptr_ OutT *)(ctx.Alloc<OutT>((*outs)[i]));
   }
diff --git a/paddle/phi/kernels/fusion/gpu/attn_gemm.h b/paddle/phi/kernels/fusion/gpu/attn_gemm.h
index 01544436e4d7eb4d3605d4d75da68ca24b5419e7..a96601dddacacf1b7c1f4cbb0129d93b8bccdad4 100644
--- a/paddle/phi/kernels/fusion/gpu/attn_gemm.h
+++ b/paddle/phi/kernels/fusion/gpu/attn_gemm.h
@@ -112,8 +112,8 @@ class AttnMatMul {
       // bias_out = output + bias
       std::vector<const phi::DenseTensor*> ins = {output, bias};
       std::vector<phi::DenseTensor*> outs = {bias_out};
-      phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
-          dev_ctx_, ins, &outs, -1, phi::funcs::AddFunctor<T>());
+      phi::funcs::BroadcastKernel<T>(
+          dev_ctx_, ins, &outs, phi::funcs::AddFunctor<T>());
     }
   }
 
diff --git a/paddle/phi/kernels/fusion/gpu/fmha_ref.h b/paddle/phi/kernels/fusion/gpu/fmha_ref.h
index 207be7b5d7bda4f4e561de861c1e57c7f5876617..a41582c7076b3b887c44d0acbb03fd72c059f5df 100644
--- a/paddle/phi/kernels/fusion/gpu/fmha_ref.h
+++ b/paddle/phi/kernels/fusion/gpu/fmha_ref.h
@@ -258,12 +258,11 @@ class FMHARef {
         ins.emplace_back(src_mask_tensor);
         outs.emplace_back(src_mask_out_tensor);
         int elewise_add_axis = -1;
-        phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
-            dev_ctx_,
-            ins,
-            &outs,
-            elewise_add_axis,
-            phi::funcs::AddFunctor<T>());
+        phi::funcs::BroadcastKernel<T>(dev_ctx_,
+                                       ins,
+                                       &outs,
+                                       phi::funcs::AddFunctor<T>(),
+                                       elewise_add_axis);
 
         phi::SoftmaxForwardCUDAKernelDriver<T>(
             dev_ctx_, *src_mask_out_tensor, softmax_axis, softmax_out_tensor);
@@ -435,12 +434,11 @@ class FMHARef {
         ins.emplace_back(src_mask_tensor);
         outs.emplace_back(src_mask_out_tensor);
         int elewise_add_axis = -1;
-        phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
-            dev_ctx_,
-            ins,
-            &outs,
-            elewise_add_axis,
-            phi::funcs::AddFunctor<T>());
+        phi::funcs::BroadcastKernel<T>(dev_ctx_,
+                                       ins,
+                                       &outs,
+                                       phi::funcs::AddFunctor<T>(),
+                                       elewise_add_axis);
 
         phi::SoftmaxForwardCUDAKernelDriver<T>(
             dev_ctx_, *src_mask_out_tensor, softmax_axis, softmax_out_tensor);
diff --git a/paddle/phi/kernels/gpu/dirichlet_kernel.cu b/paddle/phi/kernels/gpu/dirichlet_kernel.cu
index eacbab800578b69a409f5b555a8ea84aa1e13cba..09d6a402e701a5f469eaf343d1b3f2cabc858a8b 100644
--- a/paddle/phi/kernels/gpu/dirichlet_kernel.cu
+++ b/paddle/phi/kernels/gpu/dirichlet_kernel.cu
@@ -106,8 +106,8 @@ struct DirichletSampler<GPUContext, T> {
         {new_shape.size() - 1},
         true,
         false);
-    funcs::ElementwiseCompute<funcs::DivideFunctor<T>, T, T>(
-        dev_ctx, gamma_samples, gamma_sum, -1, funcs::DivideFunctor<T>(), out);
+    funcs::ElementwiseCompute<funcs::DivideFunctor<T>, T>(
+        dev_ctx, gamma_samples, gamma_sum, funcs::DivideFunctor<T>(), out);
   }
 };
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/elementwise_divide_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_divide_grad_kernel.cu
index d63841f17e62ea0ea25e5888b8db9c164f4dab49..4897060cf44ac71f2ac4b6b3fac2cbf9cc58e6d4 100644
--- a/paddle/phi/kernels/gpu/elementwise_divide_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_divide_grad_kernel.cu
@@ -37,22 +37,21 @@ void DivideGradKernel(const Context& dev_ctx,
   const auto place = dev_ctx.GetPlace();
   if (dx != nullptr && dy != nullptr) {
     std::vector<const DenseTensor*> ins = {&dout, &x, &y};
-    GetGradXAndYOut<ElementwiseType::kTernary, T>(
-        dev_ctx,
-        place,
-        axis,
-        ins,
-        dout,
-        dx,
-        dy,
-        funcs::DivGradXYFunctor<T, T>());
+    GetGradXAndYOut<T>(dev_ctx,
+                       place,
+                       axis,
+                       ins,
+                       dout,
+                       dx,
+                       dy,
+                       funcs::DivGradXYFunctor<T, T>());
   } else if (dx != nullptr && dy == nullptr) {
     std::vector<const DenseTensor*> ins = {&dout, &y};
-    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+    GetGradXOrYOut<T>(
         dev_ctx, place, axis, ins, dout, dx, funcs::DivGradXFunctor<T>());
   } else if (dy != nullptr && dx == nullptr) {
     std::vector<const DenseTensor*> ins = {&dout, &x, &y};
-    GetGradXOrYOut<ElementwiseType::kTernary, T>(
+    GetGradXOrYOut<T>(
         dev_ctx, place, axis, ins, dout, dy, funcs::DivGradYFunctor<T>());
   }
 }
diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h
index 8440ed2b1222b2081b131ee7093a9b6211e8d3b5..6152425f272215feda406803476062f686d2bfd4 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad.h
+++ b/paddle/phi/kernels/gpu/elementwise_grad.h
@@ -35,7 +35,7 @@ void ReduceWrapper(const GPUContext &dev_ctx,
       dev_ctx, *src, dst, kps::IdentityFunctor<T>(), reduce_dims);
 }
 
-template <ElementwiseType ET, typename T, typename Functor>
+template <typename T, typename Functor>
 void GetGradXAndYOut(const GPUContext &dev_ctx,
                      const Place &place,
                      int axis,
@@ -67,8 +67,7 @@ void GetGradXAndYOut(const GPUContext &dev_ctx,
     outs = {&tmp_dx, &tmp_dy};
   }
 
-  funcs::BroadcastKernel<ET, T, T, decltype(func), 2>(
-      dev_ctx, ins, &outs, axis, func);
+  funcs::BroadcastKernel<T, decltype(func), 2>(dev_ctx, ins, &outs, func, axis);
 
   if (dx->dims() != dout.dims() && dy->dims() == dout.dims()) {
     ReduceWrapper<T>(dev_ctx, axis, &tmp_dx, dx);
@@ -80,7 +79,7 @@ void GetGradXAndYOut(const GPUContext &dev_ctx,
   }
 }
 
-template <ElementwiseType ET, typename T, typename Functor>
+template <typename T, typename Functor>
 void GetGradXOrYOut(const GPUContext &dev_ctx,
                     const Place &place,
                     int axis,
@@ -100,7 +99,7 @@ void GetGradXOrYOut(const GPUContext &dev_ctx,
     outs = {dxy};
   }
 
-  funcs::BroadcastKernel<ET, T, T>(dev_ctx, ins, &outs, axis, func);
+  funcs::BroadcastKernel<T>(dev_ctx, ins, &outs, func, axis);
   if (dxy->dims() != dout.dims()) {
     ReduceWrapper<T>(dev_ctx, axis, &tmp_dxy, dxy);
   }
@@ -342,22 +341,21 @@ void ElementwiseDivGrad(const GPUContext &dev_ctx,
   const auto place = dev_ctx.GetPlace();
   if (dx != nullptr && dy != nullptr) {
     std::vector<const DenseTensor *> ins = {&dout, &out, &y};
-    GetGradXAndYOut<ElementwiseType::kTernary, T>(
-        dev_ctx,
-        place,
-        axis,
-        ins,
-        dout,
-        dx,
-        dy,
-        funcs::DivGradXYFunctor<T, T>());
+    GetGradXAndYOut<T>(dev_ctx,
+                       place,
+                       axis,
+                       ins,
+                       dout,
+                       dx,
+                       dy,
+                       funcs::DivGradXYFunctor<T, T>());
   } else if (dx != nullptr && dy == nullptr) {
     std::vector<const DenseTensor *> ins = {&dout, &y};
-    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+    GetGradXOrYOut<T>(
         dev_ctx, place, axis, ins, dout, dx, funcs::DivGradXFunctor<T>());
   } else if (dy != nullptr && dx == nullptr) {
     std::vector<const DenseTensor *> ins = {&dout, &out, &y};
-    GetGradXOrYOut<ElementwiseType::kTernary, T>(
+    GetGradXOrYOut<T>(
         dev_ctx, place, axis, ins, dout, dy, funcs::DivGradYFunctor<T>());
   }
 }
@@ -380,22 +378,21 @@ void ElementwiseMulGrad(const GPUContext &dev_ctx,
 
   if (dx != nullptr && dy != nullptr) {
     std::vector<const DenseTensor *> ins = {&dout, &y, &x};
-    GetGradXAndYOut<ElementwiseType::kTernary, T>(
-        dev_ctx,
-        place,
-        axis,
-        ins,
-        dout,
-        dx,
-        dy,
-        funcs::MultiplyGradXYFunctor<T, T>());
+    GetGradXAndYOut<T>(dev_ctx,
+                       place,
+                       axis,
+                       ins,
+                       dout,
+                       dx,
+                       dy,
+                       funcs::MultiplyGradXYFunctor<T, T>());
   } else if (dx != nullptr && dy == nullptr) {
     std::vector<const DenseTensor *> ins = {&dout, &y};
-    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+    GetGradXOrYOut<T>(
         dev_ctx, place, axis, ins, dout, dx, funcs::MultiplyGradFunctor<T>());
   } else if (dx == nullptr && dy != nullptr) {
     std::vector<const DenseTensor *> ins = {&dout, &x};
-    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+    GetGradXOrYOut<T>(
         dev_ctx, place, axis, ins, dout, dy, funcs::MultiplyGradFunctor<T>());
   }
 }
diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
index 0dac34f5baccd9cb59acd67e5af6d64e51a7f3c9..b1f5012bd715b302040c1e9c0cd9fcfd1cd03457 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -38,22 +38,21 @@ void MaximumGradKernel(const Context& dev_ctx,
 
   if (dx != nullptr && dy != nullptr) {
     std::vector<const DenseTensor*> ins = {&x, &y, &dout};
-    GetGradXAndYOut<ElementwiseType::kTernary, T>(
-        dev_ctx,
-        place,
-        axis,
-        ins,
-        dout,
-        dx,
-        dy,
-        funcs::MaxGradXYFunctor<T, T>());
+    GetGradXAndYOut<T>(dev_ctx,
+                       place,
+                       axis,
+                       ins,
+                       dout,
+                       dx,
+                       dy,
+                       funcs::MaxGradXYFunctor<T, T>());
   } else if (dx != nullptr && dy == nullptr) {
     std::vector<const DenseTensor*> ins = {&x, &y, &dout};
-    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+    GetGradXOrYOut<T>(
         dev_ctx, place, axis, ins, dout, dx, funcs::MaxGradXFunctor<T>());
   } else if (dy != nullptr && dx == nullptr) {
     std::vector<const DenseTensor*> ins = {&x, &y, &dout};
-    GetGradXOrYOut<ElementwiseType::kTernary, T>(
+    GetGradXOrYOut<T>(
         dev_ctx, place, axis, ins, dout, dy, funcs::MaxGradYFunctor<T>());
   }
 }
@@ -69,22 +68,21 @@ void MinimumGradKernel(const Context& dev_ctx,
   const auto place = dev_ctx.GetPlace();
   if (dx != nullptr && dy != nullptr) {
     std::vector<const DenseTensor*> ins = {&x, &y, &dout};
-    GetGradXAndYOut<ElementwiseType::kTernary, T>(
-        dev_ctx,
-        place,
-        axis,
-        ins,
-        dout,
-        dx,
-        dy,
-        funcs::MinGradXYFunctor<T, T>());
+    GetGradXAndYOut<T>(dev_ctx,
+                       place,
+                       axis,
+                       ins,
+                       dout,
+                       dx,
+                       dy,
+                       funcs::MinGradXYFunctor<T, T>());
   } else if (dx != nullptr && dy == nullptr) {
     std::vector<const DenseTensor*> ins = {&x, &y, &dout};
-    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+    GetGradXOrYOut<T>(
         dev_ctx, place, axis, ins, dout, dx, funcs::MinGradXFunctor<T>());
   } else if (dy != nullptr && dx == nullptr) {
     std::vector<const DenseTensor*> ins = {&x, &y, &dout};
-    GetGradXOrYOut<ElementwiseType::kTernary, T>(
+    GetGradXOrYOut<T>(
         dev_ctx, place, axis, ins, dout, dy, funcs::MinGradYFunctor<T>());
   }
 }
diff --git a/paddle/phi/kernels/gpu/expand_as_kernel.cu b/paddle/phi/kernels/gpu/expand_as_kernel.cu
index f87a2417ced19036cf6f710f5c969e7791d8f086..de47024f29581351002e06de19b7b3c8fd7c3e25 100644
--- a/paddle/phi/kernels/gpu/expand_as_kernel.cu
+++ b/paddle/phi/kernels/gpu/expand_as_kernel.cu
@@ -74,8 +74,7 @@ void ExpandAsKernel(const Context& ctx,
   ctx.template Alloc<T>(out);
   std::vector<const DenseTensor*> ins = {&x};
   std::vector<DenseTensor*> outs = {out};
-  phi::funcs::BroadcastKernel<ElementwiseType::kUnary, T, T>(
-      ctx, ins, &outs, -1, kps::IdentityFunctor<T>());
+  phi::funcs::BroadcastKernel<T>(ctx, ins, &outs, kps::IdentityFunctor<T>());
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/expand_kernel.cu b/paddle/phi/kernels/gpu/expand_kernel.cu
index b2f973b0a8896a0eb24679bf7ee989446a7d25a7..456aa9b3c5a34d40565ef83c59ef2504f724f07c 100644
--- a/paddle/phi/kernels/gpu/expand_kernel.cu
+++ b/paddle/phi/kernels/gpu/expand_kernel.cu
@@ -73,8 +73,7 @@ void ExpandKernel(const Context& ctx,
   ctx.template Alloc<T>(out);
   std::vector<const DenseTensor*> ins = {&x};
   std::vector<DenseTensor*> outs = {out};
-  phi::funcs::BroadcastKernel<ElementwiseType::kUnary, T, T>(
-      ctx, ins, &outs, -1, kps::IdentityFunctor<T>());
+  phi::funcs::BroadcastKernel<T>(ctx, ins, &outs, kps::IdentityFunctor<T>());
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
index 620341f338eefab07ed5a813b7db5dd533392ccd..e4ee1f342131a0f8634a7b73cd373a52831879b0 100644
--- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
@@ -407,11 +407,10 @@ void MatrixRankTolKernel(const Context& dev_ctx,
   tol_tensor.Resize(dim_out);
   dev_ctx.template Alloc<T>(&tol_tensor);
 
-  funcs::ElementwiseCompute<GreaterElementFunctor<T>, T, T>(
+  funcs::ElementwiseCompute<GreaterElementFunctor<T>, T>(
       dev_ctx,
       atol_tensor,
       rtol_tensor,
-      -1,
       GreaterElementFunctor<T>(),
       &tol_tensor);
 
@@ -421,12 +420,10 @@ void MatrixRankTolKernel(const Context& dev_ctx,
   compare_result.Resize(detail::NewAxisDim(dim_out, k));
   dev_ctx.template Alloc<int64_t>(&compare_result);
 
-  int axis = -1;
   funcs::ElementwiseCompute<funcs::GreaterThanFunctor<T, int64_t>, T, int64_t>(
       dev_ctx,
       eigenvalue_tensor,
       tol_tensor,
-      axis,
       funcs::GreaterThanFunctor<T, int64_t>(),
       &compare_result);
 
diff --git a/paddle/phi/kernels/gpu/reduce_amin_amax_common.h b/paddle/phi/kernels/gpu/reduce_amin_amax_common.h
index 5ba779d027a38e66506d7ed445cefda8abbdffb8..04befb29b2de11d64754d656473b705e6e2c0470 100644
--- a/paddle/phi/kernels/gpu/reduce_amin_amax_common.h
+++ b/paddle/phi/kernels/gpu/reduce_amin_amax_common.h
@@ -78,8 +78,8 @@ void ReduceCudaAMaxAMinGrad(const Context& dev_ctx,
   // 1. equal_out = Equal(x, y)
   std::vector<const phi::DenseTensor*> equal_inputs = {&new_y, new_in_tensor};
   std::vector<phi::DenseTensor*> equal_outputs = {&equal_out_tensor};
-  funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
-      dev_ctx, equal_inputs, &equal_outputs, 0, funcs::EqualFunctor<T>());
+  funcs::BroadcastKernel<T>(
+      dev_ctx, equal_inputs, &equal_outputs, funcs::EqualFunctor<T>(), 0);
   // 2. equal_count = reduceSum(equal_out)
   using MPType = typename kps::details::MPTypeTrait<T>::Type;
   phi::funcs::
@@ -95,15 +95,15 @@ void ReduceCudaAMaxAMinGrad(const Context& dev_ctx,
   std::vector<const phi::DenseTensor*> mul_inputs = {&new_dout,
                                                      &equal_out_tensor};
   std::vector<phi::DenseTensor*> mul_outputs = {&equal_out_tensor};
-  funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
-      dev_ctx, mul_inputs, &mul_outputs, 0, funcs::MultiplyFunctor<T>());
+  funcs::BroadcastKernel<T>(
+      dev_ctx, mul_inputs, &mul_outputs, funcs::MultiplyFunctor<T>(), 0);
 
   // 4. dx = Div(dx, equal_out)
   std::vector<const phi::DenseTensor*> grad_inputs = {&equal_out_tensor,
                                                       equal_count};
   std::vector<phi::DenseTensor*> grad_outputs = {new_dx_tensor};
-  funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
-      dev_ctx, grad_inputs, &grad_outputs, 0, funcs::DivideFunctor<T>());
+  funcs::BroadcastKernel<T>(
+      dev_ctx, grad_inputs, &grad_outputs, funcs::DivideFunctor<T>(), 0);
   delete equal_out;
   delete equal_count;
 }
diff --git a/paddle/phi/kernels/gpu/reduce_grad.h b/paddle/phi/kernels/gpu/reduce_grad.h
index 01f91924645fae9b8a1a4082ff3383457fa97ade..7e01c1ae8439101506f2013a67572c7488affb41 100644
--- a/paddle/phi/kernels/gpu/reduce_grad.h
+++ b/paddle/phi/kernels/gpu/reduce_grad.h
@@ -28,7 +28,7 @@
 
 namespace phi {
 
-template <typename InT, typename Functor>
+template <typename Functor>
 void ReduceGrad(const GPUContext& dev_ctx,
                 DenseTensor* d_out,
                 DenseTensor* d_x,
@@ -36,14 +36,13 @@ void ReduceGrad(const GPUContext& dev_ctx,
                 Functor functor) {
   std::vector<const DenseTensor*> inputs = {d_out};
   std::vector<DenseTensor*> outputs = {d_x};
-  PD_VISIT_ALL_TYPES(
-      out_dtype, "BroadcastKernel", ([&] {
-        funcs::BroadcastKernel<phi::ElementwiseType::kUnary, InT, data_t>(
-            dev_ctx, inputs, &outputs, 0, functor);
-      }));
+  PD_VISIT_ALL_TYPES(out_dtype, "BroadcastKernel", ([&] {
+                       funcs::BroadcastKernel<data_t>(
+                           dev_ctx, inputs, &outputs, functor, 0);
+                     }));
 }
 
-template <typename T, typename OutT, typename Context, typename Functor>
+template <typename OutT, typename Context, typename Functor>
 void ReduceGradKernel(const Context& dev_ctx,
                       const DenseTensor& x,
                       const DenseTensor& out_grad,
@@ -79,8 +78,7 @@ void ReduceGradKernel(const Context& dev_ctx,
   auto pt_d_x = *d_x;
   std::vector<const DenseTensor*> inputs = {&pt_d_out};
   std::vector<DenseTensor*> outputs = {&pt_d_x};
-  funcs::BroadcastKernel<phi::ElementwiseType::kUnary, T, OutT>(
-      dev_ctx, inputs, &outputs, 0, functor);
+  funcs::BroadcastKernel<OutT>(dev_ctx, inputs, &outputs, functor, 0);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu
index 7b4472c5223182b4b299e9f0694878501d297ccd..6bee38abe1fd058062a110b7c467991ae4058243 100644
--- a/paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu
@@ -62,14 +62,14 @@ void ReduceMaxGradKernel(const Context& dev_ctx,
   // 1. equal_out = Equal(x, y)
   std::vector<const phi::DenseTensor*> equal_inputs = {&new_out, &x};
   std::vector<phi::DenseTensor*> equal_outputs = {equal_out};
-  funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
-      dev_ctx, equal_inputs, &equal_outputs, 0, funcs::EqualFunctor<T>());
+  funcs::BroadcastKernel<T>(
+      dev_ctx, equal_inputs, &equal_outputs, funcs::EqualFunctor<T>(), 0);
 
   // 2. dx = dout * 1
   std::vector<const phi::DenseTensor*> mul_inputs = {&new_out_grad, equal_out};
   std::vector<phi::DenseTensor*> mul_outputs = {x_grad};
-  funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
-      dev_ctx, mul_inputs, &mul_outputs, 0, funcs::MultiplyFunctor<T>());
+  funcs::BroadcastKernel<T>(
+      dev_ctx, mul_inputs, &mul_outputs, funcs::MultiplyFunctor<T>(), 0);
   delete equal_out;
 }
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu
index 3c5d0e625b831944d579e1ababd9528270972daa..13683af9cb9c8113a76a23aa70ceff3e9fc71111 100644
--- a/paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu
@@ -53,8 +53,8 @@ void ReduceMeanGradKernel(const Context& dev_ctx,
   std::vector<DenseTensor*> outputs = {x_grad};
 
   using MPType = typename kps::details::MPTypeTrait<T>::Type;
-  funcs::BroadcastKernel<phi::ElementwiseType::kUnary, T, T>(
-      dev_ctx, inputs, &outputs, 0, kps::DivideFunctor<T, MPType>(reduce_num));
+  funcs::BroadcastKernel<T>(
+      dev_ctx, inputs, &outputs, kps::DivideFunctor<T, MPType>(reduce_num), 0);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu
index 86cccc5e03b1c6f4df3dbbe9572c7f0404c6c0f3..7a650dc9640a2af51494694959deb3ab6b65fda5 100644
--- a/paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu
@@ -62,14 +62,14 @@ void ReduceMinGradKernel(const Context& dev_ctx,
   // 1. equal_out = Equal(x, y)
   std::vector<const phi::DenseTensor*> equal_inputs = {&new_out, &x};
   std::vector<phi::DenseTensor*> equal_outputs = {equal_out};
-  funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
-      dev_ctx, equal_inputs, &equal_outputs, 0, funcs::EqualFunctor<T>());
+  funcs::BroadcastKernel<T>(
+      dev_ctx, equal_inputs, &equal_outputs, funcs::EqualFunctor<T>(), 0);
 
   // 2. dx = dout * 1
   std::vector<const phi::DenseTensor*> mul_inputs = {&new_out_grad, equal_out};
   std::vector<phi::DenseTensor*> mul_outputs = {x_grad};
-  funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
-      dev_ctx, mul_inputs, &mul_outputs, 0, funcs::MultiplyFunctor<T>());
+  funcs::BroadcastKernel<T>(
+      dev_ctx, mul_inputs, &mul_outputs, funcs::MultiplyFunctor<T>(), 0);
   delete equal_out;
 }
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
index 15215c05d6361ac723ff587482bd2b591f2bae0f..9ee6d530374dbb1b7ed37b3b201553ffa9bcb38a 100644
--- a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
@@ -48,7 +48,7 @@ void ReduceSumGradKernel(const Context& dev_ctx,
   // call ReduceGrad
   dev_ctx.Alloc(x_grad, x.dtype());
   using MPType = typename kps::details::MPTypeTrait<T>::Type;
-  phi::ReduceGrad<T, kps::IdentityFunctor<T, MPType>>(
+  phi::ReduceGrad<kps::IdentityFunctor<T, MPType>>(
       dev_ctx,
       &new_out_grad,
       x_grad,
diff --git a/paddle/phi/kernels/gpu/squared_l2_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/squared_l2_norm_grad_kernel.cu
index 4557d44f15021fe639e1535aa868c35b318c92f9..1d491b41ecb5564821c232aa0886a5c67dff1b6f 100644
--- a/paddle/phi/kernels/gpu/squared_l2_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/squared_l2_norm_grad_kernel.cu
@@ -46,8 +46,7 @@ void SquaredL2NormGradKernel(const Context& dev_ctx,
   std::vector<const DenseTensor*> ins{&x, &dout};
   std::vector<DenseTensor*> outs{dx};
 
-  funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
-      dev_ctx, ins, &outs, -1, phi::DoubleMulFunctor<T>());
+  funcs::BroadcastKernel<T>(dev_ctx, ins, &outs, phi::DoubleMulFunctor<T>());
 }
 }  // namespace phi
 
diff --git a/paddle/phi/kernels/gpu/tile_kernel.cu b/paddle/phi/kernels/gpu/tile_kernel.cu
index be825eea499c84bc4d50d883a027b7698acadfbe..7861a2bdf01f872a57484850b5ead3144d7d379b 100644
--- a/paddle/phi/kernels/gpu/tile_kernel.cu
+++ b/paddle/phi/kernels/gpu/tile_kernel.cu
@@ -78,8 +78,8 @@ void TileKernel(const Context& dev_ctx,
         tmp_out.Resize(make_ddim(vec_x_dims));
         dev_ctx.template Alloc<T>(&tmp_out);
         std::vector<DenseTensor*> outs = {&tmp_out};
-        phi::funcs::BroadcastKernel<ElementwiseType::kUnary, T, T>(
-            dev_ctx, ins, &outs, i, kps::IdentityFunctor<T>());
+        phi::funcs::BroadcastKernel<T>(
+            dev_ctx, ins, &outs, kps::IdentityFunctor<T>(), i);
         tmp_out.Resize(out_dims);
         new_x = tmp_out;
       }
@@ -89,8 +89,8 @@ void TileKernel(const Context& dev_ctx,
       out->Resize(make_ddim(vec_x_dims));
       dev_ctx.template Alloc<T>(out);
       std::vector<DenseTensor*> outs = {out};
-      phi::funcs::BroadcastKernel<ElementwiseType::kUnary, T, T>(
-          dev_ctx, ins, &outs, i, kps::IdentityFunctor<T>());
+      phi::funcs::BroadcastKernel<T>(
+          dev_ctx, ins, &outs, kps::IdentityFunctor<T>(), i);
       out->Resize(out_dims);
     }
   }
diff --git a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
index 0a78df87a8ca17a611cb974546f3a8ab751103eb..be630f85ce07da0f003111e1aac7d318aed0c7fb 100644
--- a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
+++ b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
@@ -91,8 +91,7 @@ struct BinaryOperation {
                   DenseTensor* output) {
     std::vector<const DenseTensor*> ins{&lhs, &rhs};
     std::vector<DenseTensor*> outs{output};
-    phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
-        dev_ctx, ins, &outs, 0, BinaryFunctor<T>());
+    phi::funcs::BroadcastKernel<T>(dev_ctx, ins, &outs, BinaryFunctor<T>(), 0);
   }
 };
 
diff --git a/paddle/phi/kernels/impl/complex_kernel_impl.h b/paddle/phi/kernels/impl/complex_kernel_impl.h
index 8bd78234119649c9dae50467ea21cdfb4598fb58..ebbbda04a01c209a2d12da3d51473fb8a08fa2ce 100644
--- a/paddle/phi/kernels/impl/complex_kernel_impl.h
+++ b/paddle/phi/kernels/impl/complex_kernel_impl.h
@@ -90,16 +90,16 @@ void ComplexKernel(const Context& dev_ctx,
 // facility functions
 #if defined(__NVCC__) || defined(__HIPCC__)
   phi::funcs::ElementwiseCompute<RealAndImagToComplexFunctor<T>, T, C>(
-      dev_ctx, x, y, /*axis*/ -1, RealAndImagToComplexFunctor<T>(), out);
+      dev_ctx, x, y, RealAndImagToComplexFunctor<T>(), out);
 #else
   auto x_dims = x.dims();
   auto y_dims = y.dims();
   if (x_dims.size() >= y_dims.size()) {
     phi::funcs::ElementwiseCompute<RealAndImagToComplexFunctor<T>, T, C>(
-        dev_ctx, x, y, /*axis*/ -1, RealAndImagToComplexFunctor<T>(), out);
+        dev_ctx, x, y, RealAndImagToComplexFunctor<T>(), out);
   } else {
     phi::funcs::ElementwiseCompute<ImagAndRealToComplexFunctor<T>, T, C>(
-        dev_ctx, x, y, /*axis*/ -1, ImagAndRealToComplexFunctor<T>(), out);
+        dev_ctx, x, y, ImagAndRealToComplexFunctor<T>(), out);
   }
 #endif
 }
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index 6aee9383da3c1d6bd6bd06afc4e9ee037179ef3d..e061d6e60083d24bd634c0a20bd6e50722adabe3 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -76,15 +76,15 @@ void AddDoubleGradImpl(const Context& dev_ctx,
     auto ddy_dims = ddy_safe.dims();
     if (ddx_dims.size() >= ddy_dims.size()) {
       funcs::ElementwiseCompute<funcs::AddFunctor<T>, T>(
-          dev_ctx, ddx_safe, ddy_safe, axis, funcs::AddFunctor<T>(), ddout);
+          dev_ctx, ddx_safe, ddy_safe, funcs::AddFunctor<T>(), ddout, axis);
     } else {
       funcs::ElementwiseCompute<funcs::InverseAddFunctor<T>, T>(
           dev_ctx,
           ddx_safe,
           ddy_safe,
-          axis,
           funcs::InverseAddFunctor<T>(),
-          ddout);
+          ddout,
+          axis);
     }
   }
 }
@@ -107,7 +107,7 @@ void SubtractDoubleGradImpl(const Context& dev_ctx,
 
     dev_ctx.template Alloc<T>(ddout);
     funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T>(
-        dev_ctx, ddx_safe, ddy_safe, axis, funcs::SubtractFunctor<T>(), ddout);
+        dev_ctx, ddx_safe, ddy_safe, funcs::SubtractFunctor<T>(), ddout, axis);
   }
 }
 
diff --git a/paddle/phi/kernels/impl/elementwise_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
index 5a30fec36c3b060c80d95be77c5a4309755b0347..0121f35b3cecb49cb965d6be9aec21fefb8edf0e 100644
--- a/paddle/phi/kernels/impl/elementwise_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
@@ -39,10 +39,10 @@ namespace phi {
       auto y_dims = y.dims();                                               \
       if (x_dims.size() >= y_dims.size()) {                                 \
         funcs::ElementwiseCompute<funcs::name##Functor<T>, T>(              \
-            dev_ctx, x, y, axis, funcs::name##Functor<T>(), out);           \
+            dev_ctx, x, y, funcs::name##Functor<T>(), out, axis);           \
       } else {                                                              \
         funcs::ElementwiseCompute<funcs::Inverse##name##Functor<T>, T>(     \
-            dev_ctx, x, y, axis, funcs::Inverse##name##Functor<T>(), out);  \
+            dev_ctx, x, y, funcs::Inverse##name##Functor<T>(), out, axis);  \
       }                                                                     \
     }                                                                       \
   }
@@ -62,8 +62,8 @@ namespace phi {
     inputs.emplace_back(&y);                                         \
     outputs.emplace_back(out);                                       \
     dev_ctx.template Alloc<T>(out);                                  \
-    funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(          \
-        dev_ctx, inputs, &outputs, axis, funcs::name##Functor<T>()); \
+    funcs::BroadcastKernel<T>(                                       \
+        dev_ctx, inputs, &outputs, funcs::name##Functor<T>(), axis); \
   }
 
 template <typename T, typename Context>
@@ -72,8 +72,8 @@ void FMaxKernel(const Context& dev_ctx,
                 const DenseTensor& y,
                 DenseTensor* out) {
   dev_ctx.template Alloc<T>(out);
-  funcs::ElementwiseCompute<funcs::FMaxFunctor<T>, T, T>(
-      dev_ctx, x, y, -1, funcs::FMaxFunctor<T>(), out);
+  funcs::ElementwiseCompute<funcs::FMaxFunctor<T>, T>(
+      dev_ctx, x, y, funcs::FMaxFunctor<T>(), out);
 }
 
 template <typename T, typename Context>
@@ -82,8 +82,8 @@ void FMinKernel(const Context& dev_ctx,
                 const DenseTensor& y,
                 DenseTensor* out) {
   dev_ctx.template Alloc<T>(out);
-  funcs::ElementwiseCompute<funcs::FMinFunctor<T>, T, T>(
-      dev_ctx, x, y, -1, funcs::FMinFunctor<T>(), out);
+  funcs::ElementwiseCompute<funcs::FMinFunctor<T>, T>(
+      dev_ctx, x, y, funcs::FMinFunctor<T>(), out);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/lu_kernel_impl.h b/paddle/phi/kernels/impl/lu_kernel_impl.h
index 5315e36b47172aa322c93ed8dddbfbd53071475c..5663484362a8ea7c40cf0ff8fd138e097828fd10 100644
--- a/paddle/phi/kernels/impl/lu_kernel_impl.h
+++ b/paddle/phi/kernels/impl/lu_kernel_impl.h
@@ -153,12 +153,8 @@ void SetValueCompute(const Context& dev_ctx,
   slice_tensor.Resize(slice_dims_for_assign);
   if (value_tensor != nullptr) {
     CheckIsDimsMatch(slice_dims_for_assign, value_tensor->dims());
-    phi::funcs::ElementwiseCompute<SubFunctor<T>, T, T>(dev_ctx,
-                                                        slice_tensor,
-                                                        *value_tensor,
-                                                        -1,
-                                                        SubFunctor<T>(),
-                                                        &slice_tensor);
+    phi::funcs::ElementwiseCompute<SubFunctor<T>, T>(
+        dev_ctx, slice_tensor, *value_tensor, SubFunctor<T>(), &slice_tensor);
   } else {
     DenseTensor value_t(dtype);
     auto value_dims = phi::make_ddim(shape);
@@ -166,8 +162,8 @@ void SetValueCompute(const Context& dev_ctx,
 
     value_t.Resize(value_dims);
     dev_ctx.template Alloc<T>(&value_t);
-    phi::funcs::ElementwiseCompute<SubFunctor<T>, T, T>(
-        dev_ctx, slice_tensor, value_t, -1, SubFunctor<T>(), &slice_tensor);
+    phi::funcs::ElementwiseCompute<SubFunctor<T>, T>(
+        dev_ctx, slice_tensor, value_t, SubFunctor<T>(), &slice_tensor);
   }
   slice_tensor.Resize(slice_dims);
 
diff --git a/paddle/phi/kernels/impl/set_value_kernel_impl.h b/paddle/phi/kernels/impl/set_value_kernel_impl.h
index 9956260a79836b5510a9c38ef82fbc064be9b163..2c545ac06ada119d73939bdd5d187dbe7db86253 100644
--- a/paddle/phi/kernels/impl/set_value_kernel_impl.h
+++ b/paddle/phi/kernels/impl/set_value_kernel_impl.h
@@ -204,7 +204,6 @@ void SetValueImpl(const Context& dev_ctx,
         dev_ctx,
         slice_tensor,
         value,
-        -1,
         funcs::SubtractFunctor<T>(),
         &slice_tensor);
   } else {
@@ -212,7 +211,6 @@ void SetValueImpl(const Context& dev_ctx,
         dev_ctx,
         slice_tensor,
         value,
-        -1,
         funcs::InverseSubtractFunctor<T>(),
         &slice_tensor);
   }
diff --git a/paddle/phi/kernels/kps/bitwise_kernel.cu b/paddle/phi/kernels/kps/bitwise_kernel.cu
index 285b18927af80a375e1258342c646f72d76ba708..fcdc7c95e9151ab23fc96b7003ccfcb7c775b513 100644
--- a/paddle/phi/kernels/kps/bitwise_kernel.cu
+++ b/paddle/phi/kernels/kps/bitwise_kernel.cu
@@ -25,18 +25,17 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 namespace phi {
 
-#define DEFINE_BITWISE_KERNEL(op_type)                      \
-  template <typename T, typename Context>                   \
-  void Bitwise##op_type##Kernel(const Context& dev_ctx,     \
-                                const DenseTensor& x,       \
-                                const DenseTensor& y,       \
-                                DenseTensor* out) {         \
-    dev_ctx.template Alloc<T>(out);                         \
-    funcs::Bitwise##op_type##Functor<T> func;               \
-    std::vector<const DenseTensor*> ins = {&x, &y};         \
-    std::vector<DenseTensor*> outs = {out};                 \
-    funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>( \
-        dev_ctx, ins, &outs, -1, func);                     \
+#define DEFINE_BITWISE_KERNEL(op_type)                    \
+  template <typename T, typename Context>                 \
+  void Bitwise##op_type##Kernel(const Context& dev_ctx,   \
+                                const DenseTensor& x,     \
+                                const DenseTensor& y,     \
+                                DenseTensor* out) {       \
+    dev_ctx.template Alloc<T>(out);                       \
+    funcs::Bitwise##op_type##Functor<T> func;             \
+    std::vector<const DenseTensor*> ins = {&x, &y};       \
+    std::vector<DenseTensor*> outs = {out};               \
+    funcs::BroadcastKernel<T>(dev_ctx, ins, &outs, func); \
   }
 
 DEFINE_BITWISE_KERNEL(And)
diff --git a/paddle/phi/kernels/kps/compare_kernel.cu b/paddle/phi/kernels/kps/compare_kernel.cu
index 5397bf41efb4abebbb7c88ac532bfa5c4eef4037..50de82cd004aa04c69dbc3193a350ef73e7ccc95 100644
--- a/paddle/phi/kernels/kps/compare_kernel.cu
+++ b/paddle/phi/kernels/kps/compare_kernel.cu
@@ -55,8 +55,7 @@ inline void CompareKernelImpl(const Context& ctx,
   ctx.template Alloc<bool>(out);
   std::vector<const DenseTensor*> ins{&x, &y};
   std::vector<DenseTensor*> outs{out};
-  funcs::BroadcastKernel<ElementwiseType::kBinary, T, bool>(
-      ctx, ins, &outs, axis, Functor());
+  funcs::BroadcastKernel<bool>(ctx, ins, &outs, Functor(), axis);
 }
 
 #ifndef PADDLE_WITH_XPU_KP
diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu
index dcab3a0d90e67629de49643fc772d5c5a9d989c4..15694ef9fe70726c8c22e7e102489a978ec28833 100644
--- a/paddle/phi/kernels/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -69,8 +69,8 @@ void HeavisideKernel(const Context& dev_ctx,
   inputs.emplace_back(&y);
   outputs.emplace_back(out);
   dev_ctx.template Alloc<T>(out);
-  funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
-      dev_ctx, inputs, &outputs, -1, funcs::ElementwiseHeavisideFunctor<T>());
+  funcs::BroadcastKernel<T>(
+      dev_ctx, inputs, &outputs, funcs::ElementwiseHeavisideFunctor<T>());
 }
 
 // Create the definition of Pow
diff --git a/paddle/phi/kernels/kps/logical_kernel.cu b/paddle/phi/kernels/kps/logical_kernel.cu
index 570ac0e676ecb24f94f04b29a875e12ebf2de700..cd9e12fe367777f59f626cd277be2f06560207dc 100755
--- a/paddle/phi/kernels/kps/logical_kernel.cu
+++ b/paddle/phi/kernels/kps/logical_kernel.cu
@@ -25,20 +25,17 @@
 
 namespace phi {
 
-#define DEFINE_LOGICAL_BINARY_KERNEL(type)                               \
-  template <typename T, typename Context>                                \
-  void Logical##type##Kernel(const Context& dev_ctx,                     \
-                             const DenseTensor& x,                       \
-                             const DenseTensor& y,                       \
-                             DenseTensor* out) {                         \
-    using InT = typename funcs::Logical##type##Functor<T>::ELEMENT_TYPE; \
-    using OutT = bool;                                                   \
-    dev_ctx.template Alloc<bool>(out);                                   \
-    funcs::Logical##type##Functor<T> binary_func;                        \
-    std::vector<const DenseTensor*> ins = {&x, &y};                      \
-    std::vector<DenseTensor*> outs = {out};                              \
-    funcs::BroadcastKernel<ElementwiseType::kBinary, InT, OutT>(         \
-        dev_ctx, ins, &outs, -1, binary_func);                           \
+#define DEFINE_LOGICAL_BINARY_KERNEL(type)                          \
+  template <typename T, typename Context>                           \
+  void Logical##type##Kernel(const Context& dev_ctx,                \
+                             const DenseTensor& x,                  \
+                             const DenseTensor& y,                  \
+                             DenseTensor* out) {                    \
+    dev_ctx.template Alloc<bool>(out);                              \
+    funcs::Logical##type##Functor<T> binary_func;                   \
+    std::vector<const DenseTensor*> ins = {&x, &y};                 \
+    std::vector<DenseTensor*> outs = {out};                         \
+    funcs::BroadcastKernel<bool>(dev_ctx, ins, &outs, binary_func); \
   }
 
 DEFINE_LOGICAL_BINARY_KERNEL(And)
@@ -50,15 +47,11 @@ template <typename T, typename Context>
 void LogicalNotKernel(const Context& dev_ctx,
                       const DenseTensor& x,
                       DenseTensor* out) {
-  using InT = typename funcs::LogicalNotFunctor<T>::ELEMENT_TYPE;
-  using OutT = bool;
-
   dev_ctx.template Alloc<bool>(out);
   funcs::LogicalNotFunctor<T> unary_func;
   std::vector<const DenseTensor*> ins = {&x};
   std::vector<DenseTensor*> outs = {out};
-  funcs::BroadcastKernel<ElementwiseType::kUnary, InT, OutT>(
-      dev_ctx, ins, &outs, -1, unary_func);
+  funcs::BroadcastKernel<bool>(dev_ctx, ins, &outs, unary_func);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/legacy/cpu/elementwise_kernel.cc b/paddle/phi/kernels/legacy/cpu/elementwise_kernel.cc
index 6d1f8701c3d3daac05a4413fda59fcb396689e6b..fcab26d98c2930e8625c94f21c9d366ab76b6dc3 100644
--- a/paddle/phi/kernels/legacy/cpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/legacy/cpu/elementwise_kernel.cc
@@ -30,7 +30,7 @@ void MaximumRawKernel(const Context& dev_ctx,
   // allocate memory for out
   dev_ctx.template Alloc<T>(out);
   funcs::ElementwiseCompute<funcs::MaximumFunctor<T>, T>(
-      dev_ctx, x, y, axis, funcs::MaximumFunctor<T>(), out);
+      dev_ctx, x, y, funcs::MaximumFunctor<T>(), out, axis);
 }
 
 template <typename T, typename Context>
@@ -42,7 +42,7 @@ void MinimumRawKernel(const Context& dev_ctx,
   // allocate memory for out
   dev_ctx.template Alloc<T>(out);
   funcs::ElementwiseCompute<funcs::MinimumFunctor<T>, T>(
-      dev_ctx, x, y, axis, funcs::MinimumFunctor<T>(), out);
+      dev_ctx, x, y, funcs::MinimumFunctor<T>(), out, axis);
 }
 
 template <typename T, typename Context>
@@ -57,10 +57,10 @@ void RemainderRawKernel(const Context& dev_ctx,
   auto y_dims = y.dims();
   if (x_dims.size() >= y_dims.size()) {
     funcs::ElementwiseCompute<funcs::RemainderFunctor<T>, T>(
-        dev_ctx, x, y, axis, funcs::RemainderFunctor<T>(), out);
+        dev_ctx, x, y, funcs::RemainderFunctor<T>(), out, axis);
   } else {
     funcs::ElementwiseCompute<funcs::InverseRemainderFunctor<T>, T>(
-        dev_ctx, x, y, axis, funcs::InverseRemainderFunctor<T>(), out);
+        dev_ctx, x, y, funcs::InverseRemainderFunctor<T>(), out, axis);
   }
 }
 
@@ -76,10 +76,10 @@ void FloorDivideRawKernel(const Context& dev_ctx,
   auto y_dims = y.dims();
   if (x_dims.size() >= y_dims.size()) {
     funcs::ElementwiseCompute<funcs::FloorDivideFunctor<T>, T>(
-        dev_ctx, x, y, axis, funcs::FloorDivideFunctor<T>(), out);
+        dev_ctx, x, y, funcs::FloorDivideFunctor<T>(), out, axis);
   } else {
     funcs::ElementwiseCompute<funcs::InverseFloorDivideFunctor<T>, T>(
-        dev_ctx, x, y, axis, funcs::InverseFloorDivideFunctor<T>(), out);
+        dev_ctx, x, y, funcs::InverseFloorDivideFunctor<T>(), out, axis);
   }
 }
 
@@ -95,10 +95,10 @@ void ElementwisePowRawKernel(const Context& dev_ctx,
   auto y_dims = y.dims();
   if (x_dims.size() >= y_dims.size()) {
     funcs::ElementwiseCompute<funcs::ElementwisePowFunctor<T>, T>(
-        dev_ctx, x, y, axis, funcs::ElementwisePowFunctor<T>(), out);
+        dev_ctx, x, y, funcs::ElementwisePowFunctor<T>(), out, axis);
   } else {
     funcs::ElementwiseCompute<funcs::ElementwiseInversePowFunctor<T>, T>(
-        dev_ctx, x, y, axis, funcs::ElementwiseInversePowFunctor<T>(), out);
+        dev_ctx, x, y, funcs::ElementwiseInversePowFunctor<T>(), out, axis);
   }
 }
 
diff --git a/paddle/phi/kernels/legacy/kps/elementwise_kernel.cu b/paddle/phi/kernels/legacy/kps/elementwise_kernel.cu
index ec856ffa53b094367b45f2c6a0c838aec992dee8..6f01324e1829b1d95607dcc81de5d73ab56e4f00 100644
--- a/paddle/phi/kernels/legacy/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/legacy/kps/elementwise_kernel.cu
@@ -36,8 +36,8 @@ void MaximumRawKernel(const Context& dev_ctx,
   inputs.emplace_back(&y);
   outputs.emplace_back(out);
   dev_ctx.template Alloc<T>(out);
-  funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
-      dev_ctx, inputs, &outputs, axis, funcs::MaximumFunctor<T>());
+  funcs::BroadcastKernel<T>(
+      dev_ctx, inputs, &outputs, funcs::MaximumFunctor<T>(), axis);
 }
 
 template <typename T, typename Context>
@@ -54,8 +54,8 @@ void MinimumRawKernel(const Context& dev_ctx,
   inputs.emplace_back(&y);
   outputs.emplace_back(out);
   dev_ctx.template Alloc<T>(out);
-  funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
-      dev_ctx, inputs, &outputs, axis, funcs::MinimumFunctor<T>());
+  funcs::BroadcastKernel<T>(
+      dev_ctx, inputs, &outputs, funcs::MinimumFunctor<T>(), axis);
 }
 
 template <typename T, typename Context>
@@ -72,8 +72,8 @@ void RemainderRawKernel(const Context& dev_ctx,
   inputs.emplace_back(&y);
   outputs.emplace_back(out);
   dev_ctx.template Alloc<T>(out);
-  funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
-      dev_ctx, inputs, &outputs, axis, funcs::RemainderFunctor<T>());
+  funcs::BroadcastKernel<T>(
+      dev_ctx, inputs, &outputs, funcs::RemainderFunctor<T>(), axis);
 }
 
 template <typename T, typename Context>
@@ -90,8 +90,8 @@ void FloorDivideRawKernel(const Context& dev_ctx,
   inputs.emplace_back(&y);
   outputs.emplace_back(out);
   dev_ctx.template Alloc<T>(out);
-  funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
-      dev_ctx, inputs, &outputs, axis, funcs::FloorDivideFunctor<T>());
+  funcs::BroadcastKernel<T>(
+      dev_ctx, inputs, &outputs, funcs::FloorDivideFunctor<T>(), axis);
 }
 
 template <typename T, typename Context>
@@ -108,8 +108,8 @@ void ElementwisePowRawKernel(const Context& dev_ctx,
   inputs.emplace_back(&y);
   outputs.emplace_back(out);
   dev_ctx.template Alloc<T>(out);
-  funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
-      dev_ctx, inputs, &outputs, axis, funcs::ElementwisePowFunctor<T>());
+  funcs::BroadcastKernel<T>(
+      dev_ctx, inputs, &outputs, funcs::ElementwisePowFunctor<T>(), axis);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h
index ac2791619610546f28a396b4ded115b1b2c63729..2a3579d99cfe67f5d38d9cf52f98b630223a891f 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives.h
@@ -255,6 +255,18 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
   }
 }
 
+/**
+ * The difference from the above function is that
+ * it supports different data types of inputs.
+ */
+template <typename T, typename ArgsT, int Index, int NX>
+__device__ __forceinline__ void Init(ArgsT* dst, T init_data) {
+#pragma unroll
+  for (int i = 0; i < NX; i++) {
+    std::get<Index>(dst[i]) = init_data;
+  }
+}
+
 /**
  * @brief Read 1D data from global memory to register. When IsBoundary = true
  * and (NX % 4 == 0 or Nx % 2 == 0), vectorized load data will be used to
@@ -307,6 +319,23 @@ __device__ __forceinline__ void ReadData(T* dst,
   }
 }
 
+/**
+ * @brief Read 1D data from global memory to register.
+ * @template paraments
+ * T: The type of data.
+ * NX: Each thread load NX data from global memory continuously.
+ * NY: Each thread need to load NY rows, only NY = 1 was supported.
+ * IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
+ * When the number of data processed by this block is less than
+ * NX x NY x blockDim.x, boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX * NY.
+ * src: The data pointer of the current block.
+ * size: The current block needs to load size data continuously.
+ */
+
 template <typename T, int NX, int NY, bool IsBoundary = false>
 __device__ __forceinline__ void ReadData(T* dst,
                                          const T* __restrict__ src,
@@ -347,9 +376,8 @@ __device__ __forceinline__ void ReadData(T* dst,
  * T: The type of data.
  * NX: Each thread load NX data from global memory continuously.
  * NY: Each thread need to load NY rows, only NY = 1 was supported.
- * ArgsT: The Type if dst, ArgsT can be std::tuple<T> or std::tuple<Args>
+ * ArgsT: The Type of dst, ArgsT can be std::tuple<T> or std::tuple<Args>
  * Index: The index of data stored in dst.
- * threadIdx.x is used as the thread index. Currently only GPU was supported.
  * IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
  * When the number of data processed by this block is less than
  * NX x NY x blockDim.x, boundary judgment is required to avoid memory access
@@ -369,7 +397,7 @@ template <typename T,
 __device__ __forceinline__ void ReadData(ArgsT* dst,
                                          const T* __restrict__ src,
                                          int num,
-                                         int read_lens) {
+                                         int read_lens = 0) {
   if (IsBoundary) {  // blockDim.x * NX > num
     int thread_offset = threadIdx.x * NX;
 #pragma unroll
@@ -743,7 +771,6 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
  * NX: The number of data continuously loaded by each thread.
  * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
  * threadIdx.x is used as the thread index. Currently only GPU was supported.
- * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
  * IsBoundary: Indicates whether to perform block access storage out-of-bounds
  * judgment. When the number of data processed by the block is less than
  * NX x NY x blockDim.x, boundary judgment is required to avoid memory access
@@ -788,6 +815,67 @@ __device__ __forceinline__ void ReadDataBc(
   }
 }
 
+/**
+ * @brief Read 1D data from global memory to register with broadcast form.
+ * The difference from the above function is that it supports different data
+ * types of inputs.
+ *
+ * @template paraments
+ * T: The type of data stored in the global memory.
+ * NX: The number of data continuously loaded by each thread.
+ * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
+ * ArgsT: The Type of dst, ArgsT can be std::tuple<T> or std::tuple<Args>
+ * Index: The index of data stored in dst.
+ * IsBoundary: Indicates whether to perform block access storage out-of-bounds
+ * judgment. When the number of data processed by the block is less than
+ * NX x NY x blockDim.x, boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX * NY.
+ * src: The original input data pointer of kernel.
+ * block_offset: The data offset of this block, blockDim.x * blockIdx.x * NX;
+ * config: Calculation configuration of broadcast. It is used to calculate the
+ * coordinate mapping relationship between output data and input data.
+ * total_num_output: Total number of original output.
+ */
+
+template <typename T,
+          int NX,
+          int NY,
+          typename ArgsT,
+          int Index,
+          bool IsBoundary = false>
+__device__ __forceinline__ void ReadDataBc(
+    ArgsT* dst,
+    const T* __restrict__ src,
+    uint32_t block_offset,
+    const details::BroadcastConfig& config,
+    int total_num_output,
+    int read_lens = NX) {
+  uint32_t thread_offset = block_offset + threadIdx.x * NX;
+  uint32_t index_src = 0;
+
+#pragma unroll
+  for (uint32_t nx = 0; nx < NX; ++nx) {
+    uint32_t index_output = thread_offset + nx;
+    index_src = 0;
+    if (IsBoundary) {
+      if (index_output >= total_num_output) {
+        break;
+      }
+    }
+#pragma unroll
+    for (int i = 0; i < phi::DDim::kMaxRank; ++i) {
+      if (i >= config.rank) break;
+      auto fast_divmoder = config.divmoders[i].Divmod(index_output);
+      index_output = fast_divmoder.val[0];
+      index_src += fast_divmoder.val[1] * config.strides[i];
+    }
+    std::get<Index>(dst[nx]) = src[index_src];
+  }
+}
+
 /**
  * @brief Initialize register with data index.
  *
diff --git a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
index 14a66516f5b632c2b94343645e04d7ae5165b983..7d60b573e2775c874e70e3011233cd0b87a11075 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
@@ -1211,6 +1211,65 @@ __device__ __inline__ void ReadDataBc(T* dst,
   }
 }
 
+/**
+ * @brief Read 1D data from global memory to register with broadcast form.
+ * The difference from the above function is that it supports different data
+ * types of inputs.
+ * @template paraments
+ * T: The type of data stored in the global memory.
+ * NX: The number of data continuously loaded by each thread.
+ * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
+ * core_id() is used as the index.
+ * IsBoundary: Indicates whether to perform block access storage out-of-bounds
+ * judgment. When the number of data processed by the block is less than
+ * NX x NY x core_num(), boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX * NY.
+ * src: The original input data pointer of kernel.
+ * block_offset: The data offset of this block, core_num() * blockIdx.x * NX;
+ * config: Calculation configuration of broadcast. It is used to calculate the
+ * coordinate mapping relationship between output data and input data.
+ * read_lens: The number of data continuously loaded by each thread.
+ * total_num_output: Total number of original output.
+ */
+template <typename T,
+          int NX,
+          int NY,
+          typename ArgsT,
+          int Index,
+          bool IsBoundary = false>
+__device__ __forceinline__ void ReadDataBc(
+    ArgsT* dst,
+    const T _global_ptr_* src,
+    int block_offset,
+    const details::BroadcastConfig& config,
+    int total_num_output,
+    int read_lens = NX) {
+  int thread_offset = block_offset + core_id() * read_lens;
+  __local__ T in_temp[NX];
+
+  if (config.cmp_type == details::OptType::MNK_M1K) {
+    ReadDataBcM1kMnk<T>(in_temp, src, thread_offset, config, read_lens);
+  } else if (config.cmp_type == details::OptType::N_1) {
+    ReadDataBc1N<T>(in_temp, src, thread_offset, config, read_lens);
+  } else if (config.cmp_type == details::OptType::MN_M) {
+    ReadDataBcM1Mn<T>(in_temp, src, thread_offset, config, read_lens);
+  } else if (config.cmp_type == details::OptType::MN_N) {
+    ReadDataBc1NMn<T>(in_temp, src, thread_offset, config, read_lens);
+  } else if (config.cmp_type == details::OptType::MNK_1N1) {
+    ReadDataBc1N1Mnk<T>(in_temp, src, thread_offset, config, read_lens);
+  } else {
+    ReadDataBcCanNotCmp<T, IsBoundary>(
+        in_temp, src, thread_offset, config, total_num_output, read_lens);
+  }
+#pragma unroll
+  for (int idx = 0; idx < read_lens; ++idx) {
+    std::get<Index>(dst[idx]) = in_temp[idx];
+  }
+}
+
 /**
  * @brief Initialize register with data index.
  *
diff --git a/test/cpp/phi/kernels/test_ternary_broadcast.cu b/test/cpp/phi/kernels/test_ternary_broadcast.cu
index d1faa89a1ed5b86394df4179a985f920b4154bc4..09598e637909aa86396034d23062d322bdd1163b 100644
--- a/test/cpp/phi/kernels/test_ternary_broadcast.cu
+++ b/test/cpp/phi/kernels/test_ternary_broadcast.cu
@@ -89,8 +89,7 @@ void TestCase(const phi::GPUContext& dev_ctx,
       d_in1.get(), d_in2.get(), d_in3.get()};
   std::vector<phi::DenseTensor*> outputs{d_out.get()};
   for (int i = 0; i < times; ++i) {
-    phi::funcs::BroadcastKernel<phi::ElementwiseType::kTernary, T, T>(
-        dev_ctx, inputs, &outputs, -1, compute);
+    phi::funcs::BroadcastKernel<T>(dev_ctx, inputs, &outputs, compute);
   }
   dev_ctx.Wait();
 }