diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index f3ddcd4e5c23c75f9715f6624cdbdf420309dd7e..adea0c04df59cd1715905e75dbbe9012add7d134 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -36,7 +36,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/pten/kernels/funcs/cuda_kernel_config.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
index e3d4607b7130c5fc6abeafb9a617589a8c2370cc..f568ee191239d205fcf5eef77cb4a53959ae1952 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
@@ -15,45 +15,13 @@
 #pragma once
 
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
+
+// only can include the headers in paddle/top/api dirs
+#include "paddle/pten/kernels/gpu/elementwise.h"
 
 namespace paddle {
 namespace operators {
 
-namespace kps = paddle::operators::kernel_primitives;
-
-template <ElementwiseType ET, typename InT, typename OutT, typename Functor,
-          int NumOuts = 1>
-void LaunchBroadcastElementwiseCudaKernel(
-    const KPDevice &ctx, const std::vector<const framework::Tensor *> &ins,
-    std::vector<framework::Tensor *> *outs, int axis, Functor func) {
-  std::vector<const pten::DenseTensor *> pt_inputs;
-  std::vector<pten::DenseTensor *> pt_outputs;
-  // TODO(YuanRisheng) *_tmp for cache DenseTensor, because the temporary
-  // DenseTensor obj
-  // generated by MakePtenDenseTensor can be destroyed when exits loop. *_tmp
-  // can be deleted
-  // when DenseTensor support copy constructor.
-  std::vector<std::unique_ptr<pten::DenseTensor>> pt_inputs_tmp;
-  std::vector<std::unique_ptr<pten::DenseTensor>> pt_outputs_tmp;
-  for (auto in : ins) {
-    pt_inputs_tmp.emplace_back(
-        std::move(paddle::experimental::MakePtenDenseTensor(*in)));
-  }
-  for (auto out : *outs) {
-    pt_outputs_tmp.emplace_back(
-        std::move(paddle::experimental::MakePtenDenseTensor(*out)));
-  }
-  for (int i = 0; i < pt_inputs_tmp.size(); i++) {
-    pt_inputs.push_back(pt_inputs_tmp[i].get());
-  }
-  for (int i = 0; i < pt_outputs_tmp.size(); i++) {
-    pt_outputs.push_back(pt_outputs_tmp[i].get());
-  }
-  pten::LaunchBroadcastElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
-      ctx, pt_inputs, &pt_outputs, axis, func);
-}
-
 template <ElementwiseType ET, typename InT, typename OutT, typename Functor,
           int NumOuts = 1>
 void LaunchElementwiseCudaKernel(
@@ -82,7 +50,7 @@ void LaunchElementwiseCudaKernel(
   for (int i = 0; i < pt_outputs_tmp.size(); i++) {
     pt_outputs.push_back(pt_outputs_tmp[i].get());
   }
-  pten::LaunchElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
+  pten::funcs::BroadcastKernel<ET, InT, OutT, Functor, NumOuts>(
       ctx, pt_inputs, &pt_outputs, axis, func);
 }
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
index 233b0767ed624489c478b953e26ed43c91d5b4fc..88c4a266cd9a108cc597e81d281cf5b02299fce6 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -19,7 +19,7 @@ limitations under the License. */
 
 // only can include the headers in paddle/top/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/kernels/gpu/elementwise.h"
+#include "paddle/pten/kernels/funcs/elementwise_base.h"
 
 namespace paddle {
 namespace operators {
@@ -53,8 +53,8 @@ void LaunchSameDimsElementwiseCudaKernel(
   for (int i = 0; i < pt_outputs_tmp.size(); i++) {
     pt_outputs.push_back(pt_outputs_tmp[i].get());
   }
-  pten::funcs::LaunchSameDimsElementwiseCudaKernel<OutT, Functor, NumOuts>(
-      ctx, pt_inputs, &pt_outputs, func);
+  pten::funcs::ElementwiseKernel<OutT, Functor, NumOuts>(ctx, pt_inputs,
+                                                         &pt_outputs, func);
 }
 
 }  // namespace operators
diff --git a/paddle/pten/kernels/funcs/broadcast_function.h b/paddle/pten/kernels/funcs/broadcast_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..0823fa32b775c500f984287e676d2ad4e6421b43
--- /dev/null
+++ b/paddle/pten/kernels/funcs/broadcast_function.h
@@ -0,0 +1,585 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/kernels/funcs/elementwise_base.h"
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+
+namespace kps = pten::kps;
+
+#endif
+
+namespace pten {
+namespace funcs {
+
+struct DimensionsTransform {
+  using DimVector = std::vector<int64_t>;
+  typedef void (*MergeFunctor)(
+      bool &, std::vector<DimVector> &, DimVector &, int, int);
+  int64_t dim_size;
+  DimVector out_dims;
+  std::vector<DimVector> in_dims;
+
+ private:
+  // To compensate the lackage of input_tensors` dimension with input variable
+  // 'axis'
+  void InputDimensionsExtend(int N, int axis) {
+    for (auto &in_dim : in_dims) {
+      int64_t in_idx = 0;
+      if (in_dim.size() < dim_size) {
+        DimVector tmp_dim(dim_size, 1);
+        do {
+          if (in_dim[in_idx] == out_dims[axis] || in_dim[in_idx] == 1) {
+            tmp_dim[axis] = in_dim[in_idx];
+            in_idx++;
+            axis++;
+          } else {
+            PADDLE_THROW(pten::errors::InvalidArgument(
+                "The %d-th dimension of input tensor is expected to be equal "
+                "with the %d-th dimension of output tensor %d or 1, but "
+                "recieved %d.",
+                in_idx + 1,
+                axis + 1,
+                out_dims[axis],
+                in_dim[in_idx]));
+          }
+        } while (in_idx < in_dim.size());
+        in_dim.resize(dim_size);
+        std::copy(tmp_dim.begin(), tmp_dim.end(), in_dim.begin());
+      } else {
+        do {
+          if (in_dim[in_idx] == out_dims[in_idx] || in_dim[in_idx] == 1) {
+            in_idx++;
+          } else {
+            PADDLE_THROW(pten::errors::InvalidArgument(
+                "The %d-th dimension of input tensor is expected to be equal "
+                "with the %d-th dimension of output tensor %d or 1, but "
+                "recieved %d.",
+                in_idx + 1,
+                in_idx + 1,
+                out_dims[in_idx],
+                in_dim[in_idx]));
+          }
+        } while (in_idx < dim_size);
+      }
+      std::reverse(in_dim.begin(), in_dim.end());
+    }
+    std::reverse(out_dims.begin(), out_dims.end());
+  }
+
+  template <typename MergeFunctor>
+  __inline__ void MergeDimensions(MergeFunctor merge_func, int N) {
+    auto VectorReorganise = [](DimVector *vec, int l_idx, int m_idx) {
+      (*vec)[m_idx - 1] = std::accumulate(vec->begin() + l_idx,
+                                          vec->begin() + m_idx,
+                                          1,
+                                          std::multiplies<int64_t>());
+      vec->erase(vec->begin() + l_idx, vec->begin() + m_idx - 1);
+    };
+
+    int64_t i = 0;
+    while (i < dim_size) {
+      int cnt = 0;
+      int low_idx = i;
+      bool equal = true;
+      do {
+        merge_func(equal, in_dims, out_dims, i, N);
+        if (equal) {
+          i++;
+          cnt++;
+        } else {
+          break;
+        }
+      } while (i < dim_size);
+
+      if (cnt > 1) {
+        for (auto &in_dim : in_dims) {
+          VectorReorganise(&in_dim, low_idx, i);
+        }
+        VectorReorganise(&out_dims, low_idx, i);
+        dim_size -= --cnt;
+        i -= cnt;
+      } else if (cnt < 1) {
+        i++;
+      }
+    }
+  }
+
+ public:
+  explicit DimensionsTransform(const std::vector<const DenseTensor *> &ins,
+                               const pten::DDim &dims,
+                               int axis) {
+    const int N = max(static_cast<int>(ins.size()), 2);
+    dim_size = dims.size();
+    out_dims = pten::vectorize<int64_t>(dims);
+    in_dims.resize(N);
+    if (ins.size() == 1) {
+      // when ins.size() = 1, broadcast input to output
+      in_dims[0] = pten::vectorize<int64_t>(ins[0]->dims());
+      in_dims[1] = out_dims;
+      // Add out_dims to in_dims to avoid errors in dims merging
+    } else {
+      for (int j = 0; j < N; ++j) {
+        in_dims[j] = pten::vectorize<int64_t>(ins[j]->dims());
+      }
+    }
+    InputDimensionsExtend(N, axis);
+
+    auto merge_sequential_dims = [](bool &equal,
+                                    std::vector<DimVector> &in_dims,
+                                    DimVector &out,
+                                    int i,
+                                    int num) {
+      for (int j = 1; j < num; ++j) {
+        equal &= (in_dims[0][i] == in_dims[j][i]) ? true : false;
+      }
+    };
+    auto merge_sequential_one_dims = [](bool &equal,
+                                        std::vector<DimVector> &in_dims,
+                                        DimVector &out,
+                                        int i,
+                                        int num) {
+      equal = in_dims[0][i] == 1;
+      if (equal) {
+        for (int j = 1; j < num; ++j) {
+          equal &= in_dims[j][i] == out[i];
+        }
+      }
+    };
+    // To Merge the dimensions of input_tensors while the consequtive
+    // equal-dimensions appears.
+    MergeFunctor merge_ptr = merge_sequential_dims;
+    MergeDimensions<MergeFunctor>(merge_ptr, N);
+
+    int min_idx = 0;
+    int min_val = std::accumulate(
+        in_dims[0].begin(), in_dims[0].end(), 1, std::multiplies<int64_t>());
+    for (int j = 1; j < N; ++j) {
+      int temp = std::accumulate(
+          in_dims[j].begin(), in_dims[j].end(), 1, std::multiplies<int64_t>());
+      min_val = min_val > temp ? temp : min_val;
+      min_idx = min_val == temp ? j : min_idx;
+    }
+    std::swap(in_dims[0], in_dims[min_idx]);
+
+    // To Merge the dimension of input_tensors while the consequtive
+    // 1-value-dimensions appears.
+    merge_ptr = merge_sequential_one_dims;
+    MergeDimensions<MergeFunctor>(merge_ptr, N);
+    std::swap(in_dims[min_idx], in_dims[0]);
+  }
+};
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+
+template <typename T, int VecSize, int Rank, bool IsBoundary = false>
+__device__ __forceinline__ void LoadData(
+    T *dst,
+    const _ptr_ T *src,
+    uint32_t block_offset,
+    const kps::details::BroadcastConfig<Rank> &config,
+    int numel,
+    int num,
+    int need_broadcast) {
+  // numel : whole num of output
+  // num: how many data will be deal with in this time
+  if (need_broadcast) {
+    kps::ReadDataBc<T, VecSize, 1, 1, Rank, IsBoundary>(
+        dst, src, block_offset, config, numel);
+  } else {
+    kps::ReadData<T, VecSize, 1, 1, IsBoundary>(dst, src + block_offset, num);
+  }
+}
+
+template <typename InT,
+          typename OutT,
+          typename Functor,
+          int Arity,
+          int NumOuts,
+          int VecSize,
+          int Rank,
+          bool IsBoundary = false>
+__device__ void VectorizedBroadcastKernelImpl(
+    const pten::Array<const _ptr_ InT *__restrict__, Arity> &ins,
+    pten::Array<_ptr_ OutT *, NumOuts> outs,
+    const pten::Array<int, Arity> &use_broadcast,
+    uint32_t numel,
+    const pten::Array<kps::details::BroadcastConfig<Rank>, Arity> &configs,
+    int num,
+    int block_offset,
+    Functor func) {
+  InT args[Arity][VecSize];
+  ConditionalT<OutT, NumOuts> result[VecSize];
+
+#pragma unroll
+  for (int i = 0; i < Arity; i++) {
+    kps::Init<InT, VecSize>(args[i], static_cast<InT>(1.0f));
+    LoadData<InT, VecSize, Rank, IsBoundary>(args[i],
+                                             ins[i],
+                                             block_offset,
+                                             configs[i],
+                                             numel,
+                                             num,
+                                             use_broadcast[i]);
+  }
+  constexpr bool kCallElementwiseAny =
+      paddle::platform::FunctionTraits<Functor>::has_pointer_args;
+  pten::funcs::ElementwisePrimitiveCaller<InT,
+                                          ConditionalT<OutT, NumOuts>,
+                                          VecSize,
+                                          Functor,
+                                          Arity,
+                                          kCallElementwiseAny>()(
+      func, args, result);
+
+  pten::funcs::ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, NumOuts>()(
+      outs, result, block_offset, num);
+}
+
+template <typename InT,
+          typename OutT,
+          typename Functor,
+          int Arity,
+          int NumOuts,
+          int VecSize,
+          int Rank>
+__global__ void VectorizedBroadcastKernel(
+    pten::Array<const _ptr_ InT *__restrict__, Arity> ins,
+    pten::Array<_ptr_ OutT *, NumOuts> outs,
+    pten::Array<int, Arity> use_broadcast,
+    uint32_t numel,
+    pten::Array<kps::details::BroadcastConfig<Rank>, Arity> configs,
+    int main_offset,
+    int tail_tid,
+    Functor func) {
+  int block_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
+  int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
+
+#ifdef PADDLE_WITH_XPU2
+  for (; block_offset < main_offset; block_offset += stride) {
+    VectorizedBroadcastKernelImpl<InT,
+                                  OutT,
+                                  Functor,
+                                  Arity,
+                                  NumOuts,
+                                  VecSize,
+                                  Rank,
+                                  false>(ins,
+                                         outs,
+                                         use_broadcast,
+                                         numel,
+                                         configs,
+                                         BLOCK_NUM_X * VecSize,
+                                         block_offset,
+                                         func);
+  }
+  int num = numel - block_offset;
+  if (num > 0) {
+    VectorizedBroadcastKernelImpl<InT,
+                                  OutT,
+                                  Functor,
+                                  Arity,
+                                  NumOuts,
+                                  VecSize,
+                                  Rank,
+                                  true>(
+        ins, outs, use_broadcast, numel, configs, num, block_offset, func);
+  }
+#else
+  if (block_offset < main_offset) {
+    VectorizedBroadcastKernelImpl<InT,
+                                  OutT,
+                                  Functor,
+                                  Arity,
+                                  NumOuts,
+                                  VecSize,
+                                  Rank,
+                                  false>(ins,
+                                         outs,
+                                         use_broadcast,
+                                         numel,
+                                         configs,
+                                         BLOCK_NUM_X * VecSize,
+                                         block_offset,
+                                         func);
+  } else {
+    VectorizedBroadcastKernelImpl<InT,
+                                  OutT,
+                                  Functor,
+                                  Arity,
+                                  NumOuts,
+                                  VecSize,
+                                  Rank,
+                                  true>(
+        ins, outs, use_broadcast, numel, configs, tail_tid, block_offset, func);
+  }
+#endif
+}
+
+template <typename InT,
+          typename OutT,
+          typename Functor,
+          int Arity,
+          int NumOuts,
+          int VecSize,
+          int Rank>
+void LaunchBroadcastKernel(const KPDevice &ctx,
+                           const std::vector<const DenseTensor *> &ins,
+                           std::vector<DenseTensor *> *outs,
+                           Functor func,
+                           DimensionsTransform merge_dims) {
+  int numel = (*outs)[0]->numel();
+  pten::Array<kps::details::BroadcastConfig<Rank>, Arity> configs;
+  pten::Array<int, Arity> use_broadcast;
+  pten::Array<const _ptr_ InT *__restrict__, Arity> ins_data;
+  pten::Array<_ptr_ OutT *, NumOuts> outs_data;
+
+  for (int i = 0; i < NumOuts; ++i) {
+    outs_data[i] = ctx.Alloc<OutT>((*outs)[i]);
+  }
+
+  for (int i = 0; i < Arity; i++) {
+    use_broadcast[i] = (ins[i]->numel() != numel);
+    ins_data[i] = (_ptr_ InT *)(ins[i]->data<InT>());
+    if (use_broadcast[i]) {
+      // get the broadcast config,
+      // if data shape is[m, n], then you should set data_dim = {n, m}
+      // eg: out's shape [3, 45, 1]. then out_dims = {1, 45, 3}
+      configs[i] = kps::details::BroadcastConfig<Rank>(
+          merge_dims.out_dims, merge_dims.in_dims[i], merge_dims.dim_size);
+    }
+  }
+
+#ifdef PADDLE_WITH_XPU2
+  const int threads = 64;
+  const int blocks = 8;
+  int main_offset = (numel / (VecSize * threads)) * VecSize * threads;
+  int tail_tid = numel % (VecSize * threads);
+  auto stream = ctx.x_context()->xpu_stream;
+  VectorizedBroadcastKernel<InT,
+                            OutT,
+                            Functor,
+                            Arity,
+                            NumOuts,
+                            VecSize,
+                            Rank><<<blocks, threads, stream>>>(ins_data,
+                                                               outs_data,
+                                                               use_broadcast,
+                                                               numel,
+                                                               configs,
+                                                               main_offset,
+                                                               tail_tid,
+                                                               func);
+#else
+  const int threads = 256;
+  int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads;
+  int main_offset = (numel / (VecSize * threads)) * VecSize * threads;
+  int tail_tid = numel % (VecSize * threads);
+  auto stream = ctx.stream();
+  VectorizedBroadcastKernel<InT,
+                            OutT,
+                            Functor,
+                            Arity,
+                            NumOuts,
+                            VecSize,
+                            Rank><<<blocks, threads, 0, stream>>>(ins_data,
+                                                                  outs_data,
+                                                                  use_broadcast,
+                                                                  numel,
+                                                                  configs,
+                                                                  main_offset,
+                                                                  tail_tid,
+                                                                  func);
+#endif
+}
+
+template <typename InT,
+          typename OutT,
+          typename Functor,
+          int Arity,
+          int NumOuts,
+          int VecSize>
+void BroadcastKernelForDifferentDimSize(
+    const KPDevice &ctx,
+    const std::vector<const DenseTensor *> &ins,
+    std::vector<DenseTensor *> *outs,
+    int axis,
+    Functor func) {
+  const auto merge_dims = DimensionsTransform(ins, (*outs)[0]->dims(), axis);
+
+#define CALL_BROADCAST_FOR_DIM_SIZE(rank)                                     \
+  case rank: {                                                                \
+    LaunchBroadcastKernel<InT, OutT, Functor, Arity, NumOuts, VecSize, rank>( \
+        ctx, ins, outs, func, merge_dims);                                    \
+  } break;
+
+  switch (merge_dims.dim_size) {
+    CALL_BROADCAST_FOR_DIM_SIZE(1);
+    CALL_BROADCAST_FOR_DIM_SIZE(2);
+    CALL_BROADCAST_FOR_DIM_SIZE(3);
+    CALL_BROADCAST_FOR_DIM_SIZE(4);
+    CALL_BROADCAST_FOR_DIM_SIZE(5);
+    CALL_BROADCAST_FOR_DIM_SIZE(6);
+    CALL_BROADCAST_FOR_DIM_SIZE(7);
+    CALL_BROADCAST_FOR_DIM_SIZE(8);
+    default: {
+      PADDLE_THROW(pten::errors::InvalidArgument(
+          "The maximum dimension of input tensor is expected to be less than "
+          "%d, but recieved %d.",
+          merge_dims.dim_size,
+          pten::DDim::kMaxRank));
+    }
+  }
+#undef CALL_BROADCAST_FOR_DIM_SIZE
+}
+
+template <ElementwiseType ET,
+          typename InT,
+          typename OutT,
+          typename Functor,
+          int NumOuts = 1>
+void BroadcastKernelForDifferentVecSize(
+    const KPDevice &ctx,
+    const std::vector<const DenseTensor *> &ins,
+    std::vector<DenseTensor *> *outs,
+    int axis,
+    Functor func) {
+  using Traits = paddle::platform::FunctionTraits<Functor>;
+  const int kArity =
+      Traits::has_pointer_args ? static_cast<int>(ET) : Traits::arity;
+  PADDLE_ENFORCE_EQ(ins.size(),
+                    kArity,
+                    pten::errors::InvalidArgument(
+                        "The number of inputs is expected to be equal to the "
+                        "arity of functor. But recieved: the number of inputs "
+                        "is %d, the arity of functor is %d.",
+                        ins.size(),
+                        kArity));
+  PADDLE_ENFORCE_LE(kArity,
+                    3,
+                    pten::errors::InvalidArgument(
+                        "Currently only broadcast of ternary is supported "
+                        "and verified, but received %d.",
+                        kArity));
+  PADDLE_ENFORCE_EQ(outs->size(),
+                    NumOuts,
+                    pten::errors::InvalidArgument(
+                        "Number of outputs shall equal to number of functions, "
+                        "but number of outputs is %d, of functions is %d.",
+                        outs->size(),
+                        NumOuts));
+  int in_vec_size = 4;
+  int out_vec_size = 4;
+  if (NumOuts > 1) {
+    for (int i = 0; i < NumOuts; ++i) {
+      PADDLE_ENFORCE_EQ(
+          (*outs)[i]->dims(),
+          (*outs)[0]->dims(),
+          pten::errors::InvalidArgument(
+              "The shape of each output tensor shall be identical yet, but "
+              "%d-th output tensor`s shape is not.",
+              i));
+      out_vec_size = std::min(
+          paddle::platform::GetVectorizedSize<OutT>((*outs)[i]->data<OutT>()),
+          out_vec_size);
+    }
+  } else {
+    out_vec_size =
+        paddle::platform::GetVectorizedSize<OutT>((*outs)[0]->data<OutT>());
+  }
+
+  for (auto *in : ins) {
+    auto temp_size = paddle::platform::GetVectorizedSize<InT>(in->data<InT>());
+    in_vec_size = in->dims() == (*outs)[0]->dims()
+                      ? std::min(temp_size, in_vec_size)
+                      : in_vec_size;
+  }
+  int vec_size = std::min(out_vec_size, in_vec_size);
+
+  switch (vec_size) {
+    case 4: {
+      BroadcastKernelForDifferentDimSize<InT,
+                                         OutT,
+                                         Functor,
+                                         kArity,
+                                         NumOuts,
+                                         4>(ctx, ins, outs, axis, func);
+      break;
+    }
+    case 2: {
+      BroadcastKernelForDifferentDimSize<InT,
+                                         OutT,
+                                         Functor,
+                                         kArity,
+                                         NumOuts,
+                                         2>(ctx, ins, outs, axis, func);
+      break;
+    }
+    case 1: {
+      BroadcastKernelForDifferentDimSize<InT,
+                                         OutT,
+                                         Functor,
+                                         kArity,
+                                         NumOuts,
+                                         1>(ctx, ins, outs, axis, func);
+      break;
+    }
+    default: {
+      PADDLE_THROW(pten::errors::Unimplemented(
+          "Unsupported vectorized size: %d!", vec_size));
+      break;
+    }
+  }
+}
+
+template <ElementwiseType ET,
+          typename InT,
+          typename OutT,
+          typename Functor,
+          int NumOuts = 1>
+void BroadcastKernel(const KPDevice &ctx,
+                     const std::vector<const DenseTensor *> &ins,
+                     std::vector<DenseTensor *> *outs,
+                     int axis,
+                     Functor func) {
+  std::vector<int> dims_size;
+  bool no_broadcast_flag = true;
+  for (auto *in : ins) {
+    no_broadcast_flag &= ins[0]->dims() == in->dims();
+    dims_size.emplace_back(in->dims().size());
+  }
+
+  if (ins.size() > 0 && outs->size() > 0) {
+    no_broadcast_flag &= outs->at(0)->dims() == ins[0]->dims();
+  }
+
+  if (no_broadcast_flag) {
+    pten::funcs::ElementwiseKernel<OutT, Functor, NumOuts>(
+        ctx, ins, outs, func);
+  } else {
+    axis = axis == -1
+               ? *std::max_element(dims_size.begin(), dims_size.end()) -
+                     *std::min_element(dims_size.begin(), dims_size.end())
+               : axis;
+    BroadcastKernelForDifferentVecSize<ET, InT, OutT, Functor, NumOuts>(
+        ctx, ins, outs, axis, func);
+  }
+}
+
+#endif
+
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/pten/kernels/funcs/cuda_kernel_config.h b/paddle/pten/kernels/funcs/cuda_kernel_config.h
deleted file mode 100644
index 483e58eedb4d13e9b45b9dd7065974bdd9606085..0000000000000000000000000000000000000000
--- a/paddle/pten/kernels/funcs/cuda_kernel_config.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/pten/backends/gpu/gpu_context.h"
-
-#ifdef __HIPCC__
-#define ELEMENTWISE_BLOCK_SIZE 256
-#else
-#define ELEMENTWISE_BLOCK_SIZE 512
-#endif
-
-namespace pten {
-namespace funcs {
-/*
-* According to NVIDIA, if number of threads per block is 64/128/256/512,
-* cuda performs better. And number of blocks should be greater (at least
-* 2x~4x) than number of SMs. Hence, SM count is took into account within
-* this function to determine the right number of threads per block.
-*/
-inline int GetThreadsConfig(const pten::GPUContext &ctx,
-                            int64_t numel,
-                            int vec_size) {
-  int threads = ELEMENTWISE_BLOCK_SIZE;
-  int sm_count = ctx.GetSMCount();
-  int active_threads_num = numel / vec_size;
-  if (active_threads_num / (sm_count << 1) < ELEMENTWISE_BLOCK_SIZE) {
-    // Round up threads number into an exponential multiple of 2, while number
-    // of acitve blocks is about twice of SM, to acquire better performance.
-    threads = paddle::platform::RoundToPowerOfTwo(active_threads_num /
-                                                  (sm_count << 1));
-  } else if (active_threads_num / (sm_count << 2) < ELEMENTWISE_BLOCK_SIZE) {
-    // Round up threads number into an exponential multiple of 2, while number
-    // of acitve blocks is about 4 times of SM, to acquire better performance.
-    threads = paddle::platform::RoundToPowerOfTwo(active_threads_num /
-                                                  (sm_count << 2));
-  }
-  // Number of threads per block shall be larger than 64.
-  return std::max(64, threads);
-}
-
-}  // namespace funcs
-}  // namespace pten
diff --git a/paddle/pten/kernels/funcs/elementwise_base.h b/paddle/pten/kernels/funcs/elementwise_base.h
index fd89c55f68c59807ab1e0fde97f7196ea305fda3..438b6e8ffbe8e793a29baf13fd33c7283c28cdf0 100644
--- a/paddle/pten/kernels/funcs/elementwise_base.h
+++ b/paddle/pten/kernels/funcs/elementwise_base.h
@@ -746,11 +746,10 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
 }
 
 template <typename OutT, typename Functor, int NumOuts = 1>
-void LaunchSameDimsElementwiseCudaKernel(
-    const KPDevice &ctx,
-    const std::vector<const DenseTensor *> &ins,
-    std::vector<DenseTensor *> *outs,
-    Functor func) {
+void ElementwiseKernel(const KPDevice &ctx,
+                       const std::vector<const DenseTensor *> &ins,
+                       std::vector<DenseTensor *> *outs,
+                       Functor func) {
   using Traits = paddle::platform::FunctionTraits<Functor>;
   const int kArity = Traits::arity;
   PADDLE_ENFORCE_EQ(ins.size(),
diff --git a/paddle/pten/kernels/gpu/abs_kernel.cu b/paddle/pten/kernels/gpu/abs_kernel.cu
index 06eff050674c3670a2aa07cb43d0baea82fe7202..d6fa5c8d34c990ee9264c62309b6e6b3b4327405 100644
--- a/paddle/pten/kernels/gpu/abs_kernel.cu
+++ b/paddle/pten/kernels/gpu/abs_kernel.cu
@@ -47,8 +47,7 @@ void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
   std::vector<DenseTensor*> outs = {out};
   auto functor = CudaAbsFunctor<T>();
 
-  funcs::LaunchSameDimsElementwiseCudaKernel<pten::funcs::Real<T>>(
-      ctx, ins, &outs, functor);
+  funcs::ElementwiseKernel<pten::funcs::Real<T>>(ctx, ins, &outs, functor);
 }
 
 }  // namespace pten
diff --git a/paddle/pten/kernels/gpu/cast_kernel.cu b/paddle/pten/kernels/gpu/cast_kernel.cu
index 006b4f3687c44bcefe5f710a113abb9cd5a3ffc9..b47871aadcb790401faa193f1bb19664da8efdbb 100644
--- a/paddle/pten/kernels/gpu/cast_kernel.cu
+++ b/paddle/pten/kernels/gpu/cast_kernel.cu
@@ -44,7 +44,7 @@ void CastCUDAKernelImpl(const GPUContext& dev_ctx,
   inputs.emplace_back(&x);
   outputs.emplace_back(out);
   dev_ctx.Alloc<OutT>(out);
-  pten::funcs::LaunchSameDimsElementwiseCudaKernel<OutT>(
+  pten::funcs::ElementwiseKernel<OutT>(
       dev_ctx, inputs, &outputs, CastFuctor<InT, OutT>());
 }
 
diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h
index d01102f2b331b77c0555deef0273a69dcec726ad..1b6df97d5e3d5f81346e8834076af109f77e17f1 100644
--- a/paddle/pten/kernels/gpu/elementwise.h
+++ b/paddle/pten/kernels/gpu/elementwise.h
@@ -15,9 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/kernels/copy_kernel.h"
+#include "paddle/pten/kernels/funcs/broadcast_function.h"
 #include "paddle/pten/kernels/funcs/common_shape.h"
-#include "paddle/pten/kernels/funcs/cuda_kernel_config.h"
-#include "paddle/pten/kernels/funcs/elementwise_base.h"
 #include "paddle/pten/kernels/gpu/reduce.h"
 
 #ifdef __HIPCC__
@@ -36,555 +35,8 @@ constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
   } while (0)
 
 namespace pten {
-// FORWARD CODE
-struct DimensionsTransform {
-  using DimVector = std::vector<int64_t>;
-  typedef void (*MergeFunctor)(
-      bool &, std::vector<DimVector> &, DimVector &, int, int);
-  int64_t dim_size;
-  DimVector out_dims;
-  std::vector<DimVector> in_dims;
-
- private:
-  // To compensate the lackage of input_tensors` dimension with input variable
-  // 'axis'
-  void InputDimensionsExtend(int N, int axis) {
-    for (auto &in_dim : in_dims) {
-      int64_t in_idx = 0;
-      if (in_dim.size() < dim_size) {
-        DimVector tmp_dim(dim_size, 1);
-        do {
-          if (in_dim[in_idx] == out_dims[axis] || in_dim[in_idx] == 1) {
-            tmp_dim[axis] = in_dim[in_idx];
-            in_idx++;
-            axis++;
-          } else {
-            PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-                "The %d-th dimension of input tensor is expected to be equal "
-                "with the %d-th dimension of output tensor %d or 1, but "
-                "recieved %d.",
-                in_idx + 1,
-                axis + 1,
-                out_dims[axis],
-                in_dim[in_idx]));
-          }
-        } while (in_idx < in_dim.size());
-        in_dim.resize(dim_size);
-        std::copy(tmp_dim.begin(), tmp_dim.end(), in_dim.begin());
-      } else {
-        do {
-          if (in_dim[in_idx] == out_dims[in_idx] || in_dim[in_idx] == 1) {
-            in_idx++;
-          } else {
-            PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-                "The %d-th dimension of input tensor is expected to be equal "
-                "with the %d-th dimension of output tensor %d or 1, but "
-                "recieved %d.",
-                in_idx + 1,
-                in_idx + 1,
-                out_dims[in_idx],
-                in_dim[in_idx]));
-          }
-        } while (in_idx < dim_size);
-      }
-      std::reverse(in_dim.begin(), in_dim.end());
-    }
-    std::reverse(out_dims.begin(), out_dims.end());
-  }
-
-  template <typename MergeFunctor>
-  __inline__ void MergeDimensions(MergeFunctor merge_func, int N) {
-    auto VectorReorganise = [](DimVector *vec, int l_idx, int m_idx) {
-      (*vec)[m_idx - 1] = std::accumulate(vec->begin() + l_idx,
-                                          vec->begin() + m_idx,
-                                          1,
-                                          std::multiplies<int64_t>());
-      vec->erase(vec->begin() + l_idx, vec->begin() + m_idx - 1);
-    };
-
-    int64_t i = 0;
-    while (i < dim_size) {
-      int cnt = 0;
-      int low_idx = i;
-      bool equal = true;
-      do {
-        merge_func(equal, in_dims, out_dims, i, N);
-        if (equal) {
-          i++;
-          cnt++;
-        } else {
-          break;
-        }
-      } while (i < dim_size);
-
-      if (cnt > 1) {
-        for (auto &in_dim : in_dims) {
-          VectorReorganise(&in_dim, low_idx, i);
-        }
-        VectorReorganise(&out_dims, low_idx, i);
-        dim_size -= --cnt;
-        i -= cnt;
-      } else if (cnt < 1) {
-        i++;
-      }
-    }
-  }
-
- public:
-  explicit DimensionsTransform(const std::vector<const DenseTensor *> &ins,
-                               const pten::DDim &dims,
-                               int axis) {
-    const int N = max(static_cast<int>(ins.size()), 2);
-    dim_size = dims.size();
-    out_dims = pten::vectorize<int64_t>(dims);
-    in_dims.resize(N);
-    if (ins.size() == 1) {
-      // when ins.size() = 1, broadcast input to output
-      in_dims[0] = pten::vectorize<int64_t>(ins[0]->dims());
-      in_dims[1] = out_dims;
-      // Add out_dims to in_dims to avoid errors in dims merging
-    } else {
-      for (int j = 0; j < N; ++j) {
-        in_dims[j] = pten::vectorize<int64_t>(ins[j]->dims());
-      }
-    }
-    InputDimensionsExtend(N, axis);
-
-    auto merge_sequential_dims = [](bool &equal,
-                                    std::vector<DimVector> &in_dims,
-                                    DimVector &out,
-                                    int i,
-                                    int num) {
-      for (int j = 1; j < num; ++j) {
-        equal &= (in_dims[0][i] == in_dims[j][i]) ? true : false;
-      }
-    };
-    auto merge_sequential_one_dims = [](bool &equal,
-                                        std::vector<DimVector> &in_dims,
-                                        DimVector &out,
-                                        int i,
-                                        int num) {
-      equal = in_dims[0][i] == 1;
-      if (equal) {
-        for (int j = 1; j < num; ++j) {
-          equal &= in_dims[j][i] == out[i];
-        }
-      }
-    };
-    // To Merge the dimensions of input_tensors while the consequtive
-    // equal-dimensions appears.
-    MergeFunctor merge_ptr = merge_sequential_dims;
-    MergeDimensions<MergeFunctor>(merge_ptr, N);
-
-    int min_idx = 0;
-    int min_val = std::accumulate(
-        in_dims[0].begin(), in_dims[0].end(), 1, std::multiplies<int64_t>());
-    for (int j = 1; j < N; ++j) {
-      int temp = std::accumulate(
-          in_dims[j].begin(), in_dims[j].end(), 1, std::multiplies<int64_t>());
-      min_val = min_val > temp ? temp : min_val;
-      min_idx = min_val == temp ? j : min_idx;
-    }
-    std::swap(in_dims[0], in_dims[min_idx]);
-
-    // To Merge the dimension of input_tensors while the consequtive
-    // 1-value-dimensions appears.
-    merge_ptr = merge_sequential_one_dims;
-    MergeDimensions<MergeFunctor>(merge_ptr, N);
-    std::swap(in_dims[min_idx], in_dims[0]);
-  }
-};
-
-template <typename T, int VecSize, int Rank, bool IsBoundary = false>
-__device__ __forceinline__ void LoadData(
-    T *dst,
-    const _ptr_ T *src,
-    uint32_t block_offset,
-    const kps::details::BroadcastConfig<Rank> &config,
-    int numel,
-    int num,
-    int need_broadcast) {
-  // numel : whole num of output
-  // num: how many data will be deal with in this time
-  if (need_broadcast) {
-    kps::ReadDataBc<T, VecSize, 1, 1, Rank, IsBoundary>(
-        dst, src, block_offset, config, numel);
-  } else {
-    kps::ReadData<T, VecSize, 1, 1, IsBoundary>(dst, src + block_offset, num);
-  }
-}
-
-template <typename InT,
-          typename OutT,
-          typename Functor,
-          int Arity,
-          int NumOuts,
-          int VecSize,
-          int Rank,
-          bool IsBoundary = false>
-__device__ void ElementwiseBroadcastKernelImpl(
-    const pten::Array<const _ptr_ InT *__restrict__, Arity> &ins,
-    pten::Array<_ptr_ OutT *, NumOuts> outs,
-    const pten::Array<int, Arity> &use_broadcast,
-    uint32_t numel,
-    const pten::Array<kps::details::BroadcastConfig<Rank>, Arity> &configs,
-    int num,
-    int block_offset,
-    Functor func) {
-  InT args[Arity][VecSize];
-  ConditionalT<OutT, NumOuts> result[VecSize];
-
-#pragma unroll
-  for (int i = 0; i < Arity; i++) {
-    kps::Init<InT, VecSize>(args[i], static_cast<InT>(1.0f));
-    LoadData<InT, VecSize, Rank, IsBoundary>(args[i],
-                                             ins[i],
-                                             block_offset,
-                                             configs[i],
-                                             numel,
-                                             num,
-                                             use_broadcast[i]);
-  }
-  constexpr bool kCallElementwiseAny =
-      paddle::platform::FunctionTraits<Functor>::has_pointer_args;
-  pten::funcs::ElementwisePrimitiveCaller<InT,
-                                          ConditionalT<OutT, NumOuts>,
-                                          VecSize,
-                                          Functor,
-                                          Arity,
-                                          kCallElementwiseAny>()(
-      func, args, result);
-
-  pten::funcs::ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, NumOuts>()(
-      outs, result, block_offset, num);
-}
-
-template <typename InT,
-          typename OutT,
-          typename Functor,
-          int Arity,
-          int NumOuts,
-          int VecSize,
-          int Rank>
-__global__ void ElementwiseBroadcastKernel(
-    pten::Array<const _ptr_ InT *__restrict__, Arity> ins,
-    pten::Array<_ptr_ OutT *, NumOuts> outs,
-    pten::Array<int, Arity> use_broadcast,
-    uint32_t numel,
-    pten::Array<kps::details::BroadcastConfig<Rank>, Arity> configs,
-    int main_offset,
-    int tail_tid,
-    Functor func) {
-  int block_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
-  int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
-
-#ifdef PADDLE_WITH_XPU2
-  for (; block_offset < main_offset; block_offset += stride) {
-    ElementwiseBroadcastKernelImpl<InT,
-                                   OutT,
-                                   Functor,
-                                   Arity,
-                                   NumOuts,
-                                   VecSize,
-                                   Rank,
-                                   false>(ins,
-                                          outs,
-                                          use_broadcast,
-                                          numel,
-                                          configs,
-                                          BLOCK_NUM_X * VecSize,
-                                          block_offset,
-                                          func);
-  }
-  int num = numel - block_offset;
-  if (num > 0) {
-    ElementwiseBroadcastKernelImpl<InT,
-                                   OutT,
-                                   Functor,
-                                   Arity,
-                                   NumOuts,
-                                   VecSize,
-                                   Rank,
-                                   true>(
-        ins, outs, use_broadcast, numel, configs, num, block_offset, func);
-  }
-#else
-  if (block_offset < main_offset) {
-    ElementwiseBroadcastKernelImpl<InT,
-                                   OutT,
-                                   Functor,
-                                   Arity,
-                                   NumOuts,
-                                   VecSize,
-                                   Rank,
-                                   false>(ins,
-                                          outs,
-                                          use_broadcast,
-                                          numel,
-                                          configs,
-                                          BLOCK_NUM_X * VecSize,
-                                          block_offset,
-                                          func);
-  } else {
-    ElementwiseBroadcastKernelImpl<InT,
-                                   OutT,
-                                   Functor,
-                                   Arity,
-                                   NumOuts,
-                                   VecSize,
-                                   Rank,
-                                   true>(
-        ins, outs, use_broadcast, numel, configs, tail_tid, block_offset, func);
-  }
-#endif
-}
-
-template <typename InT,
-          typename OutT,
-          typename Functor,
-          int Arity,
-          int NumOuts,
-          int VecSize,
-          int Rank>
-void LaunchKernel(const KPDevice &ctx,
-                  const std::vector<const DenseTensor *> &ins,
-                  std::vector<DenseTensor *> *outs,
-                  Functor func,
-                  DimensionsTransform merge_dims) {
-  int numel = (*outs)[0]->numel();
-  pten::Array<kps::details::BroadcastConfig<Rank>, Arity> configs;
-  pten::Array<int, Arity> use_broadcast;
-  pten::Array<const _ptr_ InT *__restrict__, Arity> ins_data;
-  pten::Array<_ptr_ OutT *, NumOuts> outs_data;
-
-  for (int i = 0; i < NumOuts; ++i) {
-    outs_data[i] = ctx.Alloc<OutT>((*outs)[i]);
-  }
-
-  for (int i = 0; i < Arity; i++) {
-    use_broadcast[i] = (ins[i]->numel() != numel);
-    ins_data[i] = (_ptr_ InT *)(ins[i]->data<InT>());
-    if (use_broadcast[i]) {
-      // get the broadcast config,
-      // if data shape is[m, n], then you should set data_dim = {n, m}
-      // eg: out's shape [3, 45, 1]. then out_dims = {1, 45, 3}
-      configs[i] = kps::details::BroadcastConfig<Rank>(
-          merge_dims.out_dims, merge_dims.in_dims[i], merge_dims.dim_size);
-    }
-  }
-
-#ifdef PADDLE_WITH_XPU2
-  const int threads = 64;
-  const int blocks = 8;
-  int main_offset = (numel / (VecSize * threads)) * VecSize * threads;
-  int tail_tid = numel % (VecSize * threads);
-  auto stream = ctx.x_context()->xpu_stream;
-  ElementwiseBroadcastKernel<InT,
-                             OutT,
-                             Functor,
-                             Arity,
-                             NumOuts,
-                             VecSize,
-                             Rank><<<blocks, threads, stream>>>(ins_data,
-                                                                outs_data,
-                                                                use_broadcast,
-                                                                numel,
-                                                                configs,
-                                                                main_offset,
-                                                                tail_tid,
-                                                                func);
-#else
-  const int threads = 256;
-  int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads;
-  int main_offset = (numel / (VecSize * threads)) * VecSize * threads;
-  int tail_tid = numel % (VecSize * threads);
-  auto stream = ctx.stream();
-  ElementwiseBroadcastKernel<InT,
-                             OutT,
-                             Functor,
-                             Arity,
-                             NumOuts,
-                             VecSize,
-                             Rank><<<blocks, threads, 0, stream>>>(
-      ins_data,
-      outs_data,
-      use_broadcast,
-      numel,
-      configs,
-      main_offset,
-      tail_tid,
-      func);
-#endif
-}
-
-template <typename InT,
-          typename OutT,
-          typename Functor,
-          int Arity,
-          int NumOuts,
-          int VecSize>
-void LaunchBroadcastKernelForDifferentVecSize(
-    const KPDevice &ctx,
-    const std::vector<const DenseTensor *> &ins,
-    std::vector<DenseTensor *> *outs,
-    int axis,
-    Functor func) {
-  const auto merge_dims = DimensionsTransform(ins, (*outs)[0]->dims(), axis);
-
-#define CALL_BROADCAST_FOR_DIM_SIZE(rank)                            \
-  case rank: {                                                       \
-    LaunchKernel<InT, OutT, Functor, Arity, NumOuts, VecSize, rank>( \
-        ctx, ins, outs, func, merge_dims);                           \
-  } break;
-
-  switch (merge_dims.dim_size) {
-    CALL_BROADCAST_FOR_DIM_SIZE(1);
-    CALL_BROADCAST_FOR_DIM_SIZE(2);
-    CALL_BROADCAST_FOR_DIM_SIZE(3);
-    CALL_BROADCAST_FOR_DIM_SIZE(4);
-    CALL_BROADCAST_FOR_DIM_SIZE(5);
-    CALL_BROADCAST_FOR_DIM_SIZE(6);
-    CALL_BROADCAST_FOR_DIM_SIZE(7);
-    CALL_BROADCAST_FOR_DIM_SIZE(8);
-    default: {
-      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-          "The maximum dimension of input tensor is expected to be less than "
-          "%d, but recieved %d.\n",
-          merge_dims.dim_size,
-          pten::DDim::kMaxRank));
-    }
-  }
-#undef CALL_BROADCAST_FOR_DIM_SIZE
-}
-
-template <ElementwiseType ET,
-          typename InT,
-          typename OutT,
-          typename Functor,
-          int NumOuts = 1>
-void LaunchBroadcastElementwiseCudaKernel(
-    const KPDevice &ctx,
-    const std::vector<const DenseTensor *> &ins,
-    std::vector<DenseTensor *> *outs,
-    int axis,
-    Functor func) {
-  using Traits = paddle::platform::FunctionTraits<Functor>;
-  const int kArity =
-      Traits::has_pointer_args ? static_cast<int>(ET) : Traits::arity;
-  PADDLE_ENFORCE_EQ(ins.size(),
-                    kArity,
-                    paddle::platform::errors::InvalidArgument(
-                        "The number of inputs is expected to be equal to the "
-                        "arity of functor. But recieved: the number of inputs "
-                        "is %d, the arity of functor is %d.",
-                        ins.size(),
-                        kArity));
-  PADDLE_ENFORCE_LE(kArity,
-                    3,
-                    paddle::platform::errors::InvalidArgument(
-                        "Currently only broadcast of ternary is supported "
-                        "and verified, but received %d.",
-                        kArity));
-  PADDLE_ENFORCE_EQ(outs->size(),
-                    NumOuts,
-                    paddle::platform::errors::InvalidArgument(
-                        "Number of outputs shall equal to number of functions, "
-                        "but number of outputs is %d, of functions is %d.",
-                        outs->size(),
-                        NumOuts));
-  int in_vec_size = 4;
-  int out_vec_size = 4;
-  if (NumOuts > 1) {
-    for (int i = 0; i < NumOuts; ++i) {
-      PADDLE_ENFORCE_EQ(
-          (*outs)[i]->dims(),
-          (*outs)[0]->dims(),
-          paddle::platform::errors::InvalidArgument(
-              "The shape of each output tensor shall be identical yet, but "
-              "%dth output tensor`s shape is not.",
-              i));
-      out_vec_size = std::min(
-          paddle::platform::GetVectorizedSize<OutT>((*outs)[i]->data<OutT>()),
-          out_vec_size);
-    }
-  } else {
-    out_vec_size =
-        paddle::platform::GetVectorizedSize<OutT>((*outs)[0]->data<OutT>());
-  }
-
-  for (auto *in : ins) {
-    auto temp_size = paddle::platform::GetVectorizedSize<InT>(in->data<InT>());
-    in_vec_size = in->dims() == (*outs)[0]->dims()
-                      ? std::min(temp_size, in_vec_size)
-                      : in_vec_size;
-  }
-  int vec_size = std::min(out_vec_size, in_vec_size);
-
-  switch (vec_size) {
-    case 4: {
-      LaunchBroadcastKernelForDifferentVecSize<InT,
-                                               OutT,
-                                               Functor,
-                                               kArity,
-                                               NumOuts,
-                                               4>(ctx, ins, outs, axis, func);
-      break;
-    }
-    case 2: {
-      LaunchBroadcastKernelForDifferentVecSize<InT,
-                                               OutT,
-                                               Functor,
-                                               kArity,
-                                               NumOuts,
-                                               2>(ctx, ins, outs, axis, func);
-      break;
-    }
-    case 1: {
-      LaunchBroadcastKernelForDifferentVecSize<InT,
-                                               OutT,
-                                               Functor,
-                                               kArity,
-                                               NumOuts,
-                                               1>(ctx, ins, outs, axis, func);
-      break;
-    }
-    default: {
-      PADDLE_THROW(paddle::platform::errors::Unimplemented(
-          "Unsupported vectorized size: %d !", vec_size));
-      break;
-    }
-  }
-}
-
-template <ElementwiseType ET,
-          typename InT,
-          typename OutT,
-          typename Functor,
-          int NumOuts = 1>
-void LaunchElementwiseCudaKernel(const KPDevice &ctx,
-                                 const std::vector<const DenseTensor *> &ins,
-                                 std::vector<DenseTensor *> *outs,
-                                 int axis,
-                                 Functor func) {
-  std::vector<int> dims_size;
-  bool no_broadcast_flag = true;
-  for (auto *in : ins) {
-    no_broadcast_flag &= ins[0]->dims() == in->dims();
-    dims_size.emplace_back(in->dims().size());
-  }
-  if (no_broadcast_flag) {
-    pten::funcs::LaunchSameDimsElementwiseCudaKernel<OutT, Functor, NumOuts>(
-        ctx, ins, outs, func);
-  } else {
-    axis = axis == -1
-               ? *std::max_element(dims_size.begin(), dims_size.end()) -
-                     *std::min_element(dims_size.begin(), dims_size.end())
-               : axis;
-    pten::LaunchBroadcastElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
-        ctx, ins, outs, axis, func);
-  }
-}
 
+// General binary elementwise comutaion with the support of broadcast.
 template <typename Functor, typename T, typename OutType = T>
 void ElementwiseCompute(const GPUContext &dev_ctx,
                         const DenseTensor &x,
@@ -595,12 +47,10 @@ void ElementwiseCompute(const GPUContext &dev_ctx,
   std::vector<const DenseTensor *> ins = {&x, &y};
   std::vector<DenseTensor *> outs = {z};
   z->mutable_data<OutType>(dev_ctx.GetPlace());
-  pten::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, OutType>(
+  pten::funcs::BroadcastKernel<ElementwiseType::kBinary, T, OutType>(
       dev_ctx, ins, &outs, axis, func);
 }
 
-// BACKWARD CODE
-
 // Suppose only has contiguous dims
 static inline bool CheckContiguousDims(const std::vector<int> &broadcast_pos) {
   for (int i = 1; i < broadcast_pos.size(); ++i) {
diff --git a/paddle/pten/kernels/gpu/full_kernel.cu b/paddle/pten/kernels/gpu/full_kernel.cu
index 937d398be769b33215a6bce5c8531d0db17f5cb4..2a7ec387c8175229505c0d82ce4156c91d514344 100644
--- a/paddle/pten/kernels/gpu/full_kernel.cu
+++ b/paddle/pten/kernels/gpu/full_kernel.cu
@@ -49,7 +49,7 @@ void FullKernel(const Context& dev_ctx,
     // This function has no input, so the inputs.size() == 0. Use kUnary, but
     // the data will not be loaded in the kernel because the number of
     // parameters in the operator is 0
-    pten::funcs::LaunchSameDimsElementwiseCudaKernel<T>(
+    pten::funcs::ElementwiseKernel<T>(
         dev_ctx, inputs, &outputs, FullFuctor<T>(val.to<T>()));
   }
 }
@@ -91,7 +91,7 @@ void FullLikeKernel(const Context& dev_ctx,
   // the operator is 0
   int numel = out->numel();
   if (numel > 0) {
-    pten::funcs::LaunchSameDimsElementwiseCudaKernel<T>(
+    pten::funcs::ElementwiseKernel<T>(
         dev_ctx, inputs, &outputs, FullFuctor<T>(value));
   }
 }
diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu
index 3b7122ba1b9164c19b98eaf4b9707301234fb54b..a066866c431597f3e7aebbeefaa6c3a84b8bae29 100644
--- a/paddle/pten/kernels/gpu/math_kernel.cu
+++ b/paddle/pten/kernels/gpu/math_kernel.cu
@@ -48,7 +48,7 @@ namespace pten {
     inputs.emplace_back(&y);                                         \
     outputs.emplace_back(out);                                       \
     dev_ctx.template Alloc<T>(out);                                  \
-    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(     \
+    funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(          \
         dev_ctx, inputs, &outputs, axis, funcs::name##Functor<T>()); \
   }
 
diff --git a/paddle/pten/kernels/gpu/reduce.h b/paddle/pten/kernels/gpu/reduce.h
index f5296477ceb2a0f346821227e47338b2bfdfd0df..43c19176493d90f700807d9d6b2b5fd607727e48 100644
--- a/paddle/pten/kernels/gpu/reduce.h
+++ b/paddle/pten/kernels/gpu/reduce.h
@@ -1091,8 +1091,7 @@ void TensorReduceImpl(const pten::GPUContext& dev_ctx,
   if (config.reduce_num == 1) {
     std::vector<const DenseTensor*> inputs = {&x};
     std::vector<DenseTensor*> outputs = {y};
-    funcs::LaunchSameDimsElementwiseCudaKernel<Ty>(
-        dev_ctx, inputs, &outputs, transform);
+    funcs::ElementwiseKernel<Ty>(dev_ctx, inputs, &outputs, transform);
     return;
   }
 
diff --git a/paddle/pten/kernels/gpu/reduce_grad.h b/paddle/pten/kernels/gpu/reduce_grad.h
index a626f2f70e7fb1ea0dc1d8ea419abf8022758459..517192f0f93a5cb8b11da7f00d85d24fa78c1e81 100644
--- a/paddle/pten/kernels/gpu/reduce_grad.h
+++ b/paddle/pten/kernels/gpu/reduce_grad.h
@@ -22,8 +22,10 @@
 #include <numeric>
 #include <set>
 #include <vector>
-#include "paddle/pten/kernels/gpu/elementwise.h"
+#include "paddle/pten/kernels/funcs/broadcast_function.h"
+
 namespace pten {
+
 template <typename InT, typename Functor>
 void ReduceGrad(const GPUContext& dev_ctx,
                 DenseTensor* d_out,
@@ -33,12 +35,11 @@ void ReduceGrad(const GPUContext& dev_ctx,
   std::vector<const DenseTensor*> inputs = {d_out};
   std::vector<DenseTensor*> outputs = {d_x};
   PD_VISIT_ALL_TYPES(
-      out_dtype, "LaunchBroadcastElementwiseCudaKernel", ([&] {
-        LaunchBroadcastElementwiseCudaKernel<pten::ElementwiseType::kUnary,
-                                             InT,
-                                             data_t>(
+      out_dtype, "BroadcastKernel", ([&] {
+        funcs::BroadcastKernel<pten::ElementwiseType::kUnary, InT, data_t>(
             dev_ctx, inputs, &outputs, 0, functor);
       }));
 }
+
 }  // namespace pten
 #endif
diff --git a/paddle/pten/kernels/gpu/scale_kernel.cu b/paddle/pten/kernels/gpu/scale_kernel.cu
index e1cf78224a19dcb8d40dd08b836d4e2b2fa85480..edb9a35efbde645811637f6a44018ddc412a8624 100644
--- a/paddle/pten/kernels/gpu/scale_kernel.cu
+++ b/paddle/pten/kernels/gpu/scale_kernel.cu
@@ -54,7 +54,7 @@ void ScaleKernel(const Context& dev_ctx,
   inputs.emplace_back(&x);
   outputs.emplace_back(out);
   dev_ctx.template Alloc<T>(out);
-  pten::funcs::LaunchSameDimsElementwiseCudaKernel<T>(
+  pten::funcs::ElementwiseKernel<T>(
       dev_ctx,
       inputs,
       &outputs,