diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index 4e5be2e53503fde5168de6f59959b2f41a4bcd6d..a81e4abd45e56f3e13bcfe5585ad8906dccd8bdb 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -23,6 +23,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
 
+#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/pten/include/core.h"
+
 namespace paddle {
 namespace framework {
 
@@ -73,9 +76,12 @@ class TestKernel : public OpKernel<float> {
     output->Resize(input->dims());
     output->mutable_data<T>(ctx.GetPlace());
 
-    operators::TransformFunctor<AddFunctor<T>, T, DeviceContext> functor(
-        input, input, output, ctx.template device_context<DeviceContext>(),
-        AddFunctor<T>());
+    auto pt_input = paddle::experimental::MakePtenDenseTensor(*input);
+    auto pt_out = paddle::experimental::MakePtenDenseTensor(*output);
+
+    pten::funcs::TransformFunctor<AddFunctor<T>, T, DeviceContext> functor(
+        *pt_input, *pt_input, pt_out.get(),
+        ctx.template device_context<DeviceContext>(), AddFunctor<T>());
     functor.Run();
   }
 };
diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h
index 6e53af41b657c8bebacc78d1fbbf0fe7137f0997..7ff8e6a1543af14d31fae4a02039aca95c163e9a 100644
--- a/paddle/fluid/operators/elementwise/elementwise_functor.h
+++ b/paddle/fluid/operators/elementwise/elementwise_functor.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/kernels/funcs/elementwise_functor.h"
 
 namespace paddle {
 namespace operators {
@@ -25,58 +26,31 @@ namespace operators {
 
 // Add
 template <typename T>
-struct AddFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a + b; }
-};
+using AddFunctor = pten::funcs::AddFunctor<T>;
+
 template <typename T>
-struct InverseAddFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b + a; }
-};
+using InverseAddFunctor = pten::funcs::InverseAddFunctor<T>;
 
 // Subtract
 template <typename T>
-struct SubFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a - b; }
-};
+using SubFunctor = pten::funcs::SubtractFunctor<T>;
+
 template <typename T>
-struct InverseSubFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b - a; }
-};
+using InverseSubFunctor = pten::funcs::InverseSubtractFunctor<T>;
 
 // Multiply
 template <typename T>
-struct MulFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
-};
+using MulFunctor = pten::funcs::MultiplyFunctor<T>;
+
 template <typename T>
-struct InverseMulFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b * a; }
-};
+using InverseMulFunctor = pten::funcs::InverseMultiplyFunctor<T>;
 
 // Divide
-#define DIV_ERROR_INFO                                             \
-  "InvalidArgumentError: Integer division by zero encountered in " \
-  "(floor) divide. Please check the input value."
-
-template <typename T, typename Enable = void>
-struct DivFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a / b; }
-};
-
 template <typename T>
-struct DivFunctor<T,
-                  typename std::enable_if<std::is_integral<T>::value>::type> {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const {
-    // For int32/int64, need to check whether the divison is zero.
-    PADDLE_ENFORCE(b != 0, DIV_ERROR_INFO);
-    return a / b;
-  }
-};
+using DivFunctor = pten::funcs::DivideFunctor<T>;
 
-template <typename T, typename Enable = void>
-struct InverseDivFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b / a; }
-};
+template <typename T>
+using InverseDivFunctor = pten::funcs::InverseDivideFunctor<T>;
 
 // Floor Divide
 template <typename T>
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 9700ca3584de8d153f2164abe06482adac36601d..6f3e17ea4d43470ac9c25b83bbea82a7b0287f5e 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -31,8 +31,7 @@ limitations under the License. */
 
 // only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/kernels/hybird/cpu/elementwise.h"
-#include "paddle/pten/kernels/hybird/general/elementwise_base.h"
+#include "paddle/pten/kernels/cpu/elementwise_impl.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__)
 #ifdef __NVCC__
@@ -151,9 +150,9 @@ inline void GetBroadcastDimsArrays(const framework::DDim &x_dims,
                                    int *x_dims_array, int *y_dims_array,
                                    int *out_dims_array, const int max_dim,
                                    const int axis) {
-  pten::general::GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array,
-                                        y_dims_array, out_dims_array, max_dim,
-                                        axis);
+  pten::funcs::GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array,
+                                      y_dims_array, out_dims_array, max_dim,
+                                      axis);
 }
 
 template <typename Functor, typename T, typename OutType = T>
@@ -1073,71 +1072,9 @@ void CommonGradBroadcastCUDA(
 
 inline framework::DDim trim_trailing_singular_dims(
     const framework::DDim &dims) {
-  return pten::general::trim_trailing_singular_dims(dims);
+  return pten::funcs::trim_trailing_singular_dims(dims);
 }
 
-template <typename Functor, typename T, typename DeviceContext,
-          typename OutType = T>
-class TransformFunctor {
- public:
-  TransformFunctor(const framework::Tensor *x, const framework::Tensor *y,
-                   framework::Tensor *z, const DeviceContext &ctx, Functor func,
-                   const bool is_xsize_larger = true)
-      : x_(x->data<T>()),
-        y_(y->data<T>()),
-        z_(z->mutable_data<OutType>(ctx.GetPlace())),
-        nx_(x->numel()),
-        ctx_(ctx),
-        func_(func),
-        is_xsize_larger_(is_xsize_larger) {
-    if (is_xsize_larger_ == false) {
-      nx_ = y->numel();
-    }
-  }
-
-  inline void Run() const {
-    platform::Transform<DeviceContext> trans;
-    trans(ctx_, x_, x_ + nx_, y_, z_, func_);
-  }
-
-  inline void RunRowWise(int n, int pre) const {
-    platform::Transform<DeviceContext> trans;
-    if (is_xsize_larger_) {
-      trans(ctx_, x_, x_ + nx_,
-            pten::general::RowwiseTransformIterator<T, DeviceContext>(y_, n),
-            z_, func_);
-    } else {
-      trans(ctx_, y_, y_ + nx_,
-            pten::general::RowwiseTransformIterator<T, DeviceContext>(x_, n),
-            z_, func_);
-    }
-  }
-
-  inline void RunMidWise(int n, int pre, int post) const {
-    platform::Transform<DeviceContext> trans;
-    if (is_xsize_larger_) {
-      trans(ctx_, x_, x_ + nx_,
-            pten::general::MidWiseTransformIterator<T, DeviceContext>(y_, n,
-                                                                      post),
-            z_, func_);
-    } else {
-      trans(ctx_, y_, y_ + nx_,
-            pten::general::MidWiseTransformIterator<T, DeviceContext>(x_, n,
-                                                                      post),
-            z_, func_);
-    }
-  }
-
- private:
-  const T *x_;
-  const T *y_;
-  OutType *z_;
-  int64_t nx_;
-  const DeviceContext &ctx_;
-  Functor func_;
-  bool is_xsize_larger_;
-};
-
 template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
 struct ElemwiseGradNoBroadcast {
   const T *x_;
@@ -1457,13 +1394,13 @@ void ElemwiseGradComputeWithBroadcast(
   if (is_xsize_larger) {
     auto y_dims_trimed = trim_trailing_singular_dims(y_dims);
     axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
-    pten::general::get_mid_dims(x_dims, y_dims_trimed, axis_trim, &pre, &n,
-                                &post, &is_run_common_broadcast);
+    pten::funcs::get_mid_dims(x_dims, y_dims_trimed, axis_trim, &pre, &n, &post,
+                              &is_run_common_broadcast);
   } else {
     auto x_dims_trimed = trim_trailing_singular_dims(x_dims);
     axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
-    pten::general::get_mid_dims(y_dims, x_dims_trimed, axis_trim, &pre, &n,
-                                &post, &is_run_common_broadcast);
+    pten::funcs::get_mid_dims(y_dims, x_dims_trimed, axis_trim, &pre, &n, &post,
+                              &is_run_common_broadcast);
   }
   // special case for common backward implementation.
   if (is_run_common_broadcast) {
@@ -1861,8 +1798,8 @@ void FusedElemwiseAndActComputeWithBroadcast(
   axis = (y_dim.size() == 0) ? x_dim.size() : axis;
 
   int pre, n, post, is_run_common_broadcast;
-  pten::general::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post,
-                              &is_run_common_broadcast);
+  pten::funcs::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post,
+                            &is_run_common_broadcast);
   if (post == 1) {
     int h = pre;
     int w = n;
@@ -2409,8 +2346,8 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
   axis = (y_dim.size() == 0) ? x_dim.size() : axis;
 
   int pre, n, post, is_run_common_broadcast;
-  pten::general::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post,
-                              &is_run_common_broadcast);
+  pten::funcs::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post,
+                            &is_run_common_broadcast);
   const T *x_data = nullptr;
   const T *y_data = nullptr;
   if (x->IsInitialized()) x_data = x->data<T>();
diff --git a/paddle/pten/api/lib/kernel_declare.h b/paddle/pten/api/lib/kernel_declare.h
deleted file mode 100644
index 4d3143ef09ccc62be069f2971ad0a17666547105..0000000000000000000000000000000000000000
--- a/paddle/pten/api/lib/kernel_declare.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/core/kernel_registry.h"
-
-// TODO(chenweihang) After the kernel is split into a single file,
-// the kernel declare statement is automatically generated according to the
-// file name of the kernel, and this header file will be removed
diff --git a/paddle/pten/infermeta/binary.cc b/paddle/pten/infermeta/binary.cc
index 5d3844a1dec3d51959f5cc81d2fd3358d782fa1d..944c64ecd75e228fcea5ab014659a9b88b05aa1d 100644
--- a/paddle/pten/infermeta/binary.cc
+++ b/paddle/pten/infermeta/binary.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/pten/infermeta/binary.h"
-#include "paddle/pten/kernels/hybird/general/elementwise_base.h"
+#include "paddle/pten/kernels/funcs/elementwise_base.h"
 
 namespace pten {
 
@@ -162,13 +162,13 @@ DenseTensorMeta ElementwiseInferMeta(const DenseTensorMeta& x_meta,
     std::vector<int> x_dims_array(max_dim);
     std::vector<int> y_dims_array(max_dim);
     std::vector<int> out_dims_array(max_dim);
-    general::GetBroadcastDimsArrays(x_dims,
-                                    y_dims,
-                                    x_dims_array.data(),
-                                    y_dims_array.data(),
-                                    out_dims_array.data(),
-                                    max_dim,
-                                    axis);
+    funcs::GetBroadcastDimsArrays(x_dims,
+                                  y_dims,
+                                  x_dims_array.data(),
+                                  y_dims_array.data(),
+                                  out_dims_array.data(),
+                                  max_dim,
+                                  axis);
     return_meta.dims = paddle::framework::make_ddim(out_dims_array);
   }
   return_meta.lod = x_meta.lod;
diff --git a/paddle/pten/kernels/cpu/elementwise_impl.h b/paddle/pten/kernels/cpu/elementwise_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..d3687b22fb392eb052af6a0d8a865cf48d4b1876
--- /dev/null
+++ b/paddle/pten/kernels/cpu/elementwise_impl.h
@@ -0,0 +1,392 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/funcs/elementwise_base.h"
+
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/hybird/eigen/common.h"
+
+namespace pten {
+
+// Add
+template <typename DevCtx, typename T, class Enable = void>
+struct SameDimsAddFunctor {
+  void operator()(const DevCtx& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z);
+};
+
+template <typename DevCtx, typename T>
+struct SameDimsAddFunctor<
+    DevCtx,
+    T,
+    typename std::enable_if<std::is_floating_point<T>::value>::type> {
+  void operator()(const DevCtx& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z) {
+    auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
+    blas.VADD(x.numel(), x.data<T>(), y.data<T>(), z->mutable_data<T>());
+  }
+};
+
+template <typename DevCtx, typename T>
+struct SameDimsAddFunctor<
+    DevCtx,
+    T,
+    typename std::enable_if<!std::is_floating_point<T>::value>::type> {
+  void operator()(const DevCtx& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z) {
+    z->mutable_data<T>();
+    auto eigen_x = pten::EigenVector<T>::Flatten(x);
+    auto eigen_y = pten::EigenVector<T>::Flatten(y);
+    auto eigen_z = pten::EigenVector<T>::Flatten(*z);
+    auto& place = *dev_ctx.eigen_device();
+    eigen_z.device(place) = eigen_x + eigen_y;
+  }
+};
+
+// Subtract
+template <typename DevCtx, typename T, class Enable = void>
+struct SameDimsSubtractFunctor {
+  void operator()(const DevCtx& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z);
+};
+
+template <typename DevCtx, typename T>
+struct SameDimsSubtractFunctor<
+    DevCtx,
+    T,
+    typename std::enable_if<std::is_floating_point<T>::value>::type> {
+  void operator()(const DevCtx& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z) {
+    auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
+    blas.VSUB(x.numel(), x.data<T>(), y.data<T>(), z->mutable_data<T>());
+  }
+};
+
+template <typename DevCtx, typename T>
+struct SameDimsSubtractFunctor<
+    DevCtx,
+    T,
+    typename std::enable_if<!std::is_floating_point<T>::value>::type> {
+  void operator()(const DevCtx& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z) {
+    auto eigen_x = pten::EigenVector<T>::Flatten(x);
+    auto eigen_y = pten::EigenVector<T>::Flatten(y);
+    auto eigen_z = pten::EigenVector<T>::Flatten(*z);
+    auto& place = *dev_ctx.eigen_device();
+    eigen_z.device(place) = eigen_x - eigen_y;
+  }
+};
+
+// Divide
+template <typename DevCtx, typename T, class Enable = void>
+struct SameDimsDivideFunctor {
+  void operator()(const DevCtx& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z);
+};
+
+template <typename DevCtx, typename T>
+struct SameDimsDivideFunctor<
+    DevCtx,
+    T,
+    typename std::enable_if<!std::is_floating_point<T>::value>::type> {
+  void operator()(const DevCtx& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z) {
+    paddle::platform::errors::InvalidArgument(
+        "If use SameDimsDivideFunctor, template args(T) must be floating "
+        "point. ");
+  }
+};
+
+template <typename DevCtx, typename T>
+struct SameDimsDivideFunctor<
+    DevCtx,
+    T,
+    typename std::enable_if<std::is_floating_point<T>::value>::type> {
+  void operator()(const DevCtx& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z) {
+    auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
+    blas.VDIV(x.numel(), x.data<T>(), y.data<T>(), z->mutable_data<T>());
+  }
+};
+
+// Multiply
+template <typename DevCtx, typename T, class Enable = void>
+struct SameDimsMultiplyFunctor {
+  void operator()(const DevCtx& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z);
+};
+
+template <typename DevCtx, typename T>
+struct SameDimsMultiplyFunctor<
+    DevCtx,
+    T,
+    typename std::enable_if<std::is_floating_point<T>::value>::type> {
+  void operator()(const DevCtx& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z) {
+    auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
+    blas.VMUL(x.numel(), x.data<T>(), y.data<T>(), z->mutable_data<T>());
+  }
+};
+
+template <typename DevCtx, typename T>
+struct SameDimsMultiplyFunctor<
+    DevCtx,
+    T,
+    typename std::enable_if<!std::is_floating_point<T>::value>::type> {
+  void operator()(const DevCtx& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z) {
+    auto eigen_x = pten::EigenVector<T>::Flatten(x);
+    auto eigen_y = pten::EigenVector<T>::Flatten(y);
+    auto eigen_z = pten::EigenVector<T>::Flatten(*z);
+    auto& place = *dev_ctx.eigen_device();
+    eigen_z.device(place) = eigen_x * eigen_y;
+  }
+};
+
+inline void UpdateElementwiseIndexArray(const int* out_dims_array,
+                                        const int max_dim,
+                                        int* index_array) {
+  for (int i = max_dim - 1; i >= 0; --i) {
+    ++index_array[i];
+    if (index_array[i] >= out_dims_array[i]) {
+      index_array[i] -= out_dims_array[i];
+    } else {
+      break;
+    }
+  }
+}
+
+inline int GetElementwiseIndex(const int* x_dims_array,
+                               const int max_dim,
+                               const int* index_array) {
+  int index_ = 0;
+  for (int i = 0; i < max_dim; i++) {
+    if (x_dims_array[i] > 1) {
+      index_ = index_ * x_dims_array[i] + index_array[i];
+    }
+  }
+  return index_;
+}
+
+template <typename Functor, typename T, typename OutType = T>
+void CommonForwardBroadcastCPU(const DenseTensor& x,
+                               const DenseTensor& y,
+                               DenseTensor* z,
+                               int* x_dims_array,
+                               int* y_dims_array,
+                               int* out_dims_array,
+                               int max_dim,
+                               const paddle::platform::CPUDeviceContext& ctx,
+                               Functor func,
+                               const bool is_xsize_larger = true) {
+  std::vector<int> index_array(max_dim, 0);
+  const T* x_data = x.data<T>();
+  const T* y_data = y.data<T>();
+  PADDLE_ENFORCE_NOT_NULL(x_data,
+                          paddle::platform::errors::InvalidArgument(
+                              "The input X should not be empty."));
+  PADDLE_ENFORCE_NOT_NULL(y_data,
+                          paddle::platform::errors::InvalidArgument(
+                              "The input Y should not be empty."));
+  OutType* out_data = z->mutable_data<OutType>();
+
+  const int out_size = std::accumulate(
+      out_dims_array, out_dims_array + max_dim, 1, std::multiplies<int>());
+  int x_index, y_index;
+  for (int out_index = 0; out_index < out_size; ++out_index) {
+    x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data());
+    y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data());
+    if (is_xsize_larger) {
+      out_data[out_index] = func(x_data[x_index], y_data[y_index]);
+    } else {
+      out_data[out_index] = func(y_data[y_index], x_data[x_index]);
+    }
+
+    UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data());
+  }
+}
+
+template <typename Functor, typename T, typename OutType = T>
+void CommonElementwiseBroadcastForward(
+    const paddle::platform::CPUDeviceContext& dev_ctx,
+    const DenseTensor& x,
+    const DenseTensor& y,
+    DenseTensor* z,
+    const DDim& x_dims,
+    const DDim& y_dims,
+    Functor func,
+    int axis,
+    const bool is_xsize_larger = true) {
+  int max_dim = (std::max)(x_dims.size(), y_dims.size());
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  PADDLE_ENFORCE_GE(
+      axis,
+      0,
+      paddle::platform::errors::InvalidArgument(
+          "Axis should be great than or equal to 0, but received axis is %d.",
+          axis));
+  PADDLE_ENFORCE_LT(axis,
+                    max_dim,
+                    paddle::platform::errors::InvalidArgument(
+                        "Axis should be less than %d, but received axis is %d.",
+                        max_dim,
+                        axis));
+  std::vector<int> x_dims_array(max_dim);
+  std::vector<int> y_dims_array(max_dim);
+  std::vector<int> out_dims_array(max_dim);
+  funcs::GetBroadcastDimsArrays(x_dims,
+                                y_dims,
+                                x_dims_array.data(),
+                                y_dims_array.data(),
+                                out_dims_array.data(),
+                                max_dim,
+                                axis);
+
+  CommonForwardBroadcastCPU<Functor, T, OutType>(x,
+                                                 y,
+                                                 z,
+                                                 x_dims_array.data(),
+                                                 y_dims_array.data(),
+                                                 out_dims_array.data(),
+                                                 max_dim,
+                                                 dev_ctx,
+                                                 func,
+                                                 is_xsize_larger);
+}
+
+// It is a common CPU implementation to compute binary calculation with the
+// support of broadcast. Note:
+// 1. CPU implementation cannot support the case when x needs broadcast, thus
+//    this function need to be called with XxxFunctor and XxxInverseFunctor,
+//    like AddFunctor and InverseAddFunctor.
+// 2. The corresponding GPU implementation supports all the broadcast cases,
+//    thus there is no need to define and call with XxxInverseFunctor.
+// TODO(liuyiqun): optimize the CPU implementation to support all broadcast
+// cases and avoid the need of XxxInverseFunctor.
+template <typename Functor, typename T, typename OutType = T>
+void ElementwiseCompute(const paddle::platform::CPUDeviceContext& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        int axis,
+                        Functor func,
+                        DenseTensor* z) {
+  z->mutable_data<OutType>();
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  bool is_xsize_larger = true;
+  int max_dim = x_dims.size();
+  if (x_dims.size() < y_dims.size()) {
+    is_xsize_larger = false;
+    max_dim = y_dims.size();
+  }
+  funcs::
+      TransformFunctor<Functor, T, paddle::platform::CPUDeviceContext, OutType>
+          functor(x, y, z, dev_ctx, func, is_xsize_larger);
+  if (x_dims == y_dims) {
+    functor.Run();
+    return;
+  }
+
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  PADDLE_ENFORCE_GE(
+      axis,
+      0,
+      paddle::platform::errors::InvalidArgument(
+          "Axis should be great than or equal to 0, but received axis is %d.",
+          axis));
+  PADDLE_ENFORCE_LT(axis,
+                    max_dim,
+                    paddle::platform::errors::InvalidArgument(
+                        "Axis should be less than %d, but received axis is %d.",
+                        max_dim,
+                        axis));
+
+  int pre, n, post, is_run_common_broadcast, axis_trim = 0;
+  if (is_xsize_larger) {
+    auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims);
+    axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
+    funcs::get_mid_dims(x_dims,
+                        y_dims_trimed,
+                        axis_trim,
+                        &pre,
+                        &n,
+                        &post,
+                        &is_run_common_broadcast);
+  } else {
+    auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims);
+    axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
+    funcs::get_mid_dims(y_dims,
+                        x_dims_trimed,
+                        axis_trim,
+                        &pre,
+                        &n,
+                        &post,
+                        &is_run_common_broadcast);
+  }
+  // special case for common implementation.
+  // case 1: x=[2,3,1,5], y=[2,1,4,1]
+  // case 2: x=[2,3,4], y=[1,1,4]
+  if (is_run_common_broadcast == 1) {
+    CommonElementwiseBroadcastForward<Functor, T, OutType>(
+        dev_ctx, x, y, z, x_dims, y_dims, func, axis, is_xsize_larger);
+    return;
+  }
+
+  if (post == 1) {
+    functor.RunRowWise(n, pre);
+    return;
+  } else {
+    functor.RunMidWise(n, pre, post);
+    return;
+  }
+}
+
+template <typename Functor>
+struct SameDimsElementwiseCompute {
+  void operator()(const paddle::platform::CPUDeviceContext& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z) {
+    Functor()(dev_ctx, x, y, z);
+  }
+};
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/cpu/math_kernel.cc b/paddle/pten/kernels/cpu/math_kernel.cc
index 152d945144f6cd2723a4a1eaa0c94037dfade3ca..c022dd08bbe40c22ae392341ac1d2953e389c7be 100644
--- a/paddle/pten/kernels/cpu/math_kernel.cc
+++ b/paddle/pten/kernels/cpu/math_kernel.cc
@@ -18,9 +18,11 @@
 #include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/common/scalar.h"
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/kernels/hybird/cpu/elementwise.h"
+
+#include "paddle/pten/kernels/cpu/elementwise_impl.h"
+#include "paddle/pten/kernels/funcs/elementwise_functor.h"
+
 #include "paddle/pten/kernels/hybird/eigen/reduce.h"
-#include "paddle/pten/kernels/hybird/general/elementwise_functor.h"
 #include "paddle/pten/kernels/hybird/general/reduce_impl.h"
 
 // See Note [ Why still include the fluid headers? ]
@@ -30,29 +32,28 @@
 
 namespace pten {
 
-#define DEFINE_CPU_ELEMENTWISE_OP(name)                                      \
-  template <typename T, typename Context>                                    \
-  void name##Kernel(const Context& dev_ctx,                                  \
-                    const DenseTensor& x,                                    \
-                    const DenseTensor& y,                                    \
-                    int axis,                                                \
-                    DenseTensor* out) {                                      \
-    out->mutable_data<T>();                                                  \
-    if (x.dims() == y.dims()) {                                              \
-      SameDimsElementwiseCompute<                                            \
-          general::SameDims##name##Functor<CPUContext, T>>()(                \
-          dev_ctx, x, y, out);                                               \
-    } else {                                                                 \
-      auto x_dims = x.dims();                                                \
-      auto y_dims = y.dims();                                                \
-      if (x_dims.size() >= y_dims.size()) {                                  \
-        ElementwiseCompute<general::name##Functor<T>, T>(                    \
-            dev_ctx, x, y, axis, general::name##Functor<T>(), out);          \
-      } else {                                                               \
-        ElementwiseCompute<general::Inverse##name##Functor<T>, T>(           \
-            dev_ctx, x, y, axis, general::Inverse##name##Functor<T>(), out); \
-      }                                                                      \
-    }                                                                        \
+#define DEFINE_CPU_ELEMENTWISE_OP(name)                                     \
+  template <typename T, typename Context>                                   \
+  void name##Kernel(const Context& dev_ctx,                                 \
+                    const DenseTensor& x,                                   \
+                    const DenseTensor& y,                                   \
+                    int axis,                                               \
+                    DenseTensor* out) {                                     \
+    out->mutable_data<T>();                                                 \
+    if (x.dims() == y.dims()) {                                             \
+      SameDimsElementwiseCompute<SameDims##name##Functor<CPUContext, T>>()( \
+          dev_ctx, x, y, out);                                              \
+    } else {                                                                \
+      auto x_dims = x.dims();                                               \
+      auto y_dims = y.dims();                                               \
+      if (x_dims.size() >= y_dims.size()) {                                 \
+        ElementwiseCompute<funcs::name##Functor<T>, T>(                     \
+            dev_ctx, x, y, axis, funcs::name##Functor<T>(), out);           \
+      } else {                                                              \
+        ElementwiseCompute<funcs::Inverse##name##Functor<T>, T>(            \
+            dev_ctx, x, y, axis, funcs::Inverse##name##Functor<T>(), out);  \
+      }                                                                     \
+    }                                                                       \
   }
 
 template <typename T, typename Context>
@@ -76,17 +77,17 @@ void DivideKernel(const Context& dev_ctx,
   // allocate memory for out
   out->mutable_data<T>();
   if (x.dims() == y.dims() && std::is_floating_point<T>::value) {
-    SameDimsElementwiseCompute<general::SameDimsDivideFunctor<CPUContext, T>>()(
+    SameDimsElementwiseCompute<SameDimsDivideFunctor<CPUContext, T>>()(
         dev_ctx, x, y, out);
   } else {
     auto x_dims = x.dims();
     auto y_dims = y.dims();
     if (x_dims.size() >= y_dims.size()) {
-      ElementwiseCompute<general::DivideFunctor<T>, T>(
-          dev_ctx, x, y, axis, general::DivideFunctor<T>(), out);
+      ElementwiseCompute<funcs::DivideFunctor<T>, T>(
+          dev_ctx, x, y, axis, funcs::DivideFunctor<T>(), out);
     } else {
-      ElementwiseCompute<general::InverseDivideFunctor<T>, T>(
-          dev_ctx, x, y, axis, general::InverseDivideFunctor<T>(), out);
+      ElementwiseCompute<funcs::InverseDivideFunctor<T>, T>(
+          dev_ctx, x, y, axis, funcs::InverseDivideFunctor<T>(), out);
     }
   }
 }
diff --git a/paddle/pten/kernels/hybird/general/elementwise_base.h b/paddle/pten/kernels/funcs/elementwise_base.h
similarity index 99%
rename from paddle/pten/kernels/hybird/general/elementwise_base.h
rename to paddle/pten/kernels/funcs/elementwise_base.h
index 20154a8744f3d28e98a824f79d14d7863680a5b6..a0c6d5ba5701168858740002f57c9e24dbda43d0 100644
--- a/paddle/pten/kernels/hybird/general/elementwise_base.h
+++ b/paddle/pten/kernels/funcs/elementwise_base.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/pten/core/dense_tensor.h"
 
 namespace pten {
-namespace general {
+namespace funcs {
 
 using DDim = paddle::framework::DDim;
 
@@ -378,6 +378,5 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims,
     }
   }
 }
-
-}  // namespace general
+}  // namespace funcs
 }  // namespace pten
diff --git a/paddle/pten/kernels/funcs/elementwise_functor.h b/paddle/pten/kernels/funcs/elementwise_functor.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b2519b0fd6b1418a9e73d3c3985bf6ce99dbbbd
--- /dev/null
+++ b/paddle/pten/kernels/funcs/elementwise_functor.h
@@ -0,0 +1,83 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace pten {
+namespace funcs {
+
+// Define the binary functors used in elementwise ops.
+
+// Add
+template <typename T>
+struct AddFunctor {
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a + b; }
+};
+template <typename T>
+struct InverseAddFunctor {
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b + a; }
+};
+
+// Subtract
+template <typename T>
+struct SubtractFunctor {
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a - b; }
+};
+template <typename T>
+struct InverseSubtractFunctor {
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b - a; }
+};
+
+// Multiply
+template <typename T>
+struct MultiplyFunctor {
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
+};
+template <typename T>
+struct InverseMultiplyFunctor {
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b * a; }
+};
+
+// Divide
+#define DIV_ERROR_INFO                                             \
+  "InvalidArgumentError: Integer division by zero encountered in " \
+  "(floor) divide. Please check the input value."
+
+template <typename T, typename Enable = void>
+struct DivideFunctor {
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a / b; }
+};
+
+template <typename T>
+struct DivideFunctor<
+    T,
+    typename std::enable_if<std::is_integral<T>::value>::type> {
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const {
+    // For int32/int64, need to check whether the divison is zero.
+    PADDLE_ENFORCE(b != 0, DIV_ERROR_INFO);
+    return a / b;
+  }
+};
+
+template <typename T, typename Enable = void>
+struct InverseDivideFunctor {
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b / a; }
+};
+
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu
index 636d0f16b0d711328eb7f384722f60a178b11e60..760bebe687841b663b9baae3e89788c57de063d4 100644
--- a/paddle/pten/kernels/gpu/math_kernel.cu
+++ b/paddle/pten/kernels/gpu/math_kernel.cu
@@ -15,9 +15,9 @@ limitations under the License. */
 #include "paddle/pten/kernels/math_kernel.h"
 
 #include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/kernels/funcs/elementwise_functor.h"
 #include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h"
 #include "paddle/pten/kernels/hybird/cuda/reduce/reduce.h"
-#include "paddle/pten/kernels/hybird/general/elementwise_functor.h"
 #include "paddle/pten/kernels/hybird/general/reduce_impl.h"
 
 #ifdef __NVCC__
@@ -39,21 +39,21 @@ namespace kps = paddle::operators::kernel_primitives;
 
 namespace pten {
 
-#define DEFINE_CUDA_ELEMENTWISE_OP(name)                               \
-  template <typename T, typename Context>                              \
-  void name##Kernel(const Context& dev_ctx,                            \
-                    const DenseTensor& x,                              \
-                    const DenseTensor& y,                              \
-                    int axis,                                          \
-                    DenseTensor* out) {                                \
-    std::vector<const DenseTensor*> inputs;                            \
-    std::vector<DenseTensor*> outputs;                                 \
-    inputs.emplace_back(&x);                                           \
-    inputs.emplace_back(&y);                                           \
-    outputs.emplace_back(out);                                         \
-    out->mutable_data<T>();                                            \
-    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(       \
-        dev_ctx, inputs, &outputs, axis, general::name##Functor<T>()); \
+#define DEFINE_CUDA_ELEMENTWISE_OP(name)                             \
+  template <typename T, typename Context>                            \
+  void name##Kernel(const Context& dev_ctx,                          \
+                    const DenseTensor& x,                            \
+                    const DenseTensor& y,                            \
+                    int axis,                                        \
+                    DenseTensor* out) {                              \
+    std::vector<const DenseTensor*> inputs;                          \
+    std::vector<DenseTensor*> outputs;                               \
+    inputs.emplace_back(&x);                                         \
+    inputs.emplace_back(&y);                                         \
+    outputs.emplace_back(out);                                       \
+    out->mutable_data<T>();                                          \
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(     \
+        dev_ctx, inputs, &outputs, axis, funcs::name##Functor<T>()); \
   }
 
 /**
diff --git a/paddle/pten/kernels/hybird/CMakeLists.txt b/paddle/pten/kernels/hybird/CMakeLists.txt
index 1304aa1798c0caffdd8bac3facdaa3a6e89c1012..5d04bae2eae82014f3147d1c1710f110a5bd530b 100644
--- a/paddle/pten/kernels/hybird/CMakeLists.txt
+++ b/paddle/pten/kernels/hybird/CMakeLists.txt
@@ -1,5 +1,4 @@
 add_subdirectory(eigen)
-add_subdirectory(blas)
 add_subdirectory(general)
 
 cc_library(pten_transpose_cpu SRCS transpose.cc DEPS dense_tensor pten_context)
diff --git a/paddle/pten/kernels/hybird/blas/CMakeLists.txt b/paddle/pten/kernels/hybird/blas/CMakeLists.txt
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/paddle/pten/kernels/hybird/blas/elementwise.h b/paddle/pten/kernels/hybird/blas/elementwise.h
deleted file mode 100644
index 1a530c9f8e940dc8d48ef79a821ab6534a261d7c..0000000000000000000000000000000000000000
--- a/paddle/pten/kernels/hybird/blas/elementwise.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/pten/core/dense_tensor.h"
-
-namespace pten {
-namespace blas {
-
-template <typename DevCtx, typename T>
-void ElementwiseAdd(const DevCtx& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out) {
-  auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
-  blas.VADD(x.numel(), x.data<T>(), y.data<T>(), out->mutable_data<T>());
-}
-
-template <typename DevCtx, typename T>
-void ElementwiseSub(const DevCtx& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out) {
-  auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
-  blas.VSUB(x.numel(), x.data<T>(), y.data<T>(), out->mutable_data<T>());
-}
-
-template <typename DevCtx, typename T>
-void ElementwiseDiv(const DevCtx& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out) {
-  auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
-  blas.VDIV(x.numel(), x.data<T>(), y.data<T>(), out->mutable_data<T>());
-}
-
-template <typename DevCtx, typename T>
-void ElementwiseMul(const DevCtx& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out) {
-  auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
-  blas.VMUL(x.numel(), x.data<T>(), y.data<T>(), out->mutable_data<T>());
-}
-}  // namespace blas
-}  // namespace pten
diff --git a/paddle/pten/kernels/hybird/cpu/CMakeLists.txt b/paddle/pten/kernels/hybird/cpu/CMakeLists.txt
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/paddle/pten/kernels/hybird/cpu/elementwise.h b/paddle/pten/kernels/hybird/cpu/elementwise.h
deleted file mode 100644
index d503957a7626203aa930eba62bf4fee6d5b0cdb1..0000000000000000000000000000000000000000
--- a/paddle/pten/kernels/hybird/cpu/elementwise.h
+++ /dev/null
@@ -1,230 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/hybird/general/elementwise_base.h"
-
-namespace pten {
-
-inline void UpdateElementwiseIndexArray(const int *out_dims_array,
-                                        const int max_dim,
-                                        int *index_array) {
-  for (int i = max_dim - 1; i >= 0; --i) {
-    ++index_array[i];
-    if (index_array[i] >= out_dims_array[i]) {
-      index_array[i] -= out_dims_array[i];
-    } else {
-      break;
-    }
-  }
-}
-
-inline int GetElementwiseIndex(const int *x_dims_array,
-                               const int max_dim,
-                               const int *index_array) {
-  int index_ = 0;
-  for (int i = 0; i < max_dim; i++) {
-    if (x_dims_array[i] > 1) {
-      index_ = index_ * x_dims_array[i] + index_array[i];
-    }
-  }
-  return index_;
-}
-
-template <typename Functor, typename T, typename OutType = T>
-void CommonForwardBroadcastCPU(const DenseTensor &x,
-                               const DenseTensor &y,
-                               DenseTensor *z,
-                               int *x_dims_array,
-                               int *y_dims_array,
-                               int *out_dims_array,
-                               int max_dim,
-                               const paddle::platform::CPUDeviceContext &ctx,
-                               Functor func,
-                               const bool is_xsize_larger = true) {
-  std::vector<int> index_array(max_dim, 0);
-  const T *x_data = x.data<T>();
-  const T *y_data = y.data<T>();
-  PADDLE_ENFORCE_NOT_NULL(x_data,
-                          paddle::platform::errors::InvalidArgument(
-                              "The input X should not be empty."));
-  PADDLE_ENFORCE_NOT_NULL(y_data,
-                          paddle::platform::errors::InvalidArgument(
-                              "The input Y should not be empty."));
-  OutType *out_data = z->mutable_data<OutType>();
-
-  const int out_size = std::accumulate(
-      out_dims_array, out_dims_array + max_dim, 1, std::multiplies<int>());
-  int x_index, y_index;
-  for (int out_index = 0; out_index < out_size; ++out_index) {
-    x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data());
-    y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data());
-    if (is_xsize_larger) {
-      out_data[out_index] = func(x_data[x_index], y_data[y_index]);
-    } else {
-      out_data[out_index] = func(y_data[y_index], x_data[x_index]);
-    }
-
-    UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data());
-  }
-}
-
-template <typename Functor, typename T, typename OutType = T>
-void CommonElementwiseBroadcastForward(
-    const paddle::platform::CPUDeviceContext &dev_ctx,
-    const DenseTensor &x,
-    const DenseTensor &y,
-    DenseTensor *z,
-    const DDim &x_dims,
-    const DDim &y_dims,
-    Functor func,
-    int axis,
-    const bool is_xsize_larger = true) {
-  int max_dim = (std::max)(x_dims.size(), y_dims.size());
-  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-  PADDLE_ENFORCE_GE(
-      axis,
-      0,
-      paddle::platform::errors::InvalidArgument(
-          "Axis should be great than or equal to 0, but received axis is %d.",
-          axis));
-  PADDLE_ENFORCE_LT(axis,
-                    max_dim,
-                    paddle::platform::errors::InvalidArgument(
-                        "Axis should be less than %d, but received axis is %d.",
-                        max_dim,
-                        axis));
-  std::vector<int> x_dims_array(max_dim);
-  std::vector<int> y_dims_array(max_dim);
-  std::vector<int> out_dims_array(max_dim);
-  general::GetBroadcastDimsArrays(x_dims,
-                                  y_dims,
-                                  x_dims_array.data(),
-                                  y_dims_array.data(),
-                                  out_dims_array.data(),
-                                  max_dim,
-                                  axis);
-
-  CommonForwardBroadcastCPU<Functor, T, OutType>(x,
-                                                 y,
-                                                 z,
-                                                 x_dims_array.data(),
-                                                 y_dims_array.data(),
-                                                 out_dims_array.data(),
-                                                 max_dim,
-                                                 dev_ctx,
-                                                 func,
-                                                 is_xsize_larger);
-}
-
-// It is a common CPU implementation to compute binary calculation with the
-// support of broadcast. Note:
-// 1. CPU implementation cannot support the case when x needs broadcast, thus
-//    this function need to be called with XxxFunctor and XxxInverseFunctor,
-//    like AddFunctor and InverseAddFunctor.
-// 2. The corresponding GPU implementation supports all the broadcast cases,
-//    thus there is no need to define and call with XxxInverseFunctor.
-// TODO(liuyiqun): optimize the CPU implementation to support all broadcast
-// cases and avoid the need of XxxInverseFunctor.
-template <typename Functor, typename T, typename OutType = T>
-void ElementwiseCompute(const paddle::platform::CPUDeviceContext &dev_ctx,
-                        const DenseTensor &x,
-                        const DenseTensor &y,
-                        int axis,
-                        Functor func,
-                        DenseTensor *z) {
-  z->mutable_data<OutType>();
-  auto x_dims = x.dims();
-  auto y_dims = y.dims();
-  bool is_xsize_larger = true;
-  int max_dim = x_dims.size();
-  if (x_dims.size() < y_dims.size()) {
-    is_xsize_larger = false;
-    max_dim = y_dims.size();
-  }
-  general::
-      TransformFunctor<Functor, T, paddle::platform::CPUDeviceContext, OutType>
-          functor(x, y, z, dev_ctx, func, is_xsize_larger);
-  if (x_dims == y_dims) {
-    functor.Run();
-    return;
-  }
-
-  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-  PADDLE_ENFORCE_GE(
-      axis,
-      0,
-      paddle::platform::errors::InvalidArgument(
-          "Axis should be great than or equal to 0, but received axis is %d.",
-          axis));
-  PADDLE_ENFORCE_LT(axis,
-                    max_dim,
-                    paddle::platform::errors::InvalidArgument(
-                        "Axis should be less than %d, but received axis is %d.",
-                        max_dim,
-                        axis));
-
-  int pre, n, post, is_run_common_broadcast, axis_trim = 0;
-  if (is_xsize_larger) {
-    auto y_dims_trimed = general::trim_trailing_singular_dims(y_dims);
-    axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
-    general::get_mid_dims(x_dims,
-                          y_dims_trimed,
-                          axis_trim,
-                          &pre,
-                          &n,
-                          &post,
-                          &is_run_common_broadcast);
-  } else {
-    auto x_dims_trimed = general::trim_trailing_singular_dims(x_dims);
-    axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
-    general::get_mid_dims(y_dims,
-                          x_dims_trimed,
-                          axis_trim,
-                          &pre,
-                          &n,
-                          &post,
-                          &is_run_common_broadcast);
-  }
-  // special case for common implementation.
-  // case 1: x=[2,3,1,5], y=[2,1,4,1]
-  // case 2: x=[2,3,4], y=[1,1,4]
-  if (is_run_common_broadcast == 1) {
-    CommonElementwiseBroadcastForward<Functor, T, OutType>(
-        dev_ctx, x, y, z, x_dims, y_dims, func, axis, is_xsize_larger);
-    return;
-  }
-
-  if (post == 1) {
-    functor.RunRowWise(n, pre);
-    return;
-  } else {
-    functor.RunMidWise(n, pre, post);
-    return;
-  }
-}
-
-template <typename Functor>
-struct SameDimsElementwiseCompute {
-  void operator()(const paddle::platform::CPUDeviceContext &dev_ctx,
-                  const DenseTensor &x,
-                  const DenseTensor &y,
-                  DenseTensor *z) {
-    Functor()(dev_ctx, x, y, z);
-  }
-};
-
-}  // namespace pten
diff --git a/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h b/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h
index 7c5f3a9778404e706ee706ec1e29076add24c8c1..ae384693249a48ab5576042d88dc3a3f546c154d 100644
--- a/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h
+++ b/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/function_traits.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/hybird/general/elementwise_base.h"
+#include "paddle/pten/kernels/funcs/elementwise_base.h"
 
 namespace pten {
 namespace kps = paddle::operators::kernel_primitives;
diff --git a/paddle/pten/kernels/hybird/eigen/elementwise.h b/paddle/pten/kernels/hybird/eigen/elementwise.h
deleted file mode 100644
index e67cce63d461f461c98258ca770ce09600a5ddba..0000000000000000000000000000000000000000
--- a/paddle/pten/kernels/hybird/eigen/elementwise.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/hybird/eigen/common.h"
-
-namespace pten {
-namespace eigen {
-
-template <typename DevCtx, typename T>
-void ElementwiseAdd(const DevCtx& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out) {
-  out->mutable_data<T>();
-  auto eigen_x = pten::EigenVector<T>::Flatten(x);
-  auto eigen_y = pten::EigenVector<T>::Flatten(y);
-  auto eigen_z = pten::EigenVector<T>::Flatten(*out);
-  auto& place = *dev_ctx.eigen_device();
-  eigen_z.device(place) = eigen_x + eigen_y;
-}
-
-template <typename DevCtx, typename T>
-void ElementwiseSub(const DevCtx& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out) {
-  auto eigen_x = pten::EigenVector<T>::Flatten(x);
-  auto eigen_y = pten::EigenVector<T>::Flatten(y);
-  auto eigen_z = pten::EigenVector<T>::Flatten(*out);
-  auto& place = *dev_ctx.eigen_device();
-  eigen_z.device(place) = eigen_x - eigen_y;
-}
-
-template <typename DevCtx, typename T>
-void ElementwiseMul(const DevCtx& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out) {
-  auto eigen_x = pten::EigenVector<T>::Flatten(x);
-  auto eigen_y = pten::EigenVector<T>::Flatten(y);
-  auto eigen_z = pten::EigenVector<T>::Flatten(*out);
-  auto& place = *dev_ctx.eigen_device();
-  eigen_z.device(place) = eigen_x * eigen_y;
-}
-
-}  // namespace eigen
-}  // namespace pten
diff --git a/paddle/pten/kernels/hybird/general/elementwise_functor.h b/paddle/pten/kernels/hybird/general/elementwise_functor.h
deleted file mode 100644
index 62b422f4ae414c00bfc5590743c594d32080f752..0000000000000000000000000000000000000000
--- a/paddle/pten/kernels/hybird/general/elementwise_functor.h
+++ /dev/null
@@ -1,223 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/hostdevice.h"
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/hybird/blas/elementwise.h"
-#include "paddle/pten/kernels/hybird/eigen/elementwise.h"
-
-namespace pten {
-namespace general {
-
-// Define the binary functors used in elementwise ops.
-
-// Add
-template <typename DevCtx, typename T, class Enable = void>
-struct SameDimsAddFunctor {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z);
-};
-
-template <typename DevCtx, typename T>
-struct SameDimsAddFunctor<
-    DevCtx,
-    T,
-    typename std::enable_if<std::is_floating_point<T>::value>::type> {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z) {
-    blas::ElementwiseAdd<DevCtx, T>(dev_ctx, x, y, z);
-  }
-};
-
-template <typename DevCtx, typename T>
-struct SameDimsAddFunctor<
-    DevCtx,
-    T,
-    typename std::enable_if<!std::is_floating_point<T>::value>::type> {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z) {
-    eigen::ElementwiseAdd<DevCtx, T>(dev_ctx, x, y, z);
-  }
-};
-
-template <typename T>
-struct AddFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a + b; }
-};
-template <typename T>
-struct InverseAddFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b + a; }
-};
-
-// Subtract
-template <typename DevCtx, typename T, class Enable = void>
-struct SameDimsSubtractFunctor {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z);
-};
-
-template <typename DevCtx, typename T>
-struct SameDimsSubtractFunctor<
-    DevCtx,
-    T,
-    typename std::enable_if<std::is_floating_point<T>::value>::type> {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z) {
-    blas::ElementwiseSub<DevCtx, T>(dev_ctx, x, y, z);
-  }
-};
-
-template <typename DevCtx, typename T>
-struct SameDimsSubtractFunctor<
-    DevCtx,
-    T,
-    typename std::enable_if<!std::is_floating_point<T>::value>::type> {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z) {
-    eigen::ElementwiseSub<DevCtx, T>(dev_ctx, x, y, z);
-  }
-};
-
-template <typename T>
-struct SubtractFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a - b; }
-};
-template <typename T>
-struct InverseSubtractFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b - a; }
-};
-
-// Divide
-template <typename DevCtx, typename T, class Enable = void>
-struct SameDimsDivideFunctor {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z);
-};
-
-template <typename DevCtx, typename T>
-struct SameDimsDivideFunctor<
-    DevCtx,
-    T,
-    typename std::enable_if<!std::is_floating_point<T>::value>::type> {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z) {
-    paddle::platform::errors::InvalidArgument(
-        "If use SameDimsDivideFunctor, template args(T) must be floating "
-        "point. ");
-  }
-};
-
-template <typename DevCtx, typename T>
-struct SameDimsDivideFunctor<
-    DevCtx,
-    T,
-    typename std::enable_if<std::is_floating_point<T>::value>::type> {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z) {
-    blas::ElementwiseDiv<DevCtx, T>(dev_ctx, x, y, z);
-  }
-};
-
-#define DIV_ERROR_INFO                                             \
-  "InvalidArgumentError: Integer division by zero encountered in " \
-  "(floor) divide. Please check the input value."
-
-template <typename T, typename Enable = void>
-struct DivideFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a / b; }
-};
-
-template <typename T>
-struct DivideFunctor<
-    T,
-    typename std::enable_if<std::is_integral<T>::value>::type> {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const {
-    // For int32/int64, need to check whether the divison is zero.
-    PADDLE_ENFORCE(b != 0, DIV_ERROR_INFO);
-    return a / b;
-  }
-};
-
-template <typename T, typename Enable = void>
-struct InverseDivideFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b / a; }
-};
-
-// Multiply
-template <typename DevCtx, typename T, class Enable = void>
-struct SameDimsMultiplyFunctor {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z);
-};
-
-template <typename DevCtx, typename T>
-struct SameDimsMultiplyFunctor<
-    DevCtx,
-    T,
-    typename std::enable_if<std::is_floating_point<T>::value>::type> {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z) {
-    blas::ElementwiseMul<DevCtx, T>(dev_ctx, x, y, z);
-  }
-};
-
-template <typename DevCtx, typename T>
-struct SameDimsMultiplyFunctor<
-    DevCtx,
-    T,
-    typename std::enable_if<!std::is_floating_point<T>::value>::type> {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z) {
-    eigen::ElementwiseMul<DevCtx, T>(dev_ctx, x, y, z);
-  }
-};
-template <typename T>
-struct MultiplyFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
-};
-template <typename T>
-struct InverseMultiplyFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b * a; }
-};
-
-}  // namespace general
-}  // namespace pten
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index 72bf26c57dd5ad475e3ac9a10121d14a895828f7..35720ae32fe3891b6f9ac30face388fc1c053b4b 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -342,7 +342,6 @@ def source_include(header_file_path):
 
 #include "paddle/pten/api/include/kernel_signature.h"
 #include "paddle/pten/api/lib/api_registry.h"
-#include "paddle/pten/api/lib/kernel_declare.h"
 #include "paddle/pten/api/lib/kernel_dispatch.h"
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/core/kernel_registry.h"