diff --git a/paddle/fluid/operators/cholesky_solve_op.h b/paddle/fluid/operators/cholesky_solve_op.h
index f3b0056165426c7cc46409abece3589edad96269..94b68bff8f446200de563897eb9e9d10b433987a 100644
--- a/paddle/fluid/operators/cholesky_solve_op.h
+++ b/paddle/fluid/operators/cholesky_solve_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/operators/triangular_solve_op.h"
 #include "paddle/fluid/platform/complex.h"
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/math_kernel.h"
 
 namespace paddle {
 namespace operators {  // namespace operators
@@ -205,7 +205,7 @@ class CholeskySolveGradKernel : public framework::OpKernel<T> {
       auto pt_x = paddle::experimental::MakePtenDenseTensor(commonterm);
       auto pt_y = paddle::experimental::MakePtenDenseTensor(commonterm_conj);
       auto pt_z = paddle::experimental::MakePtenDenseTensor(commonterm);
-      pten::Add<T>(dev_ctx, *pt_x.get(), *pt_y.get(), -1, pt_z.get());
+      pten::AddKernel<T>(dev_ctx, *pt_x.get(), *pt_y.get(), -1, pt_z.get());
 
       auto mat_dim_u = math::CreateMatrixDescriptor(u_bst.dims(), 0, false);
       auto mat_dim_c =
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index a4567beeb4f3d9a1e913e0853bfaee19a1e9124f..d6d79d166d00a3610af7d22aa845a753437626ad 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -25,7 +25,7 @@ limitations under the License. */
 // only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/math_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -68,7 +68,7 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
     auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
     auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
     auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
-    pten::Add<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get());
+    pten::AddKernel<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get());
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h
index f3ba5050c4f53695baa92c07f7bb93eb59871de5..c886644bbdd1b63139f4cd98890b84fd63bce330 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -28,7 +28,7 @@ limitations under the License. */
 // only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/math_kernel.h"
 namespace paddle {
 namespace operators {
 
@@ -62,7 +62,7 @@ class ElementwiseDivKernel : public framework::OpKernel<T> {
     auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
     auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
     auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
-    pten::Divide<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get());
+    pten::DivideKernel<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get());
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index e131bc4974661c613928ac8fee807841bb85ba58..12e0062a698be6d386893c5c1fb09719ceb1c609 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -57,7 +57,8 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
       auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
       auto pt_y = paddle::experimental::MakePtenDenseTensor(*y_lod);
       auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
-      pten::Multiply<T>(cuda_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get());
+      pten::MultiplyKernel<T>(cuda_ctx, *pt_x.get(), *pt_y.get(), axis,
+                              pt_z.get());
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "X's type[%s] is not supported by elementwise_op. X's type should be "
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index 8b43f82e6b6a1092565331af2bc050e23a3f8ca8..3b0f0725722101ae3248e651c8279d3de464a728 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -24,7 +24,7 @@ limitations under the License. */
 // only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/math_kernel.h"
 namespace paddle {
 namespace operators {
 
@@ -129,7 +129,8 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
       auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
       auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
       auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
-      pten::Multiply<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get());
+      pten::MultiplyKernel<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis,
+                              pt_z.get());
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "X's type[%s] is not supported by elementwise_op. X's type should be "
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
index 09a33788d4133f6466e03b4b4e619f94401a5345..6a51d7c2a45ad5d31fe58f1b1c60a2f2541c3ce1 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
@@ -22,7 +22,7 @@ limitations under the License. */
 // only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/math_kernel.h"
 namespace paddle {
 namespace operators {
 
@@ -56,7 +56,8 @@ class ElementwiseSubKernel : public framework::OpKernel<T> {
     auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
     auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
     auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
-    pten::Subtract<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get());
+    pten::SubtractKernel<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis,
+                            pt_z.get());
   }
 };
 
diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt
index 97be4c19c970b4e08416108096e4f9a093f08796..05b321c50c1c4e07be3fd16bdb9ca860448384b0 100644
--- a/paddle/pten/CMakeLists.txt
+++ b/paddle/pten/CMakeLists.txt
@@ -28,9 +28,5 @@ get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
 # keep this message for debug, remove it later if needless
 message(STATUS "All standard pten kernels: ${pten_kernels}")
 set(PTEN_DEPS ${PTEN_DEPS} ${pten_kernels})
-set(PTEN_DEPS ${PTEN_DEPS} math_cpu)
-if(WITH_GPU OR WITH_ROCM)
-  set(PTEN_DEPS ${PTEN_DEPS} math_gpu)
-endif()
 
 cc_library(pten SRCS all.cc DEPS ${PTEN_DEPS})
diff --git a/paddle/pten/api/lib/kernel_declare.h b/paddle/pten/api/lib/kernel_declare.h
index 484063df478aaf4d8245e157fce640967bb6d5d6..4d3143ef09ccc62be069f2971ad0a17666547105 100644
--- a/paddle/pten/api/lib/kernel_declare.h
+++ b/paddle/pten/api/lib/kernel_declare.h
@@ -19,9 +19,3 @@ limitations under the License. */
 // TODO(chenweihang) After the kernel is split into a single file,
 // the kernel declare statement is automatically generated according to the
 // file name of the kernel, and this header file will be removed
-
-PT_DECLARE_KERNEL(mean, CPU, ALL_LAYOUT);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_KERNEL(mean, GPU, ALL_LAYOUT);
-#endif
diff --git a/paddle/pten/include/math.h b/paddle/pten/include/math.h
index 876834cea7806daf04e381551e8a4d24a4b74bd6..9abfa297a9452026918667f5f4fb9f00aec962d3 100644
--- a/paddle/pten/include/math.h
+++ b/paddle/pten/include/math.h
@@ -18,8 +18,7 @@ limitations under the License. */
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/include/infermeta.h"
 #include "paddle/pten/kernels/complex_kernel.h"
-#include "paddle/pten/kernels/cpu/math.h"
-#include "paddle/pten/kernels/gpu/math.h"
+#include "paddle/pten/kernels/math_kernel.h"
 #include "paddle/pten/kernels/scale_kernel.h"
 
 namespace pten {
@@ -46,7 +45,7 @@ DenseTensor Mean(const ContextT& dev_ctx,
           dev_ctx.GetPlace()),
       std::move(out_meta));
   bool reduce_all = false;
-  Mean<T>(dev_ctx, x, axis, keep_dim, reduce_all, &dense_out);
+  Mean<T, ContextT>(dev_ctx, x, axis, keep_dim, reduce_all, &dense_out);
   return dense_out;
 }
 
@@ -66,7 +65,8 @@ DenseTensor Sum(const ContextT& dev_ctx,
   // so use default value(false) is OK.
   bool reduce_all = false;
 
-  Sum<T>(dev_ctx, x, axis, keep_dim, reduce_all, out_meta.dtype, &dense_out);
+  Sum<T, ContextT>(
+      dev_ctx, x, axis, keep_dim, reduce_all, out_meta.dtype, &dense_out);
   return dense_out;
 }
 
@@ -85,62 +85,6 @@ DenseTensor Scale(const ContextT& dev_ctx,
   return dense_out;
 }
 
-template <typename T, typename ContextT>
-DenseTensor Add(const ContextT& dev_ctx,
-                const DenseTensor& x,
-                const DenseTensor& y,
-                int axis) {
-  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  Add<T>(dev_ctx, x, y, axis, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename ContextT>
-DenseTensor Subtract(const ContextT& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     int axis) {
-  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  Subtract<T>(dev_ctx, x, y, axis, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename ContextT>
-DenseTensor Divide(const ContextT& dev_ctx,
-                   const DenseTensor& x,
-                   const DenseTensor& y,
-                   int axis) {
-  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  Divide<T>(dev_ctx, x, y, axis, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename ContextT>
-DenseTensor Multiply(const ContextT& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     int axis) {
-  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  Multiply<T>(dev_ctx, x, y, axis, &dense_out);
-  return dense_out;
-}
-
 template <typename T, typename ContextT>
 DenseTensor Conj(const ContextT& dev_ctx, const DenseTensor& x) {
   auto out_meta = UnchangedInferMeta(x.meta());
diff --git a/paddle/pten/kernels/CMakeLists.txt b/paddle/pten/kernels/CMakeLists.txt
index 7a785d86921aa3ca48b15b6824f7d9066c0a9aff..4c705767f4c2ff298ce648a63fdebb78d0f69f18 100644
--- a/paddle/pten/kernels/CMakeLists.txt
+++ b/paddle/pten/kernels/CMakeLists.txt
@@ -24,11 +24,17 @@ endif()
 # pten depends all pten kernel targets
 set_property(GLOBAL PROPERTY PTEN_KERNELS "")
 
-set(COMMON_KERNEL_DEPS dense_tensor kernel_context kernel_factory)
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function)
+set(COMMON_KERNEL_DEPS dense_tensor kernel_context kernel_factory convert_utils)
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta)
 
+set(MATH_KERNEL_DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel pten_transpose_cpu)
+if(WITH_GPU OR WITH_ROCM)
+  set(MATH_KERNEL_DEPS ${MATH_KERNEL_DEPS} pten_transpose_gpu)
+endif()
+
 # auto build kernel targets by cmake
-register_kernels(DEPS ${COMMON_KERNEL_DEPS})
+register_kernels(EXCLUDES math_kernel DEPS ${COMMON_KERNEL_DEPS})
+kernel_library(math_kernel DEPS ${MATH_KERNEL_DEPS})
 
 copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
diff --git a/paddle/pten/kernels/cpu/math.cc b/paddle/pten/kernels/cpu/math.cc
index ee01cf65d02e54d417f9f7536c8e935fcadb2dd4..b4642d475d56639aff2fde22e3b4ff8c37515d57 100644
--- a/paddle/pten/kernels/cpu/math.cc
+++ b/paddle/pten/kernels/cpu/math.cc
@@ -1,4 +1,4 @@
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,138 +12,4 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/pten/kernels/cpu/math.h"
-
-#include "paddle/pten/api/ext/dispatch.h"
-#include "paddle/pten/kernels/hybird/cpu/elementwise.h"
-#include "paddle/pten/kernels/hybird/eigen/reduce.h"
-#include "paddle/pten/kernels/hybird/general/elementwise_functor.h"
-#include "paddle/pten/kernels/hybird/general/reduce_impl.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex.h"
-
-namespace pten {
-
-template <typename T>
-void Mean(const CPUContext& dev_ctx,
-          const DenseTensor& x,
-          const std::vector<int64_t>& dims,
-          bool keep_dim,
-          bool reduce_all,
-          DenseTensor* out) {
-  auto out_dtype = x.dtype();
-  pten::general::Reduce<CPUContext, T, pten::eigen::MeanFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
-template <typename T>
-void Divide(const CPUContext& dev_ctx,
-            const DenseTensor& x,
-            const DenseTensor& y,
-            int axis,
-            DenseTensor* out) {
-  // allocate memory for out
-  out->mutable_data<T>();
-  if (x.dims() == y.dims() && std::is_floating_point<T>::value) {
-    SameDimsElementwiseCompute<general::SameDimsDivideFunctor<CPUContext, T>>()(
-        dev_ctx, x, y, out);
-  } else {
-    auto x_dims = x.dims();
-    auto y_dims = y.dims();
-    if (x_dims.size() >= y_dims.size()) {
-      ElementwiseCompute<general::DivideFunctor<T>, T>(
-          dev_ctx, x, y, axis, general::DivideFunctor<T>(), out);
-    } else {
-      ElementwiseCompute<general::InverseDivideFunctor<T>, T>(
-          dev_ctx, x, y, axis, general::InverseDivideFunctor<T>(), out);
-    }
-  }
-}
-
-template <typename T>
-void Sum(const CPUContext& dev_ctx,
-         const DenseTensor& x,
-         const std::vector<int64_t>& dims,
-         bool keep_dim,
-         bool reduce_all,
-         DataType out_dtype,
-         DenseTensor* out) {
-  pten::general::Reduce<CPUContext, T, pten::eigen::SumFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
-// Create the definition of Add
-DEFINE_CPU_ELEMENTWISE_OP(Add)
-
-// Create the definition of Subtract
-DEFINE_CPU_ELEMENTWISE_OP(Subtract)
-
-// Create the definition of Multiply
-DEFINE_CPU_ELEMENTWISE_OP(Multiply)
-
-}  // namespace pten
-
-using complex64 = ::paddle::platform::complex<float>;
-using complex128 = ::paddle::platform::complex<double>;
-
-// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
-// using bfloat16 = ::paddle::platform::bfloat16;
-PT_REGISTER_KERNEL(mean, CPU, ALL_LAYOUT, pten::Mean, float, double, bool) {}
-PT_REGISTER_KERNEL(add,
-                   CPU,
-                   ALL_LAYOUT,
-                   pten::Add,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(subtract,
-                   CPU,
-                   ALL_LAYOUT,
-                   pten::Subtract,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(divide,
-                   CPU,
-                   ALL_LAYOUT,
-                   pten::Divide,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(multiply,
-                   CPU,
-                   ALL_LAYOUT,
-                   pten::Multiply,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(sum,
-                   CPU,
-                   ALL_LAYOUT,
-                   pten::Sum,
-                   bool,
-                   float,
-                   double,
-                   paddle::platform::float16,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {
-  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
-}
+namespace pten {}  // namespace pten
diff --git a/paddle/pten/kernels/cpu/math.h b/paddle/pten/kernels/cpu/math.h
deleted file mode 100644
index 1a179218b4c4c92022f798a670b8327a8fb78c57..0000000000000000000000000000000000000000
--- a/paddle/pten/kernels/cpu/math.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/backends/cpu/cpu_context.h"
-#include "paddle/pten/common/scalar.h"
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/core/kernel_registry.h"
-
-namespace pten {
-
-template <typename T>
-void Mean(const CPUContext& dev_ctx,
-          const DenseTensor& x,
-          const std::vector<int64_t>& dims,
-          bool keep_dim,
-          bool reduce_all,
-          DenseTensor* out);
-
-template <typename T>
-void Add(const CPUContext& dev_ctx,
-         const DenseTensor& x,
-         const DenseTensor& y,
-         int axis,
-         DenseTensor* out);
-
-template <typename T>
-void Subtract(const CPUContext& dev_ctx,
-              const DenseTensor& x,
-              const DenseTensor& y,
-              int axis,
-              DenseTensor* out);
-
-template <typename T>
-void Divide(const CPUContext& dev_ctx,
-            const DenseTensor& x,
-            const DenseTensor& y,
-            int axis,
-            DenseTensor* out);
-
-template <typename T>
-void Multiply(const CPUContext& dev_ctx,
-              const DenseTensor& x,
-              const DenseTensor& y,
-              int axis,
-              DenseTensor* out);
-template <typename T>
-void Sum(const CPUContext& dev_ctx,
-         const DenseTensor& x,
-         const std::vector<int64_t>& dims,
-         bool keep_dim,
-         bool reduce_all,
-         DataType out_dtype,
-         DenseTensor* out);
-
-}  // namespace pten
-
-#define DEFINE_CPU_ELEMENTWISE_OP(name)                                      \
-  template <typename T>                                                      \
-  void name(const CPUContext& dev_ctx,                                       \
-            const DenseTensor& x,                                            \
-            const DenseTensor& y,                                            \
-            int axis,                                                        \
-            DenseTensor* out) {                                              \
-    out->mutable_data<T>();                                                  \
-    if (x.dims() == y.dims()) {                                              \
-      SameDimsElementwiseCompute<                                            \
-          general::SameDims##name##Functor<CPUContext, T>>()(                \
-          dev_ctx, x, y, out);                                               \
-    } else {                                                                 \
-      auto x_dims = x.dims();                                                \
-      auto y_dims = y.dims();                                                \
-      if (x_dims.size() >= y_dims.size()) {                                  \
-        ElementwiseCompute<general::name##Functor<T>, T>(                    \
-            dev_ctx, x, y, axis, general::name##Functor<T>(), out);          \
-      } else {                                                               \
-        ElementwiseCompute<general::Inverse##name##Functor<T>, T>(           \
-            dev_ctx, x, y, axis, general::Inverse##name##Functor<T>(), out); \
-      }                                                                      \
-    }                                                                        \
-  }
diff --git a/paddle/pten/kernels/cpu/math_kernel.cc b/paddle/pten/kernels/cpu/math_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..152d945144f6cd2723a4a1eaa0c94037dfade3ca
--- /dev/null
+++ b/paddle/pten/kernels/cpu/math_kernel.cc
@@ -0,0 +1,178 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/math_kernel.h"
+
+#include "paddle/pten/api/ext/dispatch.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/hybird/cpu/elementwise.h"
+#include "paddle/pten/kernels/hybird/eigen/reduce.h"
+#include "paddle/pten/kernels/hybird/general/elementwise_functor.h"
+#include "paddle/pten/kernels/hybird/general/reduce_impl.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/complex.h"
+
+namespace pten {
+
+#define DEFINE_CPU_ELEMENTWISE_OP(name)                                      \
+  template <typename T, typename Context>                                    \
+  void name##Kernel(const Context& dev_ctx,                                  \
+                    const DenseTensor& x,                                    \
+                    const DenseTensor& y,                                    \
+                    int axis,                                                \
+                    DenseTensor* out) {                                      \
+    out->mutable_data<T>();                                                  \
+    if (x.dims() == y.dims()) {                                              \
+      SameDimsElementwiseCompute<                                            \
+          general::SameDims##name##Functor<CPUContext, T>>()(                \
+          dev_ctx, x, y, out);                                               \
+    } else {                                                                 \
+      auto x_dims = x.dims();                                                \
+      auto y_dims = y.dims();                                                \
+      if (x_dims.size() >= y_dims.size()) {                                  \
+        ElementwiseCompute<general::name##Functor<T>, T>(                    \
+            dev_ctx, x, y, axis, general::name##Functor<T>(), out);          \
+      } else {                                                               \
+        ElementwiseCompute<general::Inverse##name##Functor<T>, T>(           \
+            dev_ctx, x, y, axis, general::Inverse##name##Functor<T>(), out); \
+      }                                                                      \
+    }                                                                        \
+  }
+
+template <typename T, typename Context>
+void Mean(const Context& dev_ctx,
+          const DenseTensor& x,
+          const std::vector<int64_t>& dims,
+          bool keep_dim,
+          bool reduce_all,
+          DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  pten::general::Reduce<CPUContext, T, pten::eigen::MeanFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void DivideKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  int axis,
+                  DenseTensor* out) {
+  // allocate memory for out
+  out->mutable_data<T>();
+  if (x.dims() == y.dims() && std::is_floating_point<T>::value) {
+    SameDimsElementwiseCompute<general::SameDimsDivideFunctor<CPUContext, T>>()(
+        dev_ctx, x, y, out);
+  } else {
+    auto x_dims = x.dims();
+    auto y_dims = y.dims();
+    if (x_dims.size() >= y_dims.size()) {
+      ElementwiseCompute<general::DivideFunctor<T>, T>(
+          dev_ctx, x, y, axis, general::DivideFunctor<T>(), out);
+    } else {
+      ElementwiseCompute<general::InverseDivideFunctor<T>, T>(
+          dev_ctx, x, y, axis, general::InverseDivideFunctor<T>(), out);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void Sum(const Context& dev_ctx,
+         const DenseTensor& x,
+         const std::vector<int64_t>& dims,
+         bool keep_dim,
+         bool reduce_all,
+         DataType out_dtype,
+         DenseTensor* out) {
+  pten::general::Reduce<CPUContext, T, pten::eigen::SumFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+// Create the definition of Add
+DEFINE_CPU_ELEMENTWISE_OP(Add)
+
+// Create the definition of Subtract
+DEFINE_CPU_ELEMENTWISE_OP(Subtract)
+
+// Create the definition of Multiply
+DEFINE_CPU_ELEMENTWISE_OP(Multiply)
+
+}  // namespace pten
+
+using complex64 = ::paddle::platform::complex<float>;
+using complex128 = ::paddle::platform::complex<double>;
+
+// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
+// using bfloat16 = ::paddle::platform::bfloat16;
+PT_REGISTER_CTX_KERNEL(mean, CPU, ALL_LAYOUT, pten::Mean, float, double, bool) {
+}
+PT_REGISTER_CTX_KERNEL(add,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::AddKernel,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       complex64,
+                       complex128) {}
+PT_REGISTER_CTX_KERNEL(subtract,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::SubtractKernel,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       complex64,
+                       complex128) {}
+PT_REGISTER_CTX_KERNEL(divide,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::DivideKernel,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       complex64,
+                       complex128) {}
+PT_REGISTER_CTX_KERNEL(multiply,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::MultiplyKernel,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       bool,
+                       complex64,
+                       complex128) {}
+PT_REGISTER_CTX_KERNEL(sum,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::Sum,
+                       bool,
+                       float,
+                       double,
+                       paddle::platform::float16,
+                       int,
+                       int64_t,
+                       complex64,
+                       complex128) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
diff --git a/paddle/pten/kernels/gpu/CMakeLists.txt b/paddle/pten/kernels/gpu/CMakeLists.txt
index 51c666947b2f2cbc36a56584b97b4ad471ffb7a9..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644
--- a/paddle/pten/kernels/gpu/CMakeLists.txt
+++ b/paddle/pten/kernels/gpu/CMakeLists.txt
@@ -1,5 +0,0 @@
-if(WITH_GPU)
-  nv_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu cast_kernel copy_kernel)
-elseif(WITH_ROCM)
-  hip_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu cast_kernel copy_kernel)
-endif()
diff --git a/paddle/pten/kernels/gpu/math.cu b/paddle/pten/kernels/gpu/math.cu
deleted file mode 100644
index e02403ac426f2c9544ac7300aec4099d0d010417..0000000000000000000000000000000000000000
--- a/paddle/pten/kernels/gpu/math.cu
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/pten/kernels/gpu/math.h"
-
-#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h"
-#include "paddle/pten/kernels/hybird/cuda/reduce/reduce.h"
-#include "paddle/pten/kernels/hybird/general/elementwise_functor.h"
-#include "paddle/pten/kernels/hybird/general/reduce_impl.h"
-
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/core/convert_utils.h"
-#include "paddle/pten/core/kernel_registry.h"
-
-namespace kps = paddle::operators::kernel_primitives;
-
-namespace pten {
-
-/**
- * Util Functors
- */
-
-template <typename T>
-struct DivideFunctor {
-  HOSTDEVICE explicit inline DivideFunctor(int n)
-      : n_inv(static_cast<T>(1.0 / n)) {}
-
-  HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; }
-
- private:
-  T n_inv;
-};
-
-/**
- * Kernels
- */
-
-template <typename T>
-void Mean(const GPUContext& dev_ctx,
-          const DenseTensor& x,
-          const std::vector<int64_t>& dims,
-          bool keep_dim,
-          bool reduce_all,
-          DenseTensor* out) {
-  auto out_dtype = x.dtype();
-  pten::Reduce<T, kps::AddFunctor, kps::DivideFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
-// Create the definition of Add
-DEFINE_CUDA_ELEMENTWISE_OP(Add)
-// Create the definition of Subtract
-DEFINE_CUDA_ELEMENTWISE_OP(Subtract)
-// Create the definition of Multiply
-DEFINE_CUDA_ELEMENTWISE_OP(Multiply)
-// Create the definition of Divide
-DEFINE_CUDA_ELEMENTWISE_OP(Divide)
-
-template <typename T>
-void Sum(const GPUContext& dev_ctx,
-         const DenseTensor& x,
-         const std::vector<int64_t>& dims,
-         bool keep_dim,
-         bool reduce_all,
-         DataType out_dtype,
-         DenseTensor* out) {
-  pten::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
-}  // namespace pten
-
-using float16 = paddle::platform::float16;
-using complex64 = ::paddle::platform::complex<float>;
-using complex128 = ::paddle::platform::complex<double>;
-
-PT_REGISTER_KERNEL(
-    mean, GPU, ALL_LAYOUT, pten::Mean, float, double, bool, float16) {}
-PT_REGISTER_KERNEL(add,
-                   GPU,
-                   ALL_LAYOUT,
-                   pten::Add,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   float16,
-                   complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(subtract,
-                   GPU,
-                   ALL_LAYOUT,
-                   pten::Subtract,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   float16,
-                   complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(divide,
-                   GPU,
-                   ALL_LAYOUT,
-                   pten::Divide,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   float16,
-                   complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(multiply,
-                   GPU,
-                   ALL_LAYOUT,
-                   pten::Multiply,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   float16,
-                   complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(sum,
-                   GPU,
-                   ALL_LAYOUT,
-                   pten::Sum,
-                   bool,
-                   float,
-                   double,
-                   float16,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {
-  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
-}
diff --git a/paddle/pten/kernels/gpu/math.h b/paddle/pten/kernels/gpu/math.h
deleted file mode 100644
index c1d33a0fcdd092226d1aac89327dcb13ac88d0f6..0000000000000000000000000000000000000000
--- a/paddle/pten/kernels/gpu/math.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
-#include "paddle/pten/backends/gpu/gpu_context.h"
-#include "paddle/pten/common/scalar.h"
-#include "paddle/pten/core/dense_tensor.h"
-
-namespace pten {
-
-template <typename T>
-void Mean(const GPUContext& dev_ctx,
-          const DenseTensor& x,
-          const std::vector<int64_t>& dims,
-          bool keep_dim,
-          bool reduce_all,
-          DenseTensor* out);
-
-template <typename T>
-void Add(const GPUContext& dev_ctx,
-         const DenseTensor& x,
-         const DenseTensor& y,
-         int axis,
-         DenseTensor* out);
-
-template <typename T>
-void Subtract(const GPUContext& dev_ctx,
-              const DenseTensor& x,
-              const DenseTensor& y,
-              int axis,
-              DenseTensor* out);
-
-template <typename T>
-void Divide(const GPUContext& dev_ctx,
-            const DenseTensor& x,
-            const DenseTensor& y,
-            int axis,
-            DenseTensor* out);
-
-template <typename T>
-void Multiply(const GPUContext& dev_ctx,
-              const DenseTensor& x,
-              const DenseTensor& y,
-              int axis,
-              DenseTensor* out);
-
-template <typename T>
-void Sum(const GPUContext& dev_ctx,
-         const DenseTensor& x,
-         const std::vector<int64_t>& dims,
-         bool keep_dim,
-         bool reduce_all,
-         DataType out_dtype,
-         DenseTensor* out);
-
-}  // namespace pten
-
-#define DEFINE_CUDA_ELEMENTWISE_OP(name)                               \
-  template <typename T>                                                \
-  void name(const GPUContext& dev_ctx,                                 \
-            const DenseTensor& x,                                      \
-            const DenseTensor& y,                                      \
-            int axis,                                                  \
-            DenseTensor* out) {                                        \
-    std::vector<const DenseTensor*> inputs;                            \
-    std::vector<DenseTensor*> outputs;                                 \
-    inputs.emplace_back(&x);                                           \
-    inputs.emplace_back(&y);                                           \
-    outputs.emplace_back(out);                                         \
-    out->mutable_data<T>();                                            \
-    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(       \
-        dev_ctx, inputs, &outputs, axis, general::name##Functor<T>()); \
-  }
-
-#endif
diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..636d0f16b0d711328eb7f384722f60a178b11e60
--- /dev/null
+++ b/paddle/pten/kernels/gpu/math_kernel.cu
@@ -0,0 +1,177 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/kernels/math_kernel.h"
+
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h"
+#include "paddle/pten/kernels/hybird/cuda/reduce/reduce.h"
+#include "paddle/pten/kernels/hybird/general/elementwise_functor.h"
+#include "paddle/pten/kernels/hybird/general/reduce_impl.h"
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+namespace kps = paddle::operators::kernel_primitives;
+
+namespace pten {
+
+#define DEFINE_CUDA_ELEMENTWISE_OP(name)                               \
+  template <typename T, typename Context>                              \
+  void name##Kernel(const Context& dev_ctx,                            \
+                    const DenseTensor& x,                              \
+                    const DenseTensor& y,                              \
+                    int axis,                                          \
+                    DenseTensor* out) {                                \
+    std::vector<const DenseTensor*> inputs;                            \
+    std::vector<DenseTensor*> outputs;                                 \
+    inputs.emplace_back(&x);                                           \
+    inputs.emplace_back(&y);                                           \
+    outputs.emplace_back(out);                                         \
+    out->mutable_data<T>();                                            \
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(       \
+        dev_ctx, inputs, &outputs, axis, general::name##Functor<T>()); \
+  }
+
+/**
+ * Util Functors
+ */
+
+template <typename T>
+struct DivideFunctor {
+  HOSTDEVICE explicit inline DivideFunctor(int n)
+      : n_inv(static_cast<T>(1.0 / n)) {}
+
+  HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; }
+
+ private:
+  T n_inv;
+};
+
+/**
+ * Kernels
+ */
+
+template <typename T, typename Context>
+void Mean(const Context& dev_ctx,
+          const DenseTensor& x,
+          const std::vector<int64_t>& dims,
+          bool keep_dim,
+          bool reduce_all,
+          DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  pten::Reduce<T, kps::AddFunctor, kps::DivideFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+// Create the definition of Add
+DEFINE_CUDA_ELEMENTWISE_OP(Add)
+// Create the definition of Subtract
+DEFINE_CUDA_ELEMENTWISE_OP(Subtract)
+// Create the definition of Multiply
+DEFINE_CUDA_ELEMENTWISE_OP(Multiply)
+// Create the definition of Divide
+DEFINE_CUDA_ELEMENTWISE_OP(Divide)
+
+template <typename T, typename Context>
+void Sum(const Context& dev_ctx,
+         const DenseTensor& x,
+         const std::vector<int64_t>& dims,
+         bool keep_dim,
+         bool reduce_all,
+         DataType out_dtype,
+         DenseTensor* out) {
+  pten::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace pten
+
+using float16 = paddle::platform::float16;
+using complex64 = ::paddle::platform::complex<float>;
+using complex128 = ::paddle::platform::complex<double>;
+
+PT_REGISTER_CTX_KERNEL(
+    mean, GPU, ALL_LAYOUT, pten::Mean, float, double, bool, float16) {}
+PT_REGISTER_CTX_KERNEL(add,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::AddKernel,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       float16,
+                       complex64,
+                       complex128) {}
+PT_REGISTER_CTX_KERNEL(subtract,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::SubtractKernel,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       float16,
+                       complex64,
+                       complex128) {}
+PT_REGISTER_CTX_KERNEL(divide,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::DivideKernel,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       float16,
+                       complex64,
+                       complex128) {}
+PT_REGISTER_CTX_KERNEL(multiply,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::MultiplyKernel,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       bool,
+                       float16,
+                       complex64,
+                       complex128) {}
+PT_REGISTER_CTX_KERNEL(sum,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::Sum,
+                       bool,
+                       float,
+                       double,
+                       float16,
+                       int,
+                       int64_t,
+                       complex64,
+                       complex128) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
diff --git a/paddle/pten/kernels/math_kernel.h b/paddle/pten/kernels/math_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..2968aa3524a9f29086b108482b60b4ab3eecb3b6
--- /dev/null
+++ b/paddle/pten/kernels/math_kernel.h
@@ -0,0 +1,124 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/api/lib/utils/storage.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/include/infermeta.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void Mean(const Context& dev_ctx,
+          const DenseTensor& x,
+          const std::vector<int64_t>& dims,
+          bool keep_dim,
+          bool reduce_all,
+          DenseTensor* out);
+
+template <typename T, typename Context>
+void AddKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& y,
+               int axis,
+               DenseTensor* out);
+
+template <typename T, typename Context>
+void SubtractKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    int axis,
+                    DenseTensor* out);
+
+template <typename T, typename Context>
+void DivideKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  int axis,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void MultiplyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    int axis,
+                    DenseTensor* out);
+
+template <typename T, typename Context>
+void Sum(const Context& dev_ctx,
+         const DenseTensor& x,
+         const std::vector<int64_t>& dims,
+         bool keep_dim,
+         bool reduce_all,
+         DataType out_dtype,
+         DenseTensor* out);
+
+template <typename T, typename ContextT>
+DenseTensor Add(const ContextT& dev_ctx,
+                const DenseTensor& x,
+                const DenseTensor& y,
+                int axis) {
+  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
+  pten::DenseTensor dense_out(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(
+          dev_ctx.GetPlace()),
+      std::move(out_meta));
+  AddKernel<T, ContextT>(dev_ctx, x, y, axis, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename ContextT>
+DenseTensor Subtract(const ContextT& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     int axis) {
+  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
+  pten::DenseTensor dense_out(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(
+          dev_ctx.GetPlace()),
+      std::move(out_meta));
+  SubtractKernel<T, ContextT>(dev_ctx, x, y, axis, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename ContextT>
+DenseTensor Divide(const ContextT& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   int axis) {
+  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
+  pten::DenseTensor dense_out(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(
+          dev_ctx.GetPlace()),
+      std::move(out_meta));
+  DivideKernel<T, ContextT>(dev_ctx, x, y, axis, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename ContextT>
+DenseTensor Multiply(const ContextT& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     int axis) {
+  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
+  pten::DenseTensor dense_out(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(
+          dev_ctx.GetPlace()),
+      std::move(out_meta));
+  MultiplyKernel<T, ContextT>(dev_ctx, x, y, axis, &dense_out);
+  return dense_out;
+}
+
+}  // namespace pten
diff --git a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
index f12a2d48e6b2b876fe0c8bba7de6461aa41313bd..bd09ecb770a5d59b114e2ba883d3a8e6107121cd 100644
--- a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/math_kernel.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"