From e76087adbc62b645b44e9eec994749c26a5fd8f0 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Fri, 31 Dec 2021 10:09:59 +0800
Subject: [PATCH] =?UTF-8?q?[Pten]Move=20math=20to=20new=20directory=20and?=
 =?UTF-8?q?=20change=20=E3=80=8Cmath=E3=80=8D=20to=20=E3=80=8Cmath=5Fkerne?=
 =?UTF-8?q?l=E3=80=8D=20(#38604)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* change 'math' to 'math_kernel'

* fix compile bugs

* merge develop

* fix compile bugs
---
 paddle/fluid/operators/cholesky_solve_op.h    |   4 +-
 .../elementwise/elementwise_add_op.h          |   4 +-
 .../elementwise/elementwise_div_op.h          |   4 +-
 .../elementwise/elementwise_mul_op.cu         |   3 +-
 .../elementwise/elementwise_mul_op.h          |   5 +-
 .../elementwise/elementwise_sub_op.h          |   5 +-
 paddle/pten/CMakeLists.txt                    |   4 -
 paddle/pten/api/lib/kernel_declare.h          |   6 -
 paddle/pten/include/math.h                    |  64 +------
 paddle/pten/kernels/CMakeLists.txt            |  12 +-
 paddle/pten/kernels/cpu/math.cc               | 138 +-------------
 paddle/pten/kernels/cpu/math.h                |  93 ---------
 paddle/pten/kernels/cpu/math_kernel.cc        | 178 ++++++++++++++++++
 paddle/pten/kernels/gpu/CMakeLists.txt        |   5 -
 paddle/pten/kernels/gpu/math.cu               | 159 ----------------
 paddle/pten/kernels/gpu/math.h                |  90 ---------
 paddle/pten/kernels/gpu/math_kernel.cu        | 177 +++++++++++++++++
 paddle/pten/kernels/math_kernel.h             | 124 ++++++++++++
 .../tests/kernels/test_elementwise_dev_api.cc |   2 +-
 19 files changed, 509 insertions(+), 568 deletions(-)
 delete mode 100644 paddle/pten/kernels/cpu/math.h
 create mode 100644 paddle/pten/kernels/cpu/math_kernel.cc
 delete mode 100644 paddle/pten/kernels/gpu/math.cu
 delete mode 100644 paddle/pten/kernels/gpu/math.h
 create mode 100644 paddle/pten/kernels/gpu/math_kernel.cu
 create mode 100644 paddle/pten/kernels/math_kernel.h
diff --git a/paddle/fluid/operators/cholesky_solve_op.h b/paddle/fluid/operators/cholesky_solve_op.h
index f3b0056165..94b68bff8f 100644
--- a/paddle/fluid/operators/cholesky_solve_op.h
+++ b/paddle/fluid/operators/cholesky_solve_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/operators/triangular_solve_op.h"
 #include "paddle/fluid/platform/complex.h"
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/math_kernel.h"
 
 namespace paddle {
 namespace operators {  // namespace operators
@@ -205,7 +205,7 @@ class CholeskySolveGradKernel : public framework::OpKernel<T> {
       auto pt_x = paddle::experimental::MakePtenDenseTensor(commonterm);
       auto pt_y = paddle::experimental::MakePtenDenseTensor(commonterm_conj);
       auto pt_z = paddle::experimental::MakePtenDenseTensor(commonterm);
-      pten::Add<T>(dev_ctx, *pt_x.get(), *pt_y.get(), -1, pt_z.get());
+      pten::AddKernel<T>(dev_ctx, *pt_x.get(), *pt_y.get(), -1, pt_z.get());
 
       auto mat_dim_u = math::CreateMatrixDescriptor(u_bst.dims(), 0, false);
       auto mat_dim_c =
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index a4567beeb4..d6d79d166d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -25,7 +25,7 @@ limitations under the License. */
 // only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/math_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -68,7 +68,7 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
     auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
     auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
     auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
-    pten::Add<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get());
+    pten::AddKernel<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get());
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h
index f3ba5050c4..c886644bbd 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -28,7 +28,7 @@ limitations under the License. */
 // only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/math_kernel.h"
 namespace paddle {
 namespace operators {
 
@@ -62,7 +62,7 @@ class ElementwiseDivKernel : public framework::OpKernel<T> {
     auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
     auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
     auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
-    pten::Divide<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get());
+    pten::DivideKernel<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get());
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index e131bc4974..12e0062a69 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -57,7 +57,8 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
       auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
       auto pt_y = paddle::experimental::MakePtenDenseTensor(*y_lod);
       auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
-      pten::Multiply<T>(cuda_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get());
+      pten::MultiplyKernel<T>(cuda_ctx, *pt_x.get(), *pt_y.get(), axis,
+                              pt_z.get());
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "X's type[%s] is not supported by elementwise_op. X's type should be "
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index 8b43f82e6b..3b0f072572 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -24,7 +24,7 @@ limitations under the License. */
 // only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/math_kernel.h"
 namespace paddle {
 namespace operators {
 
@@ -129,7 +129,8 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
       auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
       auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
       auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
-      pten::Multiply<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get());
+      pten::MultiplyKernel<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis,
+                              pt_z.get());
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "X's type[%s] is not supported by elementwise_op. X's type should be "
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
index 09a33788d4..6a51d7c2a4 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
@@ -22,7 +22,7 @@ limitations under the License. */
 // only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/math_kernel.h"
 namespace paddle {
 namespace operators {
 
@@ -56,7 +56,8 @@ class ElementwiseSubKernel : public framework::OpKernel<T> {
     auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
     auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
     auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
-    pten::Subtract<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get());
+    pten::SubtractKernel<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis,
+                            pt_z.get());
   }
 };
 
diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt
index 97be4c19c9..05b321c50c 100644
--- a/paddle/pten/CMakeLists.txt
+++ b/paddle/pten/CMakeLists.txt
@@ -28,9 +28,5 @@ get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
 # keep this message for debug, remove it later if needless
 message(STATUS "All standard pten kernels: ${pten_kernels}")
 set(PTEN_DEPS ${PTEN_DEPS} ${pten_kernels})
-set(PTEN_DEPS ${PTEN_DEPS} math_cpu)
-if(WITH_GPU OR WITH_ROCM)
-  set(PTEN_DEPS ${PTEN_DEPS} math_gpu)
-endif()
 
 cc_library(pten SRCS all.cc DEPS ${PTEN_DEPS})
diff --git a/paddle/pten/api/lib/kernel_declare.h b/paddle/pten/api/lib/kernel_declare.h
index 484063df47..4d3143ef09 100644
--- a/paddle/pten/api/lib/kernel_declare.h
+++ b/paddle/pten/api/lib/kernel_declare.h
@@ -19,9 +19,3 @@ limitations under the License. */
 // TODO(chenweihang) After the kernel is split into a single file,
 // the kernel declare statement is automatically generated according to the
 // file name of the kernel, and this header file will be removed
-
-PT_DECLARE_KERNEL(mean, CPU, ALL_LAYOUT);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_KERNEL(mean, GPU, ALL_LAYOUT);
-#endif
diff --git a/paddle/pten/include/math.h b/paddle/pten/include/math.h
index 876834cea7..9abfa297a9 100644
--- a/paddle/pten/include/math.h
+++ b/paddle/pten/include/math.h
@@ -18,8 +18,7 @@ limitations under the License. */
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/include/infermeta.h"
 #include "paddle/pten/kernels/complex_kernel.h"
-#include "paddle/pten/kernels/cpu/math.h"
-#include "paddle/pten/kernels/gpu/math.h"
+#include "paddle/pten/kernels/math_kernel.h"
 #include "paddle/pten/kernels/scale_kernel.h"
 
 namespace pten {
@@ -46,7 +45,7 @@ DenseTensor Mean(const ContextT& dev_ctx,
           dev_ctx.GetPlace()),
       std::move(out_meta));
   bool reduce_all = false;
-  Mean<T>(dev_ctx, x, axis, keep_dim, reduce_all, &dense_out);
+  Mean<T, ContextT>(dev_ctx, x, axis, keep_dim, reduce_all, &dense_out);
   return dense_out;
 }
 
@@ -66,7 +65,8 @@ DenseTensor Sum(const ContextT& dev_ctx,
   // so use default value(false) is OK.
   bool reduce_all = false;
 
-  Sum<T>(dev_ctx, x, axis, keep_dim, reduce_all, out_meta.dtype, &dense_out);
+  Sum<T, ContextT>(
+      dev_ctx, x, axis, keep_dim, reduce_all, out_meta.dtype, &dense_out);
   return dense_out;
 }
 
@@ -85,62 +85,6 @@ DenseTensor Scale(const ContextT& dev_ctx,
   return dense_out;
 }
 
-template <typename T, typename ContextT>
-DenseTensor Add(const ContextT& dev_ctx,
-                const DenseTensor& x,
-                const DenseTensor& y,
-                int axis) {
-  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  Add<T>(dev_ctx, x, y, axis, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename ContextT>
-DenseTensor Subtract(const ContextT& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     int axis) {
-  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  Subtract<T>(dev_ctx, x, y, axis, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename ContextT>
-DenseTensor Divide(const ContextT& dev_ctx,
-                   const DenseTensor& x,
-                   const DenseTensor& y,
-                   int axis) {
-  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  Divide<T>(dev_ctx, x, y, axis, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename ContextT>
-DenseTensor Multiply(const ContextT& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     int axis) {
-  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  Multiply<T>(dev_ctx, x, y, axis, &dense_out);
-  return dense_out;
-}
-
 template <typename T, typename ContextT>
 DenseTensor Conj(const ContextT& dev_ctx, const DenseTensor& x) {
   auto out_meta = UnchangedInferMeta(x.meta());
diff --git a/paddle/pten/kernels/CMakeLists.txt b/paddle/pten/kernels/CMakeLists.txt
index 7a785d8692..4c705767f4 100644
--- a/paddle/pten/kernels/CMakeLists.txt
+++ b/paddle/pten/kernels/CMakeLists.txt
@@ -24,11 +24,17 @@ endif()
 # pten depends all pten kernel targets
 set_property(GLOBAL PROPERTY PTEN_KERNELS "")
 
-set(COMMON_KERNEL_DEPS dense_tensor kernel_context kernel_factory)
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function)
+set(COMMON_KERNEL_DEPS dense_tensor kernel_context kernel_factory convert_utils)
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta)
 
+set(MATH_KERNEL_DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel pten_transpose_cpu)
+if(WITH_GPU OR WITH_ROCM)
+  set(MATH_KERNEL_DEPS ${MATH_KERNEL_DEPS} pten_transpose_gpu)
+endif()
+
 # auto build kernel targets by cmake
-register_kernels(DEPS ${COMMON_KERNEL_DEPS})
+register_kernels(EXCLUDES math_kernel DEPS ${COMMON_KERNEL_DEPS})
+kernel_library(math_kernel DEPS ${MATH_KERNEL_DEPS})
 
 copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
diff --git a/paddle/pten/kernels/cpu/math.cc b/paddle/pten/kernels/cpu/math.cc
index ee01cf65d0..b4642d475d 100644
--- a/paddle/pten/kernels/cpu/math.cc
+++ b/paddle/pten/kernels/cpu/math.cc
@@ -1,4 +1,4 @@
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,138 +12,4 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/pten/kernels/cpu/math.h"
-
-#include "paddle/pten/api/ext/dispatch.h"
-#include "paddle/pten/kernels/hybird/cpu/elementwise.h"
-#include "paddle/pten/kernels/hybird/eigen/reduce.h"
-#include "paddle/pten/kernels/hybird/general/elementwise_functor.h"
-#include "paddle/pten/kernels/hybird/general/reduce_impl.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex.h"
-
-namespace pten {
-
-template <typename T>
-void Mean(const CPUContext& dev_ctx,
-          const DenseTensor& x,
-          const std::vector<int64_t>& dims,
-          bool keep_dim,
-          bool reduce_all,
-          DenseTensor* out) {
-  auto out_dtype = x.dtype();
-  pten::general::Reduce<CPUContext, T, pten::eigen::MeanFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
-template <typename T>
-void Divide(const CPUContext& dev_ctx,
-            const DenseTensor& x,
-            const DenseTensor& y,
-            int axis,
-            DenseTensor* out) {
-  // allocate memory for out
-  out->mutable_data<T>();
-  if (x.dims() == y.dims() && std::is_floating_point<T>::value) {
-    SameDimsElementwiseCompute<general::SameDimsDivideFunctor<CPUContext, T>>()(
-        dev_ctx, x, y, out);
-  } else {
-    auto x_dims = x.dims();
-    auto y_dims = y.dims();
-    if (x_dims.size() >= y_dims.size()) {
-      ElementwiseCompute<general::DivideFunctor<T>, T>(
-          dev_ctx, x, y, axis, general::DivideFunctor<T>(), out);
-    } else {
-      ElementwiseCompute<general::InverseDivideFunctor<T>, T>(
-          dev_ctx, x, y, axis, general::InverseDivideFunctor<T>(), out);
-    }
-  }
-}
-
-template <typename T>
-void Sum(const CPUContext& dev_ctx,
-         const DenseTensor& x,
-         const std::vector<int64_t>& dims,
-         bool keep_dim,
-         bool reduce_all,
-         DataType out_dtype,
-         DenseTensor* out) {
-  pten::general::Reduce<CPUContext, T, pten::eigen::SumFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
-// Create the definition of Add
-DEFINE_CPU_ELEMENTWISE_OP(Add)
-
-// Create the definition of Subtract
-DEFINE_CPU_ELEMENTWISE_OP(Subtract)
-
-// Create the definition of Multiply
-DEFINE_CPU_ELEMENTWISE_OP(Multiply)
-
-}  // namespace pten
-
-using complex64 = ::paddle::platform::complex<float>;
-using complex128 = ::paddle::platform::complex<double>;
-
-// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
-// using bfloat16 = ::paddle::platform::bfloat16;
-PT_REGISTER_KERNEL(mean, CPU, ALL_LAYOUT, pten::Mean, float, double, bool) {}
-PT_REGISTER_KERNEL(add,
-                   CPU,
-                   ALL_LAYOUT,
-                   pten::Add,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(subtract,
-                   CPU,
-                   ALL_LAYOUT,
-                   pten::Subtract,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(divide,
-                   CPU,
-                   ALL_LAYOUT,
-                   pten::Divide,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(multiply,
-                   CPU,
-                   ALL_LAYOUT,
-                   pten::Multiply,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(sum,
-                   CPU,
-                   ALL_LAYOUT,
-                   pten::Sum,
-                   bool,
-                   float,
-                   double,
-                   paddle::platform::float16,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {
-  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
-}
+namespace pten {}  // namespace pten
diff --git a/paddle/pten/kernels/cpu/math.h b/paddle/pten/kernels/cpu/math.h
deleted file mode 100644
index 1a179218b4..0000000000
--- a/paddle/pten/kernels/cpu/math.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/backends/cpu/cpu_context.h"
-#include "paddle/pten/common/scalar.h"
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/core/kernel_registry.h"
-
-namespace pten {
-
-template <typename T>
-void Mean(const CPUContext& dev_ctx,
-          const DenseTensor& x,
-          const std::vector<int64_t>& dims,
-          bool keep_dim,
-          bool reduce_all,
-          DenseTensor* out);
-
-template <typename T>
-void Add(const CPUContext& dev_ctx,
-         const DenseTensor& x,
-         const DenseTensor& y,
-         int axis,
-         DenseTensor* out);
-
-template <typename T>
-void Subtract(const CPUContext& dev_ctx,
-              const DenseTensor& x,
-              const DenseTensor& y,
-              int axis,
-              DenseTensor* out);
-
-template <typename T>
-void Divide(const CPUContext& dev_ctx,
-            const DenseTensor& x,
-            const DenseTensor& y,
-            int axis,
-            DenseTensor* out);
-
-template <typename T>
-void Multiply(const CPUContext& dev_ctx,
-              const DenseTensor& x,
-              const DenseTensor& y,
-              int axis,
-              DenseTensor* out);
-template <typename T>
-void Sum(const CPUContext& dev_ctx,
-         const DenseTensor& x,
-         const std::vector<int64_t>& dims,
-         bool keep_dim,
-         bool reduce_all,
-         DataType out_dtype,
-         DenseTensor* out);
-
-}  // namespace pten
-
-#define DEFINE_CPU_ELEMENTWISE_OP(name)                                      \
-  template <typename T>                                                      \
-  void name(const CPUContext& dev_ctx,                                       \
-            const DenseTensor& x,                                            \
-            const DenseTensor& y,                                            \
-            int axis,                                                        \
-            DenseTensor* out) {                                              \
-    out->mutable_data<T>();                                                  \
-    if (x.dims() == y.dims()) {                                              \
-      SameDimsElementwiseCompute<                                            \
-          general::SameDims##name##Functor<CPUContext, T>>()(                \
-          dev_ctx, x, y, out);                                               \
-    } else {                                                                 \
-      auto x_dims = x.dims();                                                \
-      auto y_dims = y.dims();                                                \
-      if (x_dims.size() >= y_dims.size()) {                                  \
-        ElementwiseCompute<general::name##Functor<T>, T>(                    \
-            dev_ctx, x, y, axis, general::name##Functor<T>(), out);          \
-      } else {                                                               \
-        ElementwiseCompute<general::Inverse##name##Functor<T>, T>(           \
-            dev_ctx, x, y, axis, general::Inverse##name##Functor<T>(), out); \
-      }                                                                      \
-    }                                                                        \
-  }
diff --git a/paddle/pten/kernels/cpu/math_kernel.cc b/paddle/pten/kernels/cpu/math_kernel.cc
new file mode 100644
index 0000000000..152d945144
--- /dev/null
+++ b/paddle/pten/kernels/cpu/math_kernel.cc
@@ -0,0 +1,178 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/math_kernel.h"
+
+#include "paddle/pten/api/ext/dispatch.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/hybird/cpu/elementwise.h"
+#include "paddle/pten/kernels/hybird/eigen/reduce.h"
+#include "paddle/pten/kernels/hybird/general/elementwise_functor.h"
+#include "paddle/pten/kernels/hybird/general/reduce_impl.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/complex.h"
+
+namespace pten {
+
+#define DEFINE_CPU_ELEMENTWISE_OP(name)                                      \
+  template <typename T, typename Context>                                    \
+  void name##Kernel(const Context& dev_ctx,                                  \
+                    const DenseTensor& x,                                    \
+                    const DenseTensor& y,                                    \
+                    int axis,                                                \
+                    DenseTensor* out) {                                      \
+    out->mutable_data<T>();                                                  \
+    if (x.dims() == y.dims()) {                                              \
+      SameDimsElementwiseCompute<                                            \
+          general::SameDims##name##Functor<CPUContext, T>>()(                \
+          dev_ctx, x, y, out);                                               \
+    } else {                                                                 \
+      auto x_dims = x.dims();                                                \
+      auto y_dims = y.dims();                                                \
+      if (x_dims.size() >= y_dims.size()) {                                  \
+        ElementwiseCompute<general::name##Functor<T>, T>(                    \
+            dev_ctx, x, y, axis, general::name##Functor<T>(), out);          \
+      } else {                                                               \
+        ElementwiseCompute<general::Inverse##name##Functor<T>, T>(           \
+            dev_ctx, x, y, axis, general::Inverse##name##Functor<T>(), out); \
+      }                                                                      \
+    }                                                                        \
+  }
+
+template <typename T, typename Context>
+void Mean(const Context& dev_ctx,
+          const DenseTensor& x,
+          const std::vector<int64_t>& dims,
+          bool keep_dim,
+          bool reduce_all,
+          DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  pten::general::Reduce<CPUContext, T, pten::eigen::MeanFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void DivideKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  int axis,
+                  DenseTensor* out) {
+  // allocate memory for out
+  out->mutable_data<T>();
+  if (x.dims() == y.dims() && std::is_floating_point<T>::value) {
+    SameDimsElementwiseCompute<general::SameDimsDivideFunctor<CPUContext, T>>()(
+        dev_ctx, x, y, out);
+  } else {
+    auto x_dims = x.dims();
+    auto y_dims = y.dims();
+    if (x_dims.size() >= y_dims.size()) {
+      ElementwiseCompute<general::DivideFunctor<T>, T>(
+          dev_ctx, x, y, axis, general::DivideFunctor<T>(), out);
+    } else {
+      ElementwiseCompute<general::InverseDivideFunctor<T>, T>(
+          dev_ctx, x, y, axis, general::InverseDivideFunctor<T>(), out);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void Sum(const Context& dev_ctx,
+         const DenseTensor& x,
+         const std::vector<int64_t>& dims,
+         bool keep_dim,
+         bool reduce_all,
+         DataType out_dtype,
+         DenseTensor* out) {
+  pten::general::Reduce<CPUContext, T, pten::eigen::SumFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+// Create the definition of Add
+DEFINE_CPU_ELEMENTWISE_OP(Add)
+
+// Create the definition of Subtract
+DEFINE_CPU_ELEMENTWISE_OP(Subtract)
+
+// Create the definition of Multiply
+DEFINE_CPU_ELEMENTWISE_OP(Multiply)
+
+}  // namespace pten
+
+using complex64 = ::paddle::platform::complex<float>;
+using complex128 = ::paddle::platform::complex<double>;
+
+// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
+// using bfloat16 = ::paddle::platform::bfloat16;
+PT_REGISTER_CTX_KERNEL(mean, CPU, ALL_LAYOUT, pten::Mean, float, double, bool) {
+}
+PT_REGISTER_CTX_KERNEL(add,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::AddKernel,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       complex64,
+                       complex128) {}
+PT_REGISTER_CTX_KERNEL(subtract,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::SubtractKernel,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       complex64,
+                       complex128) {}
+PT_REGISTER_CTX_KERNEL(divide,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::DivideKernel,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       complex64,
+                       complex128) {}
+PT_REGISTER_CTX_KERNEL(multiply,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::MultiplyKernel,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       bool,
+                       complex64,
+                       complex128) {}
+PT_REGISTER_CTX_KERNEL(sum,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::Sum,
+                       bool,
+                       float,
+                       double,
+                       paddle::platform::float16,
+                       int,
+                       int64_t,
+                       complex64,
+                       complex128) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
diff --git a/paddle/pten/kernels/gpu/CMakeLists.txt b/paddle/pten/kernels/gpu/CMakeLists.txt
index 51c666947b..e69de29bb2 100644
--- a/paddle/pten/kernels/gpu/CMakeLists.txt
+++ b/paddle/pten/kernels/gpu/CMakeLists.txt
@@ -1,5 +0,0 @@
-if(WITH_GPU)
-  nv_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu cast_kernel copy_kernel)
-elseif(WITH_ROCM)
-  hip_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu cast_kernel copy_kernel)
-endif()
diff --git a/paddle/pten/kernels/gpu/math.cu b/paddle/pten/kernels/gpu/math.cu
deleted file mode 100644
index e02403ac42..0000000000
--- a/paddle/pten/kernels/gpu/math.cu
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/pten/kernels/gpu/math.h"
-
-#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h"
-#include "paddle/pten/kernels/hybird/cuda/reduce/reduce.h"
-#include "paddle/pten/kernels/hybird/general/elementwise_functor.h"
-#include "paddle/pten/kernels/hybird/general/reduce_impl.h"
-
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/core/convert_utils.h"
-#include "paddle/pten/core/kernel_registry.h"
-
-namespace kps = paddle::operators::kernel_primitives;
-
-namespace pten {
-
-/**
- * Util Functors
- */
-
-template <typename T>
-struct DivideFunctor {
-  HOSTDEVICE explicit inline DivideFunctor(int n)
-      : n_inv(static_cast<T>(1.0 / n)) {}
-
-  HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; }
-
- private:
-  T n_inv;
-};
-
-/**
- * Kernels
- */
-
-template <typename T>
-void Mean(const GPUContext& dev_ctx,
-          const DenseTensor& x,
-          const std::vector<int64_t>& dims,
-          bool keep_dim,
-          bool reduce_all,
-          DenseTensor* out) {
-  auto out_dtype = x.dtype();
-  pten::Reduce<T, kps::AddFunctor, kps::DivideFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
-// Create the definition of Add
-DEFINE_CUDA_ELEMENTWISE_OP(Add)
-// Create the definition of Subtract
-DEFINE_CUDA_ELEMENTWISE_OP(Subtract)
-// Create the definition of Multiply
-DEFINE_CUDA_ELEMENTWISE_OP(Multiply)
-// Create the definition of Divide
-DEFINE_CUDA_ELEMENTWISE_OP(Divide)
-
-template <typename T>
-void Sum(const GPUContext& dev_ctx,
-         const DenseTensor& x,
-         const std::vector<int64_t>& dims,
-         bool keep_dim,
-         bool reduce_all,
-         DataType out_dtype,
-         DenseTensor* out) {
-  pten::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
-}  // namespace pten
-
-using float16 = paddle::platform::float16;
-using complex64 = ::paddle::platform::complex<float>;
-using complex128 = ::paddle::platform::complex<double>;
-
-PT_REGISTER_KERNEL(
-    mean, GPU, ALL_LAYOUT, pten::Mean, float, double, bool, float16) {}
-PT_REGISTER_KERNEL(add,
-                   GPU,
-                   ALL_LAYOUT,
-                   pten::Add,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   float16,
-                   complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(subtract,
-                   GPU,
-                   ALL_LAYOUT,
-                   pten::Subtract,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   float16,
-                   complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(divide,
-                   GPU,
-                   ALL_LAYOUT,
-                   pten::Divide,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   float16,
-                   complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(multiply,
-                   GPU,
-                   ALL_LAYOUT,
-                   pten::Multiply,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   float16,
-                   complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(sum,
-                   GPU,
-                   ALL_LAYOUT,
-                   pten::Sum,
-                   bool,
-                   float,
-                   double,
-                   float16,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {
-  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
-}
diff --git a/paddle/pten/kernels/gpu/math.h b/paddle/pten/kernels/gpu/math.h
deleted file mode 100644
index c1d33a0fcd..0000000000
--- a/paddle/pten/kernels/gpu/math.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
-#include "paddle/pten/backends/gpu/gpu_context.h"
-#include "paddle/pten/common/scalar.h"
-#include "paddle/pten/core/dense_tensor.h"
-
-namespace pten {
-
-template <typename T>
-void Mean(const GPUContext& dev_ctx,
-          const DenseTensor& x,
-          const std::vector<int64_t>& dims,
-          bool keep_dim,
-          bool reduce_all,
-          DenseTensor* out);
-
-template <typename T>
-void Add(const GPUContext& dev_ctx,
-         const DenseTensor& x,
-         const DenseTensor& y,
-         int axis,
-         DenseTensor* out);
-
-template <typename T>
-void Subtract(const GPUContext& dev_ctx,
-              const DenseTensor& x,
-              const DenseTensor& y,
-              int axis,
-              DenseTensor* out);
-
-template <typename T>
-void Divide(const GPUContext& dev_ctx,
-            const DenseTensor& x,
-            const DenseTensor& y,
-            int axis,
-            DenseTensor* out);
-
-template <typename T>
-void Multiply(const GPUContext& dev_ctx,
-              const DenseTensor& x,
-              const DenseTensor& y,
-              int axis,
-              DenseTensor* out);
-
-template <typename T>
-void Sum(const GPUContext& dev_ctx,
-         const DenseTensor& x,
-         const std::vector<int64_t>& dims,
-         bool keep_dim,
-         bool reduce_all,
-         DataType out_dtype,
-         DenseTensor* out);
-
-}  // namespace pten
-
-#define DEFINE_CUDA_ELEMENTWISE_OP(name)                               \
-  template <typename T>                                                \
-  void name(const GPUContext& dev_ctx,                                 \
-            const DenseTensor& x,                                      \
-            const DenseTensor& y,                                      \
-            int axis,                                                  \
-            DenseTensor* out) {                                        \
-    std::vector<const DenseTensor*> inputs;                            \
-    std::vector<DenseTensor*> outputs;                                 \
-    inputs.emplace_back(&x);                                           \
-    inputs.emplace_back(&y);                                           \
-    outputs.emplace_back(out);                                         \
-    out->mutable_data<T>();                                            \
-    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(       \
-        dev_ctx, inputs, &outputs, axis, general::name##Functor<T>()); \
-  }
-
-#endif
diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu
new file mode 100644
index 0000000000..636d0f16b0
--- /dev/null
+++ b/paddle/pten/kernels/gpu/math_kernel.cu
@@ -0,0 +1,177 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/kernels/math_kernel.h"
+
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h"
+#include "paddle/pten/kernels/hybird/cuda/reduce/reduce.h"
+#include "paddle/pten/kernels/hybird/general/elementwise_functor.h"
+#include "paddle/pten/kernels/hybird/general/reduce_impl.h"
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+namespace kps = paddle::operators::kernel_primitives;
+
+namespace pten {
+
+#define DEFINE_CUDA_ELEMENTWISE_OP(name)                               \
+  template <typename T, typename Context>                              \
+  void name##Kernel(const Context& dev_ctx,                            \
+                    const DenseTensor& x,                              \
+                    const DenseTensor& y,                              \
+                    int axis,                                          \
+                    DenseTensor* out) {                                \
+    std::vector<const DenseTensor*> inputs;                            \
+    std::vector<DenseTensor*> outputs;                                 \
+    inputs.emplace_back(&x);                                           \
+    inputs.emplace_back(&y);                                           \
+    outputs.emplace_back(out);                                         \
+    out->mutable_data<T>();                                            \
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(       \
+        dev_ctx, inputs, &outputs, axis, general::name##Functor<T>()); \
+  }
+
+/**
+ * Util Functors
+ */
+
+template <typename T>
+struct DivideFunctor {
+  HOSTDEVICE explicit inline DivideFunctor(int n)
+      : n_inv(static_cast<T>(1.0 / n)) {}
+
+  HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; }
+
+ private:
+  T n_inv;
+};
+
+/**
+ * Kernels
+ */
+
+template <typename T, typename Context>
+void Mean(const Context& dev_ctx,
+          const DenseTensor& x,
+          const std::vector<int64_t>& dims,
+          bool keep_dim,
+          bool reduce_all,
+          DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  pten::Reduce<T, kps::AddFunctor, kps::DivideFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+// Create the definition of Add
+DEFINE_CUDA_ELEMENTWISE_OP(Add)
+// Create the definition of Subtract
+DEFINE_CUDA_ELEMENTWISE_OP(Subtract)
+// Create the definition of Multiply
+DEFINE_CUDA_ELEMENTWISE_OP(Multiply)
+// Create the definition of Divide
+DEFINE_CUDA_ELEMENTWISE_OP(Divide)
+
+template <typename T, typename Context>
+void Sum(const Context& dev_ctx,
+         const DenseTensor& x,
+         const std::vector<int64_t>& dims,
+         bool keep_dim,
+         bool reduce_all,
+         DataType out_dtype,
+         DenseTensor* out) {
+  pten::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace pten
+
+using float16 = paddle::platform::float16;
+using complex64 = ::paddle::platform::complex<float>;
+using complex128 = ::paddle::platform::complex<double>;
+
+PT_REGISTER_CTX_KERNEL(
+    mean, GPU, ALL_LAYOUT, pten::Mean, float, double, bool, float16) {}
+PT_REGISTER_CTX_KERNEL(add,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::AddKernel,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       float16,
+                       complex64,
+                       complex128) {}
+PT_REGISTER_CTX_KERNEL(subtract,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::SubtractKernel,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       float16,
+                       complex64,
+                       complex128) {}
+PT_REGISTER_CTX_KERNEL(divide,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::DivideKernel,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       float16,
+                       complex64,
+                       complex128) {}
+PT_REGISTER_CTX_KERNEL(multiply,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::MultiplyKernel,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       bool,
+                       float16,
+                       complex64,
+                       complex128) {}
+PT_REGISTER_CTX_KERNEL(sum,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::Sum,
+                       bool,
+                       float,
+                       double,
+                       float16,
+                       int,
+                       int64_t,
+                       complex64,
+                       complex128) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
diff --git a/paddle/pten/kernels/math_kernel.h b/paddle/pten/kernels/math_kernel.h
new file mode 100644
index 0000000000..2968aa3524
--- /dev/null
+++ b/paddle/pten/kernels/math_kernel.h
@@ -0,0 +1,124 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/api/lib/utils/storage.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/include/infermeta.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void Mean(const Context& dev_ctx,
+          const DenseTensor& x,
+          const std::vector<int64_t>& dims,
+          bool keep_dim,
+          bool reduce_all,
+          DenseTensor* out);
+
+template <typename T, typename Context>
+void AddKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& y,
+               int axis,
+               DenseTensor* out);
+
+template <typename T, typename Context>
+void SubtractKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    int axis,
+                    DenseTensor* out);
+
+template <typename T, typename Context>
+void DivideKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  int axis,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void MultiplyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    int axis,
+                    DenseTensor* out);
+
+template <typename T, typename Context>
+void Sum(const Context& dev_ctx,
+         const DenseTensor& x,
+         const std::vector<int64_t>& dims,
+         bool keep_dim,
+         bool reduce_all,
+         DataType out_dtype,
+         DenseTensor* out);
+
+template <typename T, typename ContextT>
+DenseTensor Add(const ContextT& dev_ctx,
+                const DenseTensor& x,
+                const DenseTensor& y,
+                int axis) {
+  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
+  pten::DenseTensor dense_out(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(
+          dev_ctx.GetPlace()),
+      std::move(out_meta));
+  AddKernel<T, ContextT>(dev_ctx, x, y, axis, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename ContextT>
+DenseTensor Subtract(const ContextT& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     int axis) {
+  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
+  pten::DenseTensor dense_out(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(
+          dev_ctx.GetPlace()),
+      std::move(out_meta));
+  SubtractKernel<T, ContextT>(dev_ctx, x, y, axis, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename ContextT>
+DenseTensor Divide(const ContextT& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   int axis) {
+  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
+  pten::DenseTensor dense_out(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(
+          dev_ctx.GetPlace()),
+      std::move(out_meta));
+  DivideKernel<T, ContextT>(dev_ctx, x, y, axis, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename ContextT>
+DenseTensor Multiply(const ContextT& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     int axis) {
+  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
+  pten::DenseTensor dense_out(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(
+          dev_ctx.GetPlace()),
+      std::move(out_meta));
+  MultiplyKernel<T, ContextT>(dev_ctx, x, y, axis, &dense_out);
+  return dense_out;
+}
+
+}  // namespace pten
diff --git a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
index f12a2d48e6..bd09ecb770 100644
--- a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/math_kernel.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
-- 
GitLab