diff --git a/paddle/pten/api/lib/kernel_declare.h b/paddle/pten/api/lib/kernel_declare.h
index 4f2160a761836410661408ea5a09215865988e3f..6ca6495e2c1bc1a1f00f4519a7ad42672c78df21 100644
--- a/paddle/pten/api/lib/kernel_declare.h
+++ b/paddle/pten/api/lib/kernel_declare.h
@@ -20,13 +20,13 @@ limitations under the License. */
 // the kernel declare statement is automatically generated according to the
 // file name of the kernel, and this header file will be removed
 
-PT_DECLARE_KERNEL(dot, CPU, ALL_LAYOUT);
+PT_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(cast, CPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(sign, CPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(conj, CPU, ALL_LAYOUT);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_KERNEL(dot, GPU, ALL_LAYOUT);
+PT_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(cast, GPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(sign, GPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(conj, GPU, ALL_LAYOUT);
diff --git a/paddle/pten/include/linalg.h b/paddle/pten/include/linalg.h
index 8f627f5fc8b0afe56b1e10788fae5ef9b78e6c68..34b0183778125596d79cbd6e2249944be5b025e5 100644
--- a/paddle/pten/include/linalg.h
+++ b/paddle/pten/include/linalg.h
@@ -18,6 +18,7 @@
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/include/infermeta.h"
 #include "paddle/pten/kernels/cpu/linalg.h"
+#include "paddle/pten/kernels/dot_kernel.h"
 #include "paddle/pten/kernels/gpu/linalg.h"
 
 namespace pten {
@@ -31,7 +32,7 @@ DenseTensor Dot(const ContextT& dev_ctx,
       pten::make_intrusive<paddle::experimental::SharedStorage>(
           dev_ctx.GetPlace()),
       std::move(out_meta));
-  Dot<T>(dev_ctx, x, y, &dense_out);
+  Dot<T, ContextT>(dev_ctx, x, y, &dense_out);
   return dense_out;
 }
 
diff --git a/paddle/pten/kernels/cpu/dot_kernel.cc b/paddle/pten/kernels/cpu/dot_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c485cc8ac5672ae5c258fefce78743fe9db06abc
--- /dev/null
+++ b/paddle/pten/kernels/cpu/dot_kernel.cc
@@ -0,0 +1,61 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/dot_kernel.h"
+
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/complex.h"
+
+namespace pten {
+
+template <typename T, typename ContextT>
+void Dot(const ContextT& dev_ctx,
+         const DenseTensor& x,
+         const DenseTensor& y,
+         DenseTensor* out) {
+  auto const *x_ptr = x.data<T>(), *x_ptr_ = &x_ptr[0];
+  auto const *y_ptr = y.data<T>(), *y_ptr_ = &y_ptr[0];
+  auto* z = out->mutable_data<T>();
+
+  // Loop over the total N elements of both operands while sum-reducing every
+  // B pairs along the way where B is the dimension of the least ordered axis
+  auto&& d = x.dims();
+  auto const N = x.numel();
+  auto const B = d[d.size() - 1];
+
+  for (int j = 0; j < N / B; j++) {
+    T ss = 0;
+    for (int i = 0; i < B; i++) ss += (*x_ptr_++) * (*y_ptr_++);
+    z[j] = ss;
+  }
+}
+
+}  // namespace pten
+
+using complex64 = ::paddle::platform::complex<float>;
+using complex128 = ::paddle::platform::complex<double>;
+
+PT_REGISTER_CTX_KERNEL(dot,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::Dot,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       complex64,
+                       complex128) {}
diff --git a/paddle/pten/kernels/cpu/linalg.cc b/paddle/pten/kernels/cpu/linalg.cc
index 87c4078896a18b66bfdd01ac5f2d5d7535fcb533..0b58b36c596465248a4c0dad59f0fff31b63eb8e 100644
--- a/paddle/pten/kernels/cpu/linalg.cc
+++ b/paddle/pten/kernels/cpu/linalg.cc
@@ -25,28 +25,6 @@
 
 namespace pten {
 
-template <typename T>
-void Dot(const CPUContext& dev_ctx,
-         const DenseTensor& x,
-         const DenseTensor& y,
-         DenseTensor* out) {
-  auto const *x_ptr = x.data<T>(), *x_ptr_ = &x_ptr[0];
-  auto const *y_ptr = y.data<T>(), *y_ptr_ = &y_ptr[0];
-  auto* z = out->mutable_data<T>();
-
-  // Loop over the total N elements of both operands while sum-reducing every
-  // B pairs along the way where B is the dimension of the least ordered axis
-  auto&& d = x.dims();
-  auto const N = x.numel();
-  auto const B = d[d.size() - 1];
-
-  for (int j = 0; j < N / B; j++) {
-    T ss = 0;
-    for (int i = 0; i < B; i++) ss += (*x_ptr_++) * (*y_ptr_++);
-    z[j] = ss;
-  }
-}
-
 template <typename T>
 void Matmul(const CPUContext& dev_ctx,
             const DenseTensor& x,
@@ -73,17 +51,6 @@ void Matmul(const CPUContext& dev_ctx,
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;
 
-PT_REGISTER_KERNEL(dot,
-                   CPU,
-                   ALL_LAYOUT,
-                   pten::Dot,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-
 PT_REGISTER_KERNEL(matmul,
                    CPU,
                    ALL_LAYOUT,
diff --git a/paddle/pten/kernels/cpu/linalg.h b/paddle/pten/kernels/cpu/linalg.h
index 29c6cd16cf81a7c12850a31ba46fe4508b881c90..d9fc391996e19893d0ca71ee502f1a65695592b1 100644
--- a/paddle/pten/kernels/cpu/linalg.h
+++ b/paddle/pten/kernels/cpu/linalg.h
@@ -22,12 +22,6 @@
 
 namespace pten {
 
-template <typename T>
-void Dot(const CPUContext& dev_ctx,
-         const DenseTensor& x,
-         const DenseTensor& y,
-         DenseTensor* out);
-
 template <typename T>
 void Matmul(const CPUContext& dev_ctx,
             const DenseTensor& x,
diff --git a/paddle/pten/kernels/dot_kernel.h b/paddle/pten/kernels/dot_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6db41cbbe238b5c12c9423eafebb372428f63b9
--- /dev/null
+++ b/paddle/pten/kernels/dot_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+
+namespace pten {
+
+template <typename T, typename ContextT>
+void Dot(const ContextT& dev_ctx,
+         const DenseTensor& x,
+         const DenseTensor& y,
+         DenseTensor* out);
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/gpu/dot_kernel.cu b/paddle/pten/kernels/gpu/dot_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7742e57a026539d9169ff7bfe991c50b76ea4b23
--- /dev/null
+++ b/paddle/pten/kernels/gpu/dot_kernel.cu
@@ -0,0 +1,64 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/dot_kernel.h"
+
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/hybird/eigen/common.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/platform/complex.h"
+
+namespace pten {
+
+template <typename T, typename ContextT>
+void Dot(const ContextT& dev_ctx,
+         const DenseTensor& x,
+         const DenseTensor& y,
+         DenseTensor* out) {
+  out->mutable_data<T>();
+  if (1 == out->dims().size()) {
+    auto eigen_out = pten::EigenScalar<T>::From(*out);
+    auto eigen_x = pten::EigenVector<T>::Flatten(x);
+    auto eigen_y = pten::EigenVector<T>::Flatten(y);
+
+    auto& dev = *dev_ctx.eigen_device();
+    eigen_out.device(dev) = (eigen_x * eigen_y).sum();
+  } else {
+    auto eigen_out = pten::EigenMatrix<T>::From(*out);
+    auto eigen_x = pten::EigenMatrix<T>::From(x);
+    auto eigen_y = pten::EigenMatrix<T>::From(y);
+
+    auto& dev = *dev_ctx.eigen_device();
+    eigen_out.device(dev) = (eigen_x * eigen_y).sum(Eigen::DSizes<int, 1>(1));
+  }
+}
+
+}  // namespace pten
+
+using complex64 = ::paddle::platform::complex<float>;
+using complex128 = ::paddle::platform::complex<double>;
+
+PT_REGISTER_CTX_KERNEL(dot,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::Dot,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       complex64,
+                       complex128) {}
diff --git a/paddle/pten/kernels/gpu/linalg.cu b/paddle/pten/kernels/gpu/linalg.cu
index c9bc4cbd07962e1c81ed0180868d184883f4e1c1..e4a69b28e6158b767e915bb809d6f6cc3712c684 100644
--- a/paddle/pten/kernels/gpu/linalg.cu
+++ b/paddle/pten/kernels/gpu/linalg.cu
@@ -15,7 +15,6 @@
 #include "paddle/pten/kernels/gpu/linalg.h"
 
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/kernels/hybird/eigen/dot.h"
 #include "paddle/pten/kernels/hybird/math/matmul_func.h"
 
 // See Note [ Why still include the fluid headers? ]
@@ -23,14 +22,6 @@
 
 namespace pten {
 
-template <typename T>
-void Dot(const GPUContext& dev_ctx,
-         const DenseTensor& x,
-         const DenseTensor& y,
-         DenseTensor* out) {
-  eigen::Dot<GPUContext, T>(dev_ctx, x, y, out);
-}
-
 template <typename T>
 void Matmul(const GPUContext& dev_ctx,
             const DenseTensor& x,
@@ -58,17 +49,6 @@ using float16 = paddle::platform::float16;
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;
 
-PT_REGISTER_KERNEL(dot,
-                   GPU,
-                   ALL_LAYOUT,
-                   pten::Dot,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-
 PT_REGISTER_KERNEL(matmul,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/pten/kernels/gpu/linalg.h b/paddle/pten/kernels/gpu/linalg.h
index a848f55c7b9f0c05dc03b39a6f52b21ca733a988..a0f7c0c0aae229acae882c06aa3505e8762e4217 100644
--- a/paddle/pten/kernels/gpu/linalg.h
+++ b/paddle/pten/kernels/gpu/linalg.h
@@ -22,12 +22,6 @@
 
 namespace pten {
 
-template <typename T>
-void Dot(const GPUContext& dev_ctx,
-         const DenseTensor& x,
-         const DenseTensor& y,
-         DenseTensor* out);
-
 template <typename T>
 void Matmul(const GPUContext& dev_ctx,
             const DenseTensor& x,
diff --git a/paddle/pten/kernels/hybird/eigen/dot.h b/paddle/pten/kernels/hybird/eigen/dot.h
deleted file mode 100644
index eb089037fa3f30efd566678e5e55f9e86666209d..0000000000000000000000000000000000000000
--- a/paddle/pten/kernels/hybird/eigen/dot.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/hybird/eigen/common.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-namespace pten {
-namespace eigen {
-
-template <typename DevCtx, typename T>
-void Dot(const DevCtx& dev_ctx,
-         const DenseTensor& x,
-         const DenseTensor& y,
-         DenseTensor* out) {
-  out->mutable_data<T>();
-  if (1 == out->dims().size()) {
-    auto eigen_out = pten::EigenScalar<T>::From(*out);
-    auto eigen_x = pten::EigenVector<T>::Flatten(x);
-    auto eigen_y = pten::EigenVector<T>::Flatten(y);
-
-    auto& dev = *dev_ctx.eigen_device();
-    eigen_out.device(dev) = (eigen_x * eigen_y).sum();
-  } else {
-    auto eigen_out = pten::EigenMatrix<T>::From(*out);
-    auto eigen_x = pten::EigenMatrix<T>::From(x);
-    auto eigen_y = pten::EigenMatrix<T>::From(y);
-
-    auto& dev = *dev_ctx.eigen_device();
-    eigen_out.device(dev) = (eigen_x * eigen_y).sum(Eigen::DSizes<int, 1>(1));
-  }
-}
-
-}  // namespace eigen
-}  // namespace pten