diff --git a/paddle/pten/api/lib/kernel_declare.h b/paddle/pten/api/lib/kernel_declare.h
index f8874cfb17fef0a2dd2a77c6e1d0c5fd9c682aae..a5d9537097a97bbe007a44716446f36a1eac98eb 100644
--- a/paddle/pten/api/lib/kernel_declare.h
+++ b/paddle/pten/api/lib/kernel_declare.h
@@ -21,12 +21,12 @@ limitations under the License. */
 // file name of the kernel, and this header file will be removed
 
 PT_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(cast, CPU, ALL_LAYOUT);
+PT_DECLARE_KERNEL(reshape, CPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(mean, CPU, ALL_LAYOUT);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PT_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(cast, GPU, ALL_LAYOUT);
+PT_DECLARE_KERNEL(reshape, GPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(mean, GPU, ALL_LAYOUT);
 #endif
 
diff --git a/paddle/pten/include/manipulation.h b/paddle/pten/include/manipulation.h
index 80bfcef89f73a270b513fbb7515319f55d2c0e10..2d429c060f4498fa492f3751ceeb8123bb9b48d6 100644
--- a/paddle/pten/include/manipulation.h
+++ b/paddle/pten/include/manipulation.h
@@ -17,6 +17,7 @@
 // See Note: [ How do we organize the kernel directory ]
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/kernels/cast_kernel.h"
 #include "paddle/pten/kernels/cpu/manipulation.h"
 #include "paddle/pten/kernels/flatten_kernel.h"
 #include "paddle/pten/kernels/gpu/manipulation.h"
@@ -48,7 +49,7 @@ DenseTensor Cast(const ContextT& dev_ctx,
       pten::make_intrusive<paddle::experimental::SharedStorage>(
           dev_ctx.GetPlace()),
       std::move(out_meta));
-  Cast<T>(dev_ctx, x, out_dtype, in_dtype, &dense_out);
+  Cast<T, ContextT>(dev_ctx, x, out_dtype, in_dtype, &dense_out);
   return dense_out;
 }
 
diff --git a/paddle/pten/kernels/cast_kernel.h b/paddle/pten/kernels/cast_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..968139f6f17636abe303502035dd8e15c0b48dff
--- /dev/null
+++ b/paddle/pten/kernels/cast_kernel.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+
+namespace pten {
+
+template <typename T, typename ContextT>
+void Cast(const ContextT& dev_ctx,
+          const DenseTensor& x,
+          DataType out_dtype,
+          DataType in_dtype,
+          DenseTensor* out);
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/cpu/CMakeLists.txt b/paddle/pten/kernels/cpu/CMakeLists.txt
index f7dabf47eb68f929cb733d572dac2931a44fa366..b67bb6296bc98b4f77857b2fa9984746b34c3a4f 100644
--- a/paddle/pten/kernels/cpu/CMakeLists.txt
+++ b/paddle/pten/kernels/cpu/CMakeLists.txt
@@ -1,3 +1,3 @@
-cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory eigen_function blas pten_transpose_cpu)
+cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory eigen_function blas pten_transpose_cpu cast_kernel)
 cc_library(linalg_cpu SRCS linalg.cc DEPS dense_tensor kernel_context kernel_factory)
 cc_library(manipulation_cpu SRCS manipulation.cc DEPS dense_tensor kernel_context kernel_factory copy_kernel unary)
diff --git a/paddle/pten/kernels/cpu/cast_kernel.cc b/paddle/pten/kernels/cpu/cast_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..db57da1d41d339d97559648b3077f912e9b441a7
--- /dev/null
+++ b/paddle/pten/kernels/cpu/cast_kernel.cc
@@ -0,0 +1,78 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/cast_kernel.h"
+
+#include "paddle/pten/api/ext/dispatch.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/transform.h"
+
+namespace pten {
+
+template <typename InT, typename OutT>
+struct CastOpTransformFunctor {
+  HOSTDEVICE OutT operator()(InT in) const { return static_cast<OutT>(in); }
+};
+
+template <typename InT, typename OutT>
+void CastKernelImpl(const CPUContext& dev_ctx,
+                    const DenseTensor& x,
+                    DenseTensor* out) {
+  auto* in_begin = x.data<InT>();
+  auto numel = x.numel();
+  auto* in_end = in_begin + numel;
+
+  auto* out_begin = out->mutable_data<OutT>();
+
+  paddle::platform::Transform<CPUContext> trans;
+  trans(dev_ctx,
+        in_begin,
+        in_end,
+        out_begin,
+        CastOpTransformFunctor<InT, OutT>());
+}
+
+template <typename T, typename ContextT>
+void Cast(const ContextT& dev_ctx,
+          const DenseTensor& x,
+          DataType out_dtype,
+          DataType in_dtype,
+          DenseTensor* out) {
+  PD_VISIT_ALL_TYPES(out_dtype, "CastKernelImpl", ([&] {
+                       CastKernelImpl<T, data_t>(dev_ctx, x, out);
+                     }));
+}
+
+}  // namespace pten
+
+PT_REGISTER_CTX_KERNEL(cast,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::Cast,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       int16_t,
+                       bool,
+                       uint8_t,
+                       paddle::platform::float16,
+                       paddle::platform::bfloat16,
+                       paddle::platform::complex<float>,
+                       paddle::platform::complex<double>) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
diff --git a/paddle/pten/kernels/cpu/manipulation.cc b/paddle/pten/kernels/cpu/manipulation.cc
index 3dba89ea19620b4c8ea41e725e03c512062a0e6c..e55eb316325107db9142fcb36ae3e1935acf2e82 100644
--- a/paddle/pten/kernels/cpu/manipulation.cc
+++ b/paddle/pten/kernels/cpu/manipulation.cc
@@ -17,7 +17,6 @@
 #include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/copy_kernel.h"
 #include "paddle/pten/kernels/hybird/general/manipulation.h"
-#include "paddle/pten/kernels/hybird/math/cast_func.h"
 
 namespace pten {
 
@@ -44,38 +43,8 @@ void ReshapeWithXShape(const CPUContext& dev_ctx,
   Reshape(dev_ctx, x, shape, out);
 }
 
-template <typename T>
-void Cast(const CPUContext& dev_ctx,
-          const DenseTensor& x,
-          DataType out_dtype,
-          DataType in_dtype,
-          DenseTensor* out) {
-  PD_VISIT_ALL_TYPES(out_dtype, "CastKernelImpl", ([&] {
-                       math::CastKernelImpl<CPUContext, T, data_t>(
-                           dev_ctx, x, out);
-                     }));
-}
-
 }  // namespace pten
 
-PT_REGISTER_KERNEL(cast,
-                   CPU,
-                   ALL_LAYOUT,
-                   pten::Cast,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   int16_t,
-                   bool,
-                   uint8_t,
-                   paddle::platform::float16,
-                   paddle::platform::bfloat16,
-                   paddle::platform::complex<float>,
-                   paddle::platform::complex<double>) {
-  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
-}
-
 PT_REGISTER_NO_TEMPLATE_KERNEL(
     reshape, CPU, ALL_LAYOUT, pten::Reshape, ALL_DTYPE) {}
 PT_REGISTER_NO_TEMPLATE_KERNEL(
diff --git a/paddle/pten/kernels/cpu/manipulation.h b/paddle/pten/kernels/cpu/manipulation.h
index 5aa5344b192dd3ca5b792b26571e303a979a9523..7eaa430f5f6652c309a4a246305fcb29a781ba6c 100644
--- a/paddle/pten/kernels/cpu/manipulation.h
+++ b/paddle/pten/kernels/cpu/manipulation.h
@@ -21,13 +21,6 @@ limitations under the License. */
 
 namespace pten {
 
-template <typename T>
-void Cast(const CPUContext& dev_ctx,
-          const DenseTensor& x,
-          DataType out_dtype,
-          DataType in_dtype,
-          DenseTensor* out);
-
 void Reshape(const CPUContext& dev_ctx,
              const DenseTensor& x,
              const ScalarArray& shape,
diff --git a/paddle/pten/kernels/gpu/CMakeLists.txt b/paddle/pten/kernels/gpu/CMakeLists.txt
index 5fdb5359e1fd757311ebc8dbcaa094fd84ff583a..99e3871044f2ff158d0c9df4f10a528852ba1b4e 100644
--- a/paddle/pten/kernels/gpu/CMakeLists.txt
+++ b/paddle/pten/kernels/gpu/CMakeLists.txt
@@ -1,9 +1,9 @@
 if(WITH_GPU)
-  nv_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu)
+  nv_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu cast_kernel)
   nv_library(linalg_gpu SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
   nv_library(manipulation_gpu SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory copy_kernel unary)
 elseif(WITH_ROCM)
-  hip_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu)
+  hip_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu cast_kernel)
   hip_library(linalg_gpu SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
   hip_library(manipulation_gpu SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory copy_kernel unary)
 endif()
diff --git a/paddle/pten/kernels/hybird/cuda/cast_kernel_impl.h b/paddle/pten/kernels/gpu/cast_kernel.cu
similarity index 60%
rename from paddle/pten/kernels/hybird/cuda/cast_kernel_impl.h
rename to paddle/pten/kernels/gpu/cast_kernel.cu
index d8c58448c9867232d5a9512d6070a821a69104ed..011fc9077dca9041c7828a69fc996757042124ba 100644
--- a/paddle/pten/kernels/hybird/cuda/cast_kernel_impl.h
+++ b/paddle/pten/kernels/gpu/cast_kernel.cu
@@ -13,15 +13,21 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/platform/device/gpu/gpu_helper.h"
-#include "paddle/fluid/platform/float16.h"
+
+#include "paddle/pten/kernels/cast_kernel.h"
+
+#include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/backends/gpu/gpu_context.h"
-#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
 
+// See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/aligned_vector.h"
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/device/gpu/gpu_helper.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/float16.h"
+
 namespace pten {
-namespace detail {
 
 template <typename InT, typename OutT, int VecSize>
 __global__ void VecCastCUDAKernel(const InT* in, const int64_t N, OutT* out) {
@@ -74,6 +80,41 @@ void CastCUDAKernelImpl(const GPUContext& dev_ctx,
   }
 }
 
-}  // namespace detail
+template <typename T, typename ContextT>
+void Cast(const ContextT& dev_ctx,
+          const DenseTensor& x,
+          DataType out_dtype,
+          DataType in_dtype,
+          DenseTensor* out) {
+  PD_VISIT_ALL_TYPES(out_dtype, "CastCUDAKernelImpl", ([&] {
+                       CastCUDAKernelImpl<T, data_t>(dev_ctx, x, out);
+                     }));
+}
 
 }  // namespace pten
+
+#define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...)     \
+  PT_REGISTER_CTX_KERNEL(cast,                              \
+                         GPU,                               \
+                         ALL_LAYOUT,                        \
+                         pten::Cast,                        \
+                         float,                             \
+                         double,                            \
+                         int,                               \
+                         int64_t,                           \
+                         int16_t,                           \
+                         bool,                              \
+                         uint8_t,                           \
+                         paddle::platform::float16,         \
+                         paddle::platform::complex<float>,  \
+                         paddle::platform::complex<double>, \
+                         ##__VA_ARGS__) {                   \
+    kernel->OutputAt(0).SetDataType(                        \
+        paddle::experimental::DataType::UNDEFINED);         \
+  }
+
+#if !defined(PADDLE_WITH_HIP)
+PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, paddle::platform::bfloat16)
+#else
+PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast)
+#endif
diff --git a/paddle/pten/kernels/gpu/manipulation.cu b/paddle/pten/kernels/gpu/manipulation.cu
index 4dbf8e69b450e2ed085f94cc3bfe39dbcd806023..7e99510c015d07395778e238c041890546a49796 100644
--- a/paddle/pten/kernels/gpu/manipulation.cu
+++ b/paddle/pten/kernels/gpu/manipulation.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/pten/api/ext/dispatch.h"
+#include "paddle/pten/kernels/gpu/manipulation.h"
+
 #include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/copy_kernel.h"
-#include "paddle/pten/kernels/gpu/manipulation.h"
-#include "paddle/pten/kernels/hybird/cuda/cast_kernel_impl.h"
 #include "paddle/pten/kernels/hybird/general/manipulation.h"
 
 namespace pten {
@@ -44,47 +43,8 @@ void ReshapeWithXShape(const GPUContext& dev_ctx,
   Reshape(dev_ctx, x, shape, out);
 }
 
-template <typename T>
-void Cast(const GPUContext& dev_ctx,
-          const DenseTensor& x,
-          DataType out_dtype,
-          DataType in_dtype,
-          DenseTensor* out) {
-  PD_VISIT_ALL_TYPES(out_dtype, "CastKernelImpl", ([&] {
-                       detail::CastCUDAKernelImpl<T, data_t>(dev_ctx, x, out);
-                     }));
-}
-
 }  // namespace pten
 
-using float16 = paddle::platform::float16;
-
-#define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...) \
-  PT_REGISTER_KERNEL(cast,                              \
-                     GPU,                               \
-                     ALL_LAYOUT,                        \
-                     pten::Cast,                        \
-                     float,                             \
-                     double,                            \
-                     int,                               \
-                     int64_t,                           \
-                     int16_t,                           \
-                     bool,                              \
-                     uint8_t,                           \
-                     paddle::platform::float16,         \
-                     paddle::platform::complex<float>,  \
-                     paddle::platform::complex<double>, \
-                     ##__VA_ARGS__) {                   \
-    kernel->OutputAt(0).SetDataType(                    \
-        paddle::experimental::DataType::UNDEFINED);     \
-  }
-
-#if !defined(PADDLE_WITH_HIP)
-PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, paddle::platform::bfloat16)
-#else
-PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast)
-#endif
-
 PT_REGISTER_NO_TEMPLATE_KERNEL(
     reshape, GPU, ALL_LAYOUT, pten::Reshape, ALL_DTYPE) {}
 PT_REGISTER_NO_TEMPLATE_KERNEL(
diff --git a/paddle/pten/kernels/gpu/manipulation.h b/paddle/pten/kernels/gpu/manipulation.h
index af49932c2e5201a92f4f3b55f09ee5a137d92c51..a099763706b2d081a54de0484c91321919310ee3 100644
--- a/paddle/pten/kernels/gpu/manipulation.h
+++ b/paddle/pten/kernels/gpu/manipulation.h
@@ -24,13 +24,6 @@
 
 namespace pten {
 
-template <typename T>
-void Cast(const GPUContext& dev_ctx,
-          const DenseTensor& x,
-          DataType out_dtype,
-          DataType in_dtype,
-          DenseTensor* out);
-
 void Reshape(const GPUContext& dev_ctx,
              const DenseTensor& x,
              const ScalarArray& shape,
diff --git a/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h b/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
index a2faf2a5416942900ee8ab6ec895286a06996d50..2680f2d3fddca53d56ee1f8931aeac2e13dcc66e 100644
--- a/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
+++ b/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
@@ -41,8 +41,8 @@ namespace cub = hipcub;
 
 #include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/cast_kernel.h"
 #include "paddle/pten/kernels/copy_kernel.h"
-#include "paddle/pten/kernels/hybird/math/cast_func.h"
 
 // Reduce split or not, Whether to use ReduceHigherDim
 #define REDUCE_SPLIT_BOUNDARY 512
@@ -1112,12 +1112,7 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
       AsyncCopy(x, y);
       y->Resize(out_dims);
     } else {
-      PD_VISIT_ALL_TYPES(
-          y->dtype(), "CastKernelImpl", ([&] {
-            pten::math::CastKernelImpl<paddle::platform::CUDADeviceContext,
-                                       Tx,
-                                       data_t>(*dev_ctx, x, y);
-          }));
+      pten::Cast<Tx>(*dev_ctx, x, y->dtype(), x.dtype(), y);
     }
     return;
   }
diff --git a/paddle/pten/kernels/hybird/general/reduce_impl.h b/paddle/pten/kernels/hybird/general/reduce_impl.h
index 52bdf18ad5a31d87f53fb48e983f6b07539f7044..a8e0bc5de2196ea8890d299a5065af9d7cb93aef 100644
--- a/paddle/pten/kernels/hybird/general/reduce_impl.h
+++ b/paddle/pten/kernels/hybird/general/reduce_impl.h
@@ -16,8 +16,9 @@
 #include "paddle/fluid/platform/transform.h"
 #include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/cast_kernel.h"
 #include "paddle/pten/kernels/hybird/eigen/reduce.h"
-#include "paddle/pten/kernels/hybird/math/cast_func.h"
+
 namespace pten {
 namespace general {
 
@@ -57,11 +58,8 @@ void Reduce(const DeviceContext& dev_ctx,
         pten::make_intrusive<paddle::experimental::SharedStorage>(x.place()),
         pten::DenseTensorMeta(out_dtype, x.dims(), x.layout()));
 
-    // cast x tensor to out_dtype first
-    PD_VISIT_ALL_TYPES(out_dtype, "CastKernelImpl", ([&] {
-                         math::CastKernelImpl<DeviceContext, T, data_t>(
-                             dev_ctx, x, &tmp_tensor);
-                       }));
+    // cast x tensor to out_dtype
+    pten::Cast<T, DeviceContext>(dev_ctx, x, out_dtype, x.dtype(), &tmp_tensor);
 
     // do reduce sum
     PD_VISIT_ALL_TYPES(
diff --git a/paddle/pten/kernels/hybird/math/cast_func.h b/paddle/pten/kernels/hybird/math/cast_func.h
deleted file mode 100644
index 0a67736dbb27b66eaf6daa482a0bbd4f07db892b..0000000000000000000000000000000000000000
--- a/paddle/pten/kernels/hybird/math/cast_func.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/platform/transform.h"
-#include "paddle/pten/core/dense_tensor.h"
-
-namespace pten {
-namespace math {
-
-template <typename InT, typename OutT>
-struct CastOpTransformFunctor {
-  HOSTDEVICE OutT operator()(InT in) const { return static_cast<OutT>(in); }
-};
-
-template <typename DeviceContext, typename InT, typename OutT>
-void CastKernelImpl(const DeviceContext& dev_ctx,
-                    const DenseTensor& x,
-                    DenseTensor* out) {
-  auto* in_begin = x.data<InT>();
-  auto numel = x.numel();
-  auto* in_end = in_begin + numel;
-
-  auto* out_begin = out->mutable_data<OutT>();
-
-  paddle::platform::Transform<DeviceContext> trans;
-  trans(dev_ctx,
-        in_begin,
-        in_end,
-        out_begin,
-        CastOpTransformFunctor<InT, OutT>());
-}
-
-}  // namespace math
-
-}  // namespace pten
diff --git a/paddle/pten/kernels/hybird/transpose.cu b/paddle/pten/kernels/hybird/transpose.cu
index 195277c216fe9127f3664c543c7a7e69fac93e46..6ea5e36e106d54fba6e72ce9edfd2fb74f4ed813 100644
--- a/paddle/pten/kernels/hybird/transpose.cu
+++ b/paddle/pten/kernels/hybird/transpose.cu
@@ -16,7 +16,6 @@
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/hybird/math/cast_func.h"
 #include "paddle/pten/kernels/hybird/transpose.h"
 
 // See Note [ Why still include the fluid headers? ]