diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps
index 0ce55b7cf7331889addd238322977c3d51d4e0ca..9a522359628eb16d306d81ca9b39e648c4f9085a 100644
--- a/paddle/fluid/operators/activation_op.kps
+++ b/paddle/fluid/operators/activation_op.kps
@@ -13,7 +13,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/kernels/funcs/activation_functor.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 7bcd336732960e1013e5fab5a6c85e11c88b5d26..5f2097f3330507e621a0e56485b9fdc73d880bab 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -42,7 +42,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/gpu/elementwise_grad.h"
 
@@ -982,7 +982,7 @@ static __global__ void FusedElemwiseAndActGradBroadcast1CUDAKernel(
 #pragma unroll
     for (int i = BLOCK_X >> 1; i > 0; i >>= 1) {
       // reduce sum with wrap
-      val += platform::CudaShuffleXorSync(0xFFFFFFFF, val, i);
+      val += phi::backends::gpu::CudaShuffleXorSync(0xFFFFFFFF, val, i);
     }
 
     size_t idx_j = j + threadIdx.y;
@@ -1004,7 +1004,8 @@ static __global__ void FusedElemwiseAndActGradBroadcast1CUDAKernel(
 #pragma unroll
         for (int i = BLOCK_X >> 1; i > 0; i >>= 1) {
           // reduce sum with wrap
-          inter_val += platform::CudaShuffleXorSync(0xFFFFFFFF, inter_val, i);
+          inter_val +=
+              phi::backends::gpu::CudaShuffleXorSync(0xFFFFFFFF, inter_val, i);
         }
         if (threadIdx.x == 0 && (idx_j < w)) d_intermediate[idx_j] = inter_val;
       }
@@ -1160,14 +1161,14 @@ static __global__ void FusedElemwiseAndActGradBroadcast2CUDAKernel(
   h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
   if (BcastY) {
     if (dy) {
-      val = paddle::platform::reduceSum(val, tid, h);
+      val = phi::backends::gpu::reduceSum(val, tid, h);
       if (threadIdx.x == 0) {
         dy[j] = val;
       }
     }
   } else {
     if (dx) {
-      val = paddle::platform::reduceSum(val, tid, h);
+      val = phi::backends::gpu::reduceSum(val, tid, h);
       if (threadIdx.x == 0) {
         dx[j] = val;
       }
@@ -1175,7 +1176,7 @@ static __global__ void FusedElemwiseAndActGradBroadcast2CUDAKernel(
   }
   if (!SameShapeOfIntermediateOutAndOut) {
     if (d_intermediate) {
-      inter_val = paddle::platform::reduceSum(inter_val, tid, h);
+      inter_val = phi::backends::gpu::reduceSum(inter_val, tid, h);
       if (threadIdx.x == 0) {
         d_intermediate[j] = inter_val;
       }
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index a13bfcf12ea8d2dc6e09339388aff1f017f13098..ef5087f0534e1f6b5d5e5f3c40b77d85c413f825 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -22,9 +22,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/attn_gemm.h"
 #include "paddle/fluid/operators/fused/fmha_ref.h"
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
index 6da533aa77f3c097c1da9acca8e0257a42bf2661..664e20b686d7ed26eacfd062d4c7de337124107c 100644
--- a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
@@ -19,8 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h
index 1b8dc4bb324ca768648df16a0b60157e17aa2c56..0fbc14436e9146f6474ef3fda4f4fb876f9de4a9 100644
--- a/paddle/fluid/operators/fused/fused_dropout_common.h
+++ b/paddle/fluid/operators/fused/fused_dropout_common.h
@@ -22,10 +22,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/fused/quant_dequant_kernel.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/functors.h"
 
diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
index 74ba0b54afd4550fb8d407d5c692a72b38d9680f..c6cfc8698351196b9559b1aa4d66885064373879 100644
--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
@@ -25,8 +25,8 @@ namespace cub = hipcub;
 #endif
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
index 8f13424ce49b54a495d753dc84baa2fb06ea791c..9cb3f19ab17409b2bf25c0148b428530ba9d7766 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/fused/attn_gemm.h"
 #include "paddle/fluid/operators/fused/fused_gate_attention.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
index 777ee83c38dc6d4aabba4294a92c9a1b68f5ba39..79fc561698989babed35887840362d51f4343603 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
@@ -26,9 +26,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/attn_gemm.h"
 #include "paddle/fluid/operators/fused/fmha_ref.h"
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
index d0a2935197a8cd87360c4ab643e520407e4389fc..6b2ba1670a3b72be947496609476cf673ca94962 100644
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -21,7 +21,7 @@ namespace cub = hipcub;
 #endif
 
 #include "paddle/fluid/operators/group_norm_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h
index 86d01f6dece4c2371e7369b856f7db5485bf5354..3d1bd7490795dd40b82ba6629a314ee02201eb6a 100644
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -25,8 +25,8 @@ namespace cub = hipcub;
 #include <iostream>
 
 #include "paddle/fluid/operators/fused/quant_dequant_kernel.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 
@@ -55,7 +55,7 @@ static __forceinline__ __device__ U WarpReduceSum(U val) {
   unsigned mask = 0u;
   CREATE_SHFL_MASK(mask, true);
   for (int offset = warpSize / 2; offset > 0; offset /= 2) {
-    val += paddle::platform::CudaShuffleDownSync(mask, val, offset);
+    val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset);
   }
   return val;
 }
diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu
index 696ddb5a059ed1ada75228281a501629bda5ff64..400f10558e15515cf02ede27da965654162e0e20 100644
--- a/paddle/fluid/operators/math/beam_search.cu
+++ b/paddle/fluid/operators/math/beam_search.cu
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/beam_search.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu
index 3c4253ef800aaad5f9e9637b2adc50257e34a636..34595180c9d721a6ddea7f0d4f0ba66ca02236a5 100644
--- a/paddle/fluid/operators/row_conv_op.cu
+++ b/paddle/fluid/operators/row_conv_op.cu
@@ -12,7 +12,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/row_conv_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -242,7 +242,7 @@ __global__ void RowConvGradFilterImproved(const T *in,
 
         for (int offset = 16; offset > 0;
              offset = offset / 2) {  // blockDim.x is 32.
-          val += platform::CudaShuffleDownSync(mask, val, offset);
+          val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset);
         }
         __syncthreads();
 
@@ -307,7 +307,7 @@ __global__ void RowConvGradFilter(const T *in,
 
         for (int offset = 16; offset > 0;
              offset = offset / 2) {  // blockDim.x is 32.
-          val += platform::CudaShuffleDownSync(mask, val, offset);
+          val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset);
         }
         __syncthreads();
 
diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h
index e95bca3c2791ef0d8758b087d5badd4e296308c9..f210f46ea4376bb0e23ad277ce7b8586beb71ede 100644
--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -26,9 +26,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/kernel_primitives/functor_primitives.h"
 #include "paddle/fluid/operators/top_k_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 #define FINAL_MASK 0xffffffff
@@ -283,8 +283,10 @@ __forceinline__ __device__ Pair<T> WarpReduce(Pair<T> input,
   if (largest) {
 #pragma unroll
     for (int offset = 16; offset > 0; offset >>= 1) {
-      T tmp_val = platform::CudaShuffleDownSync(FINAL_MASK, input.v, offset);
-      int tmp_id = platform::CudaShuffleDownSync(FINAL_MASK, input.id, offset);
+      T tmp_val =
+          phi::backends::gpu::CudaShuffleDownSync(FINAL_MASK, input.v, offset);
+      int tmp_id =
+          phi::backends::gpu::CudaShuffleDownSync(FINAL_MASK, input.id, offset);
       if (input.v < tmp_val || (input.v == tmp_val && input.id > tmp_id)) {
         input.v = tmp_val;
         input.id = tmp_id;
@@ -293,8 +295,10 @@ __forceinline__ __device__ Pair<T> WarpReduce(Pair<T> input,
   } else {
 #pragma unroll
     for (int offset = 16; offset > 0; offset >>= 1) {
-      T tmp_val = platform::CudaShuffleDownSync(FINAL_MASK, input.v, offset);
-      int tmp_id = platform::CudaShuffleDownSync(FINAL_MASK, input.id, offset);
+      T tmp_val =
+          phi::backends::gpu::CudaShuffleDownSync(FINAL_MASK, input.v, offset);
+      int tmp_id =
+          phi::backends::gpu::CudaShuffleDownSync(FINAL_MASK, input.id, offset);
       if (input.v > tmp_val || (input.v == tmp_val && input.id > tmp_id)) {
         input.v = tmp_val;
         input.id = tmp_id;
@@ -357,7 +361,8 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
     unsigned mask = 0u;
     CREATE_SHFL_MASK(mask, true);
     if (tid_max / 32 == wid) {
-      if (platform::CudaShuffleSync(mask, *beam, tid_max % 32, 32) == MaxLength)
+      if (phi::backends::gpu::CudaShuffleSync(mask, *beam, tid_max % 32, 32) ==
+          MaxLength)
         break;
     }
   }
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
deleted file mode 100644
index c1db9c6770c2464e6b65b756602beb9b10549885..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// NOTE(): support float16 to half in header file.
-#define PADDLE_CUDA_FP16
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/phi/core/enforce.h"
-
-namespace paddle {
-namespace platform {
-
-#define FULL_WARP_MASK 0xFFFFFFFF
-#define CREATE_SHFL_MASK(mask, predicate) \
-  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
-
-#define CUDA_LAUNCH_KERNEL_BASE(dim, ...)  \
-  case (dim): {                            \
-    constexpr auto kPowerOfTwoDim = (dim); \
-    __VA_ARGS__;                           \
-  } break
-
-#define CUDA_LAUNCH_KERNEL_HELPER(...)          \
-  CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \
-  CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__);  \
-  CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__);  \
-  CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__);  \
-  CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__);   \
-  CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__);
-
-template <typename T>
-__forceinline__ __device__ T
-CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) {
-  return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
-}
-
-template <typename T>
-__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
-                                                T val,
-                                                int width = warpSize) {
-  return __shfl_xor_sync(mask, val, width);
-}
-
-template <>
-__forceinline__ __device__ float16
-CudaShuffleDownSync(unsigned mask, float16 val, int delta, int width) {
-  return float16(__shfl_down_sync(
-      mask, val.to_half(), static_cast<unsigned>(delta), width));
-}
-
-template <>
-__forceinline__ __device__ bfloat16
-CudaShuffleDownSync(unsigned mask, bfloat16 val, int delta, int width) {
-#if defined(PADDLE_CUDA_BF16)
-  return bfloat16(__shfl_down_sync(mask,
-                                   static_cast<nv_bfloat16>(val),
-                                   static_cast<unsigned>(delta),
-                                   width));
-#else
-  PADDLE_ENFORCE(
-      false, "__shfl_down_sync with bfloat16 is not supported on cuda <= 11.");
-#endif
-}
-
-template <>
-__forceinline__ __device__ paddle::platform::complex<float> CudaShuffleDownSync(
-    unsigned mask, paddle::platform::complex<float> val, int delta, int width) {
-  float real = static_cast<float>(__shfl_down_sync(
-      mask, static_cast<float>(val.real), static_cast<unsigned>(delta), width));
-  float imag = static_cast<float>(__shfl_down_sync(
-      mask, static_cast<float>(val.imag), static_cast<unsigned>(delta), width));
-  return paddle::platform::complex<float>(real, imag);
-}
-
-template <>
-__forceinline__ __device__ paddle::platform::complex<double>
-CudaShuffleDownSync(unsigned mask,
-                    paddle::platform::complex<double> val,
-                    int delta,
-                    int width) {
-  double real =
-      static_cast<double>(__shfl_down_sync(mask,
-                                           static_cast<double>(val.real),
-                                           static_cast<unsigned>(delta),
-                                           width));
-  double imag =
-      static_cast<double>(__shfl_down_sync(mask,
-                                           static_cast<double>(val.imag),
-                                           static_cast<unsigned>(delta),
-                                           width));
-  return paddle::platform::complex<double>(real, imag);
-}
-
-template <>
-__forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask,
-                                                      float16 val,
-                                                      int width) {
-  return float16(__shfl_xor_sync(mask, val.to_half(), width));
-}
-
-template <>
-__forceinline__ __device__ bfloat16 CudaShuffleXorSync(unsigned mask,
-                                                       bfloat16 val,
-                                                       int width) {
-#if defined(PADDLE_CUDA_BF16)
-  return bfloat16(__shfl_xor_sync(mask, static_cast<nv_bfloat16>(val), width));
-#else
-  PADDLE_ENFORCE(
-      false, "__shfl_xor_sync with bfloat16 is not supported on cuda <= 11.");
-#endif
-}
-
-template <>
-__forceinline__ __device__ paddle::platform::complex<float> CudaShuffleXorSync(
-    unsigned mask, paddle::platform::complex<float> val, int width) {
-  float real = static_cast<float>(
-      __shfl_xor_sync(mask, static_cast<float>(val.real), width));
-  float imag = static_cast<float>(
-      __shfl_xor_sync(mask, static_cast<float>(val.imag), width));
-  return paddle::platform::complex<float>(real, imag);
-}
-
-template <>
-__forceinline__ __device__ paddle::platform::complex<double> CudaShuffleXorSync(
-    unsigned mask, paddle::platform::complex<double> val, int width) {
-  double real = static_cast<double>(
-      __shfl_xor_sync(mask, static_cast<double>(val.real), width));
-  double imag = static_cast<double>(
-      __shfl_xor_sync(mask, static_cast<double>(val.imag), width));
-  return paddle::platform::complex<double>(real, imag);
-}
-
-template <typename T>
-__forceinline__ __device__ T
-CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) {
-  return __shfl_sync(mask, val, src_line, width);
-}
-
-template <typename T>
-HOSTDEVICE T Infinity() {
-  return INFINITY;
-}
-
-template <typename T>
-__device__ T reduceSum(T val, int tid, int len) {
-  // NOTE(zcd): The warp size should be taken from the
-  // parameters of the GPU but not specified as 32 simply.
-  // To make the reduceSum more efficiently,
-  // I use Warp-Level Parallelism and assume the Warp size
-  // is 32 which may be different for different GPU,
-  // but most card's warp size is 32.
-  const int warpSize = 32;
-  __shared__ T shm[warpSize];
-  unsigned mask = 0u;
-  CREATE_SHFL_MASK(mask, tid < len);
-
-  for (int offset = warpSize / 2; offset > 0; offset /= 2)
-    val += platform::CudaShuffleDownSync(mask, val, offset);
-
-  if (tid < warpSize) shm[tid] = 0;
-  __syncthreads();
-
-  if (tid % warpSize == 0) {
-    shm[tid / warpSize] = val;
-  }
-  __syncthreads();
-
-  CREATE_SHFL_MASK(mask, tid < warpSize);
-
-  if (tid < warpSize) {
-    val = shm[tid];
-    for (int offset = warpSize / 2; offset > 0; offset /= 2)
-      val += platform::CudaShuffleDownSync(mask, val, offset);
-  }
-  return val;
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/device/gpu/cuda_helper_test.cu b/paddle/fluid/platform/device/gpu/cuda_helper_test.cu
index a3fff0dbed8e2e38e3f97026ad5ff04a85df6bc7..f20c89f97a4f5251de46bbb4649bf8e254f80462 100644
--- a/paddle/fluid/platform/device/gpu/cuda_helper_test.cu
+++ b/paddle/fluid/platform/device/gpu/cuda_helper_test.cu
@@ -22,9 +22,9 @@
 #include <random>
 
 #define PADDLE_CUDA_FP16
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_helper.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 using paddle::platform::float16;
@@ -214,7 +214,7 @@ static __forceinline__ __device__ T WarpReduceSum(T val) {
   unsigned mask = 0u;
   CREATE_SHFL_MASK(mask, true);
   for (int offset = warpSize / 2; offset > 0; offset /= 2) {
-    val += paddle::platform::CudaShuffleDownSync(mask, val, offset);
+    val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset);
   }
   return val;
 }
diff --git a/paddle/fluid/platform/device/gpu/gpu_device_function.h b/paddle/fluid/platform/device/gpu/gpu_device_function.h
deleted file mode 100644
index a8daa5e87fdc38ebe3efd4de767d35869e607eff..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/device/gpu/gpu_device_function.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h"
-#else
-#include "paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h"
-#endif
-
-#endif
diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
deleted file mode 100644
index a8ce5f1a1827bba7983bb32a1b463c5a91cdf521..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// NOTE(): support float16 to half in header file.
-#define PADDLE_CUDA_FP16
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace platform {
-
-#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate))
-
-#define CUDA_LAUNCH_KERNEL_BASE(dim, ...)  \
-  case (dim): {                            \
-    constexpr auto kPowerOfTwoDim = (dim); \
-    __VA_ARGS__;                           \
-  } break
-
-#define CUDA_LAUNCH_KERNEL_HELPER(...)          \
-  CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \
-  CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__);  \
-  CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__);  \
-  CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__);  \
-  CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__);   \
-  CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__);
-
-template <typename T>
-__forceinline__ __device__ T
-CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) {
-  return __shfl_down(val, delta, width);
-}
-
-template <typename T>
-__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
-                                                T val,
-                                                int width = warpSize) {
-  return __shfl_xor(val, width);
-}
-
-template <>
-__forceinline__ __device__ float16
-CudaShuffleDownSync(unsigned mask, float16 val, int delta, int width) {
-  return float16(__shfl_down(
-      static_cast<float>(val), static_cast<unsigned>(delta), width));
-}
-
-template <>
-__forceinline__ __device__ bfloat16
-CudaShuffleDownSync(unsigned mask, bfloat16 val, int delta, int width) {
-  return bfloat16(__shfl_down(
-      static_cast<float>(val), static_cast<unsigned>(delta), width));
-}
-
-template <>
-__forceinline__ __device__ paddle::platform::complex<float> CudaShuffleDownSync(
-    unsigned mask, paddle::platform::complex<float> val, int delta, int width) {
-  float real = __shfl_down(val.real, delta, width);
-  float imag = __shfl_down(val.imag, delta, width);
-  return paddle::platform::complex<float>(real, imag);
-}
-
-template <>
-__forceinline__ __device__ paddle::platform::complex<double>
-CudaShuffleDownSync(unsigned mask,
-                    paddle::platform::complex<double> val,
-                    int delta,
-                    int width) {
-  double real = __shfl_down(val.real, delta, width);
-  double imag = __shfl_down(val.imag, delta, width);
-  return paddle::platform::complex<double>(real, imag);
-}
-
-template <>
-__forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask,
-                                                      float16 val,
-                                                      int width) {
-  return float16(__shfl_xor(static_cast<float>(val), width));
-}
-
-template <>
-__forceinline__ __device__ bfloat16 CudaShuffleXorSync(unsigned mask,
-                                                       bfloat16 val,
-                                                       int width) {
-  return bfloat16(__shfl_xor(static_cast<float>(val), width));
-}
-
-template <>
-__forceinline__ __device__ paddle::platform::complex<float> CudaShuffleXorSync(
-    unsigned mask, paddle::platform::complex<float> val, int width) {
-  float real = __shfl_xor(val.real, width);
-  float imag = __shfl_xor(val.imag, width);
-  return paddle::platform::complex<float>(real, imag);
-}
-
-template <>
-__forceinline__ __device__ paddle::platform::complex<double> CudaShuffleXorSync(
-    unsigned mask, paddle::platform::complex<double> val, int width) {
-  double real = __shfl_xor(val.real, width);
-  double imag = __shfl_xor(val.imag, width);
-  return paddle::platform::complex<double>(real, imag);
-}
-
-template <typename T>
-__forceinline__ __device__ T
-CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) {
-  return __shfl(val, src_line, width);
-}
-
-template <typename T>
-HOSTDEVICE T Infinity() {
-  return INFINITY;
-}
-
-template <typename T>
-__device__ T reduceSum(T val, int tid, int len) {
-  // NOTE(zcd): The warp size should be taken from the
-  // parameters of the GPU but not specified as 32 simply.
-  // To make the reduceSum more efficiently,
-  // I use Warp-Level Parallelism and assume the Warp size
-  // is 32 which may be different for different GPU,
-  // but most card's warp size is 32.
-#ifdef PADDLE_WITH_HIP
-  const int warpSize = 64;
-#else
-  const int warpSize = 32;
-#endif
-  __shared__ T shm[warpSize];
-  unsigned mask = 0u;
-  CREATE_SHFL_MASK(mask, tid < len);
-
-  for (int offset = warpSize / 2; offset > 0; offset /= 2)
-    val += platform::CudaShuffleDownSync(mask, val, offset);
-
-  if (tid < warpSize) shm[tid] = 0;
-  __syncthreads();
-
-  if (tid % warpSize == 0) {
-    shm[tid / warpSize] = val;
-  }
-  __syncthreads();
-
-  CREATE_SHFL_MASK(mask, tid < warpSize);
-
-  if (tid < warpSize) {
-    val = shm[tid];
-    for (int offset = warpSize / 2; offset > 0; offset /= 2)
-      val += platform::CudaShuffleDownSync(mask, val, offset);
-  }
-  return val;
-}
-
-}  // namespace platform
-}  // namespace paddle