diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h
index e52c669c48d9897b8387cd086b6118e16bd6f8bf..f8007319d697ce12d6087993e41806760f1a5606 100644
--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -25,7 +25,7 @@ limitations under the License. */
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 
 #endif
@@ -982,7 +982,7 @@ static void ElemwiseGradBroadcast1CUDA(gpuStream_t stream,
     auto gplace = phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId());
     auto *ctx = static_cast<GPUContext *>(
         paddle::platform::DeviceContextPool::Instance().Get(gplace));
-    paddle::platform::LimitGridDim(*ctx, &grid_size);
+    phi::backends::gpu::LimitGridDim(*ctx, &grid_size);
     FastElemwiseGradBroadcast1CUDAKernel<<<grid_size, block_size, 0, stream>>>(
         x, y, out, dout, h, w, is_xsize_larger, dx_op, dy_op, dx, dy);
   }
@@ -1007,7 +1007,7 @@ static void ElemwiseGradBroadcast2CUDA(gpuStream_t stream,
   auto gplace = phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId());
   auto *ctx = static_cast<GPUContext *>(
       paddle::platform::DeviceContextPool::Instance().Get(gplace));
-  paddle::platform::LimitGridDim(*ctx, &grid_size);
+  phi::backends::gpu::LimitGridDim(*ctx, &grid_size);
   ElemwiseGradBroadcast2CUDAKernel<<<grid_size, block_size, 0, stream>>>(
       x, y, out, dout, pre, n, post, is_xsize_larger, dx_op, dy_op, dx, dy);
 }
@@ -1210,7 +1210,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
       } else {
         dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
         dim3 grid_size = dim3((w + BLOCK_X - 1) / BLOCK_X);
-        paddle::platform::LimitGridDim(ctx, &grid_size);
+        phi::backends::gpu::LimitGridDim(ctx, &grid_size);
         FastCommonGradBroadcastCUDAKernelHeight<<<grid_size,
                                                   block_size,
                                                   0,
@@ -1247,7 +1247,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
       } else {
         dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
         dim3 grid_size = dim3((w + BLOCK_X - 1) / BLOCK_X);
-        paddle::platform::LimitGridDim(ctx, &grid_size);
+        phi::backends::gpu::LimitGridDim(ctx, &grid_size);
         FastCommonGradBroadcastCUDAKernelHeight<<<grid_size,
                                                   block_size,
                                                   0,
@@ -1345,7 +1345,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
 
     int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
     dim3 grid_size = dim3(pre * post);
-    paddle::platform::LimitGridDim(ctx, &grid_size);
+    phi::backends::gpu::LimitGridDim(ctx, &grid_size);
 
     FastCommonGradBroadcastAllCUDAKernel<<<grid_size, block_size, 0, stream>>>(
         x_data,
@@ -1387,7 +1387,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
                                    std::multiplies<int>());
           int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
           dim3 grid_size = dim3(pre * post);
-          paddle::platform::LimitGridDim(ctx, &grid_size);
+          phi::backends::gpu::LimitGridDim(ctx, &grid_size);
           // we need to calc y offset with blockid, so do x_pre/y_pre to get
           // left size.
           if (k_pre != pre) k_pre = pre / k_pre;
@@ -1418,7 +1418,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
                                    std::multiplies<int>());
           int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
           dim3 grid_size = dim3(pre * post);
-          paddle::platform::LimitGridDim(ctx, &grid_size);
+          phi::backends::gpu::LimitGridDim(ctx, &grid_size);
           if (k_pre != pre) k_pre = pre / k_pre;
 
           FastCommonGradBroadcastOneCUDAKernel<<<grid_size,
diff --git a/paddle/phi/kernels/funcs/gather.cu.h b/paddle/phi/kernels/funcs/gather.cu.h
index fbffd0c1e2b8a4efdc96a88c5c18357d5596f21c..f660560fceb5e1f868ac85b4eeadbd2cddc0a157 100644
--- a/paddle/phi/kernels/funcs/gather.cu.h
+++ b/paddle/phi/kernels/funcs/gather.cu.h
@@ -18,7 +18,6 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/memcpy.h"
 // TODO(paddle-dev): move gpu_primitives.h to phi
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/place.h"
@@ -113,7 +112,7 @@ void GPUGather(const phi::GPUContext& ctx,
   int block = 512;
   int64_t n = slice_size * index_size;
   dim3 grid = dim3((n + block - 1) / block);
-  paddle::platform::LimitGridDim(ctx, &grid);
+  phi::backends::gpu::LimitGridDim(ctx, &grid);
 
   GatherCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
       p_src, p_index, p_output, index_size, slice_size);
@@ -155,7 +154,7 @@ void GPUGatherNd(const phi::GPUContext& ctx,
   int block = 512;
   int64_t n = slice_size * remain_numel;
   dim3 grid = dim3((n + block - 1) / block);
-  paddle::platform::LimitGridDim(ctx, &grid);
+  phi::backends::gpu::LimitGridDim(ctx, &grid);
 
   GatherNdCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(p_input,
                                                                   g_input_dims,
diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
index 9138fd85e65aa4ad91d5847024a22c14adfe7465..9719fbd88160e1b94cb74ad9ec5b387612fd8433 100644
--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -34,9 +34,9 @@ namespace cub = hipcub;
 
 #ifndef PADDLE_WITH_XPU_KP
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #endif
 
 #include "paddle/phi/kernels/cast_kernel.h"
@@ -337,7 +337,7 @@ struct ReduceConfig {
     SetBlockDim();
 #ifndef PADDLE_WITH_XPU_KP
     // step5: limit the grid to prevent thead overflow
-    paddle::platform::LimitGridDim(dev_ctx, &grid);
+    phi::backends::gpu::LimitGridDim(dev_ctx, &grid);
 #endif
   }
 
diff --git a/paddle/phi/kernels/funcs/scatter.cu.h b/paddle/phi/kernels/funcs/scatter.cu.h
index e10ae3951ae2f0d3b98c1ceed9aa42438127c1e1..d42538edb7561e49a3ac4ce471252bb48f6b6929 100644
--- a/paddle/phi/kernels/funcs/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/scatter.cu.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -158,7 +158,7 @@ void GPUScatterAssign(const phi::GPUContext& ctx,
   int block = 512;
   int64_t n = slice_size * index_size;
   dim3 grid = dim3((n + block - 1) / block);
-  paddle::platform::LimitGridDim(ctx, &grid);
+  phi::backends::gpu::LimitGridDim(ctx, &grid);
 
   // if not overwrite mode, init data
   if (!overwrite) {
@@ -190,7 +190,7 @@ void GPUScatterGradForX(const phi::GPUContext& ctx,
   int64_t n = slice_size * index_size;
   int64_t height = (n + block - 1) / block;
   dim3 grid = dim3((n + block - 1) / block);
-  paddle::platform::LimitGridDim(ctx, &grid);
+  phi::backends::gpu::LimitGridDim(ctx, &grid);
 
   ScatterInitCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
       p_index, p_output, index_size, slice_size);
@@ -231,7 +231,7 @@ void GPUScatterNdAdd(const phi::GPUContext& ctx,
   int block = 512;
   int64_t n = slice_size * remain_numel;
   dim3 grid = dim3((n + block - 1) / block);
-  paddle::platform::LimitGridDim(ctx, &grid);
+  phi::backends::gpu::LimitGridDim(ctx, &grid);
 
   ScatterNdCUDAKernel<T, IndexT>
       <<<grid, block, 0, ctx.stream()>>>(p_update,
diff --git a/paddle/phi/kernels/gpu/histogram_kernel.cu b/paddle/phi/kernels/gpu/histogram_kernel.cu
index cd89bf98a305bba23563f83c3110e1a162611afd..4cc6bc35578d19c54821dc06924232342a78e1bb 100644
--- a/paddle/phi/kernels/gpu/histogram_kernel.cu
+++ b/paddle/phi/kernels/gpu/histogram_kernel.cu
@@ -14,9 +14,9 @@
 
 #include "paddle/phi/kernels/histogram_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
diff --git a/paddle/phi/kernels/gpu/index_add_grad_kernel.cu b/paddle/phi/kernels/gpu/index_add_grad_kernel.cu
index c868843925aa75118a95710edc4d687bc11155f8..ddc8a65ad51edd0874914c6d10c0031e234808a8 100644
--- a/paddle/phi/kernels/gpu/index_add_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_add_grad_kernel.cu
@@ -14,9 +14,9 @@
 
 #include "paddle/phi/kernels/index_add_grad_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -71,7 +71,7 @@ void IndexAddGradKernel(const Context& ctx,
   // get add_value_grad: index_select(out_grad, index, axis)
   unsigned int block_dim = PADDLE_CUDA_NUM_THREADS;
   dim3 grid_dim = dim3((numel + block_dim - 1) / block_dim);
-  paddle::platform::LimitGridDim(ctx, &grid_dim);
+  phi::backends::gpu::LimitGridDim(ctx, &grid_dim);
 
   if (index_type == phi::DataType::INT64) {
     const int64_t* index_data = index.data<int64_t>();
diff --git a/paddle/phi/kernels/gpu/index_add_kernel.cu b/paddle/phi/kernels/gpu/index_add_kernel.cu
index ff8fb1702075cbad562130b93091c03b886ce411..047e54b99aa3b9b78516e1b35e6583b1a5184878 100644
--- a/paddle/phi/kernels/gpu/index_add_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_add_kernel.cu
@@ -14,9 +14,9 @@
 
 #include "paddle/phi/kernels/index_add_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
 
@@ -75,7 +75,7 @@ void IndexAddKernel(const Context& ctx,
 
   unsigned int block_dim = PADDLE_CUDA_NUM_THREADS;
   dim3 grid_dim = dim3((numel + block_dim - 1) / block_dim);
-  paddle::platform::LimitGridDim(ctx, &grid_dim);
+  phi::backends::gpu::LimitGridDim(ctx, &grid_dim);
 
   // copy input to output.
   // todo(@limin29): inplace do not need copy.
diff --git a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
index d2671dff7b0184f50345bdeeadb59d018a3badf0..5368d98c56a95edd95aee0a308bd924fe4f64cd7 100644
--- a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
@@ -18,9 +18,9 @@
 #include <vector>
 
 #include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
@@ -92,16 +92,16 @@ void IndexSampleGradKernel(const Context& ctx,
   size_t index_length = index_dim[1];
   bool same_data_in_index_row = index_length == 1 ? false : true;
 
-  auto block_width = paddle::platform::RoundToPowerOfTwo(index_length);
+  auto block_width = phi::backends::gpu::RoundToPowerOfTwo(index_length);
   block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
   auto block_height =
-      paddle::platform::RoundToPowerOfTwo(index_length * batch_size) /
+      phi::backends::gpu::RoundToPowerOfTwo(index_length * batch_size) /
       block_width;
   block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
   dim3 block_dim(block_width, block_height);
   dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
                 (batch_size + block_dim.y - 1) / block_dim.y);
-  paddle::platform::LimitGridDim(ctx, &grid_dim);
+  phi::backends::gpu::LimitGridDim(ctx, &grid_dim);
 
   phi::funcs::SetConstant<Context, T> set_zero;
   set_zero(ctx, x_grad, static_cast<T>(0));
diff --git a/paddle/phi/kernels/gpu/index_sample_kernel.cu b/paddle/phi/kernels/gpu/index_sample_kernel.cu
index 9b95d761fcbad475d142553741ffd658f0688884..5e6bd8701a9dc7b39f6c0266e53ea49ef0de45ec 100644
--- a/paddle/phi/kernels/gpu/index_sample_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_sample_kernel.cu
@@ -18,8 +18,8 @@
 #include <vector>
 
 #include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
@@ -80,16 +80,16 @@ void IndexSampleKernel(const Context& ctx,
   size_t input_length = input_dim[1];
   size_t index_length = index_dim[1];
 
-  auto block_width = paddle::platform::RoundToPowerOfTwo(index_length);
+  auto block_width = phi::backends::gpu::RoundToPowerOfTwo(index_length);
   block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
   int block_height =
-      paddle::platform::RoundToPowerOfTwo(index_length * batch_size) /
+      phi::backends::gpu::RoundToPowerOfTwo(index_length * batch_size) /
       block_width;
   block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
   dim3 block_dim(block_width, block_height);
   dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
                 (batch_size + block_dim.y - 1) / block_dim.y);
-  paddle::platform::LimitGridDim(ctx, &grid_dim);
+  phi::backends::gpu::LimitGridDim(ctx, &grid_dim);
 
   if (index_type == DataType::INT64) {
     const int64_t* index_data = index.data<int64_t>();
diff --git a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
index 8561744e87b1f85312867447c5a9672280744d6b..fb9157db557e65062afd27d2da562ab232529585 100644
--- a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
@@ -14,9 +14,9 @@
 
 #include "paddle/phi/kernels/index_select_grad_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -87,7 +87,7 @@ void IndexSelectGradKernel(const Context& ctx,
 
   unsigned int block_dim = PADDLE_CUDA_NUM_THREADS;
   dim3 grid_dim = dim3((numel + block_dim - 1) / block_dim);
-  paddle::platform::LimitGridDim(ctx, &grid_dim);
+  phi::backends::gpu::LimitGridDim(ctx, &grid_dim);
 
   phi::funcs::SetConstant<phi::GPUContext, T> index_select_grad_init;
   index_select_grad_init(ctx, x_grad, static_cast<T>(0));
diff --git a/paddle/phi/kernels/gpu/index_select_impl.h b/paddle/phi/kernels/gpu/index_select_impl.h
index fc631b651540fb1f002c533da8e5b142cdc30ddc..da9cdbf52783b2822f167c264f7be3bd48741847 100644
--- a/paddle/phi/kernels/gpu/index_select_impl.h
+++ b/paddle/phi/kernels/gpu/index_select_impl.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
 
diff --git a/paddle/phi/kernels/gpu/index_select_kernel.cu b/paddle/phi/kernels/gpu/index_select_kernel.cu
index e9228b54edf7c171f115a13fe4954d639db862ac..135ae5265189753d4281de71a61aba3eb063f1f2 100644
--- a/paddle/phi/kernels/gpu/index_select_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_select_kernel.cu
@@ -14,9 +14,9 @@
 
 #include "paddle/phi/kernels/index_select_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/gpu/index_select_impl.h"
@@ -62,7 +62,7 @@ void IndexSelectKernel(const Context& ctx,
 
   unsigned int block_dim = PADDLE_CUDA_NUM_THREADS;
   dim3 grid_dim = dim3((numel + block_dim - 1) / block_dim);
-  paddle::platform::LimitGridDim(ctx, &grid_dim);
+  phi::backends::gpu::LimitGridDim(ctx, &grid_dim);
 
   if (index_type == phi::DataType::INT64) {
     const int64_t* index_data = index.data<int64_t>();
diff --git a/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu b/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu
index b6333110485aff6d9cef572de0840a84e29c3684..30fd93533ed54ee80f61acc1e0dc900c442c5809 100644
--- a/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu
@@ -14,9 +14,9 @@
 
 #include "paddle/phi/kernels/nanmedian_grad_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/gpu/nanmedian_kernel.cu b/paddle/phi/kernels/gpu/nanmedian_kernel.cu
index f61e413c9887d1d31df6686da4d2e120fc400c5c..132b9fa10b7c6fe9e38a2647b8950192f1fe6226 100644
--- a/paddle/phi/kernels/gpu/nanmedian_kernel.cu
+++ b/paddle/phi/kernels/gpu/nanmedian_kernel.cu
@@ -15,9 +15,9 @@
 #include "paddle/phi/kernels/nanmedian_kernel.h"
 
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/impl/nanmedian_kernel_impl.h"
diff --git a/paddle/phi/kernels/gpu/top_k_kernel.cu b/paddle/phi/kernels/gpu/top_k_kernel.cu
index 9fc21b19a156c2587df02b3fbcde47a26fe59383..ecca63e52f05d2a59040d574748040bce614dc8d 100644
--- a/paddle/phi/kernels/gpu/top_k_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_k_kernel.cu
@@ -173,8 +173,8 @@ void TopkKernel(const Context& dev_ctx,
     // NOTE: old matrix implementation of stride is different to eigen.
     const int kMaxHeight = 2048;
     int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
-    paddle::platform::GpuLaunchConfig config =
-        paddle::platform::GetGpuLaunchConfig1D(dev_ctx, input_width);
+    auto config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, input_width);
     switch (config.thread_per_block.x) {
 #ifdef PADDLE_WITH_HIP
       FIXED_BLOCK_DIM(
@@ -282,8 +282,8 @@ void TopkKernel(const Context& dev_ctx,
 
     const int kMaxHeight = 2048;
     int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
-    paddle::platform::GpuLaunchConfig config =
-        paddle::platform::GetGpuLaunchConfig1D(dev_ctx, input_width);
+    auto config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, input_width);
     switch (config.thread_per_block.x) {
 #ifdef PADDLE_WITH_HIP
       FIXED_BLOCK_DIM(