[PHI decoupling] remove "paddle/fluid/platform/device/gpu/gpu_launch_config.h" in phi (#47808)

* rm fluid gpu_launch_config * fix type

[PHI decoupling] remove "paddle/fluid/platform/device/gpu/gpu_launch_config.h" in phi (#47808)
* rm fluid gpu_launch_config * fix type
40a9b488 · PuQing · GitHub · 0f3fb562 · 40a9b488 · 40a9b488
15 changed file
--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -25,7 +25,7 @@ limitations under the License. */
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"

 #endif
@@ -982,7 +982,7 @@ static void ElemwiseGradBroadcast1CUDA(gpuStream_t stream,
    auto gplace = phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId());
    auto *ctx = static_cast<GPUContext *>(
        paddle::platform::DeviceContextPool::Instance().Get(gplace));
-    paddle::platform::LimitGridDim(*ctx, &grid_size);
+    phi::backends::gpu::LimitGridDim(*ctx, &grid_size);
    FastElemwiseGradBroadcast1CUDAKernel<<<grid_size, block_size, 0, stream>>>(
        x, y, out, dout, h, w, is_xsize_larger, dx_op, dy_op, dx, dy);
  }
@@ -1007,7 +1007,7 @@ static void ElemwiseGradBroadcast2CUDA(gpuStream_t stream,
  auto gplace = phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId());
  auto *ctx = static_cast<GPUContext *>(
      paddle::platform::DeviceContextPool::Instance().Get(gplace));
-  paddle::platform::LimitGridDim(*ctx, &grid_size);
+  phi::backends::gpu::LimitGridDim(*ctx, &grid_size);
  ElemwiseGradBroadcast2CUDAKernel<<<grid_size, block_size, 0, stream>>>(
      x, y, out, dout, pre, n, post, is_xsize_larger, dx_op, dy_op, dx, dy);
 }
@@ -1210,7 +1210,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
      } else {
        dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
        dim3 grid_size = dim3((w + BLOCK_X - 1) / BLOCK_X);
-        paddle::platform::LimitGridDim(ctx, &grid_size);
+        phi::backends::gpu::LimitGridDim(ctx, &grid_size);
        FastCommonGradBroadcastCUDAKernelHeight<<<grid_size,
                                                  block_size,
                                                  0,
@@ -1247,7 +1247,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
      } else {
        dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
        dim3 grid_size = dim3((w + BLOCK_X - 1) / BLOCK_X);
-        paddle::platform::LimitGridDim(ctx, &grid_size);
+        phi::backends::gpu::LimitGridDim(ctx, &grid_size);
        FastCommonGradBroadcastCUDAKernelHeight<<<grid_size,
                                                  block_size,
                                                  0,
@@ -1345,7 +1345,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,

    int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
    dim3 grid_size = dim3(pre * post);
-    paddle::platform::LimitGridDim(ctx, &grid_size);
+    phi::backends::gpu::LimitGridDim(ctx, &grid_size);

    FastCommonGradBroadcastAllCUDAKernel<<<grid_size, block_size, 0, stream>>>(
        x_data,
@@ -1387,7 +1387,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
                                   std::multiplies<int>());
          int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
          dim3 grid_size = dim3(pre * post);
-          paddle::platform::LimitGridDim(ctx, &grid_size);
+          phi::backends::gpu::LimitGridDim(ctx, &grid_size);
          // we need to calc y offset with blockid, so do x_pre/y_pre to get
          // left size.
          if (k_pre != pre) k_pre = pre / k_pre;
@@ -1418,7 +1418,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
                                   std::multiplies<int>());
          int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
          dim3 grid_size = dim3(pre * post);
-          paddle::platform::LimitGridDim(ctx, &grid_size);
+          phi::backends::gpu::LimitGridDim(ctx, &grid_size);
          if (k_pre != pre) k_pre = pre / k_pre;

          FastCommonGradBroadcastOneCUDAKernel<<<grid_size,

--- a/paddle/phi/kernels/funcs/gather.cu.h
+++ b/paddle/phi/kernels/funcs/gather.cu.h
@@ -18,7 +18,6 @@ limitations under the License. */

 #include "paddle/fluid/memory/memcpy.h"
 // TODO(paddle-dev): move gpu_primitives.h to phi
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/place.h"
@@ -113,7 +112,7 @@ void GPUGather(const phi::GPUContext& ctx,
  int block = 512;
  int64_t n = slice_size * index_size;
  dim3 grid = dim3((n + block - 1) / block);
-  paddle::platform::LimitGridDim(ctx, &grid);
+  phi::backends::gpu::LimitGridDim(ctx, &grid);

  GatherCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
      p_src, p_index, p_output, index_size, slice_size);
@@ -155,7 +154,7 @@ void GPUGatherNd(const phi::GPUContext& ctx,
  int block = 512;
  int64_t n = slice_size * remain_numel;
  dim3 grid = dim3((n + block - 1) / block);
-  paddle::platform::LimitGridDim(ctx, &grid);
+  phi::backends::gpu::LimitGridDim(ctx, &grid);

  GatherNdCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(p_input,
                                                                  g_input_dims,

--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -34,9 +34,9 @@ namespace cub = hipcub;

 #ifndef PADDLE_WITH_XPU_KP
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #endif

 #include "paddle/phi/kernels/cast_kernel.h"
@@ -337,7 +337,7 @@ struct ReduceConfig {
    SetBlockDim();
 #ifndef PADDLE_WITH_XPU_KP
    // step5: limit the grid to prevent thead overflow
-    paddle::platform::LimitGridDim(dev_ctx, &grid);
+    phi::backends::gpu::LimitGridDim(dev_ctx, &grid);
 #endif
  }


--- a/paddle/phi/kernels/funcs/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/scatter.cu.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>

-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -158,7 +158,7 @@ void GPUScatterAssign(const phi::GPUContext& ctx,
  int block = 512;
  int64_t n = slice_size * index_size;
  dim3 grid = dim3((n + block - 1) / block);
-  paddle::platform::LimitGridDim(ctx, &grid);
+  phi::backends::gpu::LimitGridDim(ctx, &grid);

  // if not overwrite mode, init data
  if (!overwrite) {
@@ -190,7 +190,7 @@ void GPUScatterGradForX(const phi::GPUContext& ctx,
  int64_t n = slice_size * index_size;
  int64_t height = (n + block - 1) / block;
  dim3 grid = dim3((n + block - 1) / block);
-  paddle::platform::LimitGridDim(ctx, &grid);
+  phi::backends::gpu::LimitGridDim(ctx, &grid);

  ScatterInitCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
      p_index, p_output, index_size, slice_size);
@@ -231,7 +231,7 @@ void GPUScatterNdAdd(const phi::GPUContext& ctx,
  int block = 512;
  int64_t n = slice_size * remain_numel;
  dim3 grid = dim3((n + block - 1) / block);
-  paddle::platform::LimitGridDim(ctx, &grid);
+  phi::backends::gpu::LimitGridDim(ctx, &grid);

  ScatterNdCUDAKernel<T, IndexT>
      <<<grid, block, 0, ctx.stream()>>>(p_update,

--- a/paddle/phi/kernels/gpu/histogram_kernel.cu
+++ b/paddle/phi/kernels/gpu/histogram_kernel.cu
@@ -14,9 +14,9 @@

 #include "paddle/phi/kernels/histogram_kernel.h"

-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"

--- a/paddle/phi/kernels/gpu/index_add_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_add_grad_kernel.cu
@@ -14,9 +14,9 @@

 #include "paddle/phi/kernels/index_add_grad_kernel.h"

-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -71,7 +71,7 @@ void IndexAddGradKernel(const Context& ctx,
  // get add_value_grad: index_select(out_grad, index, axis)
  unsigned int block_dim = PADDLE_CUDA_NUM_THREADS;
  dim3 grid_dim = dim3((numel + block_dim - 1) / block_dim);
-  paddle::platform::LimitGridDim(ctx, &grid_dim);
+  phi::backends::gpu::LimitGridDim(ctx, &grid_dim);

  if (index_type == phi::DataType::INT64) {
    const int64_t* index_data = index.data<int64_t>();

--- a/paddle/phi/kernels/gpu/index_add_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_add_kernel.cu
@@ -14,9 +14,9 @@

 #include "paddle/phi/kernels/index_add_kernel.h"

-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"

@@ -75,7 +75,7 @@ void IndexAddKernel(const Context& ctx,

  unsigned int block_dim = PADDLE_CUDA_NUM_THREADS;
  dim3 grid_dim = dim3((numel + block_dim - 1) / block_dim);
-  paddle::platform::LimitGridDim(ctx, &grid_dim);
+  phi::backends::gpu::LimitGridDim(ctx, &grid_dim);

  // copy input to output.
  // todo(@limin29): inplace do not need copy.

--- a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
@@ -18,9 +18,9 @@
 #include <vector>

 #include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

@@ -92,16 +92,16 @@ void IndexSampleGradKernel(const Context& ctx,
  size_t index_length = index_dim[1];
  bool same_data_in_index_row = index_length == 1 ? false : true;

-  auto block_width = paddle::platform::RoundToPowerOfTwo(index_length);
+  auto block_width = phi::backends::gpu::RoundToPowerOfTwo(index_length);
  block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
  auto block_height =
-      paddle::platform::RoundToPowerOfTwo(index_length * batch_size) /
+      phi::backends::gpu::RoundToPowerOfTwo(index_length * batch_size) /
      block_width;
  block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
  dim3 block_dim(block_width, block_height);
  dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
                (batch_size + block_dim.y - 1) / block_dim.y);
-  paddle::platform::LimitGridDim(ctx, &grid_dim);
+  phi::backends::gpu::LimitGridDim(ctx, &grid_dim);

  phi::funcs::SetConstant<Context, T> set_zero;
  set_zero(ctx, x_grad, static_cast<T>(0));

--- a/paddle/phi/kernels/gpu/index_sample_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_sample_kernel.cu
@@ -18,8 +18,8 @@
 #include <vector>

 #include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

@@ -80,16 +80,16 @@ void IndexSampleKernel(const Context& ctx,
  size_t input_length = input_dim[1];
  size_t index_length = index_dim[1];

-  auto block_width = paddle::platform::RoundToPowerOfTwo(index_length);
+  auto block_width = phi::backends::gpu::RoundToPowerOfTwo(index_length);
  block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
  int block_height =
-      paddle::platform::RoundToPowerOfTwo(index_length * batch_size) /
+      phi::backends::gpu::RoundToPowerOfTwo(index_length * batch_size) /
      block_width;
  block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
  dim3 block_dim(block_width, block_height);
  dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
                (batch_size + block_dim.y - 1) / block_dim.y);
-  paddle::platform::LimitGridDim(ctx, &grid_dim);
+  phi::backends::gpu::LimitGridDim(ctx, &grid_dim);

  if (index_type == DataType::INT64) {
    const int64_t* index_data = index.data<int64_t>();

--- a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
@@ -14,9 +14,9 @@

 #include "paddle/phi/kernels/index_select_grad_kernel.h"

-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -87,7 +87,7 @@ void IndexSelectGradKernel(const Context& ctx,

  unsigned int block_dim = PADDLE_CUDA_NUM_THREADS;
  dim3 grid_dim = dim3((numel + block_dim - 1) / block_dim);
-  paddle::platform::LimitGridDim(ctx, &grid_dim);
+  phi::backends::gpu::LimitGridDim(ctx, &grid_dim);

  phi::funcs::SetConstant<phi::GPUContext, T> index_select_grad_init;
  index_select_grad_init(ctx, x_grad, static_cast<T>(0));

--- a/paddle/phi/kernels/gpu/index_select_impl.h
+++ b/paddle/phi/kernels/gpu/index_select_impl.h
@@ -14,9 +14,9 @@

 #pragma once

-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"


--- a/paddle/phi/kernels/gpu/index_select_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_select_kernel.cu
@@ -14,9 +14,9 @@

 #include "paddle/phi/kernels/index_select_kernel.h"

-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/gpu/index_select_impl.h"
@@ -62,7 +62,7 @@ void IndexSelectKernel(const Context& ctx,

  unsigned int block_dim = PADDLE_CUDA_NUM_THREADS;
  dim3 grid_dim = dim3((numel + block_dim - 1) / block_dim);
-  paddle::platform::LimitGridDim(ctx, &grid_dim);
+  phi::backends::gpu::LimitGridDim(ctx, &grid_dim);

  if (index_type == phi::DataType::INT64) {
    const int64_t* index_data = index.data<int64_t>();

--- a/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu
@@ -14,9 +14,9 @@

 #include "paddle/phi/kernels/nanmedian_grad_kernel.h"

-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

--- a/paddle/phi/kernels/gpu/nanmedian_kernel.cu
+++ b/paddle/phi/kernels/gpu/nanmedian_kernel.cu
@@ -15,9 +15,9 @@
 #include "paddle/phi/kernels/nanmedian_kernel.h"

 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/impl/nanmedian_kernel_impl.h"

--- a/paddle/phi/kernels/gpu/top_k_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_k_kernel.cu
@@ -173,8 +173,8 @@ void TopkKernel(const Context& dev_ctx,
    // NOTE: old matrix implementation of stride is different to eigen.
    const int kMaxHeight = 2048;
    int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
-    paddle::platform::GpuLaunchConfig config =
-        paddle::platform::GetGpuLaunchConfig1D(dev_ctx, input_width);
+    auto config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, input_width);
    switch (config.thread_per_block.x) {
 #ifdef PADDLE_WITH_HIP
      FIXED_BLOCK_DIM(
@@ -282,8 +282,8 @@ void TopkKernel(const Context& dev_ctx,

    const int kMaxHeight = 2048;
    int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
-    paddle::platform::GpuLaunchConfig config =
-        paddle::platform::GetGpuLaunchConfig1D(dev_ctx, input_width);
+    auto config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, input_width);
    switch (config.thread_per_block.x) {
 #ifdef PADDLE_WITH_HIP
      FIXED_BLOCK_DIM(