未验证 提交 40a9b488 编写于 作者: P PuQing 提交者: GitHub

[PHI decoupling] remove "paddle/fluid/platform/device/gpu/gpu_launch_config.h" in phi (#47808)

* rm fluid gpu_launch_config

* fix type
上级 0f3fb562
......@@ -25,7 +25,7 @@ limitations under the License. */
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/kernels/primitive/kernel_primitives.h"
#endif
......@@ -982,7 +982,7 @@ static void ElemwiseGradBroadcast1CUDA(gpuStream_t stream,
auto gplace = phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId());
auto *ctx = static_cast<GPUContext *>(
paddle::platform::DeviceContextPool::Instance().Get(gplace));
paddle::platform::LimitGridDim(*ctx, &grid_size);
phi::backends::gpu::LimitGridDim(*ctx, &grid_size);
FastElemwiseGradBroadcast1CUDAKernel<<<grid_size, block_size, 0, stream>>>(
x, y, out, dout, h, w, is_xsize_larger, dx_op, dy_op, dx, dy);
}
......@@ -1007,7 +1007,7 @@ static void ElemwiseGradBroadcast2CUDA(gpuStream_t stream,
auto gplace = phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId());
auto *ctx = static_cast<GPUContext *>(
paddle::platform::DeviceContextPool::Instance().Get(gplace));
paddle::platform::LimitGridDim(*ctx, &grid_size);
phi::backends::gpu::LimitGridDim(*ctx, &grid_size);
ElemwiseGradBroadcast2CUDAKernel<<<grid_size, block_size, 0, stream>>>(
x, y, out, dout, pre, n, post, is_xsize_larger, dx_op, dy_op, dx, dy);
}
......@@ -1210,7 +1210,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
} else {
dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
dim3 grid_size = dim3((w + BLOCK_X - 1) / BLOCK_X);
paddle::platform::LimitGridDim(ctx, &grid_size);
phi::backends::gpu::LimitGridDim(ctx, &grid_size);
FastCommonGradBroadcastCUDAKernelHeight<<<grid_size,
block_size,
0,
......@@ -1247,7 +1247,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
} else {
dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
dim3 grid_size = dim3((w + BLOCK_X - 1) / BLOCK_X);
paddle::platform::LimitGridDim(ctx, &grid_size);
phi::backends::gpu::LimitGridDim(ctx, &grid_size);
FastCommonGradBroadcastCUDAKernelHeight<<<grid_size,
block_size,
0,
......@@ -1345,7 +1345,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
dim3 grid_size = dim3(pre * post);
paddle::platform::LimitGridDim(ctx, &grid_size);
phi::backends::gpu::LimitGridDim(ctx, &grid_size);
FastCommonGradBroadcastAllCUDAKernel<<<grid_size, block_size, 0, stream>>>(
x_data,
......@@ -1387,7 +1387,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
std::multiplies<int>());
int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
dim3 grid_size = dim3(pre * post);
paddle::platform::LimitGridDim(ctx, &grid_size);
phi::backends::gpu::LimitGridDim(ctx, &grid_size);
// we need to calc y offset with blockid, so do x_pre/y_pre to get
// left size.
if (k_pre != pre) k_pre = pre / k_pre;
......@@ -1418,7 +1418,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
std::multiplies<int>());
int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
dim3 grid_size = dim3(pre * post);
paddle::platform::LimitGridDim(ctx, &grid_size);
phi::backends::gpu::LimitGridDim(ctx, &grid_size);
if (k_pre != pre) k_pre = pre / k_pre;
FastCommonGradBroadcastOneCUDAKernel<<<grid_size,
......
......@@ -18,7 +18,6 @@ limitations under the License. */
#include "paddle/fluid/memory/memcpy.h"
// TODO(paddle-dev): move gpu_primitives.h to phi
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/place.h"
......@@ -113,7 +112,7 @@ void GPUGather(const phi::GPUContext& ctx,
int block = 512;
int64_t n = slice_size * index_size;
dim3 grid = dim3((n + block - 1) / block);
paddle::platform::LimitGridDim(ctx, &grid);
phi::backends::gpu::LimitGridDim(ctx, &grid);
GatherCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
p_src, p_index, p_output, index_size, slice_size);
......@@ -155,7 +154,7 @@ void GPUGatherNd(const phi::GPUContext& ctx,
int block = 512;
int64_t n = slice_size * remain_numel;
dim3 grid = dim3((n + block - 1) / block);
paddle::platform::LimitGridDim(ctx, &grid);
phi::backends::gpu::LimitGridDim(ctx, &grid);
GatherNdCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(p_input,
g_input_dims,
......
......@@ -34,9 +34,9 @@ namespace cub = hipcub;
#ifndef PADDLE_WITH_XPU_KP
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#endif
#include "paddle/phi/kernels/cast_kernel.h"
......@@ -337,7 +337,7 @@ struct ReduceConfig {
SetBlockDim();
#ifndef PADDLE_WITH_XPU_KP
// step5: limit the grid to prevent thead overflow
paddle::platform::LimitGridDim(dev_ctx, &grid);
phi::backends::gpu::LimitGridDim(dev_ctx, &grid);
#endif
}
......
......@@ -16,8 +16,8 @@ limitations under the License. */
#include <unordered_set>
#include <vector>
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......@@ -158,7 +158,7 @@ void GPUScatterAssign(const phi::GPUContext& ctx,
int block = 512;
int64_t n = slice_size * index_size;
dim3 grid = dim3((n + block - 1) / block);
paddle::platform::LimitGridDim(ctx, &grid);
phi::backends::gpu::LimitGridDim(ctx, &grid);
// if not overwrite mode, init data
if (!overwrite) {
......@@ -190,7 +190,7 @@ void GPUScatterGradForX(const phi::GPUContext& ctx,
int64_t n = slice_size * index_size;
int64_t height = (n + block - 1) / block;
dim3 grid = dim3((n + block - 1) / block);
paddle::platform::LimitGridDim(ctx, &grid);
phi::backends::gpu::LimitGridDim(ctx, &grid);
ScatterInitCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
p_index, p_output, index_size, slice_size);
......@@ -231,7 +231,7 @@ void GPUScatterNdAdd(const phi::GPUContext& ctx,
int block = 512;
int64_t n = slice_size * remain_numel;
dim3 grid = dim3((n + block - 1) / block);
paddle::platform::LimitGridDim(ctx, &grid);
phi::backends::gpu::LimitGridDim(ctx, &grid);
ScatterNdCUDAKernel<T, IndexT>
<<<grid, block, 0, ctx.stream()>>>(p_update,
......
......@@ -14,9 +14,9 @@
#include "paddle/phi/kernels/histogram_kernel.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
......
......@@ -14,9 +14,9 @@
#include "paddle/phi/kernels/index_add_grad_kernel.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/utils/data_type.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......@@ -71,7 +71,7 @@ void IndexAddGradKernel(const Context& ctx,
// get add_value_grad: index_select(out_grad, index, axis)
unsigned int block_dim = PADDLE_CUDA_NUM_THREADS;
dim3 grid_dim = dim3((numel + block_dim - 1) / block_dim);
paddle::platform::LimitGridDim(ctx, &grid_dim);
phi::backends::gpu::LimitGridDim(ctx, &grid_dim);
if (index_type == phi::DataType::INT64) {
const int64_t* index_data = index.data<int64_t>();
......
......@@ -14,9 +14,9 @@
#include "paddle/phi/kernels/index_add_kernel.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/utils/data_type.h"
......@@ -75,7 +75,7 @@ void IndexAddKernel(const Context& ctx,
unsigned int block_dim = PADDLE_CUDA_NUM_THREADS;
dim3 grid_dim = dim3((numel + block_dim - 1) / block_dim);
paddle::platform::LimitGridDim(ctx, &grid_dim);
phi::backends::gpu::LimitGridDim(ctx, &grid_dim);
// copy input to output.
// todo(@limin29): inplace do not need copy.
......
......@@ -18,9 +18,9 @@
#include <vector>
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......@@ -92,16 +92,16 @@ void IndexSampleGradKernel(const Context& ctx,
size_t index_length = index_dim[1];
bool same_data_in_index_row = index_length == 1 ? false : true;
auto block_width = paddle::platform::RoundToPowerOfTwo(index_length);
auto block_width = phi::backends::gpu::RoundToPowerOfTwo(index_length);
block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
auto block_height =
paddle::platform::RoundToPowerOfTwo(index_length * batch_size) /
phi::backends::gpu::RoundToPowerOfTwo(index_length * batch_size) /
block_width;
block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
dim3 block_dim(block_width, block_height);
dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
(batch_size + block_dim.y - 1) / block_dim.y);
paddle::platform::LimitGridDim(ctx, &grid_dim);
phi::backends::gpu::LimitGridDim(ctx, &grid_dim);
phi::funcs::SetConstant<Context, T> set_zero;
set_zero(ctx, x_grad, static_cast<T>(0));
......
......@@ -18,8 +18,8 @@
#include <vector>
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......@@ -80,16 +80,16 @@ void IndexSampleKernel(const Context& ctx,
size_t input_length = input_dim[1];
size_t index_length = index_dim[1];
auto block_width = paddle::platform::RoundToPowerOfTwo(index_length);
auto block_width = phi::backends::gpu::RoundToPowerOfTwo(index_length);
block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
int block_height =
paddle::platform::RoundToPowerOfTwo(index_length * batch_size) /
phi::backends::gpu::RoundToPowerOfTwo(index_length * batch_size) /
block_width;
block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
dim3 block_dim(block_width, block_height);
dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
(batch_size + block_dim.y - 1) / block_dim.y);
paddle::platform::LimitGridDim(ctx, &grid_dim);
phi::backends::gpu::LimitGridDim(ctx, &grid_dim);
if (index_type == DataType::INT64) {
const int64_t* index_data = index.data<int64_t>();
......
......@@ -14,9 +14,9 @@
#include "paddle/phi/kernels/index_select_grad_kernel.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/utils/data_type.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......@@ -87,7 +87,7 @@ void IndexSelectGradKernel(const Context& ctx,
unsigned int block_dim = PADDLE_CUDA_NUM_THREADS;
dim3 grid_dim = dim3((numel + block_dim - 1) / block_dim);
paddle::platform::LimitGridDim(ctx, &grid_dim);
phi::backends::gpu::LimitGridDim(ctx, &grid_dim);
phi::funcs::SetConstant<phi::GPUContext, T> index_select_grad_init;
index_select_grad_init(ctx, x_grad, static_cast<T>(0));
......
......@@ -14,9 +14,9 @@
#pragma once
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/utils/data_type.h"
......
......@@ -14,9 +14,9 @@
#include "paddle/phi/kernels/index_select_kernel.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/utils/data_type.h"
#include "paddle/phi/kernels/gpu/index_select_impl.h"
......@@ -62,7 +62,7 @@ void IndexSelectKernel(const Context& ctx,
unsigned int block_dim = PADDLE_CUDA_NUM_THREADS;
dim3 grid_dim = dim3((numel + block_dim - 1) / block_dim);
paddle::platform::LimitGridDim(ctx, &grid_dim);
phi::backends::gpu::LimitGridDim(ctx, &grid_dim);
if (index_type == phi::DataType::INT64) {
const int64_t* index_data = index.data<int64_t>();
......
......@@ -14,9 +14,9 @@
#include "paddle/phi/kernels/nanmedian_grad_kernel.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_meta.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......
......@@ -15,9 +15,9 @@
#include "paddle/phi/kernels/nanmedian_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/full_kernel.h"
#include "paddle/phi/kernels/impl/nanmedian_kernel_impl.h"
......
......@@ -173,8 +173,8 @@ void TopkKernel(const Context& dev_ctx,
// NOTE: old matrix implementation of stride is different to eigen.
const int kMaxHeight = 2048;
int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
paddle::platform::GpuLaunchConfig config =
paddle::platform::GetGpuLaunchConfig1D(dev_ctx, input_width);
auto config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, input_width);
switch (config.thread_per_block.x) {
#ifdef PADDLE_WITH_HIP
FIXED_BLOCK_DIM(
......@@ -282,8 +282,8 @@ void TopkKernel(const Context& dev_ctx,
const int kMaxHeight = 2048;
int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
paddle::platform::GpuLaunchConfig config =
paddle::platform::GetGpuLaunchConfig1D(dev_ctx, input_width);
auto config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, input_width);
switch (config.thread_per_block.x) {
#ifdef PADDLE_WITH_HIP
FIXED_BLOCK_DIM(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册