From bc01242b87897046ac540166b9182b1d0b2b1fcf Mon Sep 17 00:00:00 2001 From: FlyingQianMM <245467267@qq.com> Date: Tue, 12 Apr 2022 10:12:45 +0800 Subject: [PATCH] add a inner loop for index_select_grad_init() in index_select op when dealing with large-shape data (#41563) * replace for with CUDA_KERNEL_LOOP for index_select_grad_init() in index_select op * use CUDA_KERNEL_LOOP_TYPE * fix code style * replace index_select_grad_init with SetConstant --- paddle/phi/kernels/funcs/gather.cu.h | 15 +++++---------- paddle/phi/kernels/funcs/scatter.cu.h | 16 +++++++--------- .../phi/kernels/gpu/index_select_grad_kernel.cu | 16 ++++------------ paddle/phi/kernels/gpu/index_select_kernel.cu | 2 +- 4 files changed, 17 insertions(+), 32 deletions(-) diff --git a/paddle/phi/kernels/funcs/gather.cu.h b/paddle/phi/kernels/funcs/gather.cu.h index 59c8c9f3b8f..617d249308c 100644 --- a/paddle/phi/kernels/funcs/gather.cu.h +++ b/paddle/phi/kernels/funcs/gather.cu.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include "paddle/fluid/memory/memcpy.h" // TODO(paddle-dev): move gpu_primitives.h to phi +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/common/place.h" @@ -110,11 +111,8 @@ void GPUGather(const phi::GPUContext& ctx, int block = 512; int64_t n = slice_size * index_size; - int64_t grid = (n + block - 1) / block; - unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; - if (grid > maxGridDimX) { - grid = maxGridDimX; - } + dim3 grid = dim3((n + block - 1) / block); + paddle::platform::LimitGridDim(ctx, &grid); GatherCUDAKernel<<>>( p_src, p_index, p_output, index_size, slice_size); @@ -155,11 +153,8 @@ void GPUGatherNd(const phi::GPUContext& ctx, int block = 512; int64_t n = slice_size * remain_numel; - int64_t grid = (n + block - 1) / block; - unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; - if (grid > maxGridDimX) { - grid = maxGridDimX; - } + dim3 grid = dim3((n + block - 1) / block); + paddle::platform::LimitGridDim(ctx, &grid); GatherNdCUDAKernel<<>>(p_input, g_input_dims, diff --git a/paddle/phi/kernels/funcs/scatter.cu.h b/paddle/phi/kernels/funcs/scatter.cu.h index 254dd45edb5..87083af3bc6 100644 --- a/paddle/phi/kernels/funcs/scatter.cu.h +++ b/paddle/phi/kernels/funcs/scatter.cu.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include #include +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/dense_tensor.h" @@ -155,9 +156,8 @@ void GPUScatterAssign(const phi::GPUContext& ctx, // set block and grid num int block = 512; int64_t n = slice_size * index_size; - int64_t grid = (n + block - 1) / block; - unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; - grid = grid > maxGridDimX ? maxGridDimX : grid; + dim3 grid = dim3((n + block - 1) / block); + paddle::platform::LimitGridDim(ctx, &grid); // if not overwrite mode, init data if (!overwrite) { @@ -188,9 +188,8 @@ void GPUScatterGradForX(const phi::GPUContext& ctx, int64_t block = 512; int64_t n = slice_size * index_size; int64_t height = (n + block - 1) / block; - - int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; - int64_t grid = height < max_grid_dimx ? height : max_grid_dimx; + dim3 grid = dim3((n + block - 1) / block); + paddle::platform::LimitGridDim(ctx, &grid); ScatterInitCUDAKernel<<>>( p_index, p_output, index_size, slice_size); @@ -230,9 +229,8 @@ void GPUScatterNdAdd(const phi::GPUContext& ctx, int block = 512; int64_t n = slice_size * remain_numel; - int64_t grid = (n + block - 1) / block; - unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; - grid = grid > maxGridDimX ? maxGridDimX : grid; + dim3 grid = dim3((n + block - 1) / block); + paddle::platform::LimitGridDim(ctx, &grid); ScatterNdCUDAKernel<<>>( p_update, diff --git a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu index 75ae1bbcd0a..84094f4c1ee 100644 --- a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu @@ -19,6 +19,7 @@ #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/utils/data_type.h" +#include "paddle/phi/kernels/funcs/math_function.h" DECLARE_bool(cudnn_deterministic); @@ -35,7 +36,7 @@ __global__ void index_select_grad_cuda_kernel(const T* output_grad, int64_t stride, int64_t size, int64_t delta) { - CUDA_KERNEL_LOOP(idx, N) { + CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) { int64_t pre_idx = idx / (stride * size); int64_t dim_idx = idx % (stride * size) / stride; IndexT src_dim_idx = index[dim_idx]; @@ -45,15 +46,6 @@ __global__ void index_select_grad_cuda_kernel(const T* output_grad, } } -template -__global__ void index_select_grad_init(T* input_grad, int64_t N) { - int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= N) { - return; - } - input_grad[idx] = 0.0; -} - template void IndexSelectGradKernel(const Context& ctx, const DenseTensor& x, @@ -97,8 +89,8 @@ void IndexSelectGradKernel(const Context& ctx, dim3 grid_dim = dim3((numel + block_dim - 1) / block_dim); paddle::platform::LimitGridDim(ctx, &grid_dim); - index_select_grad_init<<>>(in_grad_data, - numel); + phi::funcs::SetConstant index_select_grad_init; + index_select_grad_init(ctx, x_grad, static_cast(0)); if (FLAGS_cudnn_deterministic) { VLOG(2) << "Run grad kernel of index_select with single thread."; diff --git a/paddle/phi/kernels/gpu/index_select_kernel.cu b/paddle/phi/kernels/gpu/index_select_kernel.cu index 38a6582d790..0a6ac69cef0 100644 --- a/paddle/phi/kernels/gpu/index_select_kernel.cu +++ b/paddle/phi/kernels/gpu/index_select_kernel.cu @@ -32,7 +32,7 @@ __global__ void index_select_cuda_kernel(const T* input, int64_t stride, int64_t size, int64_t delta) { - CUDA_KERNEL_LOOP(idx, N) { + CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) { int64_t pre_idx = idx / (stride * size); int64_t dim_idx = idx % (stride * size) / stride; IndexT src_dim_idx = index[dim_idx]; -- GitLab