未验证 提交 5d4980c0 编写于 作者: F FlyingQianMM 提交者: GitHub

add a inner loop for index_select_grad_init() in index_select op when dealing...

add a inner loop for index_select_grad_init() in index_select op when dealing with large-shape data (PaddlePaddle#41563) (#41669)
上级 aec47f8a
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
// TODO(paddle-dev): move gpu_primitives.h to phi // TODO(paddle-dev): move gpu_primitives.h to phi
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
...@@ -110,11 +111,8 @@ void GPUGather(const phi::GPUContext& ctx, ...@@ -110,11 +111,8 @@ void GPUGather(const phi::GPUContext& ctx,
int block = 512; int block = 512;
int64_t n = slice_size * index_size; int64_t n = slice_size * index_size;
int64_t grid = (n + block - 1) / block; dim3 grid = dim3((n + block - 1) / block);
unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; paddle::platform::LimitGridDim(ctx, &grid);
if (grid > maxGridDimX) {
grid = maxGridDimX;
}
GatherCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>( GatherCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
p_src, p_index, p_output, index_size, slice_size); p_src, p_index, p_output, index_size, slice_size);
...@@ -155,11 +153,8 @@ void GPUGatherNd(const phi::GPUContext& ctx, ...@@ -155,11 +153,8 @@ void GPUGatherNd(const phi::GPUContext& ctx,
int block = 512; int block = 512;
int64_t n = slice_size * remain_numel; int64_t n = slice_size * remain_numel;
int64_t grid = (n + block - 1) / block; dim3 grid = dim3((n + block - 1) / block);
unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; paddle::platform::LimitGridDim(ctx, &grid);
if (grid > maxGridDimX) {
grid = maxGridDimX;
}
GatherNdCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(p_input, GatherNdCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(p_input,
g_input_dims, g_input_dims,
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
...@@ -155,9 +156,8 @@ void GPUScatterAssign(const phi::GPUContext& ctx, ...@@ -155,9 +156,8 @@ void GPUScatterAssign(const phi::GPUContext& ctx,
// set block and grid num // set block and grid num
int block = 512; int block = 512;
int64_t n = slice_size * index_size; int64_t n = slice_size * index_size;
int64_t grid = (n + block - 1) / block; dim3 grid = dim3((n + block - 1) / block);
unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; paddle::platform::LimitGridDim(ctx, &grid);
grid = grid > maxGridDimX ? maxGridDimX : grid;
// if not overwrite mode, init data // if not overwrite mode, init data
if (!overwrite) { if (!overwrite) {
...@@ -188,9 +188,8 @@ void GPUScatterGradForX(const phi::GPUContext& ctx, ...@@ -188,9 +188,8 @@ void GPUScatterGradForX(const phi::GPUContext& ctx,
int64_t block = 512; int64_t block = 512;
int64_t n = slice_size * index_size; int64_t n = slice_size * index_size;
int64_t height = (n + block - 1) / block; int64_t height = (n + block - 1) / block;
dim3 grid = dim3((n + block - 1) / block);
int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; paddle::platform::LimitGridDim(ctx, &grid);
int64_t grid = height < max_grid_dimx ? height : max_grid_dimx;
ScatterInitCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>( ScatterInitCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
p_index, p_output, index_size, slice_size); p_index, p_output, index_size, slice_size);
...@@ -230,9 +229,8 @@ void GPUScatterNdAdd(const phi::GPUContext& ctx, ...@@ -230,9 +229,8 @@ void GPUScatterNdAdd(const phi::GPUContext& ctx,
int block = 512; int block = 512;
int64_t n = slice_size * remain_numel; int64_t n = slice_size * remain_numel;
int64_t grid = (n + block - 1) / block; dim3 grid = dim3((n + block - 1) / block);
unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; paddle::platform::LimitGridDim(ctx, &grid);
grid = grid > maxGridDimX ? maxGridDimX : grid;
ScatterNdCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>( ScatterNdCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
p_update, p_update,
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/utils/data_type.h" #include "paddle/phi/core/utils/data_type.h"
#include "paddle/phi/kernels/funcs/math_function.h"
DECLARE_bool(cudnn_deterministic); DECLARE_bool(cudnn_deterministic);
...@@ -35,7 +36,7 @@ __global__ void index_select_grad_cuda_kernel(const T* output_grad, ...@@ -35,7 +36,7 @@ __global__ void index_select_grad_cuda_kernel(const T* output_grad,
int64_t stride, int64_t stride,
int64_t size, int64_t size,
int64_t delta) { int64_t delta) {
CUDA_KERNEL_LOOP(idx, N) { CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) {
int64_t pre_idx = idx / (stride * size); int64_t pre_idx = idx / (stride * size);
int64_t dim_idx = idx % (stride * size) / stride; int64_t dim_idx = idx % (stride * size) / stride;
IndexT src_dim_idx = index[dim_idx]; IndexT src_dim_idx = index[dim_idx];
...@@ -45,15 +46,6 @@ __global__ void index_select_grad_cuda_kernel(const T* output_grad, ...@@ -45,15 +46,6 @@ __global__ void index_select_grad_cuda_kernel(const T* output_grad,
} }
} }
template <typename T>
__global__ void index_select_grad_init(T* input_grad, int64_t N) {
int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N) {
return;
}
input_grad[idx] = 0.0;
}
template <typename T, typename Context> template <typename T, typename Context>
void IndexSelectGradKernel(const Context& ctx, void IndexSelectGradKernel(const Context& ctx,
const DenseTensor& x, const DenseTensor& x,
...@@ -97,8 +89,8 @@ void IndexSelectGradKernel(const Context& ctx, ...@@ -97,8 +89,8 @@ void IndexSelectGradKernel(const Context& ctx,
dim3 grid_dim = dim3((numel + block_dim - 1) / block_dim); dim3 grid_dim = dim3((numel + block_dim - 1) / block_dim);
paddle::platform::LimitGridDim(ctx, &grid_dim); paddle::platform::LimitGridDim(ctx, &grid_dim);
index_select_grad_init<T><<<grid_dim, block_dim, 0, stream>>>(in_grad_data, phi::funcs::SetConstant<phi::GPUContext, T> index_select_grad_init;
numel); index_select_grad_init(ctx, x_grad, static_cast<T>(0));
if (FLAGS_cudnn_deterministic) { if (FLAGS_cudnn_deterministic) {
VLOG(2) << "Run grad kernel of index_select with single thread."; VLOG(2) << "Run grad kernel of index_select with single thread.";
......
...@@ -32,7 +32,7 @@ __global__ void index_select_cuda_kernel(const T* input, ...@@ -32,7 +32,7 @@ __global__ void index_select_cuda_kernel(const T* input,
int64_t stride, int64_t stride,
int64_t size, int64_t size,
int64_t delta) { int64_t delta) {
CUDA_KERNEL_LOOP(idx, N) { CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) {
int64_t pre_idx = idx / (stride * size); int64_t pre_idx = idx / (stride * size);
int64_t dim_idx = idx % (stride * size) / stride; int64_t dim_idx = idx % (stride * size) / stride;
IndexT src_dim_idx = index[dim_idx]; IndexT src_dim_idx = index[dim_idx];
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册