From 44271ece52a182d5435531690e307c4b373e0104 Mon Sep 17 00:00:00 2001 From: FlyingQianMM <245467267@qq.com> Date: Sat, 7 May 2022 14:55:47 +0800 Subject: [PATCH] Reduce the number of threads per block of deformable_psroi_pooling to solve the bug where too many resources requested for launch (PaddlePaddle#42531) (#42533) --- .../operators/deformable_psroi_pooling_op.cu | 54 +++++++++---------- 1 file changed, 25 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu index caad1b169a6..448f67a4bad 100644 --- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu +++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu @@ -39,10 +39,10 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; +using paddle::platform::PADDLE_CUDA_NUM_THREADS; -const int CUDA_NUM_THREADS = 1024; static inline int GET_BLOCKS(const int N) { - return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; + return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; } template @@ -252,8 +252,8 @@ class DeformablePSROIPoolCUDAKernel : public framework::OpKernel { T* top_data = out->mutable_data(ctx.GetPlace()); T* top_count_data = top_count->mutable_data(ctx.GetPlace()); - DeformablePSROIPoolForwardKernel<<>>( + DeformablePSROIPoolForwardKernel<<< + GET_BLOCKS(count), PADDLE_CUDA_NUM_THREADS, 0, dev_ctx.stream()>>>( count, bottom_data, (T)spatial_scale, channels, height, width, pooled_height, pooled_width, bottom_rois, bottom_trans, no_trans, (T)trans_std, sample_per_part, output_dim, group_height, group_width, @@ -344,6 +344,19 @@ __global__ void DeformablePSROIPoolBackwardAccKernel( gw = min(max(gw, 0), group_width - 1); gh = min(max(gh, 0), group_height - 1); + int c = (ctop * group_height + gh) * group_width + gw; + int bottom_index_base = c * height * width; + int bottom_index = + roi_batch_ind * channels * height * width + bottom_index_base; + int trans_index_x = + (((n * num_classes + class_id) * 2) * part_height + part_h) * + part_width + + part_w; + int trans_index_y = + (((n * num_classes + class_id) * 2 + 1) * part_height + part_h) * + part_width + + part_w; + // sampling in each bin for (int ih = 0; ih < sample_per_part; ih++) { for (int iw = 0; iw < sample_per_part; iw++) { @@ -354,7 +367,6 @@ __global__ void DeformablePSROIPoolBackwardAccKernel( } w = min(max(w, 0.), width - 1.); h = min(max(h, 0.), height - 1.); - int c = (ctop * group_height + gh) * group_width + gw; int x0 = floor(w); int x1 = ceil(w); int y0 = floor(h); @@ -366,25 +378,20 @@ __global__ void DeformablePSROIPoolBackwardAccKernel( T q01 = (1 - dist_x) * dist_y; T q10 = dist_x * (1 - dist_y); T q11 = dist_x * dist_y; - int bottom_index_base = c * height * width; // compute gradient of input if (bottom_data_diff) { platform::CudaAtomicAdd( - bottom_data_diff + roi_batch_ind * channels * height * width + - bottom_index_base + y0 * width + x0, + bottom_data_diff + bottom_index + y0 * width + x0, q00 * diff_val); platform::CudaAtomicAdd( - bottom_data_diff + roi_batch_ind * channels * height * width + - bottom_index_base + y1 * width + x0, + bottom_data_diff + bottom_index + y1 * width + x0, q01 * diff_val); platform::CudaAtomicAdd( - bottom_data_diff + roi_batch_ind * channels * height * width + - bottom_index_base + y0 * width + x1, + bottom_data_diff + bottom_index + y0 * width + x1, q10 * diff_val); platform::CudaAtomicAdd( - bottom_data_diff + roi_batch_ind * channels * height * width + - bottom_index_base + y1 * width + x1, + bottom_data_diff + bottom_index + y1 * width + x1, q11 * diff_val); } @@ -405,19 +412,8 @@ __global__ void DeformablePSROIPoolBackwardAccKernel( u00 * (1 - dist_x)) * trans_std * diff_val; diff_y *= roi_height; - platform::CudaAtomicAdd( - bottom_trans_diff + - (((n * num_classes + class_id) * 2) * part_height + part_h) * - part_width + - part_w, - diff_x); - platform::CudaAtomicAdd( - bottom_trans_diff + - (((n * num_classes + class_id) * 2 + 1) * part_height + - part_h) * - part_width + - part_w, - diff_y); + platform::CudaAtomicAdd(bottom_trans_diff + trans_index_x, diff_x); + platform::CudaAtomicAdd(bottom_trans_diff + trans_index_y, diff_y); } } } @@ -520,8 +516,8 @@ class DeformablePSROIPoolGradCUDAKernel : public framework::OpKernel { memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream()); - DeformablePSROIPoolBackwardAccKernel<<>>( + DeformablePSROIPoolBackwardAccKernel<<< + GET_BLOCKS(count), PADDLE_CUDA_NUM_THREADS, 0, dev_ctx.stream()>>>( count, top_diff, top_count_data, num_rois, (T)spatial_scale, channels, height, width, pooled_height, pooled_width, output_dim, bottom_data_diff, bottom_trans_diff, bottom_data, bottom_rois, -- GitLab