From 4b3e8d567bd5dc45c2413ad4f7199cef12cbb35b Mon Sep 17 00:00:00 2001 From: wawltor Date: Wed, 22 Jun 2022 16:24:14 +0800 Subject: [PATCH] fix the cumsum bug for large size (#43722) --- paddle/phi/kernels/gpu/cum_kernel.cu | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/paddle/phi/kernels/gpu/cum_kernel.cu b/paddle/phi/kernels/gpu/cum_kernel.cu index 837914fb2d..40d7f74379 100644 --- a/paddle/phi/kernels/gpu/cum_kernel.cu +++ b/paddle/phi/kernels/gpu/cum_kernel.cu @@ -176,10 +176,8 @@ __global__ void BlockScanKernel(T* d_out, } temp_storage; int bx = blockIdx.x; - int by = blockIdx.y; BlockPrefixCallbackOp prefix_op(Identity::value, op); - T block_aggregate = static_cast(0); // Obtain this block's segment of consecutive keys (blocked across threads) int item_per_block = BLOCK_THREADS * ITEMS_PER_THREAD; @@ -192,7 +190,7 @@ __global__ void BlockScanKernel(T* d_out, valid_item = scan_size; } - int offset = bx * scan_size + block_offset + by * (inner_size * scan_size); + int offset = block_offset + bx * scan_size; T thread_keys[ITEMS_PER_THREAD]; BlockLoadT(temp_storage.load) @@ -307,6 +305,7 @@ void ScanKernel(const Context& dev_ctx, int outer_size = height / scan_size; int inner_size = width; // Consider the size of shared memory, here block size is 128 + dim3 scan_grid(outer_size, inner_size); dim3 reverse_grid = scan_grid; if (reverse) { @@ -322,13 +321,14 @@ void ScanKernel(const Context& dev_ctx, in_data, out_data, scan_size, outer_size, inner_size); } } + int64_t grid_size = outer_size * inner_size; if (!transpose && !reverse) { - BlockScanKernel<<>>( + BlockScanKernel<<>>( out_data, in_data, outer_size, inner_size, scan_size, exclusive, op); } else { BlockScanKernel - <<>>(next_out_data, + <<>>(next_out_data, next_in_data, outer_size, inner_size, -- GitLab