From 53c288a3041308c7cf14049b2e3f85b120f3d92e Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Thu, 21 May 2020 14:06:21 +0800 Subject: [PATCH] fix(dnn/cuda): fix topk grid oversize GitOrigin-RevId: d3c811a034e09f72576173130d9aac26d601fbf6 --- dnn/src/cuda/topk/topk_radix.cu | 170 ++++++++++++++++++++++---------- dnn/test/common/topk.cpp | 6 ++ 2 files changed, 125 insertions(+), 51 deletions(-) diff --git a/dnn/src/cuda/topk/topk_radix.cu b/dnn/src/cuda/topk/topk_radix.cu index eedc3dcb1..cf28d82bf 100644 --- a/dnn/src/cuda/topk/topk_radix.cu +++ b/dnn/src/cuda/topk/topk_radix.cu @@ -470,7 +470,17 @@ static size_t get_scan_workspace(uint32_t size) { uint32_t topk::find_kth_radix_workspace(uint32_t batch, uint32_t length) { using namespace cuda_topk_impl::kth; - return (batch * get_grid_dim_x(length) * NR_BUCKET + batch * 2) * + int device_id; + if (cudaGetDevice(&device_id) != cudaSuccess) { + megdnn_trap(); + } + cudaDeviceProp prop; + if (cudaGetDeviceProperties(&prop, device_id) != cudaSuccess) { + megdnn_trap(); + } + uint32_t grid_dim_y_limit = prop.maxGridSize[1]; + uint32_t limit = batch > grid_dim_y_limit ? grid_dim_y_limit : batch; + return (limit * get_grid_dim_x(length) * NR_BUCKET + limit * 2) * sizeof(uint32_t); } @@ -491,35 +501,65 @@ cudaError_t topk::find_kth_radix(const ctype* input, ctype* output, // assert megdnn_trap(); } + + int device_id; + if (cudaGetDevice(&device_id) != cudaSuccess) { + megdnn_trap(); + } + cudaDeviceProp prop; + if (cudaGetDeviceProperties(&prop, device_id) != cudaSuccess) { + megdnn_trap(); + } + uint32_t grid_dim_y_limit = prop.maxGridSize[1]; + + uint32_t batch_idx = 0; uint32_t grid_dim_x = get_grid_dim_x(length); - dim3 grid_dim(grid_dim_x, batch); - uint32_t* dev_k = static_cast(workspace); - uint32_t* dev_prefix = dev_k + batch; - uint32_t* bucket_cnt = dev_prefix + batch; - - compute_histogram<<>>( - input, bucket_cnt, length, lda, nullptr); - // use float to make compiler happy; it is not used since last == false - update_prefix_and_k - <<>>(bucket_cnt, dev_prefix, dev_k, k, - grid_dim_x, nullptr); - - compute_histogram<<>>( - input, bucket_cnt, length, lda, dev_prefix); - update_prefix_and_k - <<>>(bucket_cnt, dev_prefix, dev_k, k, - grid_dim_x, nullptr); - compute_histogram<<>>( - input, bucket_cnt, length, lda, dev_prefix); - update_prefix_and_k - <<>>(bucket_cnt, dev_prefix, dev_k, k, - grid_dim_x, nullptr); - - compute_histogram<<>>( - input, bucket_cnt, length, lda, dev_prefix); - update_prefix_and_k<<>>( - bucket_cnt, dev_prefix, dev_k, k, grid_dim_x, output); + uint32_t grid_dim_y = 1; + + while (batch_idx < batch) { + if (batch - batch_idx >= grid_dim_y_limit) { + grid_dim_y = grid_dim_y_limit; + } else { + grid_dim_y = batch - batch_idx; + } + + dim3 grid_dim(grid_dim_x, grid_dim_y); + uint32_t* dev_k = static_cast(workspace); + uint32_t* dev_prefix = dev_k + grid_dim_y; + uint32_t* bucket_cnt = dev_prefix + grid_dim_y; + + compute_histogram<<>>( + input + batch_idx * lda, bucket_cnt, length, lda, nullptr); + + // use float to make compiler happy; it is not used since last == false + update_prefix_and_k + <<>>( + bucket_cnt, dev_prefix, dev_k, k, grid_dim_x, nullptr); + + compute_histogram<<>>( + input + batch_idx * lda, bucket_cnt, length, lda, dev_prefix); + + update_prefix_and_k + <<>>( + bucket_cnt, dev_prefix, dev_k, k, grid_dim_x, nullptr); + + compute_histogram<<>>( + input + batch_idx * lda, bucket_cnt, length, lda, dev_prefix); + update_prefix_and_k + <<>>( + bucket_cnt, dev_prefix, dev_k, k, grid_dim_x, nullptr); + + compute_histogram<<>>( + input + batch_idx * lda, bucket_cnt, length, lda, dev_prefix); + + update_prefix_and_k + <<>>(bucket_cnt, dev_prefix, + dev_k, k, grid_dim_x, + output + batch_idx); + + batch_idx += grid_dim_y; + } return cudaGetLastError(); } @@ -530,12 +570,18 @@ cudaError_t topk::topk_select(const ctype* input, const ctype* thresh, int32_t lda, int32_t k, cudaStream_t stream) { using namespace cuda_topk_impl; using namespace cuda_topk_impl::select; - uint32_t length_split = DIVUP(length, REDUCE_SIZE), - scan_size = batch * length_split; - size_t scan_wk = get_scan_workspace(scan_size); - uint64_t *scan_inp = static_cast(workspace) + - scan_wk / sizeof(uint64_t), - *scan_out = scan_inp + scan_size; + + int device_id; + if (cudaGetDevice(&device_id) != cudaSuccess) { + megdnn_trap(); + } + cudaDeviceProp prop; + if (cudaGetDeviceProperties(&prop, device_id) != cudaSuccess) { + megdnn_trap(); + } + uint32_t batch_upper_limit = prop.maxGridSize[1]; + + uint32_t length_split = DIVUP(length, REDUCE_SIZE); void (*kptr_reduce_block_cnt)(const ctype*, const ctype*, uint32_t, int32_t, uint64_t*, uint32_t); @@ -585,25 +631,47 @@ cudaError_t topk::topk_select(const ctype* input, const ctype* thresh, #undef CASE_SHARD #undef CASE_SHARD_ON - // reduce to scan_inp - kptr_reduce_block_cnt<<>>( - input, thresh, length, lda, scan_inp, length_split); + uint32_t batch_idx = 0; + uint32_t batch_real = 1; - // scan to scan_out - scan_out += 1; // set scan[-1] to 0 - cudaError_t err = invoke_cub_scan(scan_inp, scan_out, workspace, scan_wk, - scan_size, stream); - if (err != cudaSuccess) { - return err; - } - kern_init_zero<<<1, 1, 0, stream>>>(scan_out - 1); + while (batch_idx < batch) { + if (batch - batch_idx >= batch_upper_limit) { + batch_real = batch_upper_limit; + } else { + batch_real = batch - batch_idx; + } - // copy result - kptr_copy<<>>( - input, thresh, scan_out, length_split, output_value, output_idx, - length, k, lda); + size_t scan_size = batch_real * length_split; + size_t scan_wk = get_scan_workspace(scan_size); + uint64_t *scan_inp = static_cast(workspace) + + scan_wk / sizeof(uint64_t), + *scan_out = scan_inp + scan_size; + + // reduce to scan_inp + kptr_reduce_block_cnt<<< + dim3(DIVUP(length_split, REDUCE_SHARD), batch_real), + dim3(REDUCE_WARP_SIZE, REDUCE_SHARD), 0, stream>>>( + input + batch_idx * lda, thresh + batch_idx, length, lda, + scan_inp, length_split); + + // scan to scan_out + scan_out += 1; // set scan[-1] to 0 + cudaError_t err = invoke_cub_scan(scan_inp, scan_out, workspace, + scan_wk, scan_size, stream); + if (err != cudaSuccess) { + return err; + } + kern_init_zero<<<1, 1, 0, stream>>>(scan_out - 1); + + // copy result + kptr_copy<<>>( + input + batch_idx * lda, thresh + batch_idx, scan_out, + length_split, output_value + std::abs(k) * batch_idx, + output_idx + std::abs(k) * batch_idx, length, k, lda); + + batch_idx += batch_real; + } return cudaGetLastError(); } diff --git a/dnn/test/common/topk.cpp b/dnn/test/common/topk.cpp index efd2fe81a..77f3996c1 100644 --- a/dnn/test/common/topk.cpp +++ b/dnn/test/common/topk.cpp @@ -169,6 +169,12 @@ void test::run_topk_test(Handle* handle) { run(5, 123, 3, mode); // equiv to sort run(-5, 123, 3, mode); // equiv to rev sort run(5, 3, 1231, mode, 2000); // non contig + +//! opencl on armv7's CI does not support large batch. +//! but P30 and MI9 are ok. fix it in the future. +#if !defined(MEGDNN_ARMV7) && defined(MGB_CUDA) + run(3, 70000, 5, mode, 10); // non contig +#endif } // special case to check if tie-break is correct -- GitLab