diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu index 9da8551eb2d7ea66ad434c42b54522432095ce29..8e4a07556fb51dbb15ef948fcee120e2f68e089a 100644 --- a/paddle/fluid/operators/top_k_op.cu +++ b/paddle/fluid/operators/top_k_op.cu @@ -256,36 +256,65 @@ __device__ __forceinline__ void BlockReduce(Pair* sh_topk, int* maxid, * 3. go to the second setp, until one thread's topk value is null; * 4. go to the first setp, until get the topk value. */ + template __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices, - const T* src, int lds, int dim, int k) { + const T* src, int lds, int dim, int k, + int grid_dim, int num) { __shared__ Pair sh_topk[BlockSize]; __shared__ int maxid[BlockSize / 2]; const int tid = threadIdx.x; const int warp = threadIdx.x / 32; - output += blockIdx.x * output_stride; - indices += blockIdx.x * k; - Pair topk[MaxLength]; - int beam = MaxLength; - Pair max; - bool is_empty = false; - bool firststep = true; + const int bid = blockIdx.x; + for (int i = bid; i < num; i += grid_dim) { + output += i * output_stride; + indices += i * k; + + Pair topk[MaxLength]; + int beam = MaxLength; + Pair max; + bool is_empty = false; + bool firststep = true; + + for (int k = 0; k < MaxLength; k++) { + topk[k].set(-INFINITY, -1); + } + while (k) { + ThreadGetTopK( + topk, &beam, k, src + i * lds, &firststep, &is_empty, &max, dim, tid); - for (int k = 0; k < MaxLength; k++) { - topk[k].set(-INFINITY, -1); + sh_topk[tid] = topk[0]; + BlockReduce(sh_topk, maxid, topk, &output, + &indices, &beam, &k, tid, warp); + } } - while (k) { - ThreadGetTopK(topk, &beam, k, - src + blockIdx.x * lds, &firststep, - &is_empty, &max, dim, tid); - - sh_topk[tid] = topk[0]; - BlockReduce(sh_topk, maxid, topk, &output, - &indices, &beam, &k, tid, warp); +} + +inline static int GetDesiredBlockDim(int dim) { + if (dim > 128) { + return 256; + } else if (dim > 64) { + return 128; + } else if (dim > 32) { + return 64; + } else { + return 32; } } +#define FIXED_BLOCK_DIM_BASE(dim, ...) \ + case (dim): { \ + constexpr auto kBlockDim = (dim); \ + __VA_ARGS__; \ + } break + +#define FIXED_BLOCK_DIM(...) \ + FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__) + template class TopkOpCUDAKernel : public framework::OpKernel { public: @@ -310,18 +339,26 @@ class TopkOpCUDAKernel : public framework::OpKernel { // NOTE: pass lds and dim same to input width. // NOTE: old matrix implementation of stride is different to eigen. // TODO(typhoonzero): refine this kernel. - dim3 threads(256, 1); - dim3 grid(input_height, 1); - - KeMatrixTopK<<< - grid, threads, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>( - output_data, output->dims()[1], indices_data, input_data, input_width, - input_width, static_cast(k)); + const int kMaxHeight = 2048; + int gridx = input_height < kMaxHeight ? input_height : kMaxHeight; + auto& dev_ctx = ctx.cuda_device_context(); + + switch (GetDesiredBlockDim(input_width)) { + FIXED_BLOCK_DIM( + KeMatrixTopK<<>>( + output_data, output->dims()[1], indices_data, input_data, + input_width, input_width, static_cast(k), gridx, + input_height)); + default: + PADDLE_THROW("Error"); + } } }; +#undef FIXED_BLOCK_DIM_BASE +#undef FIXED_BLOCK_DIM + } // namespace operators } // namespace paddle