未验证 提交 2c687df0 编写于 作者: C carryyu 提交者: GitHub

Optimize topk's performance when k is small and input_width is large (#45312)

* Optimize topk's performance when k is small and input_width is large

* 修改blockdim设置逻辑

* Update top_k_function_cuda.h
上级 18860735
......@@ -27,9 +27,11 @@ limitations under the License. */
#include "paddle/fluid/operators/kernel_primitives/functor_primitives.h"
#include "paddle/fluid/operators/top_k_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h"
#define FINAL_MASK 0xffffffff
#ifdef __HIPCC__
namespace rocprim {
namespace detail {
......@@ -105,6 +107,14 @@ inline static int GetDesiredBlockDim(int dim) {
}
}
inline static int getMaxLength(int k) {
if (k / 5 < 1) {
return 1;
} else if (k / 5 >= 1) {
return min(k / 5, 5);
}
}
template <typename T>
__global__ void InitIndex(T* indices, T num_rows, T num_cols) {
int col_id = threadIdx.x;
......@@ -248,7 +258,11 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[],
if (k < MaxLength - (*beam)) {
topk[k] = topk[k + *beam];
} else {
topk[k].set(-static_cast<T>(INFINITY), -1);
if (largest) {
topk[k].set(-static_cast<T>(INFINITY), -1);
} else {
topk[k].set(static_cast<T>(INFINITY), -1);
}
}
}
if (!(*is_empty)) {
......@@ -258,79 +272,98 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[],
}
*max = topk[MaxLength - 1];
if ((*max).v == -static_cast<T>(1)) *is_empty = true;
if ((*max).id == -1) *is_empty = true;
*beam = 0;
}
}
template <typename T>
__forceinline__ __device__ Pair<T> WarpReduce(Pair<T> input,
const bool& largest) {
if (largest) {
#pragma unroll
for (int offset = 16; offset > 0; offset >>= 1) {
T tmp_val = platform::CudaShuffleDownSync(FINAL_MASK, input.v, offset);
int tmp_id = platform::CudaShuffleDownSync(FINAL_MASK, input.id, offset);
if (input.v < tmp_val || (input.v == tmp_val && input.id > tmp_id)) {
input.v = tmp_val;
input.id = tmp_id;
}
}
} else {
#pragma unroll
for (int offset = 16; offset > 0; offset >>= 1) {
T tmp_val = platform::CudaShuffleDownSync(FINAL_MASK, input.v, offset);
int tmp_id = platform::CudaShuffleDownSync(FINAL_MASK, input.id, offset);
if (input.v > tmp_val || (input.v == tmp_val && input.id > tmp_id)) {
input.v = tmp_val;
input.id = tmp_id;
}
}
}
return input;
}
template <typename T, int MaxLength, int BlockSize>
__device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk,
int* maxid,
__device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
Pair<T> topk[],
T** topVal,
int64_t** topIds,
int* beam,
int* k,
const int tid,
const int warp,
const int wid,
const int lane,
const bool& largest) {
while (true) {
__syncthreads();
if (tid < BlockSize / 2) {
if (largest) {
if (sh_topk[tid] < sh_topk[tid + BlockSize / 2]) {
maxid[tid] = tid + BlockSize / 2;
} else {
maxid[tid] = tid;
}
} else {
if (sh_topk[tid] > sh_topk[tid + BlockSize / 2]) {
maxid[tid] = tid + BlockSize / 2;
} else {
maxid[tid] = tid;
}
}
Pair<T> input_now = topk[0];
input_now = WarpReduce(input_now, largest);
if (lane == 0) {
shared_max[wid] = input_now;
}
__syncthreads();
for (int stride = BlockSize / 4; stride > 0; stride = stride / 2) {
if (tid < stride) {
if (largest) {
if (sh_topk[maxid[tid]] < sh_topk[maxid[tid + stride]]) {
maxid[tid] = maxid[tid + stride];
}
} else {
if (sh_topk[maxid[tid]] > sh_topk[maxid[tid + stride]]) {
maxid[tid] = maxid[tid + stride];
}
}
}
__syncthreads();
if (largest) {
input_now = (tid < BlockSize / 32)
? shared_max[lane]
: Pair<T>(-static_cast<T>(INFINITY), -1);
} else {
input_now = (tid < BlockSize / 32)
? shared_max[lane]
: Pair<T>(static_cast<T>(INFINITY), -1);
}
if (wid == 0) {
input_now = WarpReduce(input_now, largest);
if (lane == 0) shared_max[0] = input_now;
}
__syncthreads();
if (tid == 0) {
**topVal = sh_topk[maxid[0]].v;
**topIds = sh_topk[maxid[0]].id;
**topVal = input_now.v;
**topIds = input_now.id;
(*topVal)++;
(*topIds)++;
}
if (tid == maxid[0]) (*beam)++;
if (--(*k) == 0) break;
__syncthreads();
if (tid == maxid[0]) {
int tid_max = shared_max[0].id % BlockSize;
if (tid == tid_max) {
(*beam)++;
if (*beam < MaxLength) {
sh_topk[tid] = topk[*beam];
topk[0] = topk[*beam];
}
}
// NOTE(zcd): temporary solution
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, true);
if (maxid[0] / 32 == warp) {
if (platform::CudaShuffleSync(mask, *beam, (maxid[0]) % 32, 32) ==
MaxLength)
break;
if (--(*k) == 0) break;
if (MaxLength < 5) {
if (*beam >= MaxLength) break;
} else {
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, true);
if (tid_max / 32 == wid) {
if (platform::CudaShuffleSync(mask, *beam, tid_max % 32, 32) ==
MaxLength)
break;
}
}
}
}
......@@ -355,14 +388,13 @@ __global__ void KeMatrixTopK(T* output,
int grid_dim,
int num,
bool largest = true) {
__shared__ Pair<T> sh_topk[BlockSize];
const int tid = threadIdx.x;
const int warp = threadIdx.x / 32;
const int wid = tid / 32;
const int lane = tid % 32;
const int bid = blockIdx.x;
for (int i = bid; i < num; i += grid_dim) {
int top_num = k;
__shared__ int maxid[BlockSize / 2];
__shared__ Pair<T> shared_max[BlockSize / 32];
T* out = output + i * output_stride;
int64_t* inds = indices + i * k;
Pair<T> topk[MaxLength];
......@@ -389,17 +421,15 @@ __global__ void KeMatrixTopK(T* output,
dim,
tid,
largest);
sh_topk[tid] = topk[0];
BlockReduce<T, MaxLength, BlockSize>(sh_topk,
maxid,
BlockReduce<T, MaxLength, BlockSize>(shared_max,
topk,
&out,
&inds,
&beam,
&top_num,
tid,
warp,
wid,
lane,
largest);
}
}
......
......@@ -38,12 +38,27 @@ using Tensor = framework::Tensor;
__VA_ARGS__; \
} break
#define FIXED_BLOCK_DIM(...) \
FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__); \
#define FIXED_MAXLENGTH_BASE(MaxLength, ...) \
case (MaxLength): { \
constexpr auto maxLength = (MaxLength); \
__VA_ARGS__; \
} break
#define FIXED_BLOCK_DIM(...) \
FIXED_BLOCK_DIM_BASE(1024, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(512, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
#define FIXED_MAXLENGTH(...) \
FIXED_MAXLENGTH_BASE(1, ##__VA_ARGS__); \
FIXED_MAXLENGTH_BASE(2, ##__VA_ARGS__); \
FIXED_MAXLENGTH_BASE(3, ##__VA_ARGS__); \
FIXED_MAXLENGTH_BASE(4, ##__VA_ARGS__); \
FIXED_MAXLENGTH_BASE(5, ##__VA_ARGS__)
template <typename DeviceContext, typename T>
class TopkOpCUDAKernel : public framework::OpKernel<T> {
public:
......@@ -95,18 +110,25 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
// TODO(typhoonzero): refine this kernel.
const int kMaxHeight = 2048;
int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
switch (GetDesiredBlockDim(input_width)) {
FIXED_BLOCK_DIM(
KeMatrixTopK<T, 5, kBlockDim>
<<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(output_data,
k,
indices_data,
input_data,
input_width,
input_width,
static_cast<int>(k),
gridx,
input_height));
paddle::platform::GpuLaunchConfig config =
paddle::platform::GetGpuLaunchConfig1D(dev_ctx, input_width);
switch (config.thread_per_block.x) {
FIXED_BLOCK_DIM(switch (getMaxLength(k)) {
FIXED_MAXLENGTH(
KeMatrixTopK<T, maxLength, kBlockDim>
<<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(output_data,
k,
indices_data,
input_data,
input_width,
input_width,
static_cast<int>(k),
gridx,
input_height));
default:
PADDLE_THROW(platform::errors::Fatal(
"the input k has error in the topk cuda kernel."));
});
default:
PADDLE_THROW(platform::errors::Unavailable(
"Calculation error occurred in TopK Operator's CUDA Kernel."));
......
......@@ -31,12 +31,27 @@ namespace ops = paddle::operators;
__VA_ARGS__; \
} break
#define FIXED_BLOCK_DIM(...) \
FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__); \
#define FIXED_MAXLENGTH_BASE(MaxLength, ...) \
case (MaxLength): { \
constexpr auto maxLength = (MaxLength); \
__VA_ARGS__; \
} break
#define FIXED_BLOCK_DIM(...) \
FIXED_BLOCK_DIM_BASE(1024, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(512, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
#define FIXED_MAXLENGTH(...) \
FIXED_MAXLENGTH_BASE(1, ##__VA_ARGS__); \
FIXED_MAXLENGTH_BASE(2, ##__VA_ARGS__); \
FIXED_MAXLENGTH_BASE(3, ##__VA_ARGS__); \
FIXED_MAXLENGTH_BASE(4, ##__VA_ARGS__); \
FIXED_MAXLENGTH_BASE(5, ##__VA_ARGS__)
template <typename T, typename Context>
void TopkKernel(const Context& dev_ctx,
const DenseTensor& x,
......@@ -158,7 +173,9 @@ void TopkKernel(const Context& dev_ctx,
// NOTE: old matrix implementation of stride is different to eigen.
const int kMaxHeight = 2048;
int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
switch (ops::GetDesiredBlockDim(input_width)) {
paddle::platform::GpuLaunchConfig config =
paddle::platform::GetGpuLaunchConfig1D(dev_ctx, input_width);
switch (config.thread_per_block.x) {
#ifdef PADDLE_WITH_HIP
FIXED_BLOCK_DIM(
ops::KeMatrixTopK<T, 20, kBlockDim>
......@@ -173,18 +190,23 @@ void TopkKernel(const Context& dev_ctx,
input_height,
largest));
#else
FIXED_BLOCK_DIM(
ops::KeMatrixTopK<T, 5, kBlockDim>
<<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(output_data,
k,
indices_data,
input_data,
input_width,
input_width,
static_cast<int>(k),
gridx,
input_height,
largest));
FIXED_BLOCK_DIM(switch (ops::getMaxLength(k)) {
FIXED_MAXLENGTH(
ops::KeMatrixTopK<T, maxLength, kBlockDim>
<<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(output_data,
k,
indices_data,
input_data,
input_width,
input_width,
static_cast<int>(k),
gridx,
input_height,
largest));
default:
PADDLE_THROW(
errors::Fatal("the input k has error in the topk cuda kernel."));
});
#endif
default:
PADDLE_THROW(errors::Fatal(
......@@ -259,7 +281,9 @@ void TopkKernel(const Context& dev_ctx,
const int kMaxHeight = 2048;
int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
switch (ops::GetDesiredBlockDim(input_width)) {
paddle::platform::GpuLaunchConfig config =
paddle::platform::GetGpuLaunchConfig1D(dev_ctx, input_width);
switch (config.thread_per_block.x) {
#ifdef PADDLE_WITH_HIP
FIXED_BLOCK_DIM(
ops::KeMatrixTopK<T, 20, kBlockDim>
......@@ -274,18 +298,23 @@ void TopkKernel(const Context& dev_ctx,
input_height,
largest));
#else
FIXED_BLOCK_DIM(
ops::KeMatrixTopK<T, 5, kBlockDim>
<<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(trans_out.data<T>(),
k,
trans_ind.data<int64_t>(),
trans_input.data<T>(),
input_width,
input_width,
static_cast<int>(k),
gridx,
input_height,
largest));
FIXED_BLOCK_DIM(switch (ops::getMaxLength(k)) {
FIXED_MAXLENGTH(ops::KeMatrixTopK<T, maxLength, kBlockDim>
<<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
trans_out.data<T>(),
k,
trans_ind.data<int64_t>(),
trans_input.data<T>(),
input_width,
input_width,
static_cast<int>(k),
gridx,
input_height,
largest));
default:
PADDLE_THROW(
errors::Fatal("the input k has error in the topk cuda kernel."));
});
#endif
default:
PADDLE_THROW(errors::Fatal(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册