From fc93266b0ac0fcd65c48f4c0cb95162e5dfd147b Mon Sep 17 00:00:00 2001 From: Jeng Bai-Cheng Date: Fri, 10 Jul 2020 11:00:20 +0800 Subject: [PATCH] Improve qkv transpose performance (#23919) Use vector instruction (LDG.128) to improve qkv transpose. It provides 1.4X speedup at same GPU base frequency. test=develop --- .../tensorrt/plugin/qkv_to_context_plugin.cu | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu index fe3ea180593..240ecaa2589 100644 --- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu @@ -115,7 +115,18 @@ inline void TransposeQKV(const int batch, const int seq_len, const half *input, half *output, cudaStream_t stream) { int scratch_size = batch * head_num * seq_len * seq_len; const dim3 grid(seq_len, batch, 3); - if (head_size % 2 == 0 && scratch_size % 2 == 0) { + if (head_size % 8 == 0 && scratch_size % 8 == 0) { + int h = head_size / 8; + const int4 *input4 = reinterpret_cast(input); + int4 *output4 = reinterpret_cast(output); + dim3 block(h, head_num, 1); + // limit h * head_num to max block size(1024). + PADDLE_ENFORCE_LE(h * head_num, 1024, + platform::errors::InvalidArgument( + "head_num (%d) * head_size (%d) should <= %d", + head_num, head_size, 1024 * 8)); + TransposeQkvKernel<<>>(h, input4, output4); + } else if (head_size % 2 == 0 && scratch_size % 2 == 0) { const int h = head_size / 2; const half2 *input2 = reinterpret_cast(input); half2 *output2 = reinterpret_cast(output); -- GitLab