From fc93266b0ac0fcd65c48f4c0cb95162e5dfd147b Mon Sep 17 00:00:00 2001
From: Jeng Bai-Cheng <jeng1220@users.noreply.github.com>
Date: Fri, 10 Jul 2020 11:00:20 +0800
Subject: [PATCH] Improve qkv transpose performance (#23919)

Use vector instruction (LDG.128) to improve qkv transpose. It
provides 1.4X speedup at same GPU base frequency.
test=develop
---
 .../tensorrt/plugin/qkv_to_context_plugin.cu        | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
index fe3ea180593..240ecaa2589 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
@@ -115,7 +115,18 @@ inline void TransposeQKV(const int batch, const int seq_len,
                          const half *input, half *output, cudaStream_t stream) {
   int scratch_size = batch * head_num * seq_len * seq_len;
   const dim3 grid(seq_len, batch, 3);
-  if (head_size % 2 == 0 && scratch_size % 2 == 0) {
+  if (head_size % 8 == 0 && scratch_size % 8 == 0) {
+    int h = head_size / 8;
+    const int4 *input4 = reinterpret_cast<const int4 *>(input);
+    int4 *output4 = reinterpret_cast<int4 *>(output);
+    dim3 block(h, head_num, 1);
+    // limit h * head_num to max block size(1024).
+    PADDLE_ENFORCE_LE(h * head_num, 1024,
+                      platform::errors::InvalidArgument(
+                          "head_num (%d) * head_size (%d) should <= %d",
+                          head_num, head_size, 1024 * 8));
+    TransposeQkvKernel<int4><<<grid, block, 0, stream>>>(h, input4, output4);
+  } else if (head_size % 2 == 0 && scratch_size % 2 == 0) {
     const int h = head_size / 2;
     const half2 *input2 = reinterpret_cast<const half2 *>(input);
     half2 *output2 = reinterpret_cast<half2 *>(output);
-- 
GitLab