Enable matmul + bias fusion in fused_gat_attention. (#50755)

* Enable matmul + bias fusion in fused_gat_attention. * Add a variable to control whether using fused matmul + bias.

Enable matmul + bias fusion in fused_gat_attention. (#50755)
* Enable matmul + bias fusion in fused_gat_attention. * Add a variable to control whether using fused matmul + bias.
57f6a469 · Yiqun Liu · GitHub · 7c73910e · 57f6a469 · 57f6a469
9 changed file
--- a/paddle/fluid/operators/fused/attn_gemm.h
+++ b/paddle/fluid/operators/fused/attn_gemm.h
@@ -14,12 +14,13 @@ limitations under the License. */
 #pragma once
-#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
+#include "paddle/fluid/operators/fused/fused_gemm_epilogue_op.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
 namespace paddle {
 namespace operators {
@@ -44,13 +45,43 @@ class AttnMatMul {
        input_size_(input_size),
        compute_bias_(compute_bias) {}
-  ~AttnMatMul() {}
  void ComputeForward(const phi::DenseTensor* weight,
                      const phi::DenseTensor* input,
                      const phi::DenseTensor* bias,
                      phi::DenseTensor* output,
-                      phi::DenseTensor* bias_out) {
+                      phi::DenseTensor* bias_out,
+                      bool fused = false) {
+    VLOG(6) << "input.shape={" << input->dims() << "}, weight.shape={"
+            << weight->dims() << "}, output.shape={" << output->dims()
+            << "}, batch_size=" << bsz_seq_ << ", output_size=" << output_size_
+            << ", input_size=" << input_size_ << ", transA=" << transA_
+            << ", transB=" << transB_ << ", compute_bias=" << compute_bias_
+            << ", fused=" << fused;
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
+    if (compute_bias_ && fused) {
+      PADDLE_ENFORCE_EQ(
+          !output || output == bias_out,
+          true,
+          phi::errors::InvalidArgument(
+              "The output (= input * weight) is expected to be nullptr or the "
+              "same as bias_out when fused is true."));
+      ComputeFusedGemmEpilogueForward<T>(dev_ctx_,
+                                         input,
+                                         weight,
+                                         bias,
+                                         bsz_seq_,      // M
+                                         output_size_,  // N
+                                         input_size_,   // K
+                                         transA_,
+                                         transB_,
+                                         "none",
+                                         bias_out,
+                                         nullptr);
+      return;
+    }
+#endif
    // Note: for blas.GEMM API in Paddle, it treats all inputs as row-major.
    // here: (transa, transb): nt, input * weight.
    CBLAS_TRANSPOSE transA = transA_ ? CblasTrans : CblasNoTrans;
@@ -85,7 +116,29 @@ class AttnMatMul {
                       phi::DenseTensor* d_input,
                       phi::DenseTensor* d_weight,
                       phi::DenseTensor* d_bias,
-                       bool use_addto = false) {
+                       bool use_addto = false,
+                       bool fused = false) {
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
+    if (compute_bias_ && fused) {
+      ComputeFusedGemmEpilogueBackward<T>(dev_ctx_,
+                                          d_output,
+                                          input,
+                                          weight,
+                                          nullptr,
+                                          bsz_seq_,      // M
+                                          output_size_,  // N
+                                          input_size_,   // K
+                                          transA_,
+                                          transB_,
+                                          "none",
+                                          d_input,
+                                          d_weight,
+                                          d_bias,
+                                          use_addto);
+      return;
+    }
+#endif
    T alpha = static_cast<T>(1.0);
    T beta_dA = use_addto ? static_cast<T>(1.0) : static_cast<T>(0.0);
    T beta_dB = static_cast<T>(0.0);

--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/operators/fused/fused_softmax_mask.cu.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/funcs/dropout_impl.cu.h"

--- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
@@ -209,7 +209,8 @@ void ComputeGatingLinearForward(const framework::ExecutionContext &ctx,
                                const GateAttentionConfig<T> &config,
                                const phi::DenseTensor *query,
                                const phi::DenseTensor *fmha_out,
-                                phi::DenseTensor *gate_out) {
+                                phi::DenseTensor *gate_bias_out,
+                                bool use_fused_matmul_bias) {
  auto *gate_weight = ctx.Input<phi::DenseTensor>("GateWeight");
  auto *gate_bias = ctx.Input<phi::DenseTensor>("GateBias");
@@ -220,14 +221,18 @@ void ComputeGatingLinearForward(const framework::ExecutionContext &ctx,
  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
  int n = config.num_heads * config.head_dim;
  int k = config.q_dim;
-  auto gate_attn_compute =
+  auto gate_linear =
      AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
-  gate_attn_compute.ComputeForward(
+  gate_linear.ComputeForward(gate_weight,
-      gate_weight, query, gate_bias, gate_out, gate_out);
+                             query,
+                             gate_bias,
+                             gate_bias_out,
+                             gate_bias_out,
+                             use_fused_matmul_bias);
  // gate_out = sigmoid(gate_out) * fmha_out
-  std::vector<const phi::DenseTensor *> ins = {gate_out, fmha_out};
+  std::vector<const phi::DenseTensor *> ins = {gate_bias_out, fmha_out};
-  std::vector<phi::DenseTensor *> outs = {gate_out};
+  std::vector<phi::DenseTensor *> outs = {gate_bias_out};
  phi::funcs::ElementwiseKernel<T>(
      ctx.cuda_device_context(), ins, &outs, SigmoidMultiplyFunctor<T>());
 }
@@ -239,10 +244,12 @@ void ComputeGatingLinearBackward(const framework::ExecutionContext &ctx,
                                 const phi::DenseTensor *fmha_out,
                                 const phi::DenseTensor *gate_out_grad,
                                 phi::DenseTensor *query_grad,
-                                 phi::DenseTensor *fmha_out_grad) {
+                                 phi::DenseTensor *fmha_out_grad,
+                                 bool use_fused_matmul_bias) {
  const auto *gate_weight = ctx.Input<phi::DenseTensor>("GateWeight");
  const auto *gate_bias = ctx.Input<phi::DenseTensor>("GateBias");
  auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
  // Re-compute gate_bias_out
  phi::DenseTensor gate_bias_out;
  gate_bias_out.Resize(config.gate_out_dims);
@@ -251,10 +258,14 @@ void ComputeGatingLinearBackward(const framework::ExecutionContext &ctx,
  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
  int n = config.num_heads * config.head_dim;
  int k = config.q_dim;
-  auto gate_attn_compute =
+  auto gate_linear =
      AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
-  gate_attn_compute.ComputeForward(
+  gate_linear.ComputeForward(gate_weight,
-      gate_weight, query, gate_bias, &gate_bias_out, &gate_bias_out);
+                             query,
+                             gate_bias,
+                             &gate_bias_out,
+                             &gate_bias_out,
+                             use_fused_matmul_bias);
  // Gradient of sigmoid(gate_bias_out) * fmha_out
  // Compute inplace and save gate_bias_out_grad to gate_bias_out.
@@ -272,19 +283,22 @@ void ComputeGatingLinearBackward(const framework::ExecutionContext &ctx,
  dev_ctx.Alloc<T>(gate_weight_grad, gate_weight_grad->numel() * sizeof(T));
  dev_ctx.Alloc<T>(gate_bias_grad, gate_bias_grad->numel() * sizeof(T));
-  gate_attn_compute.ComputeBackward(query,
+  gate_linear.ComputeBackward(query,
-                                    gate_weight,
+                              gate_weight,
-                                    &gate_bias_out,
+                              &gate_bias_out,
-                                    query_grad,
+                              query_grad,
-                                    gate_weight_grad,
+                              gate_weight_grad,
-                                    gate_bias_grad);
+                              gate_bias_grad,
+                              false,
+                              use_fused_matmul_bias);
 }
 template <typename T>
 void ComputeOutputLinearForward(const framework::ExecutionContext &ctx,
                                const GateAttentionConfig<T> &config,
                                const phi::DenseTensor *fmha_or_gate_out,
-                                phi::DenseTensor *out) {
+                                phi::DenseTensor *out,
+                                bool use_fused_matmul_bias) {
  const auto *out_linear_weight =
      ctx.Input<phi::DenseTensor>("OutLinearWeight");
  const auto *out_linear_bias = ctx.Input<phi::DenseTensor>("OutLinearBias");
@@ -293,17 +307,22 @@ void ComputeOutputLinearForward(const framework::ExecutionContext &ctx,
  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
  int n = config.q_dim;
  int k = config.num_heads * config.head_dim;
-  auto out_linear_compute =
+  auto out_linear =
      AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
-  out_linear_compute.ComputeForward(
+  out_linear.ComputeForward(out_linear_weight,
-      out_linear_weight, fmha_or_gate_out, out_linear_bias, out, out);
+                            fmha_or_gate_out,
+                            out_linear_bias,
+                            out,
+                            out,
+                            use_fused_matmul_bias);
 }
 template <typename T>
 void ComputeOutputLinearBackward(const framework::ExecutionContext &ctx,
                                 const GateAttentionGradConfig<T> &config,
                                 const phi::DenseTensor *input,
-                                 phi::DenseTensor *input_grad) {
+                                 phi::DenseTensor *input_grad,
+                                 bool use_fused_matmul_bias) {
  auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
  const auto *out_grad =
      ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
@@ -323,14 +342,16 @@ void ComputeOutputLinearBackward(const framework::ExecutionContext &ctx,
  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
  int n = config.q_dim;
  int k = config.num_heads * config.head_dim;
-  auto out_linear_compute =
+  auto out_linear =
      AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
-  out_linear_compute.ComputeBackward(input,
+  out_linear.ComputeBackward(input,
-                                     out_linear_weight,
+                             out_linear_weight,
-                                     out_grad,
+                             out_grad,
-                                     input_grad,
+                             input_grad,
-                                     out_linear_weight_grad,
+                             out_linear_weight_grad,
-                                     out_linear_bias_grad);
+                             out_linear_bias_grad,
+                             false,
+                             use_fused_matmul_bias);
 }
 template <typename T>
@@ -358,6 +379,7 @@ class FusedGateAttentionOpKernel : public framework::OpKernel<T> {
    const bool merge_qkv = ctx.Attr<bool>("merge_qkv");
    const bool has_gating = ctx.Attr<bool>("has_gating");
+    bool use_fused_matmul_bias = true;
    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
    AllocWithDebugInfo<T>(dev_ctx, "softmax_out", softmax_out);
    AllocWithDebugInfo<T>(dev_ctx, "fmha_out", fmha_out);
@@ -413,12 +435,14 @@ class FusedGateAttentionOpKernel : public framework::OpKernel<T> {
    // 3. Gating Linear
    if (has_gating) {
-      ComputeGatingLinearForward<T>(ctx, config, query, fmha_out, gate_out);
+      ComputeGatingLinearForward<T>(
+          ctx, config, query, fmha_out, gate_out, use_fused_matmul_bias);
    }
    // 4. Output Linear
    phi::DenseTensor *fmha_or_gate_out = has_gating ? gate_out : fmha_out;
-    ComputeOutputLinearForward<T>(ctx, config, fmha_or_gate_out, out);
+    ComputeOutputLinearForward<T>(
+        ctx, config, fmha_or_gate_out, out, use_fused_matmul_bias);
  }
 };
@@ -454,6 +478,7 @@ class FusedGateAttentionGradKernel : public framework::OpKernel<T> {
    bool has_gating = ctx.Attr<bool>("has_gating");
    bool merge_qkv = ctx.Attr<bool>("merge_qkv");
+    bool use_fused_matmul_bias = true;
    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
    AllocWithDebugInfo<T>(dev_ctx, "query_grad", query_grad);
@@ -468,7 +493,8 @@ class FusedGateAttentionGradKernel : public framework::OpKernel<T> {
      phi::DenseTensor gate_out_grad;
      gate_out_grad.Resize(config.gate_out_dims);
      AllocWithDebugInfo<T>(dev_ctx, "gate_out_grad", &gate_out_grad);
-      ComputeOutputLinearBackward<T>(ctx, config, gate_out, &gate_out_grad);
+      ComputeOutputLinearBackward<T>(
+          ctx, config, gate_out, &gate_out_grad, use_fused_matmul_bias);
      // 2. Gradient of Gating Linear
      // Forward: gate_out = Sigmoid(Linear(fmha_out)) * fmha_out
@@ -478,10 +504,12 @@ class FusedGateAttentionGradKernel : public framework::OpKernel<T> {
                                     fmha_out,
                                     &gate_out_grad,
                                     query_grad,
-                                     &fmha_out_grad);
+                                     &fmha_out_grad,
+                                     use_fused_matmul_bias);
    } else {
      // 1. Gradient of Output Linear: out = Linear(fmha_grad)
-      ComputeOutputLinearBackward<T>(ctx, config, fmha_out, &fmha_out_grad);
+      ComputeOutputLinearBackward<T>(
+          ctx, config, fmha_out, &fmha_out_grad, use_fused_matmul_bias);
    }
    // 3. Gradient of FMHA

--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
@@ -14,7 +14,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/fused/fused_gemm_epilogue_op.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"

--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.h
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.h
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
 /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
 /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 // This file has been adapted from FasterTransformer file:
 // https://github.com/NVIDIA/FasterTransformer/blob/v4.0/fastertransformer/cuda/masked_multihead_attention.cu
 // We add License in the head.

--- a/paddle/phi/backends/gpu/cuda/cuda_helper.h
+++ b/paddle/phi/backends/gpu/cuda/cuda_helper.h
@@ -14,6 +14,12 @@
 #pragma once
+#ifdef PADDLE_WITH_CUDA
+#include <cuda_runtime.h>  // NOLINT
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
 namespace phi {
 namespace backends {
 namespace gpu {
@@ -69,6 +75,22 @@ namespace gpu {
  for (index_type i = __index__; __index__ < (num);                  \
       __index__ += __stride__, i = __index__)
+template <typename T>
+cudaDataType_t ToCudaDataType() {
+  if (std::is_same<T, float>::value) {
+    return CUDA_R_32F;
+  } else if (std::is_same<T, double>::value) {
+    return CUDA_R_64F;
+  } else if (std::is_same<T, phi::dtype::float16>::value) {
+    return CUDA_R_16F;
+#if CUDA_VERSION >= 11000
+  } else if (std::is_same<T, phi::dtype::bfloat16>::value) {
+    return CUDA_R_16BF;
+#endif
+  }
+}
 }  // namespace gpu
 }  // namespace backends
 }  // namespace phi
+#endif