[Paddle Inference] Add bias input of mmha and simplify mmha. (#56411)

* add_bias_and_simplify_mmha

[Paddle Inference] Add bias input of mmha and simplify mmha. (#56411)
* add_bias_and_simplify_mmha
636dc2ff · xiaoxiaohehe001 · GitHub · e99b3cb2 · 636dc2ff · 636dc2ff
9 changed file
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -1607,14 +1607,14 @@
  backward : margin_cross_entropy_grad
 - op : masked_multihead_attention_
-  args : (Tensor x, Tensor cache_kv, Tensor src_mask, Tensor cum_offsets, Tensor sequence_lengths, Tensor rotary_tensor, Tensor beam_cache_offset, Tensor qkv_out_scale, Tensor out_shift, Tensor out_smooth, int seq_len, int rotary_emb_dims, bool use_neox_rotary_style=false, float out_scale=-1, int quant_round_type=1, float quant_max_bound=127.0, float quant_min_bound=-127.0)
+  args : (Tensor x, Tensor cache_kv, Tensor bias, Tensor src_mask, Tensor cum_offsets, Tensor sequence_lengths, Tensor rotary_tensor, Tensor beam_cache_offset, Tensor qkv_out_scale, Tensor out_shift, Tensor out_smooth, int seq_len, int rotary_emb_dims, bool use_neox_rotary_style=false, str compute_dtype = "default", float out_scale=-1, int quant_round_type=1, float quant_max_bound=127.0, float quant_min_bound=-127.0)
  output : Tensor(out), Tensor(cache_kv_out), Tensor(beam_cache_offset_out)
  infer_meta :
    func : MaskedMultiheadAttentionInferMeta
  kernel :
    func : masked_multihead_attention
-    data_type : cache_kv
+    data_type : x
-  optional : src_mask, cum_offsets, sequence_lengths, rotary_tensor, beam_cache_offset, qkv_out_scale, out_shift, out_smooth
+  optional : bias, src_mask, cum_offsets, sequence_lengths, rotary_tensor, beam_cache_offset, qkv_out_scale, out_shift, out_smooth
  inplace : (cache_kv -> cache_kv_out), (beam_cache_offset -> beam_cache_offset_out)
 - op : masked_select

--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -4094,6 +4094,7 @@ void WeightedSampleNeighborsInferMeta(const MetaTensor& row,
 void MaskedMultiheadAttentionInferMeta(const MetaTensor& x,
                                       const MetaTensor& cache_kv,
+                                       const MetaTensor& bias,
                                       const MetaTensor& src_mask,
                                       const MetaTensor& cum_offsets,
                                       const MetaTensor& sequence_lengths,
@@ -4105,6 +4106,7 @@ void MaskedMultiheadAttentionInferMeta(const MetaTensor& x,
                                       int seq_len,
                                       int rotary_emb_dims,
                                       const bool use_neox_rotary_style,
+                                       const std::string& compute_dtype,
                                       const float out_scale,
                                       const int quant_round_type,
                                       const float quant_max_bound,
@@ -4113,7 +4115,6 @@ void MaskedMultiheadAttentionInferMeta(const MetaTensor& x,
                                       MetaTensor* cache_kv_out,
                                       MetaTensor* beam_cache_offset_out) {
  int bsz = x.dims()[0];
-  auto x_dtype = x.dtype();
  auto cache_kv_dims = cache_kv.dims();
  int num_head = cache_kv.dims()[2];
  int dim_head = cache_kv.dims()[4];
@@ -4141,10 +4142,86 @@ void MaskedMultiheadAttentionInferMeta(const MetaTensor& x,
  out->set_dims({bsz, num_head * dim_head});
-  if (out_scale > 0) {
+  auto FBADtypeCheck = [](const MetaTensor& check_tensor,
-    out->set_dtype(DataType::INT8);
+                          const std::string& tensor_name,
+                          const std::string& compute_dtype) {
+    if (compute_dtype == "bf16") {
+      PADDLE_ENFORCE_EQ(
+          check_tensor.dtype(),
+          phi::DataType::BFLOAT16,
+          phi::errors::InvalidArgument(
+              "Input(%s) dtype must be the same with Attr(compute_dtype)",
+              tensor_name));
+    } else if (compute_dtype == "fp16") {
+      PADDLE_ENFORCE_EQ(
+          check_tensor.dtype(),
+          phi::DataType::FLOAT16,
+          phi::errors::InvalidArgument(
+              "Input(%s) dtype must be the same with Attr(compute_dtype)",
+              tensor_name));
+    } else if (compute_dtype == "fp32") {
+      PADDLE_ENFORCE_EQ(
+          check_tensor.dtype(),
+          phi::DataType::FLOAT32,
+          phi::errors::InvalidArgument(
+              "Input(%s) dtype must be the same with Attr(compute_dtype)",
+              tensor_name));
+    }
+  };
+  // In the case of quantization enabled, the dtype for computation is
+  // determined based on compute_dtype.
+  if (x.dtype() == phi::DataType::INT32) {
+    PADDLE_ENFORCE_NE(
+        compute_dtype,
+        "default",
+        phi::errors::InvalidArgument(
+            "If Input(x) dtype is INT32, Attr(compute_dtype) must be set."));
+    if (bias) {
+      FBADtypeCheck(bias, "bias", compute_dtype);
+    }
+    if (out_scale > 0) {
+      out->set_dtype(phi::DataType::INT8);
+    } else {
+      if (compute_dtype == "bf16") {
+        out->set_dtype(phi::DataType::BFLOAT16);
+      } else if (compute_dtype == "fp16") {
+        out->set_dtype(phi::DataType::FLOAT16);
+      } else if (compute_dtype == "fp32") {
+        out->set_dtype(phi::DataType::FLOAT32);
+      } else {
+        PADDLE_THROW(phi::errors::InvalidArgument(
+            "In the case of quantization enabled with Input(x) INT32, "
+            "Attr(compute_dtype) must be set in (bf16, fp16, fp32), "
+            "but get compute_dtype (%s)",
+            compute_dtype));
+      }
+    }
  } else {
-    out->set_dtype(x_dtype);
+    if (bias) {
+      if (compute_dtype != "default") {
+        FBADtypeCheck(bias, "bias", compute_dtype);
+        FBADtypeCheck(x, "x", compute_dtype);
+      } else {
+        PADDLE_ENFORCE_EQ(
+            x.dtype(),
+            bias.dtype(),
+            phi::errors::InvalidArgument("Input(x) and Input(bias) must be the "
+                                         "same dtype in this situation"));
+      }
+    } else {
+      // bias not exist
+      if (compute_dtype != "default") {
+        FBADtypeCheck(x, "x", compute_dtype);
+      }
+    }
+    if (out_scale > 0) {
+      out->set_dtype(phi::DataType::INT8);
+    } else {
+      out->set_dtype(x.dtype());
+    }
  }
  cache_kv_out->set_dims(cache_kv_dims);

--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -801,6 +801,7 @@ void FusedRopeInferMeta(const MetaTensor& q,
 void MaskedMultiheadAttentionInferMeta(const MetaTensor& x,
                                       const MetaTensor& cache_kv,
+                                       const MetaTensor& bias,
                                       const MetaTensor& src_mask,
                                       const MetaTensor& cum_offsets,
                                       const MetaTensor& sequence_lengths,
@@ -812,6 +813,7 @@ void MaskedMultiheadAttentionInferMeta(const MetaTensor& x,
                                       int seq_len,
                                       int rotary_emb_dims,
                                       const bool use_neox_rotary_style,
+                                       const std::string& compute_dtype,
                                       const float out_scale,
                                       const int quant_round_type,
                                       const float quant_max_bound,

--- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention.cu
+++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention.cu
--- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention.h
+++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention.h
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/fusion/gpu/masked_multihead_attention_utils.h"
-namespace phi {
-namespace fusion {
-template <typename T, typename Context>
-void MMHAKernel(const Context& dev_ctx,
-                const DenseTensor& x,
-                const DenseTensor& cache_kv,
-                const paddle::optional<DenseTensor>& src_mask,
-                const paddle::optional<DenseTensor>& cum_offsets,
-                const paddle::optional<DenseTensor>& sequence_lengths,
-                const paddle::optional<DenseTensor>& rotary_tensor,
-                const paddle::optional<DenseTensor>& beam_cache_offset,
-                const paddle::optional<DenseTensor>& qkv_out_scale,
-                const paddle::optional<DenseTensor>& out_shift,
-                const paddle::optional<DenseTensor>& out_smooth,
-                int seq_len,
-                int rotary_emb_dims,
-                const bool use_neox_rotary_style,
-                const float out_scale,
-                const int quant_round_type,
-                const float quant_max_bound,
-                const float quant_min_bound,
-                DenseTensor* out,
-                DenseTensor* cache_kv_out,
-                DenseTensor* beam_cache_offset_out);
-}  // namespace fusion
-}  // namespace phi
--- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_utils.h
+++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_utils.h
--- a/paddle/phi/kernels/funcs/mmha_util.cu.h
+++ b/paddle/phi/kernels/funcs/mmha_util.cu.h
@@ -48,7 +48,6 @@
 */
 #ifndef PADDLE_WITH_HIP
 #pragma once
 #if defined(__CUDACC__) && CUDA_VERSION >= 11000
@@ -66,8 +65,6 @@
 namespace phi {
 namespace fusion {
-namespace {  // NOLINT
 struct Float8_ {
  float2 x;
  float2 y;
@@ -1712,8 +1709,6 @@ inline __device__ void apply_rotary_embedding(bf16_8_t& q,  // NOLINT
 }
 #endif  // ENABLE_BF16
-}  // namespace
 }  // namespace fusion
 }  // namespace phi

--- a/python/paddle/incubate/nn/functional/masked_multihead_attention.py
+++ b/python/paddle/incubate/nn/functional/masked_multihead_attention.py
@@ -19,6 +19,7 @@ from paddle.framework import LayerHelper, in_dynamic_mode
 def masked_multihead_attention(
    x,
    cache_kv=None,
+    bias=None,
    src_mask=None,
    cum_offsets=None,
    sequence_lengths=None,
@@ -30,6 +31,7 @@ def masked_multihead_attention(
    seq_len=1,
    rotary_emb_dims=0,
    use_neox_rotary_style=False,
+    compute_dtype='default',
    out_scale=-1,
    quant_round_type=1,
    quant_max_bound=127.0,
@@ -43,6 +45,7 @@ def masked_multihead_attention(
    Args:
        x (Tensor): The input tensor could be 2-D tensor. Its shape is [batch_size, 3 * num_head * head_dim].
        cache_kvs (list(Tensor)|tuple(Tensor)): The cache structure tensors for the generation model. Its shape is [2, batch_size, num_head, max_seq_len, head_dim].
+        bias (Tensor, optional): The bias tensor. Its shape is [3, num_head, head_dim].
        src_mask (Tensor, optional): The src_mask tensor. Its shape is [batch_size, 1, 1, sequence_length].
        sequence_lengths (Tensor, optional): The sequence_lengths tensor, used to index input. Its shape is [batch_size, 1].
        rotary_tensor (Tensor, optional): The rotary_tensor tensor. The dtype must be float. Its shape is [batch_size, 1, 1, sequence_length, head_dim].
@@ -53,6 +56,7 @@ def masked_multihead_attention(
        seq_len (int, optional): The seq_len, used to get input length. Default 1.
        rotary_emb_dims (int, optional): The rotary_emb_dims. Default 1.
        use_neox_rotary_style (bool, optional): A flag indicating whether neox_rotary_style is needed or not. Default False.
+        compute_dtype (string): A compute dtype, used to represent the input data type.
        out_scale (float, optional): The out_scale, used in quant.
        quant_round_type (int, optional): The quant_round_type, used in quant. Default 1.
        quant_max_bound (float, optional): The quant_max_bound, used in quant. Default 127.0.
@@ -89,6 +93,7 @@ def masked_multihead_attention(
        return _C_ops.masked_multihead_attention_(
            x,
            cache_kv,
+            bias,
            src_mask,
            cum_offsets,
            sequence_lengths,
@@ -100,6 +105,7 @@ def masked_multihead_attention(
            seq_len,
            rotary_emb_dims,
            use_neox_rotary_style,
+            compute_dtype,
            out_scale,
            quant_round_type,
            quant_max_bound,
@@ -107,11 +113,22 @@ def masked_multihead_attention(
        )
    helper = LayerHelper('masked_multihead_attention', **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    if x.dtype == "int32":
+        if compute_dtype == "bf16":
+            dtype = "uint16"
+        elif compute_dtype == "fp16":
+            dtype = "float16"
+        elif compute_dtype == "fp32":
+            dtype = "float32"
+        out = helper.create_variable_for_type_inference(dtype=dtype)
+    else:
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
    inputs = {}
    inputs['x'] = x
    inputs['cache_kv'] = cache_kv
+    if bias is not None:
+        inputs['bias'] = bias
    if src_mask is not None:
        inputs['src_mask'] = src_mask
    if cum_offsets is not None:
@@ -148,6 +165,7 @@ def masked_multihead_attention(
            'seq_len': seq_len,
            'rotary_emb_dims': rotary_emb_dims,
            'use_neox_rotary_style': use_neox_rotary_style,
+            'compute_dtype': compute_dtype,
            'out_scale': out_scale,
            'quant_round_type': quant_round_type,
            'quant_max_bound': quant_max_bound,

--- a/test/legacy_test/test_masked_multihead_attention_op.py
+++ b/test/legacy_test/test_masked_multihead_attention_op.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import unittest
 import numpy as np
@@ -43,6 +42,10 @@ class TestMMHAOp(unittest.TestCase):
            2, 10, size=(self.bsz, 3, self.num_head, self.dim_head)
        ).astype("int")
+        self.bias = np.random.uniform(
+            -0.05, 0.05, [3, self.num_head, self.dim_head]
+        )
        self.src_mask = np.zeros([self.bsz, 1, 1, self.sequence_length + 1])
        self.cum_offsets = None
@@ -77,7 +80,7 @@ class TestMMHAOp(unittest.TestCase):
        self.seq_len = 1
        self.rotary_emb_dims = 0
        self.use_neox_rotary_style = False
+        self.compute_dtype = "default"
        self.out_scale = 10
        self.quant_round_type = 1
        self.quant_max_bound = 126
@@ -100,6 +103,7 @@ class TestMMHAOp(unittest.TestCase):
        self,
        x,
        cache_kv_out,
+        bias,
        src_mask,
        qkv_out_scale,
        seq_len,
@@ -110,9 +114,9 @@ class TestMMHAOp(unittest.TestCase):
        bsz,
    ):
        if qkv_out_scale is not None:
-            x = x.cast(cache_kv_out.dtype) * qkv_out_scale
+            x = x.cast(cache_kv_out.dtype) * qkv_out_scale + bias
        else:
-            x = x
+            x = x + bias
        x = paddle.transpose(
            x, [0, 2, 1, 3]
@@ -145,6 +149,7 @@ class TestMMHAOp(unittest.TestCase):
        x,
        cache_kv_out,
        cache_kv_mmha_out,
+        bias,
        src_mask,
        qkv_out_scale,
        out_scale,
@@ -157,12 +162,14 @@ class TestMMHAOp(unittest.TestCase):
        else:
            x = paddle.to_tensor(x).cast(dtype)
        src_mask = paddle.to_tensor(src_mask).cast(dtype)
+        bias = paddle.to_tensor(bias).cast(dtype)
        cache_kv_out = paddle.to_tensor(cache_kv_out).cast(dtype)
        cache_kv_mmha_out = paddle.to_tensor(cache_kv_mmha_out).cast(dtype)
        paddle_naive_mmha_out = 0
        paddle_naive_mmha_out = self.mmha_naive(
            x,
            cache_kv_out,
+            bias,
            src_mask,
            qkv_out_scale,
            self.seq_len,
@@ -174,9 +181,14 @@ class TestMMHAOp(unittest.TestCase):
        )
        x = x.reshape([self.bsz, -1])
+        if x.dtype == paddle.float16:
+            dtype = self.compute_dtype
+        else:
+            dtype = "fp16"
        paddle_mmha_out = masked_multihead_attention(
            x,
            cache_kv_mmha_out,
+            bias,
            src_mask,
            None,
            None,
@@ -188,6 +200,7 @@ class TestMMHAOp(unittest.TestCase):
            self.seq_len,
            self.rotary_emb_dims,
            self.use_neox_rotary_style,
+            dtype,
            out_scale,
            self.quant_round_type,
            self.quant_max_bound,
@@ -204,6 +217,7 @@ class TestMMHAOp(unittest.TestCase):
            self.x,
            self.cache_kv_out,
            self.cache_kv_mmha_out,
+            self.bias,
            self.src_mask,
            None,
            -1,
@@ -224,6 +238,7 @@ class TestMMHAOp(unittest.TestCase):
            self.x_int,
            self.cache_kv_out,
            self.cache_kv_mmha_out,
+            self.bias,
            self.src_mask,
            self.qkv_out_scale,
            -1,
@@ -244,6 +259,7 @@ class TestMMHAOp(unittest.TestCase):
            self.x,
            self.cache_kv_out,
            self.cache_kv_mmha_out,
+            self.bias,
            self.src_mask,
            None,
            self.out_scale,
@@ -274,6 +290,9 @@ class TestLayerNormStaticInt8Op(unittest.TestCase):
        self.x = np.random.uniform(
            -0.05, 0.05, [self.bsz, 3, self.num_head, self.dim_head]
        )
+        self.bias = np.random.uniform(
+            -0.05, 0.05, [3, self.num_head, self.dim_head]
+        )
        self.src_mask = np.zeros([self.bsz, 1, 1, self.sequence_length + 1])
        self.cum_offsets = None
@@ -317,6 +336,7 @@ class TestLayerNormStaticInt8Op(unittest.TestCase):
        self,
        x,
        cache_kv_out,
+        bias,
        src_mask,
        qkv_out_scale,
        seq_len,
@@ -327,7 +347,9 @@ class TestLayerNormStaticInt8Op(unittest.TestCase):
        bsz,
    ):
        if qkv_out_scale is not None:
-            x = x.cast(cache_kv_out.dtype) * qkv_out_scale
+            x = x.cast(cache_kv_out.dtype) * qkv_out_scale + bias
+        else:
+            x = x + bias
        x = paddle.transpose(
            x, [0, 2, 1, 3]
@@ -351,6 +373,7 @@ class TestLayerNormStaticInt8Op(unittest.TestCase):
    def check_main(
        self,
        x,
+        bias,
        src_mask,
        cache_kv_out,
        cache_kv_mmha_out,
@@ -361,11 +384,13 @@ class TestLayerNormStaticInt8Op(unittest.TestCase):
        paddle.disable_static()
        x_tensor = paddle.to_tensor(x).cast(dtype)
        src_mask_tensor = paddle.to_tensor(src_mask).cast(dtype)
+        bias_tensor = paddle.to_tensor(bias).cast(dtype)
        cache_kv_out = paddle.to_tensor(cache_kv_out).cast(dtype)
        paddle_naive_mmha_out = self.mmha_naive(
            x_tensor,
            cache_kv_out,
+            bias_tensor,
            src_mask_tensor,
            None,
            self.seq_len,
@@ -383,6 +408,11 @@ class TestLayerNormStaticInt8Op(unittest.TestCase):
                shape=[self.bsz, 3 * self.num_head * self.dim_head],
                dtype=dtype,
            )
+            bias_static = paddle.static.data(
+                name="bias_static",
+                shape=[3, self.num_head, self.dim_head],
+                dtype=dtype,
+            )
            src_mask_static = paddle.static.data(
                name="src_mask_static",
                shape=[self.bsz, 1, 1, self.sequence_length + 1],
@@ -403,6 +433,7 @@ class TestLayerNormStaticInt8Op(unittest.TestCase):
            outs = masked_multihead_attention(
                x_static,
                cache_kv_mmha_out_static,
+                bias_static,
                src_mask_static,
                None,
                None,
@@ -414,6 +445,7 @@ class TestLayerNormStaticInt8Op(unittest.TestCase):
                32,
                0,
                False,
+                "fp16",
                -1,
                1,
                127.0,
@@ -424,6 +456,7 @@ class TestLayerNormStaticInt8Op(unittest.TestCase):
                feed={
                    "x_static": x.reshape(self.bsz, -1).astype(dtype),
                    "cache_kv_mmha_out_static": cache_kv_mmha_out.astype(dtype),
+                    "bias_static": bias.astype(dtype),
                    "src_mask_static": src_mask.astype(dtype),
                },
                fetch_list=[outs],
@@ -437,6 +470,7 @@ class TestLayerNormStaticInt8Op(unittest.TestCase):
        paddle_naive_mmha_out, paddle_mmha_out = self.check_main(
            self.x,
+            self.bias,
            self.src_mask,
            self.cache_kv_out,
            self.cache_kv_mmha_out,