[Fuse attention pass] Forward pattern. (#49621)

b0ece266 · Yuang Liu · GitHub · 13992de7 · b0ece266 · b0ece266
11 changed file
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -382,6 +382,7 @@ set(IR_PASS_DEPS
    graph_to_program_pass
    fix_op_run_order_pass
    fuse_gemm_epilogue_pass
+    fused_attention_pass
    delete_dropout_op_pass)

 if(WITH_CINN)

--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -187,6 +187,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
    AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass");
 #endif

+#ifdef PADDLE_WITH_CUDA
+    AppendPassWithCheck(strategy_.fused_attention_, "fused_attention_pass");
+#endif
+
 #if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060)
    AppendPassWithCheck(strategy_.fuse_gemm_epilogue_,
                        "fuse_gemm_epilogue_pass");
@@ -519,6 +523,9 @@ USE_PASS(fuse_all_reduce_op_pass);
 USE_PASS(runtime_context_cache_pass);
 USE_PASS(add_reader_dependency_pass);
 USE_PASS(delete_dropout_op_x_pass);
+#ifdef PADDLE_WITH_CUDA
+USE_PASS(fused_attention_pass);
+#endif
 #ifdef PADDLE_WITH_CINN
 USE_PASS(build_cinn_pass);
 #endif

--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -129,6 +129,8 @@ struct BuildStrategy {
  bool sync_batch_norm_{false};
  // Fuse GEMM+Epilogue via cublasLt epilogue.
  bool fuse_gemm_epilogue_{false};
+  // Fused multi head attention
+  bool fused_attention_{false};

  // mkldnn_enabled_op_types specify the operator type list to
  // use MKLDNN acceleration. It is null in default, means
@@ -261,6 +263,7 @@ inline std::ostream &operator<<(std::ostream &os,
  os << "fuse_broadcast_ops_: " << strategy.fuse_broadcast_ops_ << std::endl;
  os << "sync_batch_norm_: " << strategy.sync_batch_norm_ << std::endl;
  os << "fuse_gemm_epilogue_: " << strategy.fuse_gemm_epilogue_ << std::endl;
+  os << "fused_attention_: " << strategy.fused_attention_ << std::endl;
  os << "mkldnn_enabled_op_types_: ";
  for (auto str : strategy.mkldnn_enabled_op_types_) {
    os << str << ", ";

--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -124,6 +124,7 @@ message BuildStrategy {
  optional int32 reduce_strategy = 15 [ default = 0 ];
  optional bool fuse_gemm_epilogue = 16 [ default = false ];
  optional string debug_graphviz_path = 17;
+  optional bool fused_attention = 18 [ default = false];
 }

 message ExecutionStrategy {

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -223,6 +223,10 @@ cc_library(
  fuse_gemm_epilogue_pass
  SRCS fuse_gemm_epilogue_pass.cc
  DEPS pass graph_pattern_detector)
+cc_library(
+  fused_attention_pass
+  SRCS fused_attention_pass.cc
+  DEPS pass graph_pattern_detector)
 cc_library(
  fuse_relu_depthwise_conv_pass
  SRCS fuse_relu_depthwise_conv_pass.cc

--- a/paddle/fluid/framework/ir/fused_attention_pass.cc
+++ b/paddle/fluid/framework/ir/fused_attention_pass.cc
--- a/paddle/fluid/framework/ir/fused_attention_pass.h
+++ b/paddle/fluid/framework/ir/fused_attention_pass.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+// Declare patterns for multi head attention.
+// Can detect:
+// 1. Pre layer norm, post layer norm or sandwich layer norm.
+// 2. Add attn mask for qk product before the softmax or not.
+// 3. Do attn dropout or not.
+// 4. Add residual to the out linear result or not.
+struct FusedAttentionPattern : public PatternBase {
+  FusedAttentionPattern(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "fused_attention_pattern") {}
+
+  PDNode* operator()(PDNode* x,
+                     bool pre_layer_norm,   // do pre ln or not
+                     bool post_layer_norm,  // do post ln or not
+                     bool has_attn_mask,    // add attn mask to qk or not
+                     bool do_dropout,       // dropout the softmax(qk) or not
+                     bool add_residual);    // add residual to out linear or not
+
+  // pre layer norm
+  PATTERN_DECL_NODE(pre_layer_norm_op);
+  PATTERN_DECL_NODE(pre_layer_norm_scale);
+  PATTERN_DECL_NODE(pre_layer_norm_bias);
+  PATTERN_DECL_NODE(pre_layer_norm_out);
+  PATTERN_DECL_NODE(pre_layer_norm_mean);
+  PATTERN_DECL_NODE(pre_layer_norm_variance);
+
+  // fuse qkv projection
+  PATTERN_DECL_NODE(fuse_qkv_matmul_op);
+  PATTERN_DECL_NODE(fuse_qkv_matmul_w);
+  PATTERN_DECL_NODE(fuse_qkv_matmul_out);
+
+  PATTERN_DECL_NODE(fuse_qkv_ele_add_op);
+  PATTERN_DECL_NODE(fuse_qkv_ele_add_bias);
+  PATTERN_DECL_NODE(fuse_qkv_ele_add_out);
+
+  PATTERN_DECL_NODE(fuse_qkv_reshape_op);
+  PATTERN_DECL_NODE(fuse_qkv_reshape_out);
+  PATTERN_DECL_NODE(fuse_qkv_reshape_x_shape);
+
+  PATTERN_DECL_NODE(fuse_qkv_transpose_op);
+  PATTERN_DECL_NODE(fuse_qkv_transpose_out);
+  PATTERN_DECL_NODE(fuse_qkv_transpose_x_shape);
+
+  PATTERN_DECL_NODE(fuse_qkv_split_op);
+  PATTERN_DECL_NODE(fuse_qkv_split_out_q);  // q
+  PATTERN_DECL_NODE(fuse_qkv_split_out_k);  // k
+  PATTERN_DECL_NODE(fuse_qkv_split_out_v);  // v
+
+  // core attention
+  PATTERN_DECL_NODE(qk_matmul_op);
+  PATTERN_DECL_NODE(qk_matmul_out);
+
+  PATTERN_DECL_NODE(qk_scale_op);
+  PATTERN_DECL_NODE(qk_scale_out);
+
+  PATTERN_DECL_NODE(add_mask_ele_add_op);
+  PATTERN_DECL_NODE(add_mask_ele_add_mask);
+  PATTERN_DECL_NODE(add_mask_ele_add_out);
+
+  PATTERN_DECL_NODE(qk_softmax_op);
+  PATTERN_DECL_NODE(qk_softmax_out);
+
+  PATTERN_DECL_NODE(attn_dropout_op);
+  PATTERN_DECL_NODE(attn_dropout_out);
+  PATTERN_DECL_NODE(attn_dropout_mask);
+
+  PATTERN_DECL_NODE(qkv_matmul_op);
+  PATTERN_DECL_NODE(qkv_matmul_out);
+
+  PATTERN_DECL_NODE(qkv_transpose_op);
+  PATTERN_DECL_NODE(qkv_transpose_out);
+  PATTERN_DECL_NODE(qkv_transpose_x_shape);
+
+  PATTERN_DECL_NODE(qkv_reshape_op);
+  PATTERN_DECL_NODE(qkv_reshape_out);
+  PATTERN_DECL_NODE(qkv_reshape_x_shape);
+
+  // out linear
+  PATTERN_DECL_NODE(out_linear_matmul_op);
+  PATTERN_DECL_NODE(out_linear_matmul_w);
+  PATTERN_DECL_NODE(out_linear_matmul_out);
+
+  PATTERN_DECL_NODE(out_linear_ele_add_op);
+  PATTERN_DECL_NODE(out_linear_ele_add_bias);
+  PATTERN_DECL_NODE(out_linear_ele_add_out);
+
+  PATTERN_DECL_NODE(out_linear_dropout_op);
+  PATTERN_DECL_NODE(out_linear_dropout_out);
+  PATTERN_DECL_NODE(out_linear_dropout_mask);
+
+  // residual
+  PATTERN_DECL_NODE(residual_ele_add_op);
+  PATTERN_DECL_NODE(residual_ele_add_out);
+
+  // post layer norm
+  PATTERN_DECL_NODE(post_layer_norm_op);
+  PATTERN_DECL_NODE(post_layer_norm_scale);
+  PATTERN_DECL_NODE(post_layer_norm_bias);
+  PATTERN_DECL_NODE(post_layer_norm_out);
+  PATTERN_DECL_NODE(post_layer_norm_mean);
+  PATTERN_DECL_NODE(post_layer_norm_variance);
+};
+
+// Declare the grad pattern for multi head attention
+struct FusedAttentionGradPattern : public PatternBase {
+  FusedAttentionGradPattern(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "fused_attention_pattern") {}
+
+  PDNode* operator()(PDNode* x,
+                     bool pre_layer_norm,   // pre ln
+                     bool post_layer_norm,  // post ln
+                     bool has_attn_mask,    // add attn mask to qk or not
+                     bool do_dropout,       // dropout the softmax(qk) or not
+                     bool add_residual);    // add residual to out linear or not
+
+  // TODO(Yuang Liu): add backward pattern
+};
+
+}  // namespace patterns
+
+class FusedAttentionsPass : public FusePassBase {
+ public:
+  virtual ~FusedAttentionsPass() {}
+
+ protected:
+  void ApplyImpl(Graph* graph) const;
+
+  const std::string name_scope_{"fused_attention_pass"};
+
+ private:
+  // The name rule for the helper function.
+  // The function name will contain at most five parts in order:
+  // 1. Do pre layer norm? [Pre]
+  // 2. Add mask in the core attention part? [Mask]
+  // 3. Do dropout in the core attention part? [Drop]
+  // 4. Add residual? [Res]
+  // 5. Do post layer norm? [Post]
+  // 6. Forward or Backward? [Fwd/Bwd]
+  // If true, the function name will have an abbreviation part.
+  // If false, the function name won't contain an abbreviation for it.
+
+  ir::Graph* PreMaskDropResPostFwd(Graph* graph) const;
+
+  ir::Graph* PreMaskDropResPostBwd(Graph* graph) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/pybind/parallel_executor.cc
+++ b/paddle/fluid/pybind/parallel_executor.cc
@@ -714,6 +714,32 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                        build_strategy = static.BuildStrategy()
                        build_strategy.fuse_gemm_epilogue = True
                     )DOC")
+      .def_property(
+          "fused_attention",
+          [](const BuildStrategy &self) { return self.fused_attention_; },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.fused_attention_ = b;
+          },
+          R"DOC((bool, optional): fused_attention indicate whether
+                to fuse the whole multi head attention part with one op,
+                it may make the execution faster. Default is False.
+
+                Examples:
+                    .. code-block:: python
+
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.fused_attention = True
+                     )DOC")
      .def_property(
          "fuse_bn_act_ops",
          [](const BuildStrategy &self) { return self.fuse_bn_act_ops_; },

--- a/python/paddle/distributed/passes/cpp_pass.py
+++ b/python/paddle/distributed/passes/cpp_pass.py
@@ -71,6 +71,19 @@ class FuseReluDepthwiseConvPass(CPPPassWrapper):
        return PassType.FUSION_OPT


+@register_pass("fused_attention")
+class FusedAttentionPass(CPPPassWrapper):
+    def __init__(self):
+        super().__init__()
+
+    @property
+    def cpp_name(self):
+        return "fused_attention_pass"
+
+    def _type(self):
+        return PassType.FUSION_OPT
+
+
 @register_pass("fuse_gemm_epilogue")
 class FuseGemmEpiloguePass(CPPPassWrapper):
    def __init__(self):

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -76,6 +76,7 @@ if(NOT WITH_GPU)
  list(REMOVE_ITEM TEST_OPS test_fused_transformer_encoder_layer)
  list(REMOVE_ITEM TEST_OPS test_fused_bias_dropout_residual_layer_norm_op)
  list(REMOVE_ITEM TEST_OPS test_fused_bias_dropout_residual_layer_norm_op_api)
+  list(REMOVE_ITEM TEST_OPS test_fused_attention_pass)
 endif()

 list(REMOVE_ITEM TEST_OPS test_fused_ec_moe_op)

--- a/python/paddle/fluid/tests/unittests/test_fused_attention_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_pass.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.fluid.core as core
+import paddle.nn.functional as F
+from paddle.distributed.passes import PassManager, new_pass
+
+paddle.enable_static()
+
+
+class MultiHeadAttention(paddle.nn.Layer):
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        add_residual=True,
+        pre_ln=True,
+        post_ln=False,
+        attn_dropout=True,
+    ):
+        super(MultiHeadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = embed_dim
+        self.vdim = embed_dim
+        self.num_heads = num_heads
+
+        self.add_residual = add_residual
+        self.pre_ln = pre_ln
+        self.post_ln = post_ln
+        self.attn_dropout = attn_dropout
+
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
+
+        self.norm1 = paddle.nn.LayerNorm(embed_dim, epsilon=1e-5)
+        self.norm2 = paddle.nn.LayerNorm(embed_dim, epsilon=1e-5)
+
+        self.qkv_proj = paddle.nn.Linear(embed_dim, 3 * embed_dim)
+        self.out_proj = paddle.nn.Linear(embed_dim, embed_dim)
+        self.dropout = paddle.nn.Dropout(0.1, mode="upscale_in_train")
+
+    def forward(self, x, attn_mask=None):
+        residual = x
+
+        if self.pre_ln:
+            # pre layer norm
+            x = self.norm1(x)
+
+        # compute qkv
+        qkv = self.qkv_proj(x)
+        qkv = paddle.reshape(qkv, [0, 0, self.num_heads, 3 * self.head_dim])
+        qkv = paddle.transpose(qkv, [0, 2, 1, 3])
+        q, k, v = paddle.split(qkv, num_or_sections=3, axis=-1)
+
+        # compute core attention
+        product = paddle.matmul(x=q, y=k, transpose_y=True)
+        product = paddle.scale(product, scale=self.head_dim**-0.5)
+        if attn_mask is not None:
+            product = product + attn_mask
+        weights = F.softmax(product)
+        if self.attn_dropout:
+            weights = F.dropout(
+                weights, 0.1, training=self.training, mode="upscale_in_train"
+            )
+        out = paddle.matmul(weights, v)
+        out = paddle.transpose(out, perm=[0, 2, 1, 3])
+        out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+
+        # project to output
+        out = self.out_proj(out)
+        out = self.dropout(out)
+        if self.add_residual:
+            out = residual + out
+
+        if self.post_ln:
+            # post layer norm
+            out = self.norm2(out)
+
+        return out
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+)
+class TestFusedAttentionPass(unittest.TestCase):
+    def setUp(self):
+        self.add_residual = True
+        self.pre_ln = True
+        self.post_ln = True
+        self.attn_dropout = True
+        self.add_mask = True
+
+    def test_pass(self):
+        batch_size = 2
+        seq_len = 1024
+        hidden_size = 768
+        num_heads = 12
+
+        x_data = np.random.rand(batch_size, seq_len, hidden_size).astype(
+            'float32'
+        )
+        mask_data = np.random.rand(
+            batch_size, num_heads, seq_len, seq_len
+        ).astype('float32')
+
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            data = paddle.static.data(
+                name="x",
+                shape=[-1, seq_len, hidden_size],
+                dtype='float32',
+            )
+            if self.add_mask:
+                attn_mask = paddle.static.data(
+                    name="attn_mask",
+                    shape=[-1, num_heads, seq_len, seq_len],
+                    dtype='float32',
+                )
+            else:
+                attn_mask = None
+            multi_head_attn = MultiHeadAttention(
+                hidden_size,
+                num_heads,
+                add_residual=self.add_residual,
+                pre_ln=self.pre_ln,
+                post_ln=self.post_ln,
+                attn_dropout=self.attn_dropout,
+            )
+            out = multi_head_attn(data, attn_mask)
+            loss = paddle.mean(out)
+
+            sgd_optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.001)
+            sgd_optimizer.minimize(loss)
+
+        pass_manager = PassManager([new_pass("fused_attention")])
+        pass_manager.apply([main_prog], [startup_prog])
+
+        ops = main_prog.global_block().ops
+        assert ops[0].type == 'reduce_mean'
+
+
+if __name__ == "__main__":
+    np.random.seed(0)
+    unittest.main()