Fused attention pass backward op replace. (#50186)

7e8ef328 · Yuang Liu · GitHub · f2ec69b4 · 7e8ef328 · 7e8ef328
4 changed file
--- a/paddle/fluid/framework/ir/fused_attention_pass.cc
+++ b/paddle/fluid/framework/ir/fused_attention_pass.cc
--- a/paddle/fluid/framework/ir/fused_attention_pass.h
+++ b/paddle/fluid/framework/ir/fused_attention_pass.h
@@ -16,6 +16,7 @@

 #include <memory>
 #include <string>
+#include <unordered_map>

 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
@@ -252,6 +253,31 @@ struct FusedAttentionGradPattern : public PatternBase {

 }  // namespace patterns

+class FusedAttentionPassCache {
+ public:
+  ir::Node* GetNodeFromCache(const std::string name) {
+    if (var_name_to_ir_node_cache_.count(name)) {
+      return var_name_to_ir_node_cache_.find(name)->second;
+    }
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "The key (%d) of FusedAttentionCache does not exist.", name));
+  }
+
+  void InsertIntoCache(const std::string name, ir::Node* node) {
+    if (!var_name_to_ir_node_cache_.count(name)) {
+      var_name_to_ir_node_cache_.insert({name, node});
+    } else {
+      PADDLE_THROW(platform::errors::AlreadyExists(
+          "The key (%d) of FusedAttentionCache already exist.", name));
+    }
+  }
+
+  void ResetCache() { var_name_to_ir_node_cache_.clear(); }
+
+ private:
+  std::unordered_map<std::string, ir::Node*> var_name_to_ir_node_cache_;
+};
+
 class FusedAttentionsPass : public FusePassBase {
 public:
  virtual ~FusedAttentionsPass() {}
@@ -273,9 +299,17 @@ class FusedAttentionsPass : public FusePassBase {
  // If true, the function name will have an abbreviation part.
  // If false, the function name won't contain an abbreviation for it.

-  ir::Graph* PreMaskDropResFwd(Graph* graph) const;
+  ir::Graph* PreMaskDropResFwd(Graph* graph,
+                               FusedAttentionPassCache* cache) const;
+
+  ir::Graph* PreMaskDropResBwd(Graph* graph,
+                               FusedAttentionPassCache* cache) const;

-  ir::Graph* PreMaskDropResBwd(Graph* graph) const;
+  const std::string GenerateCacheKey(const std::string anchor,
+                                     const std::string var_name,
+                                     int block_id) const {
+    return anchor + "_" + std::to_string(block_id) + "_" + var_name;
+  }
 };

 }  // namespace ir

--- a/paddle/fluid/operators/fused/fused_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_attention_op.cc
@@ -375,7 +375,7 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("BiasDropoutResidualOut",
              "Result of residual + dropout(src + bias).")
        .AsIntermediate();
-    AddOutput("CacheKVOut", "The udpated cache KV.");
+    AddOutput("CacheKVOut", "The udpated cache KV.").AsDispensable();
    AddOutput("Y", "Result after attention.");

    AddAttr<int>("num_heads", "The number head for multi_head_attention.")

--- a/python/paddle/fluid/tests/unittests/test_fused_attention_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_pass.py
@@ -157,11 +157,20 @@ class TestFusedAttentionPass(unittest.TestCase):
        assert ops[2].type == 'fused_attention'
        assert ops[3].type == 'reduce_mean'
        assert ops[5].type == 'reduce_mean_grad'
+        assert ops[6].type == 'fused_attention_grad'
        # two ops for linear, one op for reduce mean
        # one fill constant
        # one op for reduce mean grad, two ops for linear bwd
        # the eighth op should be the optimizer
-        assert ops[8].type == 'sgd'
+        assert ops[9].type == 'sgd'
+
+        exe = paddle.static.Executor()
+        exe.run(startup_prog)
+        rst = exe.run(
+            main_prog,
+            feed={'x': x_data, 'attn_mask': mask_data},
+            fetch_list=[loss],
+        )


 if __name__ == "__main__":