[XPU]Add act add fuse (#53965)

f55f9d79 · wz1qqx · GitHub · 75fc4bf0 · f55f9d79 · f55f9d79
11 changed file
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -248,6 +248,8 @@ if(WITH_XPU)
  pass_library(stack_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
  pass_library(fused_multi_transformer_cachekv_layout_trans_pass inference DIR
               xpu DEPS ${XPU_PASS_DEPS})
+  pass_library(add_activation_xpu_fuse_pass inference DIR xpu DEPS
+               ${XPU_PASS_DEPS})
 endif()

 cc_library(

--- a/paddle/fluid/framework/ir/xpu/add_activation_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/add_activation_xpu_fuse_pass.cc
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "glog/logging.h"
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/xpu/pass_utils.h"
+#include "paddle/fluid/framework/ir/xpu/quant_utils.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace phi {
+class DenseTensor;
+}  // namespace phi
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+/*
+fuse ele_add + activation block in to xpu_ele_fusion op
+For example:
+graph:
+                    ele_x
+                      |
+                      |
+                 elementwise_add -----ele_y
+                      |
+                      |
+                     act
+                      |
+                      |
+                    out_Out
+------------------------------------------------------
+After the pass is applied:
+                    Input
+                      |     ele_y
+                      |    /
+                      |   /
+  Input_max ---- add_act_fusion ---- ele_y_max
+                      |    \
+                      |     \
+                      |      OutputMax
+                    Output
+*/
+struct AddActXPUPattern : public PatternBase {
+  AddActXPUPattern(PDPattern* pattern,
+                   const std::string& name_scope,
+                   const std::string& act_type);
+  // declare operator node's name
+  PATTERN_DECL_NODE(ele_add);
+  PATTERN_DECL_NODE(act);
+  // declare variable node's name
+  PATTERN_DECL_NODE(ele_x);
+  PATTERN_DECL_NODE(ele_y);
+  PATTERN_DECL_NODE(ele_out);
+  PATTERN_DECL_NODE(act_out);
+
+ private:
+  std::string act_type_;
+};
+
+AddActXPUPattern::AddActXPUPattern(PDPattern* pattern,
+                                   const std::string& name_scope,
+                                   const std::string& act_type)
+    : PatternBase(pattern, name_scope, name_scope), act_type_(act_type) {
+  auto ele_add =
+      pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add");
+  auto ele_x = pattern->NewNode(ele_x_repr())
+                   ->assert_is_op_input("elementwise_add", "X")
+                   ->assert_var_not_persistable()
+                   ->AsInput();
+  auto ele_y = pattern->NewNode(ele_y_repr())
+                   ->assert_is_op_input("elementwise_add", "Y")
+                   ->assert_var_not_persistable()
+                   ->AsInput();
+  auto ele_out = pattern->NewNode(ele_out_repr())
+                     ->assert_is_op_output("elementwise_add", "Out")
+                     ->assert_has_n_outputs(1);
+  ele_add->LinksFrom({ele_x, ele_y}).LinksTo({ele_out});
+  ele_out->assert_is_op_input(act_type_, "X");
+  auto act = pattern->NewNode(act_repr())->assert_is_op(act_type_);
+  auto act_out =
+      pattern->NewNode(act_out_repr())->assert_is_op_output(act_type_, "Out");
+  act->LinksFrom({ele_out}).LinksTo({act_out});
+}
+
+}  // namespace patterns
+
+class AddActXPUFusePass : public FusePassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+
+ private:
+  int ApplyImpl(ir::Graph* graph, const std::string& act_type) const;
+
+  const std::string name_scope_{"add_activation_xpu_fuse_pass"};
+};
+
+void AddActXPUFusePass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  Init(name_scope_, graph);
+
+  int found_subgraph_count = 0;
+  for (auto act_type : {"relu", "gelu"}) {
+    found_subgraph_count += ApplyImpl(graph, act_type);
+  }
+  AddStatis(found_subgraph_count);
+}
+
+int AddActXPUFusePass::ApplyImpl(ir::Graph* graph,
+                                 const std::string& act_type) const {
+  GraphPatternDetector gpd;
+  patterns::AddActXPUPattern pattern(
+      gpd.mutable_pattern(), name_scope_, act_type);
+  int found_subgraph_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* graph) {
+    VLOG(4) << "handle AddActXPUFusePass fuse";
+    /* declare operator node's name */
+    GET_IR_NODE(ele_add);
+    GET_IR_NODE(act);
+    /* declare variable node's name*/
+    GET_IR_NODE(ele_x);
+    GET_IR_NODE(ele_y);
+    GET_IR_NODE(ele_out);
+    GET_IR_NODE(act_out);
+    auto* block = ele_add->Op()->Block();
+    auto* scope = param_scope();
+    PADDLE_ENFORCE_NOT_NULL(
+        scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
+    std::string fused_op_out_name;
+    fused_op_out_name = act_out->Name();
+    std::string fused_op_out_max_name = fused_op_out_name + "_max";
+    VarDesc fused_op_out_max_desc(fused_op_out_max_name);
+    Node* fused_op_out_max = graph->CreateVarNode(&fused_op_out_max_desc);
+    // Generate add_act fused op
+    framework::OpDesc fused_op_desc(block);
+    fused_op_desc.SetType("add_act_xpu");
+    // set attrs for fused op
+    fused_op_desc.SetAttr("act_type", ConvertActivationType(act_type));
+    fused_op_desc.SetInput("x", {ele_x->Name()});
+    fused_op_desc.SetInput("y", {ele_y->Name()});
+    fused_op_desc.SetOutput("out", {fused_op_out_name});
+    fused_op_desc.SetOutput("out_max", {fused_op_out_max_name});
+    // relink fused op
+    auto* fused_op = graph->CreateOpNode(&fused_op_desc);
+    IR_NODE_LINK_TO(ele_x, fused_op);
+    IR_NODE_LINK_TO(ele_y, fused_op);
+    IR_NODE_LINK_TO(fused_op, act_out);
+    IR_NODE_LINK_TO(fused_op, fused_op_out_max);
+    // delete useless node
+    std::unordered_set<const Node*> delete_nodes = {ele_add, act, ele_out};
+    GraphSafeRemoveNodes(graph, delete_nodes);
+    found_subgraph_count++;
+  };
+
+  gpd(graph, handler);
+  return found_subgraph_count;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(add_activation_xpu_fuse_pass,
+              paddle::framework::ir::AddActXPUFusePass);
+
+REGISTER_PASS_CAPABILITY(add_activation_xpu_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().EQ(
+            "add_act_xpu", 0));
--- a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include "paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.h"
 #include <string>

 #include "glog/logging.h"

-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/ir/xpu/pass_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -36,165 +35,211 @@ class Scope;
 namespace paddle {
 namespace framework {
 namespace ir {
+
 namespace patterns {
+struct LinkAddActPattern : public PatternBase {
+  LinkAddActPattern(PDPattern* pattern, const std::string& name_scope);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(fusion_op);
+  // declare variable node's name
+  PATTERN_DECL_NODE(x);
+  PATTERN_DECL_NODE(ele_y);
+};
+
+LinkAddActPattern::LinkAddActPattern(PDPattern* pattern,
+                                     const std::string& name_scope)
+    : PatternBase(pattern, name_scope, name_scope) {
+  auto* fusion_op =
+      pattern->NewNode(fusion_op_repr())->assert_is_op("add_act_xpu");
+  auto* x = pattern->NewNode(x_repr())->assert_is_op_input("add_act_xpu", "x");
+  auto* ele_y =
+      pattern->NewNode(ele_y_repr())->assert_is_op_input("add_act_xpu", "y");
+  fusion_op->LinksFrom({x, ele_y});
+}

-struct FusionXPUOpPattern : public PatternBase {
-  FusionXPUOpPattern(PDPattern* pattern,
+struct LinkConv2dPattern : public PatternBase {
+  LinkConv2dPattern(PDPattern* pattern,
                    const std::string& name_scope,
-                     const std::string& op_type,
                    bool with_branch);

  // declare operator node's name
  PATTERN_DECL_NODE(fusion_op);
  // declare variable node's name
-  PATTERN_DECL_NODE(input);
+  PATTERN_DECL_NODE(x);
  PATTERN_DECL_NODE(branch);

 private:
-  std::string op_type_;
  bool with_branch_{false};
 };

-FusionXPUOpPattern::FusionXPUOpPattern(PDPattern* pattern,
+LinkConv2dPattern::LinkConv2dPattern(PDPattern* pattern,
                                     const std::string& name_scope,
-                                       const std::string& op_type,
                                     bool with_branch)
-    : PatternBase(pattern, name_scope, name_scope),
-      op_type_(op_type),
-      with_branch_(with_branch) {
-  auto* fusion_op = pattern->NewNode(fusion_op_repr())->assert_is_op(op_type_);
-  auto* input =
-      pattern->NewNode(input_repr())->assert_is_op_input(op_type_, "x");
-
+    : PatternBase(pattern, name_scope, name_scope), with_branch_(with_branch) {
+  auto* fusion_op =
+      pattern->NewNode(fusion_op_repr())->assert_is_op("conv2d_xpu");
+  auto* x = pattern->NewNode(x_repr())->assert_is_op_input("conv2d_xpu", "x");
  PDNode* branch = nullptr;
  if (with_branch_) {
-    branch =
-        pattern->NewNode(branch_repr())->assert_is_op_input(op_type_, "branch");
-    fusion_op->LinksFrom({input, branch});
-  } else {
-    fusion_op->LinksFrom({input});
+    branch = pattern->NewNode(branch_repr())
+                 ->assert_is_op_input("conv2d_xpu", "branch");
+    fusion_op->LinksFrom({branch});
  }
+  fusion_op->LinksFrom({x});
 }

-}  // namespace patterns
+struct LinkFcPattern : public PatternBase {
+  LinkFcPattern(PDPattern* pattern, const std::string& name_scope);

-class LinkXPUOpMaxPass : public FusePassBase {
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
+  // declare operator node's name
+  PATTERN_DECL_NODE(fusion_op);
+  // declare variable node's name
+  PATTERN_DECL_NODE(x);
+};

- private:
-  void ApplyImpl(ir::Graph* graph,
-                 const std::string& op_type,
-                 bool with_branch) const;
+LinkFcPattern::LinkFcPattern(PDPattern* pattern, const std::string& name_scope)
+    : PatternBase(pattern, name_scope, name_scope) {
+  auto* fusion_op = pattern->NewNode(fusion_op_repr())->assert_is_op("fc_xpu");
+  auto* x = pattern->NewNode(x_repr())->assert_is_op_input("fc_xpu", "x");

-  const std::string name_scope_{"link_xpu_op_max_pass"};
-  // ops with x_max/out_max
-  std::set<std::string> op_types_{"fc_xpu", "conv2d_xpu"};
-};
+  fusion_op->LinksFrom({x});
+}

-/*
-Origin subgraph:
-          fusion_xpu_op0
-            /       \
-            |       |
-          out0   out0_max
-            |
-            \
-            fusion_op
-Fused subgraph:
-          fusion_xpu_op0
-            /       \
-            |       |
-          out0   out0_max
-            |       |
-            \       /
-            fusion_op
-
-Origin subgraph1:
-          fusion_xpu_op0     fusion_xpu_op1
-            /       \         /          \
-            |       |         |          |
-          out0   out0_max    out1      out1_max
-            |                 |
-        (x) \                / (branch)
-              fusion_xpu_op2
-Fused subgraph1:
-          fusion_xpu_op0     fusion_xpu_op1
-            /       \         /           \
-            |       |         |            |
-          out0   out0_max    out1      out1_max
-            |       |          |           |
-        (x) \       |(x_max)   |(branch)  /(branch_max)
-             \      |          |         /
-              \     |          |        /
-               \    |          |       /
-                   fusion_xpu_op2
-*/
-void LinkXPUOpMaxPass::ApplyImpl(ir::Graph* graph) const {
-  Init(name_scope_, graph);
-  for (auto op_type : op_types_) {
-    for (auto with_branch : {true, false}) {
-      ApplyImpl(graph, op_type, with_branch);
+}  // namespace patterns
+
+void LinkXPUOpMaxPass::LinkAddActMax(ir::Graph* graph) const {
+  GraphPatternDetector gpd;
+  patterns::LinkAddActPattern pattern(gpd.mutable_pattern(), name_scope_);
+  int found_subgraph_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* graph) {
+    VLOG(4) << "handle LinkAddActMax";
+    /* declare operator node's name */
+    GET_IR_NODE(fusion_op);
+    /* declare variable node's name*/
+    GET_IR_NODE(x);
+    GET_IR_NODE(ele_y);
+    auto* fusion_op_desc = fusion_op->Op();
+    auto* x_pre_op = x->inputs[0]->Op();
+    if (x->inputs.size() > 0 && x->inputs[0]->IsOp() &&
+        x_pre_op->HasOutput("out_max")) {
+      auto preop_max_var_name = x_pre_op->Output("out_max");
+      for (auto max_node : x->inputs[0]->outputs) {
+        if (preop_max_var_name[0] == max_node->Name()) {
+          fusion_op_desc->SetInput("x_max", {max_node->Name()});
+          IR_NODE_LINK_TO(max_node, fusion_op);
+        }
+      }
+    }
+    auto* ele_y_pre_op = ele_y->inputs[0]->Op();
+    if (ele_y->inputs.size() > 0 && ele_y->inputs[0]->IsOp() &&
+        ele_y_pre_op->HasOutput("out_max")) {
+      auto preop_max_var_name = ele_y_pre_op->Output("out_max");
+      for (auto max_node : ele_y->inputs[0]->outputs) {
+        if (preop_max_var_name[0] == max_node->Name()) {
+          fusion_op_desc->SetInput("y_max", {max_node->Name()});
+          IR_NODE_LINK_TO(max_node, fusion_op);
        }
      }
+    }
+    found_subgraph_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_subgraph_count);
 }

-void LinkXPUOpMaxPass::ApplyImpl(ir::Graph* graph,
-                                 const std::string& op_type,
-                                 bool with_branch) const {
-  PADDLE_ENFORCE_NOT_NULL(
-      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+void LinkXPUOpMaxPass::LinkConv2dMax(ir::Graph* graph, bool with_branch) const {
  GraphPatternDetector gpd;
-  patterns::FusionXPUOpPattern pattern(
-      gpd.mutable_pattern(), name_scope_, op_type, with_branch);
-
+  patterns::LinkConv2dPattern pattern(
+      gpd.mutable_pattern(), name_scope_, with_branch);
  int found_subgraph_count = 0;
+
  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                     Graph* graph) {
-    VLOG(4) << "handle LinkXPUOpMaxPass fuse";
+    VLOG(4) << "handle LinkConv2dMax";
+    /* declare operator node's name */
    GET_IR_NODE(fusion_op);
-    GET_IR_NODE(input);
+    /* declare variable node's name*/
+    GET_IR_NODE(x);
    GET_IR_NODE(branch);
-
    auto* fusion_op_desc = fusion_op->Op();
-    if (fusion_op_desc->HasAttr("has_branch")) {
-      bool fusion_op_branch =
-          PADDLE_GET_CONST(bool, fusion_op_desc->GetAttr("has_branch"));
-      if (fusion_op_branch != with_branch) {
-        return;
-      }
-    }
-    if (input->inputs.size() > 0 && input->inputs[0]->IsOp() &&
-        input->inputs[0]->Op()->HasOutput("out_max")) {
-      auto input_max_name = input->inputs[0]->Op()->Output("out_max");
-      for (auto max_node : input->inputs[0]->outputs) {
-        if (input_max_name[0] == max_node->Name()) {
+    auto* x_pre_op = x->inputs[0]->Op();
+    if (x->inputs.size() > 0 && x->inputs[0]->IsOp() &&
+        x_pre_op->HasOutput("out_max")) {
+      auto preop_max_var_name = x_pre_op->Output("out_max");
+      for (auto max_node : x->inputs[0]->outputs) {
+        if (preop_max_var_name[0] == max_node->Name()) {
          fusion_op_desc->SetInput("x_max", {max_node->Name()});
          IR_NODE_LINK_TO(max_node, fusion_op);
-          found_subgraph_count++;
        }
      }
    }
-
    if (with_branch) {
+      auto* branch_pre_op = branch->inputs[0]->Op();
      if (branch->inputs.size() > 0 && branch->inputs[0]->IsOp() &&
-          branch->inputs[0]->Op()->HasOutput("out_max")) {
-        auto branch_max_name = branch->inputs[0]->Op()->Output("out_max");
+          branch_pre_op->HasOutput("out_max")) {
+        auto preop_max_var_name = branch_pre_op->Output("out_max");
        for (auto max_node : branch->inputs[0]->outputs) {
-          if (branch_max_name[0] == max_node->Name()) {
+          if (preop_max_var_name[0] == max_node->Name()) {
            fusion_op_desc->SetInput("branch_max", {max_node->Name()});
            IR_NODE_LINK_TO(max_node, fusion_op);
-            found_subgraph_count++;
          }
        }
      }
    }
+    found_subgraph_count++;
  };

  gpd(graph, handler);
  AddStatis(found_subgraph_count);
 }

+void LinkXPUOpMaxPass::LinkFcMax(ir::Graph* graph) const {
+  GraphPatternDetector gpd;
+  patterns::LinkFcPattern pattern(gpd.mutable_pattern(), name_scope_);
+  int found_subgraph_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* graph) {
+    VLOG(4) << "handle LinkFcMax";
+    /* declare operator node's name */
+    GET_IR_NODE(fusion_op);
+    /* declare variable node's name*/
+    GET_IR_NODE(x);
+    auto* fusion_op_desc = fusion_op->Op();
+    auto* x_pre_op = x->inputs[0]->Op();
+    if (x->inputs.size() > 0 && x->inputs[0]->IsOp() &&
+        x_pre_op->HasOutput("out_max")) {
+      auto preop_max_var_name = x_pre_op->Output("out_max");
+      for (auto max_node : x->inputs[0]->outputs) {
+        if (preop_max_var_name[0] == max_node->Name()) {
+          fusion_op_desc->SetInput("x_max", {max_node->Name()});
+          IR_NODE_LINK_TO(max_node, fusion_op);
+        }
+      }
+    }
+    found_subgraph_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_subgraph_count);
+}
+
+void LinkXPUOpMaxPass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  Init(name_scope_, graph);
+
+  LinkFcMax(graph);
+  for (auto with_branch : {true, false}) {
+    LinkConv2dMax(graph, with_branch);
+  }
+  LinkAddActMax(graph);
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
@@ -203,5 +248,7 @@ REGISTER_PASS(link_xpu_op_max_pass, paddle::framework::ir::LinkXPUOpMaxPass);

 REGISTER_PASS_CAPABILITY(link_xpu_op_max_pass)
    .AddCombination(
-        paddle::framework::compatible::OpVersionComparatorCombination().EQ(
-            "fc_xpu", 0));
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("fc_xpu", 0)
+            .EQ("conv2d_xpu", 0)
+            .EQ("add_act_xpu", 0));
--- a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.h
+++ b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.h
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace phi {
+class DenseTensor;
+}  // namespace phi
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class LinkXPUOpMaxPass : public FusePassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+
+ private:
+  /*
+  Origin subgraph:
+          fusion_xpu_op0
+            /       \
+            |       |
+          out0   out0_max
+            |
+            \
+            fc_xpu
+Fused subgraph:
+          fusion_xpu_op0
+            /       \
+            |       |
+          out0   out0_max
+            |       |
+            \       /
+            fc_xpu
+  */
+  void LinkFcMax(ir::Graph* graph) const;
+
+  /*
+  Origin subgraph:
+          fusion_xpu_op0     fusion_xpu_op1
+            /       \         /          \
+            |       |         |          |
+          out0   out0_max    out1      out1_max
+            |                 |
+        (x) \                / (branch)
+              conv2d_xpu
+Fused subgraph:
+          fusion_xpu_op0     fusion_xpu_op1
+            /       \         /           \
+            |       |         |            |
+          out0   out0_max    out1      out1_max
+            |       |          |           |
+        (x) \       |(x_max)   |(branch)  /(branch_max)
+             \      |          |         /
+              \     |          |        /
+               \    |          |       /
+                   conv2d_xpu
+  */
+  void LinkConv2dMax(ir::Graph* graph, bool with_branch) const;
+
+  /*
+  Origin subgraph:
+            fusion_xpu_op0     fusion_xpu_op1
+              /       \         /          \
+              |       |         |          |
+            out0   out0_max    out1      out1_max
+              |                 |
+          (x) \                / (y)
+                add_act_xpu
+  Fused subgraph:
+            fusion_xpu_op0     fusion_xpu_op1
+              /       \         /           \
+              |       |         |            |
+            out0   out0_max    out1      out1_max
+              |       |          |           |
+          (x) \       |(x_max)   |(y)  /(y_max)
+               \      |          |         /
+                \     |          |        /
+                 \    |          |       /
+                     add_act_xpu
+  */
+  void LinkAddActMax(ir::Graph* graph) const;
+
+  const std::string name_scope_{"link_xpu_op_max_pass"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -527,6 +527,7 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) {
      "sigmoid_elementmul_fuse_pass",
      "fc_xpu_fuse_pass",
      "conv2d_xpu_fuse_pass",
+      "add_activation_xpu_fuse_pass",
      "link_xpu_op_max_pass",
      "inplace_op_var_pass",
      "delete_isolated_node_pass",

--- a/paddle/phi/api/yaml/fused_ops.yaml
+++ b/paddle/phi/api/yaml/fused_ops.yaml
@@ -4,6 +4,16 @@
 # if one operator have "support_dygraph_mode : true", it supports dygraph mode,
 # otherwise the operator only could be used in static mode.

+- op : add_act_xpu
+  args : (Tensor x, Tensor x_max, Tensor y, Tensor y_max, int act_type)
+  output : Tensor(out), Tensor(out_max)
+  infer_meta :
+    func : AddActXPUInferMeta
+  kernel :
+    func : add_act_xpu
+    data_type : x
+  optional : x_max, y_max
+
 - op : conv2d_xpu
  args : (Tensor x, Tensor x_max, Tensor filter, Tensor filter_max, Tensor bias, Tensor branch, Tensor branch_max, int[] paddings, int[] dilations, int[] strides, str padding_algorithm, int groups, bool has_bias, bool has_branch, int act_type, float act_param)
  output : Tensor(out), Tensor(out_max)

--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -22,6 +22,8 @@ namespace xpu {
 XPUOpMap& get_kl2_ops() {
  // KL2支持的op，通过op_name, data_type, place来索引
  static XPUOpMap s_xpu2_kernels{
+      {"add_act_xpu",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
      {"abs", XPUKernelSet({phi::DataType::FLOAT32})},
      {"abs_grad",
       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},

--- a/paddle/phi/infermeta/fusion.cc
+++ b/paddle/phi/infermeta/fusion.cc
@@ -19,9 +19,66 @@ limitations under the License. */
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"

 namespace phi {

+void AddActXPUInferMeta(const MetaTensor& x,
+                        const MetaTensor& x_max,
+                        const MetaTensor& y,
+                        const MetaTensor& y_max,
+                        int act_type,
+                        MetaTensor* out,
+                        MetaTensor* out_max) {
+  int axis = -1;
+  if (x.dims() != y.dims()) {
+    auto x_dims = x.dims();
+    auto y_dims = y.dims();
+    int max_dim = std::max(x_dims.size(), y_dims.size());
+    if (x_dims.size() == y_dims.size()) {
+      PADDLE_ENFORCE_EQ((axis == -1) || (axis == 0),
+                        true,
+                        phi::errors::InvalidArgument(
+                            "axis should be -1 or 0 while the dimension of "
+                            "tensor X (%s) is equal to the dimension of "
+                            "tensor Y (%s), but received axis: %s",
+                            x_dims.size(),
+                            y_dims.size(),
+                            axis));
+    }
+    PADDLE_ENFORCE_EQ((axis >= (-1 * max_dim)) && (axis < max_dim),
+                      true,
+                      phi::errors::InvalidArgument(
+                          "The axis range must be [%s, %s), but axis is %s. "
+                          "Please set the axis again.",
+                          -1 * max_dim,
+                          max_dim,
+                          axis));
+    axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1)
+                     : axis);
+    std::vector<int> x_dims_array(max_dim);
+    std::vector<int> y_dims_array(max_dim);
+    std::vector<int> out_dims_array(max_dim);
+    funcs::GetBroadcastDimsArrays(x_dims,
+                                  y_dims,
+                                  x_dims_array.data(),
+                                  y_dims_array.data(),
+                                  out_dims_array.data(),
+                                  max_dim,
+                                  axis);
+    auto out_dims = phi::make_ddim(out_dims_array);
+    out->set_dims(out_dims);
+  } else {
+    out->set_dims(x.dims());
+  }
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+  out->share_lod(x);
+  out_max->set_dims(phi::make_ddim({6}));
+  out_max->set_dtype(x.dtype());
+  out_max->set_layout(x.layout());
+}
+
 inline int ConvOutSize(int input_size,
                       int filter_size,
                       int dilation,

--- a/paddle/phi/infermeta/fusion.h
+++ b/paddle/phi/infermeta/fusion.h
@@ -22,6 +22,14 @@ namespace phi {
 // Common InferMeta Functions for fusion operators.
 // NOTE: The InferMeta Functions in this file are arranged in alphabetic order.

+void AddActXPUInferMeta(const MetaTensor& x,
+                        const MetaTensor& x_max,
+                        const MetaTensor& y,
+                        const MetaTensor& y_max,
+                        int act_type,
+                        MetaTensor* out,
+                        MetaTensor* out_max);
+
 void Conv2dXPUInferMeta(const MetaTensor& x,
                        const MetaTensor& x_max,
                        const MetaTensor& filter,

--- a/paddle/phi/kernels/fusion/xpu/add_act_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/add_act_xpu_kernel.cc
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+namespace fusion {
+
+template <typename T, typename Context>
+void AddActXPUKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const paddle::optional<DenseTensor>& x_max,
+                     const DenseTensor& y,
+                     const paddle::optional<DenseTensor>& y_max,
+                     int act_type,
+                     DenseTensor* out,
+                     DenseTensor* out_max) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+  auto* x_data = reinterpret_cast<const XPUType*>(x.data<T>());
+  const float* x_max_data =
+      x_max.get_ptr() == nullptr ? nullptr : x_max.get_ptr()->data<float>();
+  auto* y_data = reinterpret_cast<const XPUType*>(y.data<T>());
+  const float* y_max_data =
+      y_max.get_ptr() == nullptr ? nullptr : y_max.get_ptr()->data<float>();
+  auto* out_data = reinterpret_cast<XPUType*>(ctx.template Alloc<T>(out));
+
+  std::vector<int64_t> x_shape = phi::vectorize(x.dims());
+  std::vector<int64_t> y_shape = phi::vectorize(y.dims());
+  xpu::Activation_t act(static_cast<xpu::Activation_t::act_enum>(act_type));
+  int r =
+      xpu::add_activation_fusion<XPUType, XPUType, XPUType>(  // TX/TY/TZ/TID
+          /* baidu::xpu::api::Context* ctx */ ctx.x_context(),
+          /* const TX* x */ x_data,
+          /* const TY* y */ y_data,
+          /* TZ* z */ out_data,
+          /* const std::vector<int64_t>& x_shape */ x_shape,
+          /* const std::vector<int64_t>& y_shape */ y_shape,
+          /* const float* max_x */ x_max_data,
+          /* const float* max_y */ y_max_data,
+          /* float* max_z */ ctx.template Alloc<float>(out_max),
+          /* const baidu::xpu::api::Activation_t& act */ act);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "add_act_xpu");
+}
+
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(add_act_xpu,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::fusion::AddActXPUKernel,
+                   float,
+                   phi::dtype::float16) {}
--- a/test/ir/inference/test_xpu_add_activation_fuse_pass.py
+++ b/test/ir/inference/test_xpu_add_activation_fuse_pass.py
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from functools import partial
+
+import hypothesis.strategies as st
+import numpy as np
+from auto_scan_test import PassAutoScanTest
+from program_config import OpConfig, ProgramConfig, TensorConfig
+
+
+class TestAddActXPUFusePass(PassAutoScanTest):
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_xpu=True)
+        yield config, ["add_act_xpu"], (1e-3, 1e-3)
+
+    def sample_program_config(self, draw):
+        batch_size = draw(st.integers(min_value=1, max_value=50))
+
+        # Generate shape of input:X Y of ele_add
+        def generate_input():
+            return np.random.random([batch_size, 3, 100, 100]).astype(
+                np.float32
+            )
+
+        axis = -1
+
+        # Here we will compose a program
+        # Still has some risks that the program is invalid or cause bug while running
+        # Use function `is_program_valid` to filter the invalid programs before running
+        # Use function `add_skip_pass_case` to ignore the programs even if they cause bug while runing
+        elementwise_op = OpConfig(
+            type='elementwise_add',
+            inputs={'X': ['eltwise_X'], 'Y': ['eltwise_Y']},
+            outputs={'Out': ['eltwise_output']},
+            axis=axis,
+        )
+        relu_op = OpConfig(
+            "relu",
+            inputs={"X": ["eltwise_output"]},
+            outputs={"Out": ["relu_out"]},
+        )
+        mini_graph = [elementwise_op, relu_op]
+
+        program_config = ProgramConfig(
+            ops=mini_graph,
+            weights={},
+            inputs={
+                "eltwise_X": TensorConfig(data_gen=partial(generate_input)),
+                "eltwise_Y": TensorConfig(data_gen=partial(generate_input)),
+            },
+            outputs=mini_graph[-1].outputs["Out"],
+        )
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=25,
+            passes=["add_activation_xpu_fuse_pass"],
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()