[XPU] add squeeze_excitation_block_xpu op&pass to optimize ppocr_v3_det model (#56773)

* [XPU] add squeeze_excitation_block_xpu op&pass to optimize ppocr_v3_det model test=kunlun * fix * fix Codestype * remove xpu name

[XPU] add squeeze_excitation_block_xpu op&pass to optimize ppocr_v3_det model (#56773)
* [XPU] add squeeze_excitation_block_xpu op&pass to optimize ppocr_v3_det model test=kunlun * fix * fix Codestype * remove xpu name
7c8c9b7d · leolishaohao · GitHub · c170074d · 7c8c9b7d · 7c8c9b7d
11 changed file
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -290,6 +290,8 @@ if(WITH_XPU)
  pass_library(fast_where_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
  pass_library(fast_layernorm_xpu_fuse_pass inference DIR xpu DEPS
               ${XPU_PASS_DEPS})
+  pass_library(squeeze_excitation_fuse_pass inference DIR xpu DEPS
+               ${XPU_PASS_DEPS})
  pass_library(elementwise_mul_add_fuse_pass inference DIR xpu DEPS
               ${XPU_PASS_DEPS})
 endif()
@@ -615,4 +617,8 @@ if(WITH_XPU)
    test_fast_where_xpu_fuse_pass
    SRCS xpu/fast_where_xpu_fuse_pass_test.cc
    DEPS fast_where_xpu_fuse_pass)
+  cc_test(
+    test_squeeze_excitation_fuse_pass
+    SRCS xpu/squeeze_excitation_fuse_pass_test.cc
+    DEPS squeeze_excitation_fuse_pass)
 endif()
--- a/paddle/fluid/framework/ir/xpu/squeeze_excitation_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/squeeze_excitation_fuse_pass.cc
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/xpu/squeeze_excitation_fuse_pass.h"
+#include <string>
+
+#include "glog/logging.h"
+
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/xpu/pass_utils.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+namespace math {
+
+template <typename T>
+static inline void Transpose(const T* in, T* out, int h, int w) {
+  for (int h1 = 0; h1 < w; ++h1) {
+    for (int w1 = 0; w1 < h; ++w1) {
+      out[h1 * h + w1] = in[w1 * w + h1];
+    }
+  }
+}
+
+}  // namespace math
+
+namespace patterns {
+
+struct SqueezeExcitationFusePattern : public PatternBase {
+  SqueezeExcitationFusePattern(PDPattern* pattern,
+                               const std::string& name_scope,
+                               const std::string& op_type,
+                               const std::string& act_type,
+                               bool with_branch,
+                               bool with_bias);
+
+  // declare operator node`s name
+  PATTERN_DECL_NODE(pool2d);
+  PATTERN_DECL_NODE(mul_1);
+  PATTERN_DECL_NODE(mul_2);
+  PATTERN_DECL_NODE(ew_mul);
+  PATTERN_DECL_NODE(ew_branch_add);
+  PATTERN_DECL_NODE(block_act);
+
+  // declare variable node`s name
+  PATTERN_DECL_NODE(x);
+  PATTERN_DECL_NODE(pool2d_out);
+  PATTERN_DECL_NODE(mul_1_w);
+  PATTERN_DECL_NODE(mul_1_w_max);
+  PATTERN_DECL_NODE(mul_1_bias);
+  PATTERN_DECL_NODE(mul_1_out);
+  PATTERN_DECL_NODE(mul_1_out_max);
+  PATTERN_DECL_NODE(mul_2_w);
+  PATTERN_DECL_NODE(mul_2_w_max);
+  PATTERN_DECL_NODE(mul_2_bias);
+  PATTERN_DECL_NODE(mul_2_out);
+  PATTERN_DECL_NODE(mul_2_out_max);
+  PATTERN_DECL_NODE(ew_mul_out);
+  PATTERN_DECL_NODE(ew_branch_add_in);
+  PATTERN_DECL_NODE(ew_branch_add_out);
+  PATTERN_DECL_NODE(block_act_out);
+};
+
+SqueezeExcitationFusePattern::SqueezeExcitationFusePattern(
+    PDPattern* pattern,
+    const std::string& name_scope,
+    const std::string& op_type,
+    const std::string& act_type,
+    bool with_branch,
+    bool with_bias)
+    : PatternBase(pattern, name_scope, name_scope) {
+  auto* x = pattern->NewNode(x_repr())
+                ->assert_is_op_input("pool2d", "X")
+                ->assert_is_op_input("elementwise_mul", "X")
+                ->AsInput();
+
+  auto pool2d_teller = [](const Node* x) {
+    auto* op_desc = x->Op();
+    bool has_adap = op_desc->HasAttr("adaptive");
+    if (has_adap) {
+      auto ksize =
+          PADDLE_GET_CONST(std::vector<int>, op_desc->GetAttr("ksize"));
+      if (ksize[0] != 1 || ksize[1] != 1) {
+        return false;
+      }
+    } else if (PADDLE_GET_CONST(bool, op_desc->GetAttr("global_pooling")) ==
+               false) {
+      return false;
+    }
+    return true;
+  };
+
+  auto* pool2d = pattern->NewNode(pool2d_repr())
+                     ->assert_is_op("pool2d")
+                     ->assert_op_attr<std::string>("pooling_type", "avg")
+                     ->assert_more(pool2d_teller);
+
+  auto* pool2d_out = pattern->NewNode(pool2d_out_repr())
+                         ->assert_is_op_output("pool2d", "Out")
+                         ->assert_is_op_input(op_type, "x");
+
+  auto mul_w_teller = [](const Node* x) {
+    auto* var_desc = x->Var();
+    auto filter_dims = var_desc->GetShape();
+    auto in_c = filter_dims[0];
+    auto out_c = filter_dims[1];
+    auto bigger = std::max(in_c, out_c);
+    auto smaller = std::min(in_c, out_c);
+    if (bigger % smaller != 0) {
+      return false;
+    }
+    return true;
+  };
+
+  auto* mul_1 = pattern->NewNode(mul_1_repr())->assert_is_op(op_type);
+  auto* mul_1_w = pattern->NewNode(mul_1_w_repr())
+                      ->assert_is_op_input(op_type, "filter")
+                      ->assert_more(mul_w_teller);
+  auto* mul_1_w_max = pattern->NewNode(mul_1_w_max_repr())
+                          ->assert_is_op_input(op_type, "filter_max");
+  auto* mul_1_out = pattern->NewNode(mul_1_out_repr())
+                        ->assert_is_op_output(op_type, "out")
+                        ->assert_is_op_input(op_type, "x");
+  auto* mul_1_out_max = pattern->NewNode(mul_1_out_max_repr())
+                            ->assert_is_op_output(op_type, "out_max");
+  auto* mul_2 = pattern->NewNode(mul_2_repr())->assert_is_op(op_type);
+  auto* mul_2_w = pattern->NewNode(mul_2_w_repr())
+                      ->assert_is_op_input(op_type, "filter")
+                      ->assert_more(mul_w_teller);
+  auto* mul_2_w_max = pattern->NewNode(mul_2_w_max_repr())
+                          ->assert_is_op_input(op_type, "filter_max");
+  auto* mul_2_out = pattern->NewNode(mul_2_out_repr())
+                        ->assert_is_op_output(op_type, "out")
+                        ->assert_is_op_input("elementwise_mul", "Y");
+  auto* mul_2_out_max = pattern->NewNode(mul_2_out_max_repr())
+                            ->assert_is_op_output(op_type, "out_max");
+
+  PDNode* mul_1_bias = nullptr;
+  PDNode* mul_2_bias = nullptr;
+  if (with_bias) {
+    mul_1_bias = pattern->NewNode(mul_1_bias_repr())
+                     ->assert_is_op_input(op_type, "bias");
+    mul_2_bias = pattern->NewNode(mul_2_bias_repr())
+                     ->assert_is_op_input(op_type, "bias");
+  }
+  auto* ew_mul =
+      pattern->NewNode(ew_mul_repr())->assert_is_op("elementwise_mul");
+  auto* ew_mul_out = pattern->NewNode(ew_mul_out_repr())
+                         ->assert_is_op_output("elementwise_mul", "Out");
+
+  // branch
+  PDNode* ew_branch_add_in = nullptr;
+  PDNode* ew_branch_add = nullptr;
+  PDNode* ew_branch_add_out = nullptr;
+  if (with_branch) {
+    ew_branch_add_in = pattern->NewNode(ew_branch_add_in_repr())
+                           ->assert_is_op_input("elementwise_add", "X")
+                           ->AsInput();
+    ew_branch_add =
+        pattern->NewNode(ew_branch_add_repr())->assert_is_op("elementwise_add");
+    ew_branch_add_out = pattern->NewNode(ew_branch_add_out_repr())
+                            ->assert_is_op_output("elementwise_add", "out");
+  }
+  // act
+  PDNode* block_act = nullptr;
+  PDNode* block_act_out = nullptr;
+  if (act_type != "linear") {
+    block_act = pattern->NewNode(block_act_repr())->assert_is_op(act_type);
+    block_act_out = pattern->NewNode(block_act_out_repr())
+                        ->assert_is_op_output(act_type, "Out");
+  }
+
+  // pass
+  pool2d->LinksFrom({x}).LinksTo({pool2d_out});
+  mul_1->LinksFrom({mul_1_w, mul_1_w_max, pool2d_out})
+      .LinksTo({mul_1_out, mul_1_out_max});
+  mul_2->LinksFrom({mul_2_w, mul_2_w_max, mul_1_out})
+      .LinksTo({mul_2_out, mul_2_out_max});
+  ew_mul->LinksFrom({x, mul_2_out}).LinksTo({ew_mul_out});
+
+  if (with_branch) {
+    ew_mul_out->assert_is_op_input("elementwise_add", "Y");
+    ew_branch_add->LinksFrom({ew_mul_out, ew_branch_add_in})
+        .LinksTo({ew_branch_add_out});
+  } else {
+    ew_branch_add_out = ew_mul_out;
+  }
+  if (act_type != "linear") {
+    ew_branch_add_out->assert_is_op_input(act_type, "X");
+    block_act->LinksFrom({ew_branch_add_out}).LinksTo({block_act_out});
+  } else {
+    block_act_out = ew_branch_add_out;
+  }
+  if (with_bias) {
+    mul_1->LinksFrom({mul_1_bias});
+    mul_2->LinksFrom({mul_2_bias});
+  }
+}
+
+}  // namespace patterns
+
+void SqueezeExcitationFusePass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph,
+      platform::errors::PreconditionNotMet("graph should not be null. "));
+  Init(name_scope_, graph);
+
+  int found_subgraph_count = 0;
+  for (auto with_branch : {true, false}) {
+    for (auto with_bias : {true, false}) {
+      for (auto op_type : {"conv2d_xpu"}) {
+        for (auto act_type : {"relu",
+                              "sigmoid",
+                              "tanh",
+                              "leaky_relu",
+                              "hard_swish",
+                              "hard_sigmoid",
+                              "relu6",
+                              "linear"}) {
+          found_subgraph_count +=
+              ApplyImpl(graph, op_type, act_type, with_branch, with_bias);
+        }
+      }
+    }
+  }
+  AddStatis(found_subgraph_count);
+}
+
+int SqueezeExcitationFusePass::ApplyImpl(ir::Graph* graph,
+                                         const std::string& op_type,
+                                         const std::string& act_type,
+                                         bool with_branch,
+                                         bool with_bias) const {
+  GraphPatternDetector gpd;
+  patterns::SqueezeExcitationFusePattern pattern(gpd.mutable_pattern(),
+                                                 name_scope_,
+                                                 op_type,
+                                                 act_type,
+                                                 with_branch,
+                                                 with_bias);
+
+  int found_subgraph_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* graph) {
+    VLOG(4) << "handle SqueezeExcitationFusePass";
+    /* declare operator node's name */
+    GET_IR_NODE(pool2d);
+    GET_IR_NODE(mul_1);
+    GET_IR_NODE(mul_2);
+    GET_IR_NODE(ew_mul);
+    GET_IR_NODE(ew_branch_add);
+    GET_IR_NODE(block_act)
+    /* declare variable node's name*/
+    GET_IR_NODE(x);
+    GET_IR_NODE(mul_1_w);
+    GET_IR_NODE(mul_1_w_max);
+    GET_IR_NODE(mul_1_bias);
+    GET_IR_NODE(mul_1_out);
+    GET_IR_NODE(mul_2_w);
+    GET_IR_NODE(mul_2_w_max);
+    GET_IR_NODE(mul_2_bias);
+    GET_IR_NODE(mul_2_out);
+    GET_IR_NODE(ew_mul_out);
+    GET_IR_NODE(ew_branch_add_in);
+    GET_IR_NODE(ew_branch_add_out);
+    GET_IR_NODE(block_act_out);
+
+    auto* block = pool2d->Op()->Block();
+    auto* scope = param_scope();
+    PADDLE_ENFORCE_NOT_NULL(
+        scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
+
+    framework::OpDesc fused_op_desc(block);
+    fused_op_desc.SetType("squeeze_excitation_block");
+    fused_op_desc.SetInput("x", {x->Name()});
+    if (with_branch) {
+      fused_op_desc.SetInput("branch", {ew_branch_add_in->Name()});
+    }
+    // filter
+    auto mul_1_w_name = mul_1_w->Name();
+    auto* mul_1_w_t =
+        scope->FindVar(mul_1_w_name)->GetMutable<phi::DenseTensor>();
+    auto mul_1_w_dims = mul_1_w_t->dims();
+    auto mul_1_w_len = mul_1_w_t->numel();
+    int16_t* mul_1_w_ptr = mul_1_w_t->data<int16_t>();
+    auto* mul_2_w_t =
+        scope->FindVar(mul_2_w->Name())->GetMutable<phi::DenseTensor>();
+    auto mul_2_w_dims = mul_2_w_t->dims();
+    auto mul_2_w_len = mul_2_w_t->numel();
+    int16_t* mul_2_w_ptr = mul_2_w_t->data<int16_t>();
+    if (mul_1_w_dims[0] != mul_2_w_dims[1] ||
+        mul_1_w_dims[1] != mul_2_w_dims[0] ||
+        mul_1_w_len != mul_1_w_dims[0] * mul_1_w_dims[1]) {
+      LOG(FATAL) << "Error: Dims of excitation mul1 weight is: " << mul_1_w_dims
+                 << ", but get dims of excitation mul2 weight is: "
+                 << mul_2_w_dims;
+    }
+    std::vector<int16_t> encode_filter_int16;
+    encode_filter_int16.resize(mul_1_w_len + mul_2_w_len);
+
+    PADDLE_ENFORCE_EQ(mul_1_w_dims[1] % mul_1_w_dims[0] == 0,
+                      1,
+                      platform::errors::InvalidArgument(
+                          "Reduction ratio of excitation is not an integer."
+                          "Received mul_1_w_dims[1]: %d, mul_1_w_dims[0]: %d",
+                          mul_1_w_dims[1],
+                          mul_1_w_dims[0]));
+    fused_op_desc.SetAttr(
+        "filter_dims",
+        std::vector<int>{static_cast<int>(mul_1_w_dims[1] / mul_1_w_dims[0]),
+                         static_cast<int>(mul_1_w_dims[1])});
+
+    paddle::framework::ir::math::Transpose(mul_1_w_ptr,
+                                           encode_filter_int16.data(),
+                                           mul_1_w_dims[0],
+                                           mul_1_w_dims[1]),
+        paddle::framework::ir::math::Transpose(
+            mul_2_w_ptr,
+            encode_filter_int16.data() + mul_1_w_len,
+            mul_2_w_dims[0],
+            mul_2_w_dims[1]);
+
+    std::string new_filter_name = "se_" + mul_1_w_name;
+    Node* new_filter_node = nullptr;
+    VarDesc dst_desc(new_filter_name);
+    dst_desc.SetPersistable(true);
+    dst_desc.SetShape({mul_1_w_len + mul_2_w_len});
+    dst_desc.SetDataType(framework::TransToProtoVarType(mul_1_w_t->dtype()));
+    new_filter_node = graph->CreateVarNode(&dst_desc);
+    auto* block_dst_desc = block->Var(new_filter_name);
+    block_dst_desc->SetPersistable(dst_desc.Persistable());
+    block_dst_desc->SetShape(dst_desc.GetShape());
+    block_dst_desc->SetDataType(dst_desc.GetDataType());
+
+    phi::DenseTensor new_filter_t;
+    new_filter_t.Resize(DDim({mul_1_w_len + mul_2_w_len}));
+    new_filter_t.set_type(phi::DataType::INT16);
+    auto* cpu_ctx = static_cast<phi::CPUContext*>(
+        platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
+    auto* new_filter_data = cpu_ctx->Alloc<int16_t>(&new_filter_t);
+
+    memcpy(new_filter_data,
+           encode_filter_int16.data(),
+           (mul_1_w_len + mul_2_w_len) * sizeof(int16_t));
+
+    Assign(new_filter_t,
+           scope->Var(new_filter_name)->GetMutable<phi::DenseTensor>());
+    fused_op_desc.SetInput("filter", {new_filter_name});
+
+    // filter max
+    std::vector<float> encode_filter_max;
+    int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
+    int filter_max_size = max_ptr_size + max_ptr_size;
+    encode_filter_max.resize(filter_max_size);
+
+    auto mul_1_w_max_name = mul_1_w_max->Name();
+    auto mul_2_w_max_name = mul_2_w_max->Name();
+    auto* mul_1_w_max_t =
+        scope->FindVar(mul_1_w_max_name)->GetMutable<phi::DenseTensor>();
+    auto* mul_2_w_max_t =
+        scope->FindVar(mul_2_w_max_name)->GetMutable<phi::DenseTensor>();
+
+    float* mul_1_w_max_ptr = mul_1_w_max_t->data<float>();
+    float* mul_2_w_max_ptr = mul_2_w_max_t->data<float>();
+    memcpy(encode_filter_max.data(),
+           mul_1_w_max_ptr,
+           max_ptr_size * sizeof(float));
+    memcpy(encode_filter_max.data() + max_ptr_size,
+           mul_2_w_max_ptr,
+           max_ptr_size * sizeof(float));
+
+    std::string new_filter_max_name = new_filter_name + "_max";
+    Node* new_filter_max_node = nullptr;
+    VarDesc filter_max_desc(new_filter_max_name);
+    filter_max_desc.SetPersistable(true);
+    filter_max_desc.SetShape({filter_max_size});
+    filter_max_desc.SetDataType(
+        framework::TransToProtoVarType(mul_1_w_max_t->dtype()));
+    new_filter_max_node = graph->CreateVarNode(&filter_max_desc);
+    auto* block_filter_max_desc = block->Var(new_filter_max_name);
+    block_filter_max_desc->SetPersistable(filter_max_desc.Persistable());
+    block_filter_max_desc->SetShape(filter_max_desc.GetShape());
+    block_filter_max_desc->SetDataType(filter_max_desc.GetDataType());
+
+    phi::DenseTensor new_filter_max_t;
+    new_filter_max_t.Resize(DDim({filter_max_size}));
+    new_filter_max_t.set_type(phi::DataType::FLOAT32);
+    auto* new_filter_max_data = cpu_ctx->Alloc<float>(&new_filter_max_t);
+
+    memcpy(new_filter_max_data,
+           encode_filter_max.data(),
+           (filter_max_size) * sizeof(float));
+
+    Assign(new_filter_max_t,
+           scope->Var(new_filter_max_name)->GetMutable<phi::DenseTensor>());
+
+    fused_op_desc.SetInput("filter_max", {new_filter_max_name});
+
+    // bias
+    std::string new_bias_name = new_filter_name + "_bias";
+    VarDesc new_bias_desc(new_bias_name);
+    new_bias_desc.SetPersistable(true);
+    new_bias_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32);
+    Node* new_bias_node = graph->CreateVarNode(&new_bias_desc);
+    if (with_bias) {
+      auto mul_1_bias_name = mul_1_bias->Name();
+      auto mul_2_bias_name = mul_2_bias->Name();
+      auto* mul_1_bias_t =
+          scope->FindVar(mul_1_bias_name)->GetMutable<phi::DenseTensor>();
+      auto* mul_2_bias_t =
+          scope->FindVar(mul_2_bias_name)->GetMutable<phi::DenseTensor>();
+      int mul_1_bias_numel = mul_1_bias_t->numel();
+      int mul_2_bias_numel = mul_2_bias_t->numel();
+
+      std::vector<float> encode_bias;
+      encode_bias.resize(mul_1_bias_numel + mul_2_bias_numel);
+      float* mul_1_bias_ptr = mul_1_bias_t->data<float>();
+      float* mul_2_bias_ptr = mul_2_bias_t->data<float>();
+
+      memcpy(
+          encode_bias.data(), mul_1_bias_ptr, mul_1_bias_numel * sizeof(float));
+      memcpy(encode_bias.data() + mul_1_bias_numel,
+             mul_2_bias_ptr,
+             mul_2_bias_numel * sizeof(float));
+
+      new_bias_desc.SetShape({mul_1_bias_numel + mul_2_bias_numel});
+      auto* block_new_bias_dst_desc = block->Var(new_bias_name);
+      block_new_bias_dst_desc->SetPersistable(new_bias_desc.Persistable());
+      block_new_bias_dst_desc->SetShape(new_bias_desc.GetShape());
+      block_new_bias_dst_desc->SetDataType(new_bias_desc.GetDataType());
+
+      phi::DenseTensor new_bias_t;
+      new_bias_t.Resize(DDim({mul_1_bias_numel + mul_2_bias_numel}));
+      new_bias_t.set_type(phi::DataType::FLOAT32);
+      auto* cpu_ctx = static_cast<phi::CPUContext*>(
+          platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
+      auto* new_bias_data = cpu_ctx->Alloc<float>(&new_bias_t);
+
+      memcpy(new_bias_data,
+             encode_bias.data(),
+             (mul_1_bias_numel + mul_2_bias_numel) * sizeof(float));
+      Assign(new_bias_t,
+             scope->Var(new_bias_name)->GetMutable<phi::DenseTensor>());
+      fused_op_desc.SetInput("bias", {new_bias_name});
+    }
+    fused_op_desc.SetAttr("has_bias", with_bias);
+    fused_op_desc.SetAttr("has_branch", with_branch);
+    std::string output_name;
+    if (act_type != "linear") {
+      output_name = block_act_out->Name();
+    } else if (with_branch) {
+      output_name = ew_branch_add_out->Name();
+    } else {
+      output_name = ew_mul_out->Name();
+    }
+    fused_op_desc.SetOutput("out", {output_name});
+    std::string max_output_name = output_name + "_max";
+    VarDesc max_out_desc(max_output_name);
+    auto* max_output_node = graph->CreateVarNode(&max_out_desc);
+
+    fused_op_desc.SetOutput("out_max", {max_output_name});
+    fused_op_desc.SetAttr("op_type", std::vector<int>{4});
+    fused_op_desc.SetAttr("place_x", std::vector<int>{0});
+    fused_op_desc.SetAttr("place_y", std::vector<int>{9});
+    fused_op_desc.SetAttr("place_z", std::vector<int>{10});
+    fused_op_desc.SetAttr("strides", std::vector<int>{});
+    fused_op_desc.SetAttr("paddings", std::vector<int>{});
+    fused_op_desc.SetAttr("dilations", std::vector<int>{});
+    fused_op_desc.SetAttr("groups", std::vector<int>{});
+    fused_op_desc.SetAttr("block_lod", std::vector<int>{1});
+    fused_op_desc.SetAttr("conv_bias", std::vector<int>{with_bias});
+
+    std::map<std::string, int> act_map{{"linear", 0},
+                                       {"relu", 1},
+                                       {"sigmoid", 2},
+                                       {"tanh", 3},
+                                       {"leaky_relu", 5},
+                                       {"hard_swish", 14},
+                                       {"hard_sigmoid", 15},
+                                       {"relu6", 17}};
+
+    float block_act_param_ = 0.f;
+    if (act_type == "leak_relu") {
+      block_act_param_ =
+          PADDLE_GET_CONST(float, block_act->Op()->GetAttr("alpha"));
+    } else if (act_type == "hard_sigmoid") {
+      block_act_param_ =
+          PADDLE_GET_CONST(float, block_act->Op()->GetAttr("slope"));
+    }
+    fused_op_desc.SetAttr(
+        "act_type",
+        std::vector<int>{
+            PADDLE_GET_CONST(int, mul_1->Op()->GetAttr("act_type")),
+            PADDLE_GET_CONST(int, mul_2->Op()->GetAttr("act_type")),
+            act_map[act_type]});
+
+    fused_op_desc.SetAttr(
+        "act_param",
+        std::vector<float>{
+            PADDLE_GET_CONST(float, mul_1->Op()->GetAttr("act_param")),
+            PADDLE_GET_CONST(float, mul_2->Op()->GetAttr("act_param")),
+            block_act_param_});
+
+    auto* new_op_node = graph->CreateOpNode(&fused_op_desc);
+    IR_NODE_LINK_TO(x, new_op_node);
+    if (with_branch) {
+      IR_NODE_LINK_TO(ew_branch_add_in, new_op_node);
+    }
+    IR_NODE_LINK_TO(new_filter_node, new_op_node);
+    IR_NODE_LINK_TO(new_filter_max_node, new_op_node);
+
+    if (with_bias) {
+      IR_NODE_LINK_TO(new_bias_node, new_op_node);
+    }
+
+    if (act_type != "linear") {
+      IR_NODE_LINK_TO(new_op_node, block_act_out);
+    } else if (with_branch) {
+      IR_NODE_LINK_TO(new_op_node, ew_branch_add_out);
+    } else {
+      IR_NODE_LINK_TO(new_op_node, ew_mul_out);
+    }
+    IR_NODE_LINK_TO(new_op_node, max_output_node);
+    // delete useless node
+    std::unordered_set<const Node*> delete_nodes = {
+        pool2d, mul_1, mul_1_out, mul_2, mul_2_out, ew_mul};
+    if (with_bias) {
+      delete_nodes.insert(mul_1_bias);
+      delete_nodes.insert(mul_2_bias);
+    }
+    if (with_branch) {
+      delete_nodes.insert(ew_branch_add);
+    }
+    GraphSafeRemoveNodes(graph, delete_nodes);
+    found_subgraph_count++;
+  };
+
+  gpd(graph, handler);
+  return found_subgraph_count;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(squeeze_excitation_fuse_pass,
+              paddle::framework::ir::SqueezeExcitationFusePass);
+
+REGISTER_PASS_CAPABILITY(squeeze_excitation_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().EQ(
+            "squeeze_excitation_block", 0));
--- a/paddle/fluid/framework/ir/xpu/squeeze_excitation_fuse_pass.h
+++ b/paddle/fluid/framework/ir/xpu/squeeze_excitation_fuse_pass.h
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace phi {
+class DenseTensor;
+}  // namespace phi
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+/*
+Squeeze and Excitaion Block Fusion for SE-ResNet
+Origin subgraph
+        Input
+        |    \
+        |     \
+        |      \
+        |       |
+        |     Global Pooling
+        |       |
+        |       conv2d_xpu
+        |       |
+        |       |
+        |       conv2d_xpu
+        \       |
+         \      |
+           elementwise_mul
+             |
+           Output
+------------------------------------------------------
+After the pass is applied:
+
+                          in_Input
+            in_Filter      |     in_FilterMax
+                      \    |    /
+                        \  |   /
+  in_Branch ------- squeeze_excitation_block ------ in_Bias
+                           |
+                           |
+                           |
+                      out_Output
+*/
+class SqueezeExcitationFusePass : public FusePassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+
+ private:
+  int ApplyImpl(ir::Graph* graph,
+                const std::string& op_type,
+                const std::string& act_type,
+                bool with_branch,
+                bool with_bias) const;
+
+  const std::string name_scope_{"squeeze_excitation_fuse_pass"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/xpu/squeeze_excitation_fuse_pass_test.cc
+++ b/paddle/fluid/framework/ir/xpu/squeeze_excitation_fuse_pass_test.cc
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+TEST(SqueezeExcitationFusePass, V1) {
+  Layers layers;
+  auto* block = layers.Block();
+
+  auto* pool2d_inp = layers.data("pool2d_inp", {1, 24, 14, 14});
+  auto* pool2d_out = layers.pool2d(pool2d_inp, false);
+
+  auto* conv2d_xpu_op1_out = layers.data("conv2d_xpu_op1_out");
+  OpDesc* conv2d_xpu_op1 = block->AppendOp();
+  conv2d_xpu_op1->SetType("conv2d_xpu");
+  conv2d_xpu_op1->SetInput("x", {pool2d_out->Name()});
+  conv2d_xpu_op1->SetOutput("out", {conv2d_xpu_op1_out->Name()});
+
+  auto* conv2d_xpu_op2_out = layers.data("conv2d_xpu_op2_out");
+  OpDesc* conv2d_xpu_op2 = block->AppendOp();
+  conv2d_xpu_op2->SetType("conv2d_xpu");
+  conv2d_xpu_op2->SetInput("x", {conv2d_xpu_op1_out->Name()});
+  conv2d_xpu_op2->SetOutput("out", {conv2d_xpu_op2_out->Name()});
+
+  layers.elementwise_mul(pool2d_inp, conv2d_xpu_op2_out);
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
+  auto pass = PassRegistry::Instance().Get("squeeze_excitation_fuse_pass");
+  pass->Apply(graph.get());
+  auto num = GetNumOpNodes(graph, "pool2d") +
+             GetNumOpNodes(graph, "conv2d_xpu") +
+             GetNumOpNodes(graph, "elementwise_mul");
+  PADDLE_ENFORCE_EQ(num,
+                    0,
+                    platform::errors::PreconditionNotMet(
+                        "pool2d/conv2d_xpu/elementwise_mul ops should be "
+                        "removed from graph, but graph "
+                        "still has %d ops. ",
+                        num));
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(squeeze_excitation_fuse_pass);
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -547,6 +547,7 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) {
      "fc_xpu_fuse_pass",
      "conv2d_xpu_fuse_pass",
      "conv2d_transpose_xpu_fuse_pass",
+      "squeeze_excitation_fuse_pass",
      "add_activation_xpu_fuse_pass",
      "add_layernorm_xpu_fuse_pass",
      "fast_layernorm_xpu_fuse_pass",

--- a/paddle/phi/api/yaml/fused_ops.yaml
+++ b/paddle/phi/api/yaml/fused_ops.yaml
@@ -208,6 +208,16 @@
    data_type : input
  optional : bias_qk

+- op : squeeze_excitation_block
+  args : (Tensor x, Tensor filter, Tensor filter_max, Tensor bias, Tensor branch, int[] act_type, float[] act_param, int[] filter_dims)
+  output : Tensor(out)
+  infer_meta :
+    func : SqueezeExcitationInferMeta
+  kernel :
+    func : squeeze_excitation_block
+    data_type : x
+  optional : bias, branch
+
 - op : yolo_box_xpu
  args : (Tensor x, Tensor x_max, Tensor grid, Tensor stride, Tensor anchor_grid, float offset)
  output : Tensor(out), Tensor(out_max)

--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -1005,6 +1005,7 @@ XPUOpMap& get_kl2_ops() {
      {"sequence_conv_grad", XPUKernelSet({phi::DataType::FLOAT32})},
      {"sequence_unpad", XPUKernelSet({phi::DataType::FLOAT32})},
      // Fused op
+      {"squeeze_excitation_block", XPUKernelSet({phi::DataType::FLOAT32})},
      {"resnet_basic_block_grad", XPUKernelSet({phi::DataType::FLOAT32})},
      {"resnet_basic_block", XPUKernelSet({phi::DataType::FLOAT32})},
      {"fused_gemm_epilogue",

--- a/paddle/phi/infermeta/fusion.cc
+++ b/paddle/phi/infermeta/fusion.cc
@@ -964,4 +964,29 @@ void FusedScaleBiasReluConvBnstatsInferMeta(
  eq_bias->set_dims(c_dims);
 }

+void SqueezeExcitationInferMeta(const MetaTensor& x,
+                                const MetaTensor& filter,
+                                const MetaTensor& filter_max,
+                                const MetaTensor& bias,
+                                const MetaTensor& branch,
+                                const std::vector<int>& act_type,
+                                const std::vector<float>& act_param,
+                                const std::vector<int>& filter_dims,
+                                MetaTensor* out) {
+  auto in_dims = x.dims();
+  // do some checks
+  PADDLE_ENFORCE_EQ(
+      in_dims.size(),
+      4,
+      phi::errors::InvalidArgument(
+          "The input should be a 4-D Tensor. But "
+          "received: input's dimension is %u, input's shape is [%s].",
+          in_dims.size(),
+          in_dims));
+  std::vector<int64_t> out_shape(
+      {in_dims[0], filter_dims[1], in_dims[2], in_dims[3]});
+  // set output dims
+  out->set_dims(DDim(out_shape.data(), out_shape.size()));
+}
+
 }  // namespace phi
--- a/paddle/phi/infermeta/fusion.h
+++ b/paddle/phi/infermeta/fusion.h
@@ -234,4 +234,14 @@ void FusedScaleBiasReluConvBnstatsInferMeta(
    MetaTensor* eq_scale,
    MetaTensor* eq_bias);

+void SqueezeExcitationInferMeta(const MetaTensor& x,
+                                const MetaTensor& filter,
+                                const MetaTensor& filter_max,
+                                const MetaTensor& bias,
+                                const MetaTensor& branch,
+                                const std::vector<int>& act_type,
+                                const std::vector<float>& act_param,
+                                const std::vector<int>& filter_dims,
+                                MetaTensor* out);
+
 }  // namespace phi
--- a/paddle/phi/kernels/fusion/xpu/squeeze_excitation_block_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/squeeze_excitation_block_xpu_kernel.cc
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "glog/logging.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+namespace fusion {
+
+template <typename T, typename TW, typename Context>
+void SqueezeExcitationKernelImpl(const Context& ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& filter,
+                                 const DenseTensor& filter_max,
+                                 const paddle::optional<DenseTensor>& bias,
+                                 const paddle::optional<DenseTensor>& branch,
+                                 const std::vector<int>& act_type,
+                                 const std::vector<float>& act_param,
+                                 const std::vector<int>& filter_dims,
+                                 DenseTensor* out) {
+  using XPUTypeX = typename XPUTypeTrait<T>::Type;
+  using XPUTypeW = typename XPUTypeTrait<TW>::Type;
+
+  auto* weight1_ptr = filter.data<TW>();
+  auto weight_len = filter.numel();
+  auto weight1_len = weight_len / 2;
+  auto* weight2_ptr = weight1_ptr + weight1_len;
+
+  auto input_dims = x.dims();
+
+  int batch = static_cast<int>(input_dims[0]);
+  int channel = static_cast<int>(input_dims[1]);
+  int h = static_cast<int>(input_dims[2]);
+  int w = static_cast<int>(input_dims[3]);
+  auto* input_data = reinterpret_cast<const XPUTypeX*>(x.data<T>());
+  const XPUTypeX* branch_data = nullptr;
+  auto* branch_tensor = branch.get_ptr();
+  if (branch_tensor != nullptr) {
+    branch_data = reinterpret_cast<const XPUTypeX*>(branch_tensor->data<T>());
+  }
+  const float* bias1_ptr =
+      bias.get_ptr() == nullptr ? nullptr : bias.get_ptr()->data<float>();
+  const float* bias2_ptr = (bias1_ptr != nullptr)
+                               ? (bias1_ptr + filter_dims[1] / filter_dims[0])
+                               : nullptr;
+  int max_ptr_size = 6;
+  const float* w1_maxptr = filter_max.data<float>();
+  const float* w2_maxptr = w1_maxptr + max_ptr_size;
+  auto* out_data =
+      reinterpret_cast<XPUTypeX*>(ctx.template Alloc<XPUTypeX>(out));
+
+  std::vector<xpu::Activation_t> act;
+  for (size_t i = 0; i < 3; i++) {
+    xpu::Activation_t cur_act = (xpu::Activation_t::act_enum)act_type[i];
+    if (act_type[i] == 5) {
+      cur_act.leaky_alpha = act_param[i];
+    } else if (act_type[i] == 15) {
+      cur_act.hard_sigmoid_slope = act_param[i];
+    }
+    act.push_back(cur_act);
+  }
+  int r = xpu::squeeze_excitation_block<T, int16_t, int16_t>(
+      /* baidu::xpu::api::Context* ctx */ ctx.x_context(),
+      /* const T* x */ input_data,
+      /* const TW* weight1 */ reinterpret_cast<const XPUTypeW*>(weight1_ptr),
+      /* const TW* weight2 */ reinterpret_cast<const XPUTypeW*>(weight2_ptr),
+      /* T* y */ out_data,
+      /* int64_t n x */ batch,
+      /* int64_t c x */ channel,
+      /* int64_t h */ h,
+      /* int64_t w */ w,
+      /* int64_t r */ filter_dims[0],
+      /* const float* w1_maxptr */ reinterpret_cast<const float*>(w1_maxptr),
+      /* const float* w2_maxptr */ reinterpret_cast<const float*>(w2_maxptr),
+      /* const float* bias1 x */ bias1_ptr,
+      /* const float* bias2 */ bias2_ptr,
+      /* const T* branch */ branch_data,
+      /* const Activation_t& excitation_act1 */ act[0],
+      /* const Activation_t& excitation_act2 */ act[1],
+      /* const Activation_t& block_act */ act[2]);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "squeeze_excitation_block");
+}
+
+template <typename T, typename Context>
+void SqueezeExcitationKernel(const Context& ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& filter,
+                             const DenseTensor& filter_max,
+                             const paddle::optional<DenseTensor>& bias,
+                             const paddle::optional<DenseTensor>& branch,
+                             const std::vector<int>& act_type,
+                             const std::vector<float>& act_param,
+                             const std::vector<int>& filter_dims,
+                             DenseTensor* out) {
+  SqueezeExcitationKernelImpl<T, int16_t, Context>(ctx,
+                                                   x,
+                                                   filter,
+                                                   filter_max,
+                                                   bias,
+                                                   branch,
+                                                   act_type,
+                                                   act_param,
+                                                   filter_dims,
+                                                   out);
+}
+
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(squeeze_excitation_block,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::fusion::SqueezeExcitationKernel,
+                   float) {}
--- a/test/ir/inference/test_xpu_squeeze_excitation_fuse_pass.py
+++ b/test/ir/inference/test_xpu_squeeze_excitation_fuse_pass.py
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from functools import partial
+
+import hypothesis.strategies as st
+import numpy as np
+from auto_scan_test import PassAutoScanTest
+from program_config import OpConfig, ProgramConfig, TensorConfig
+
+
+class TestSqueezeExcitationFusePass(PassAutoScanTest):
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_xpu=True)
+        yield config, ["squeeze_excitation_block"], (1e-3, 1e-3)
+
+    def sample_program_config(self, draw):
+        def generate_data(shape):
+            return np.random.random(shape).astype(np.float32)
+
+        x_shape = draw(
+            st.lists(
+                st.integers(min_value=1, max_value=12), min_size=4, max_size=4
+            )
+        )
+        x_shape[1] = 24
+
+        oc = 6
+        conv2d_op1_w_shape = [oc, x_shape[1], 1, 1]
+        conv2d_op1_b_shape = [oc]
+        conv2d_op2_w_shape = [x_shape[1], oc, 1, 1]
+        conv2d_op2_b_shape = [x_shape[1]]
+
+        # Random choose if add a relu operator
+        has_relu = draw(st.sampled_from([True, False]))
+
+        pool2d_op = OpConfig(
+            type="pool2d",
+            inputs={"X": ["pool2d_x"]},
+            outputs={"Out": ["pool2d_out"]},
+            adaptive=True,
+            data_format="NCHW",
+            global_pooling=False,
+            ksize=[1, 1],
+            pooling_type="avg",
+        )
+        ops = [pool2d_op]
+
+        conv2d_op = OpConfig(
+            "conv2d",
+            inputs={
+                "Input": ["pool2d_out"],
+                "Filter": ["conv2d_weight"],
+            },
+            outputs={"Output": ["conv2d_out"]},
+            data_format="NCHW",
+            dilations=[1, 1],
+            padding_algorithm="EXPLICIT",
+            groups=1,
+            paddings=[0, 0, 0, 0],
+            strides=[1, 1],
+            has_bias=False,
+        )
+
+        ew_bias_op = OpConfig(
+            "elementwise_add",
+            inputs={"X": ["conv2d_out"], "Y": ["ew_bias"]},
+            outputs={"Out": ["add_out"]},
+            axis=1,
+        )
+        ops.extend([conv2d_op, ew_bias_op])
+        conv2d_input = "add_out"
+        # 3. activation
+        if has_relu:
+            relu_op = OpConfig(
+                "relu", inputs={"X": ["add_out"]}, outputs={"Out": ["relu_out"]}
+            )
+            conv2d_input = "relu_out"
+            ops.append(relu_op)
+
+        conv2d_op2 = OpConfig(
+            "conv2d",
+            inputs={
+                "Input": [conv2d_input],
+                "Filter": ["conv2d_weight2"],
+            },
+            outputs={"Output": ["conv2d_out2"]},
+            data_format="NCHW",
+            dilations=[1, 1],
+            padding_algorithm="EXPLICIT",
+            groups=1,
+            paddings=[0, 0, 0, 0],
+            strides=[1, 1],
+            has_bias=False,
+        )
+
+        ew_bias_op2 = OpConfig(
+            "elementwise_add",
+            inputs={"X": ["conv2d_out2"], "Y": ["ew_bias2"]},
+            outputs={"Out": ["add_out2"]},
+            axis=1,
+        )
+        ops.extend([conv2d_op2, ew_bias_op2])
+        ele_mul_input = "add_out2"
+        # 3. activation
+        if has_relu:
+            relu_op2 = OpConfig(
+                "relu",
+                inputs={"X": ["add_out2"]},
+                outputs={"Out": ["relu_out2"]},
+            )
+            ele_mul_input = "relu_out2"
+            ops.append(relu_op2)
+
+        ew_mul_op = OpConfig(
+            "elementwise_mul",
+            inputs={"X": ["pool2d_x"], "Y": [ele_mul_input]},
+            outputs={"Out": ["ew_mul_out"]},
+            axis=-1,
+        )
+        ops.append(ew_mul_op)
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={
+                "conv2d_weight": TensorConfig(
+                    data_gen=partial(generate_data, conv2d_op1_w_shape)
+                ),
+                "ew_bias": TensorConfig(shape=conv2d_op1_b_shape),
+                "conv2d_weight2": TensorConfig(
+                    data_gen=partial(generate_data, conv2d_op2_w_shape)
+                ),
+                "ew_bias2": TensorConfig(shape=conv2d_op2_b_shape),
+            },
+            inputs={
+                "pool2d_x": TensorConfig(shape=x_shape),
+            },
+            outputs=ops[-1].outputs["Out"],
+        )
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=25,
+            passes=["squeeze_excitation_fuse_pass"],
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()