[XPU] add fc_xpu op&pass to optimize ernie model (#50277)

945f918c · zhupengyang · GitHub · 62fe3cf5 · 945f918c · 945f918c
19 changed file
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -210,6 +210,14 @@ if(WITH_IPU)
  pass_library(inference_dtype_transfer_pass base DIR ipu)
 endif()

+if(WITH_XPU)
+  cc_library(
+    quant_utils
+    SRCS xpu/quant_utils.cc
+    DEPS pass)
+  pass_library(fc_xpu_fuse_pass inference DIR xpu DEPS quant_utils)
+endif()
+
 cc_library(
  fuse_bn_act_pass
  SRCS fuse_bn_act_pass.cc

--- a/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc
+++ b/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc
@@ -96,7 +96,6 @@ void AdaptivePool2dConvertGlobalPass::ApplyImpl(ir::Graph* graph) const {
      }
    }
  }
-  // LOG(INFO) << "---  processed " << num << " nodes";
  AddStatis(num);
 }


--- a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/xpu/pass_utils.h"
+#include "paddle/fluid/framework/ir/xpu/quant_utils.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace phi {
+class DenseTensor;
+}  // namespace phi
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+struct FcXPUPattern : public PatternBase {
+  FcXPUPattern(PDPattern* pattern,
+               const std::string& name_scope,
+               const std::string& mul_type,
+               bool with_bias,
+               const std::string& act_type);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(mul);
+  PATTERN_DECL_NODE(add);
+  PATTERN_DECL_NODE(act);
+  // declare variable node's name
+  PATTERN_DECL_NODE(mul_x);
+  PATTERN_DECL_NODE(mul_w);
+  PATTERN_DECL_NODE(mul_out);
+  PATTERN_DECL_NODE(bias);
+  PATTERN_DECL_NODE(add_out);
+  PATTERN_DECL_NODE(act_out);
+
+ private:
+  std::string mul_type_;
+  bool with_bias_{false};
+  std::string act_type_;
+};
+
+FcXPUPattern::FcXPUPattern(PDPattern* pattern,
+                           const std::string& name_scope,
+                           const std::string& mul_type,
+                           bool with_bias,
+                           const std::string& act_type)
+    : PatternBase(pattern, name_scope, name_scope),
+      mul_type_(mul_type),
+      with_bias_(with_bias),
+      act_type_(act_type) {
+  auto* mul_x = pattern->NewNode(mul_x_repr())
+                    ->assert_is_op_input(mul_type_, "X")
+                    ->assert_var_not_persistable();
+  auto* mul_w = pattern->NewNode(mul_w_repr())
+                    ->assert_is_op_input(mul_type_, "Y")
+                    ->assert_is_persistable_var()
+                    ->assert_more([](Node* node) {
+                      return true;
+                      return node->Var()->GetShape().size() == 2;
+                    });
+  auto* mul =
+      pattern->NewNode(mul_repr())
+          ->assert_is_op(mul_type_)
+          ->assert_more([](Node* node) {
+            return true;
+            auto op_type = node->Op()->Type();
+            if (op_type == "matmul") {
+              return !PADDLE_GET_CONST(bool,
+                                       node->Op()->GetAttr("transpose_X"));
+            } else if (op_type == "matmul_v2") {
+              return !PADDLE_GET_CONST(bool, node->Op()->GetAttr("trans_x"));
+            } else {
+              return true;
+            }
+          });
+  auto* mul_out = pattern->NewNode(mul_out_repr())
+                      ->assert_is_op_output(mul_type_, "Out")
+                      ->assert_var_not_persistable();
+  mul->LinksFrom({mul_x, mul_w}).LinksTo({mul_out});
+  PDNode* bias = nullptr;
+  PDNode* add = nullptr;
+  PDNode* add_out = nullptr;
+  PDNode* act = nullptr;
+  PDNode* act_out = nullptr;
+  if (with_bias_) {
+    mul_out->assert_is_op_input("elementwise_add", "X");
+    bias = pattern->NewNode(bias_repr())
+               ->assert_is_op_input("elementwise_add", "Y")
+               ->assert_is_persistable_var();
+    add = pattern->NewNode(add_repr())->assert_is_op("elementwise_add");
+    add_out = pattern->NewNode(add_out_repr())
+                  ->assert_is_op_output("elementwise_add", "Out")
+                  ->assert_var_not_persistable();
+    add->LinksFrom({mul_out, bias}).LinksTo({add_out});
+  } else {
+    add_out = mul_out;
+  }
+  if (!act_type_.empty()) {
+    add_out->assert_is_op_input(act_type_, "X");
+    act = pattern->NewNode(act_repr())->assert_is_op(act_type_);
+    act_out = pattern->NewNode(act_out_repr())
+                  ->assert_is_op_output(act_type_, "Out")
+                  ->assert_var_not_persistable();
+    act->LinksFrom({add_out}).LinksTo({act_out});
+  }
+}
+
+}  // namespace patterns
+
+/*
+1. fuse mul/matmul/matmul_v2 + add + act into fc_xpu
+2. add is optional
+3. act is optional
+
+Origin subgraph:
+          mul_x  mul_w
+             \     /
+              \   /
+               mul
+                |
+                |
+             mul_out  bias
+                \      /
+                 \    /
+             elementwise_add
+                   |
+                   |
+           elementwise_add_out
+                   |
+                   |
+                  act
+                   |
+                   |
+                act_out
+
+Fused subgraph:
+        mul_x mul_w bias mul_w_max
+          \     |    /       |
+           \    |   /        |
+            \   |  /         |
+             fc_xpu-----------
+                |
+                |
+             act_out
+*/
+class FcXPUFusePass : public FusePassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+
+ private:
+  void ApplyImpl(ir::Graph* graph,
+                 const std::string& mul_type,
+                 bool with_bias,
+                 const std::string& act_type) const;
+
+  const std::string name_scope_{"fc_xpu_fuse_pass"};
+  const std::map<std::string, int> act_map_{{"", 0},
+                                            {"relu", 1},
+                                            {"sigmoid", 2},
+                                            {"tanh", 3},
+                                            {"gelu", 4},
+                                            {"leaky_relu", 5},
+                                            {"hard_swish", 14},
+                                            {"hard_sigmoid", 15},
+                                            {"relu6", 17}};
+};
+
+void FcXPUFusePass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  Init(name_scope_, graph);
+  for (auto mul_type : {"mul", "matmul", "matmul_v2"}) {
+    for (auto with_bias : {true, false}) {
+      for (auto act_type : {
+               "relu",
+               "gelu",
+               "",
+           }) {
+        ApplyImpl(graph, mul_type, with_bias, act_type);
+      }
+    }
+  }
+}
+
+void FcXPUFusePass::ApplyImpl(ir::Graph* graph,
+                              const std::string& mul_type,
+                              bool with_bias,
+                              const std::string& act_type) const {
+  GraphPatternDetector gpd;
+  patterns::FcXPUPattern pattern(
+      gpd.mutable_pattern(), name_scope_, mul_type, with_bias, act_type);
+
+  int found_subgraph_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* graph) {
+    VLOG(4) << "handle FcXPUFusePass fuse";
+    GET_IR_NODE(mul_x);
+    GET_IR_NODE(mul_w);
+    GET_IR_NODE(mul);
+    GET_IR_NODE(mul_out);
+    GET_IR_NODE(bias);
+    GET_IR_NODE(add);
+    GET_IR_NODE(add_out);
+    GET_IR_NODE(act);
+    GET_IR_NODE(act_out);
+    auto* block = mul->Op()->Block();
+    auto* scope = param_scope();
+
+    auto mul_w_name = mul_w->Name();
+    auto mul_w_tensor =
+        scope->FindVar(mul_w_name)->GetMutable<phi::DenseTensor>();
+    // 1. Transform weight to int16/int31
+    // 2. Avoid transform repeatly, because weight may be shared with other ops.
+    // TODO(zhupengyang): support int31
+    std::string mul_w_max_name = mul_w_name + "_max";
+    Node* mul_w_max = nullptr;
+    if (mul_w_tensor->dtype() != phi::DataType::INT16) {
+      // Create weight_max node
+      VarDesc mul_w_max_desc(mul_w_max_name);
+      mul_w_max_desc.SetPersistable(true);
+      mul_w_max = graph->CreateVarNode(&mul_w_max_desc);
+      // Create weight_max var/tensor
+      auto mul_w_max_var = block->Var(mul_w_max_name);
+      mul_w_max_var->SetPersistable(true);
+      auto mul_w_max_tensor =
+          scope->Var(mul_w_max_name)->GetMutable<phi::DenseTensor>();
+      auto* xpu_ctx = static_cast<phi::XPUContext*>(
+          platform::DeviceContextPool::Instance().Get(phi::XPUPlace()));
+      int max_ptr_size = xpu_ctx->x_context()->max_ptr_size();
+      bool transpose_w = false;
+      if (mul_type == "matmul") {
+        transpose_w = PADDLE_GET_CONST(bool, mul->Op()->GetAttr("transpose_Y"));
+      } else if (mul_type == "matmul_v2") {
+        transpose_w = PADDLE_GET_CONST(bool, mul->Op()->GetAttr("trans_y"));
+      }
+      QuantWeight<int16_t>(
+          mul_w_tensor, mul_w_max_tensor, !transpose_w, max_ptr_size);
+    }
+
+    // Generate fc_xpu op
+    framework::OpDesc fc_xpu_op_desc(block);
+    fc_xpu_op_desc.SetType("fc_xpu");
+    fc_xpu_op_desc.SetInput("x", {mul_x->Name()});
+    fc_xpu_op_desc.SetInput("w", {mul_w->Name()});
+    fc_xpu_op_desc.SetInput("w_max", {mul_w_max_name});
+    if (bias) {
+      fc_xpu_op_desc.SetInput("bias", {bias->Name()});
+    }
+    fc_xpu_op_desc.SetAttr(
+        "in_num_col_dims",
+        static_cast<int>(mul_x->Var()->GetShape().size() - 1));
+    if (mul_type == "mul") {
+      fc_xpu_op_desc.SetAttr(
+          "in_num_col_dims",
+          PADDLE_GET_CONST(int, mul->Op()->GetAttr("in_num_col_dims")));
+    }
+    fc_xpu_op_desc.SetAttr("transpose_x", false);
+    fc_xpu_op_desc.SetAttr("alpha", 1.f);
+    fc_xpu_op_desc.SetAttr("beta", 0.f);
+    if (mul_type == "matmul") {
+      fc_xpu_op_desc.SetAttr(
+          "alpha", PADDLE_GET_CONST(float, mul->Op()->GetAttr("alpha")));
+      fc_xpu_op_desc.SetAttr(
+          "beta", PADDLE_GET_CONST(float, mul->Op()->GetAttr("beta")));
+    }
+    fc_xpu_op_desc.SetAttr("act_type", 0);
+    fc_xpu_op_desc.SetAttr("act_alpha", 0.f);
+    if (act) {
+      fc_xpu_op_desc.SetAttr("act_type", act_map_.at(act_type));
+      if (act_type == "leaky_relu") {
+        fc_xpu_op_desc.SetAttr(
+            "act_alpha", PADDLE_GET_CONST(float, act->Op()->GetAttr("alpha")));
+      } else if (act_type == "hard_sigmoid") {
+        fc_xpu_op_desc.SetAttr(
+            "act_alpha", PADDLE_GET_CONST(float, act->Op()->GetAttr("slope")));
+      }
+    }
+    if (act_out) {
+      fc_xpu_op_desc.SetOutput("out", {act_out->Name()});
+    } else if (add_out) {
+      fc_xpu_op_desc.SetOutput("out", {add_out->Name()});
+    } else {
+      fc_xpu_op_desc.SetOutput("out", {mul_out->Name()});
+    }
+    auto* fc_xpu = graph->CreateOpNode(&fc_xpu_op_desc);
+    SAFE_IR_NODE_LINK_TO(mul_x, fc_xpu);
+    SAFE_IR_NODE_LINK_TO(mul_w, fc_xpu);
+    SAFE_IR_NODE_LINK_TO(mul_w_max, fc_xpu);
+    SAFE_IR_NODE_LINK_TO(bias, fc_xpu);
+    if (act_out) {
+      SAFE_IR_NODE_LINK_TO(fc_xpu, act_out);
+    } else if (add_out) {
+      SAFE_IR_NODE_LINK_TO(fc_xpu, add_out);
+    } else {
+      SAFE_IR_NODE_LINK_TO(fc_xpu, mul_out);
+    }
+
+    // delete useless node
+    std::unordered_set<const Node*> delete_nodes;
+    if (act != nullptr && add != nullptr) {
+      delete_nodes = {mul, mul_out, add, add_out, act};
+    } else if (act) {
+      delete_nodes = {mul, mul_out, act};
+    } else if (add) {
+      delete_nodes = {mul, mul_out, add};
+    }
+    GraphSafeRemoveNodes(graph, delete_nodes);
+    found_subgraph_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_subgraph_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(fc_xpu_fuse_pass, paddle::framework::ir::FcXPUFusePass);
+
+REGISTER_PASS_CAPABILITY(fc_xpu_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().EQ(
+            "fc_xpu", 0));
--- a/paddle/fluid/framework/ir/xpu/pass_utils.h
+++ b/paddle/fluid/framework/ir/xpu/pass_utils.h
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+#define GET_IR_NODE(node_) SAFE_GET_IR_NODE_FROM_SUBGRAPH(node_, node_, pattern)
+
+// Get an ir::Node* from the matched subgraph.
+// var: variable.
+// arg: the argument declared by PATTERN_DECL_NODE in a pattern definition.
+// pat: the pattern object.
+#define SAFE_GET_IR_NODE_FROM_SUBGRAPH(var, arg, pat)                          \
+  Node* var = nullptr;                                                         \
+  if (pat.arg##_n()) {                                                         \
+    PADDLE_ENFORCE_NE(subgraph.count(pat.arg##_n()),                           \
+                      0UL,                                                     \
+                      platform::errors::NotFound(                              \
+                          "Node not found for PDNode %s", pat.arg##_repr()));  \
+    var = subgraph.at(pat.arg##_n());                                          \
+    PADDLE_ENFORCE_NOT_NULL(var,                                               \
+                            platform::errors::NotFound(                        \
+                                "node %s not exists in the sub-graph", #arg)); \
+  }
+
+#define SAFE_IR_NODE_LINK_TO(a, b)    \
+  if (a != nullptr && b != nullptr) { \
+    IR_NODE_LINK_TO(a, b)             \
+  }
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/xpu/quant_utils.cc
+++ b/paddle/fluid/framework/ir/xpu/quant_utils.cc
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/xpu/quant_utils.h"
+#include <vector>
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/core/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+template <typename T>
+static void Transpose(const T* in, T* out, int h, int w) {
+  for (int h1 = 0; h1 < w; ++h1) {
+    for (int w1 = 0; w1 < h; ++w1) {
+      out[h1 * h + w1] = in[w1 * w + h1];
+    }
+  }
+}
+
+static float FindMaxAbs(const float* data, int len) {
+  float max_f = 0.0f;
+  for (int i = 0; i < len; ++i) {
+    float max = std::abs(data[i]);
+    if (max > max_f) {
+      max_f = max;
+    }
+  }
+  return max_f;
+}
+
+static float IEEECompliance0(float f) {
+  uint32_t* ptr = reinterpret_cast<uint32_t*>(&f);
+  uint32_t sign = (*ptr) & 0x80000000;
+  uint32_t uf = 0;
+  // nan -> inf
+  if (std::isnan(f)) {
+    uf = (sign | 0x7F800000);
+    float* ptr = reinterpret_cast<float*>(&uf);
+    return *ptr;
+  } else if (std::isnormal(f) || (std::isinf(f)) || (f == 0)) {
+    return f;
+  } else {
+    // denormal -> +-0
+    uf = 0x0;
+    float* ptr = reinterpret_cast<float*>(&uf);
+    return *ptr;
+  }
+}
+
+static inline long RoundHalfToEven(const float src) {  // NOLINT
+  long ret = llround(src);                             // NOLINT
+  if (fabs(fabs(round(src) - src) - 0.5) > 0) {
+    return ret;
+  } else {
+    if (abs(ret) % 2 == 0) {
+      return ret;
+    } else {
+      return ret + (ret > 0 ? -1 : 1);
+    }
+  }
+}
+
+template <typename T, int RMAX>
+static T Fp32ToIntx(const float f, float max) {
+  max = IEEECompliance0(max);
+  float input = IEEECompliance0(f);
+  // +0 and -0 -> +0
+  if (input == 0) {
+    input = 0.0f;
+  }
+
+  float tmp = RMAX / max;
+  if (std::isinf(tmp)) {
+    uint32_t* ptr = reinterpret_cast<uint32_t*>(&input);
+    if ((*ptr) >> 31 & 1) {
+      return T(-RMAX);
+    } else {
+      return T(RMAX);
+    }
+  }
+
+  tmp = input * tmp;
+  if (std::isnan(tmp)) {
+    return T(RMAX);
+  }
+
+  tmp = IEEECompliance0(tmp);
+  // early check to avoid INF or big value get into convertor func.
+  if (tmp > RMAX) {
+    return T(RMAX);
+  }
+  if (tmp < -RMAX) {
+    return T(-RMAX);
+  }
+  T ret = (T)RoundHalfToEven(tmp);
+  if (ret > RMAX) {
+    ret = T(RMAX);
+  }
+  if (ret < -RMAX) {
+    ret = T(-RMAX);
+  }
+  return ret;
+}
+
+template <typename T>
+static void QuantFP32ToIntX(const float* src_ptr,
+                            T* dst_ptr,
+                            float max_val,
+                            int numel) {
+  LOG(FATAL) << "Not support.";
+}
+
+template <>
+void QuantFP32ToIntX<int16_t>(const float* src_ptr,
+                              int16_t* dst_ptr,
+                              float max_val,
+                              int numel) {
+  for (int i = 0; i < numel; i++) {
+    dst_ptr[i] = Fp32ToIntx<int16_t, 32767>(src_ptr[i], max_val);
+  }
+}
+
+template <typename T>
+void QuantWeight(phi::DenseTensor* weight,
+                 phi::DenseTensor* weight_max,
+                 bool transpose,
+                 int max_ptr_size) {
+  // Transpose
+  auto* weight_data = weight->data<float>();
+  auto dims = weight->dims();
+  auto size = weight->numel();
+  std::vector<float> transpose_data(weight_data, weight_data + size);
+  if (transpose) {
+    PADDLE_ENFORCE_EQ(
+        dims.size(),
+        2,
+        platform::errors::InvalidArgument(
+            "Only support 2D weight, but received weight rank is [%d].",
+            dims.size()));
+    Transpose(weight_data, transpose_data.data(), dims[0], dims[1]);
+    weight->Resize({dims[1], dims[0]});
+  }
+  weight_data = transpose_data.data();
+  // Find max
+  float max_val = FindMaxAbs(weight_data, size);
+  std::vector<float> max_vec(max_ptr_size, max_val);
+  weight_max->set_type(paddle::experimental::CppTypeToDataType<float>::Type());
+  weight_max->Resize({max_ptr_size});
+  auto* dev_ctx = static_cast<phi::CPUContext*>(
+      platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
+  memcpy(dev_ctx->Alloc<float>(weight_max),
+         max_vec.data(),
+         max_ptr_size * sizeof(float));
+  // Quant
+  std::vector<T> quant_data(size);
+  QuantFP32ToIntX(weight_data, quant_data.data(), max_val, size);
+  weight->set_type(paddle::experimental::CppTypeToDataType<T>::Type());
+  memcpy(dev_ctx->Alloc<T>(weight), quant_data.data(), size * sizeof(T));
+}
+
+template void QuantWeight<int16_t>(phi::DenseTensor* weight,
+                                   phi::DenseTensor* weight_max,
+                                   bool transpose,
+                                   int max_ptr_size);
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/xpu/quant_utils.h
+++ b/paddle/fluid/framework/ir/xpu/quant_utils.h
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+// 1. Quant weight from fp32 to int16/int31
+// 2. Weight data is in-place update.
+// 3. Generate weight max tensor
+template <typename T>
+void QuantWeight(phi::DenseTensor* weight,
+                 phi::DenseTensor* weight_max,
+                 bool transpose,
+                 int max_ptr_size);
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -513,6 +513,19 @@ void CpuPassStrategy::EraseFcMkldnnPasses() {
  }
 }

+XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) {
+  passes_.assign({
+      "delete_dropout_op_pass",
+      // "multi_encoder_xpu_fuse_pass",
+      // "embedding_with_eltwise_add_xpu_fuse_pass",
+      "fc_xpu_fuse_pass",
+      // "multi_encoder_slice_link_xpu_fuse_pass",
+      // "generate_sequence_xpu_fuse_pass",
+      // "link_previous_out_max_xpu_pass",
+  });
+  use_xpu_ = true;
+}
+
 IpuPassStrategy::IpuPassStrategy() : PassStrategy({}) {
  passes_.assign({"inference_process_pass"});
 }

--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -290,7 +290,7 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
 /// mode.
 class PD_INFER_DECL XpuPassStrategy final : public PassStrategy {
 public:
-  XpuPassStrategy() : PassStrategy({}) { use_xpu_ = true; }
+  XpuPassStrategy();
 };

 /// \class NpuPassStrategy

--- a/paddle/fluid/operators/generator/templates/op.c.j2
+++ b/paddle/fluid/operators/generator/templates/op.c.j2
@@ -11,6 +11,7 @@
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/backward.h"
 #include "paddle/phi/infermeta/binary.h"
+#include "paddle/phi/infermeta/fusion.h"
 #include "paddle/phi/infermeta/multiary.h"
 #include "paddle/phi/infermeta/nullary.h"
 #include "paddle/phi/infermeta/ternary.h"

--- a/paddle/phi/api/yaml/static_ops.yaml
+++ b/paddle/phi/api/yaml/static_ops.yaml
+- op : fc_xpu
+  args : (Tensor x, Tensor w, Tensor w_max, Tensor bias, int in_num_col_dims, bool transpose_x, float alpha, float beta, int act_type, float act_alpha)
+  output : Tensor
+  infer_meta :
+    func : FcXPUInferMeta
+  kernel :
+    func : fc_xpu
+  optional : bias
+
 - op : share_buffer
  args : (Tensor[] x, bool[] share_dims_and_dtype={})
  output : Tensor[](out){x.size()}, Tensor[](xout){x.size()}

--- a/paddle/phi/backends/xpu/xpu1_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu1_op_list.cc
@@ -93,6 +93,7 @@ XPUOpMap& get_kl1_ops() {
                     phi::DataType::BOOL,
                     phi::DataType::FLOAT16,
                     phi::DataType::FLOAT32})},
+      {"fc_xpu", XPUKernelSet({phi::DataType::FLOAT32})},
      {"fill_any_like", XPUKernelSet({phi::DataType::INT64})},
      {"fill_constant",
       XPUKernelSet({phi::DataType::INT32,

--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -224,6 +224,7 @@ XPUOpMap& get_kl2_ops() {
                     phi::DataType::BOOL,
                     phi::DataType::FLOAT16,
                     phi::DataType::FLOAT32})},
+      {"fc_xpu", XPUKernelSet({phi::DataType::FLOAT32})},
      {"fill",
       XPUKernelSet({phi::DataType::INT64,
                     phi::DataType::INT32,

--- a/paddle/phi/infermeta/CMakeLists.txt
+++ b/paddle/phi/infermeta/CMakeLists.txt
 cc_library(
  infermeta
-  SRCS nullary.cc unary.cc binary.cc ternary.cc multiary.cc
+  SRCS nullary.cc unary.cc binary.cc ternary.cc multiary.cc fusion.cc
  DEPS convert_utils meta_tensor infermeta_utils)
 cc_library(
  backward_infermeta

--- a/paddle/phi/infermeta/fusion.cc
+++ b/paddle/phi/infermeta/fusion.cc
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/infermeta/fusion.h"
+#include <vector>
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/core/meta_tensor.h"
+
+namespace phi {
+
+void FcXPUInferMeta(const MetaTensor& x,
+                    const MetaTensor& w,
+                    const MetaTensor& w_max,
+                    const MetaTensor& bias,
+                    int in_num_col_dims,
+                    bool transpose_x,
+                    float alpha,
+                    float beta,
+                    int act_type,
+                    float act_alpha,
+                    MetaTensor* out) {
+  std::vector<int> out_shape(in_num_col_dims + 1);
+  for (int i = 0; i < in_num_col_dims; i++) {
+    out_shape[i] = x.dims()[i];
+  }
+  out_shape[in_num_col_dims] = w.dims()[0];
+  out->set_dims(DDim(out_shape.data(), out_shape.size()));
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+}
+
+}  // namespace phi
--- a/paddle/phi/infermeta/fusion.h
+++ b/paddle/phi/infermeta/fusion.h
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/phi/common/int_array.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/meta_tensor.h"
+
+namespace phi {
+
+// Common InferMeta Functions for fusion operators.
+// NOTE: The InferMeta Functions in this file are arranged in alphabetic order.
+
+void FcXPUInferMeta(const MetaTensor& x,
+                    const MetaTensor& w,
+                    const MetaTensor& w_max,
+                    const MetaTensor& bias,
+                    int in_num_col_dims,
+                    bool transpose_x,
+                    float alpha,
+                    float beta,
+                    int act_type,
+                    float act_alpha,
+                    MetaTensor* out);
+
+}  // namespace phi
--- a/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+namespace fusion {
+
+template <typename T, typename Context>
+void FcXPUKernel(const Context& ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& w,
+                 const DenseTensor& w_max,
+                 const paddle::optional<DenseTensor>& bias,
+                 int in_num_col_dims,
+                 bool transpose_x,
+                 float alpha,
+                 float beta,
+                 int act_type,
+                 float act_alpha,
+                 DenseTensor* out) {
+  auto in_mat_dims = flatten_to_2d(x.dims(), in_num_col_dims);
+  int m = in_mat_dims[0];
+  int k = in_mat_dims[1];
+  int n = w.dims()[0];
+  const float* bias_data =
+      bias.get_ptr() == nullptr ? nullptr : bias.get_ptr()->data<T>();
+  xpu::Activation_t act(static_cast<xpu::Activation_t::act_enum>(act_type));
+  if (act_type == 5) {
+    act.leaky_alpha = act_alpha;
+  } else if (act_type == 15) {
+    act.hard_sigmoid_slope = act_alpha;
+  }
+  ctx.template Alloc<T>(out);
+  int r = xpu::fc_fusion<T, int16_t, T, int16_t>(  // TX, TW. TY, TGEMM
+      ctx.x_context(),                             // ctx
+      x.data<T>(),                                 // x
+      w.data<int16_t>(),                           // w
+      out->data<T>(),                              // y
+      m,                                           // m
+      n,                                           // n
+      k,                                           // k
+      transpose_x,                                 // x_trans
+      true,                                        // w_trans
+      nullptr,                                     // x_maxptr
+      w_max.data<float>(),                         // w_maxptr
+      nullptr,                                     // y_maxptr
+      transpose_x ? m : k,                         // ldx
+      k,                                           // ldw
+      n,                                           // ldy
+      alpha,                                       // alpha
+      beta,                                        // beta
+      bias_data,                                   // bias
+      act);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "fc_xpu");
+}
+
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(fc_xpu, XPU, ALL_LAYOUT, phi::fusion::FcXPUKernel, float) {}
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -93,6 +93,22 @@ if(WITH_MKLDNN)
  endforeach()
 endif()

+file(
+  GLOB TEST_XPU_IR_PASSES
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_xpu_*.py")
+string(REPLACE ".py" "" TEST_XPU_IR_PASSES "${TEST_XPU_IR_PASSES}")
+foreach(TEST_XPU_IR_PASS ${TEST_XPU_IR_PASSES})
+  list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES ${TEST_XPU_IR_PASS})
+endforeach()
+
+if(WITH_XPU)
+  foreach(target ${TEST_XPU_IR_PASSES})
+    py_test_modules(${target} MODULES ${target})
+    set_tests_properties(${target} PROPERTIES LABELS "RUN_TYPE=INFER")
+  endforeach()
+endif()
+
 # below are cutlass unitests
 file(
  GLOB TEST_CUTLASS

--- a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
@@ -223,6 +223,7 @@ class AutoScanTest(unittest.TestCase):
        passes: Optional[List[str]] = None,
        use_gpu: bool = False,
        use_mkldnn: bool = False,
+        use_xpu: bool = False,
        ir_optim: Optional[bool] = None,
    ):
        config = paddle_infer.Config()
@@ -235,6 +236,8 @@ class AutoScanTest(unittest.TestCase):
            config.enable_use_gpu(100, 0)
        if use_mkldnn:
            config.enable_mkldnn()
+        if use_xpu:
+            config.enable_xpu()
        if passes is not None:
            config.pass_builder().set_passes(passes)
            self.passes = passes
@@ -571,6 +574,8 @@ class PassAutoScanTest(AutoScanTest):
        dic['use_mkldnn'] = enable_mkldnn
        enable_gpu = config.use_gpu()
        dic['use_gpu'] = enable_gpu
+        enable_xpu = config.use_xpu()
+        dic['use_xpu'] = enable_xpu
        if not self.passes:
            dic['passes'] = self.passes


--- a/python/paddle/fluid/tests/unittests/ir/inference/test_xpu_fc_xpu_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_xpu_fc_xpu_fuse_pass.py
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import hypothesis.strategies as st
+from auto_scan_test import PassAutoScanTest
+from program_config import OpConfig, ProgramConfig, TensorConfig
+
+
+class TestFcXPUFusePass(PassAutoScanTest):
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_xpu=True)
+        yield config, ["fc_xpu"], (1e-3, 1e-3)
+
+    def sample_program_config(self, draw):
+        # 1. matmul_v2
+        # Generate shape of input:X of matmul_v2
+        x_shape = draw(
+            st.lists(
+                st.integers(min_value=1, max_value=4), min_size=2, max_size=4
+            )
+        )
+        # Generate attr trans_x, trans_y
+        trans_x = False
+        trans_y = draw(st.booleans())
+        # Generate legal shape of input:Y of mul
+        y_shape = draw(
+            st.lists(
+                st.integers(min_value=1, max_value=8), min_size=2, max_size=2
+            )
+        )
+        if trans_y:
+            y_shape[1] = x_shape[-1]
+        else:
+            y_shape[0] = x_shape[-1]
+        # 2. elementwise_add
+        # Generate legal attr:axis of elementwise_add
+        axis = -1
+        # Generate legal shape of input:Y of elementwise_add
+        bias_shape = [y_shape[0]] if trans_y else [y_shape[1]]
+        # 3. activation
+        # Random choose if add a relu operator
+        has_relu = draw(st.booleans())
+
+        # Here we will compose a program
+        # Still has some risks that the program is invalid or cause bug while running
+        # Use function `is_program_valid` to filter the invalid programs before running
+        # Use function `add_skip_pass_case` to ignore the programs even if they cause bug while runing
+        matmul_v2_op = OpConfig(
+            "matmul_v2",
+            inputs={"X": ["matmul_v2_x"], "Y": ["matmul_v2_y"]},
+            outputs={"Out": ["matmul_v2_out"]},
+            trans_x=trans_x,
+            trans_y=trans_y,
+        )
+        add_op = OpConfig(
+            "elementwise_add",
+            inputs={"X": ["matmul_v2_out"], "Y": ["bias"]},
+            outputs={"Out": ["add_out"]},
+            axis=axis,
+        )
+        ops = [matmul_v2_op, add_op]
+        if has_relu:
+            relu_op = OpConfig(
+                "relu", inputs={"X": ["add_out"]}, outputs={"Out": ["relu_out"]}
+            )
+            ops.append(relu_op)
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={
+                "matmul_v2_y": TensorConfig(shape=y_shape),
+                "bias": TensorConfig(shape=bias_shape),
+            },
+            inputs={
+                "matmul_v2_x": TensorConfig(shape=x_shape),
+            },
+            outputs=ops[-1].outputs["Out"],
+        )
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            quant=False, max_examples=25, passes=["fc_xpu_fuse_pass"]
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()