diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc
index 50db74e46d1d6929b84b9fb89b11f48c485a8e25..61bd888715c702fe8974dc93a36626a65a715497 100644
--- a/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@ using string::PrettyLogDetail;
 
 void MatmulActivationMkldnnFusePass::ApplyImpl(Graph* graph) const {
   auto act_types = GetSupportedActivations();
-  auto matmul_types = {"matmul", "matmul_v2"};
+  auto matmul_types = {"fused_matmul", "matmul", "matmul_v2"};
 
   for (const auto& matmul_type : matmul_types)
     for (auto& act_type : act_types) {
@@ -61,8 +61,17 @@ void MatmulActivationMkldnnFusePass::FuseMatmulAct(
     GET_IR_NODE_FROM_SUBGRAPH(
         activation_out, activation_out, matmul_act_pattern);
 
-    SetActivationAttrs(matmul->Op(), activation->Op(), act_type);
-    matmul->Op()->SetOutput("Out", {activation_out->Name()});
+    OpDesc* matmul_op = matmul->Op();
+
+    matmul_op->SetType("fused_matmul");
+    if (matmul_type == "matmul") {
+      matmul_op->SetAttr("trans_x", matmul_op->GetAttr("transpose_X"));
+      matmul_op->SetAttr("trans_y", matmul_op->GetAttr("transpose_Y"));
+      matmul_op->SetAttr("matmul_alpha", matmul_op->GetAttr("alpha"));
+    }
+
+    SetActivationAttrs(matmul_op, activation->Op(), act_type);
+    matmul_op->SetOutput("Out", {activation_out->Name()});
 
     IR_NODE_LINK_TO(matmul, activation_out);
     GraphSafeRemoveNodes(graph, {activation, matmul_out});
@@ -88,11 +97,6 @@ MatmulActivationMkldnnFusePass::MatmulActivationMkldnnFusePass() {
       .AddInput("Y")
       .IsTensor()
       .End()
-      .AddInput(
-          "ResidualData")  // Extra tensor used in matmul+elementwise_add fuse
-      .IsTensor()
-      .IsOptional()
-      .End()
       .AddOutput("Out")
       .IsTensor()
       .End()
@@ -113,8 +117,24 @@ MatmulActivationMkldnnFusePass::MatmulActivationMkldnnFusePass() {
       .AddInput("Y")
       .IsTensor()
       .End()
-      .AddInput(
-          "ResidualData")  // Extra tensor used in matmul+elementwise_add fuse
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("trans_x")
+      .IsType<bool>()
+      .End()
+      .AddAttr("trans_y")
+      .IsType<bool>()
+      .End();
+
+  AddOpCompat(OpCompat("fused_matmul"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddInput("ResidualData")
       .IsTensor()
       .IsOptional()
       .End()
@@ -126,6 +146,50 @@ MatmulActivationMkldnnFusePass::MatmulActivationMkldnnFusePass() {
       .End()
       .AddAttr("trans_y")
       .IsType<bool>()
+      .End()
+      .AddAttr("matmul_alpha")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fuse_activation")
+      .IsType<std::string>()
+      .IsOptional()
+      .End()
+      .AddAttr("fuse_alpha")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fuse_beta")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_output_scale")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_reshape_X")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_transpose_X")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_reshape_Y")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_transpose_Y")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_reshape_Out")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_transpose_Out")
+      .IsType<std::vector<int>>()
+      .IsOptional()
       .End();
 
   AddOpCompat(OpCompat("abs"))
@@ -279,6 +343,7 @@ REGISTER_PASS(matmul_activation_mkldnn_fuse_pass,
 REGISTER_PASS_CAPABILITY(matmul_activation_mkldnn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("fused_matmul", 0)
             .LE("matmul", 1)
             .EQ("matmul_v2", 0)
             .EQ("abs", 0)
diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc
index f045377465e0322207d2d5ebdb888f74878e8d43..680600a403251548dc47a416d2786653e19bf630 100644
--- a/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@ namespace ir {
 using string::PrettyLogDetail;
 
 void MatmulElementwiseAddMKLDNNFusePass::ApplyImpl(Graph* graph) const {
-  auto matmul_types = {"matmul", "matmul_v2"};
+  auto matmul_types = {"fused_matmul", "matmul", "matmul_v2"};
   auto matmul_as_x = {true, false};
 
   for (const auto& matmul_type : matmul_types)
@@ -65,6 +65,12 @@ void MatmulElementwiseAddMKLDNNFusePass::FuseMatmulElementwiseAdd(
       return;
     }
 
+    matmul->Op()->SetType("fused_matmul");
+    if (matmul_type == "matmul") {
+      matmul->Op()->SetAttr("trans_x", matmul->Op()->GetAttr("transpose_X"));
+      matmul->Op()->SetAttr("trans_y", matmul->Op()->GetAttr("transpose_Y"));
+      matmul->Op()->SetAttr("matmul_alpha", matmul->Op()->GetAttr("alpha"));
+    }
     matmul->Op()->SetInput("ResidualData", {elementwise_addend->Name()});
     matmul->Op()->SetOutput("Out", {elementwise_add_out->Name()});
 
@@ -125,6 +131,71 @@ MatmulElementwiseAddMKLDNNFusePass::MatmulElementwiseAddMKLDNNFusePass() {
       .IsType<bool>()
       .End();
 
+  AddOpCompat(OpCompat("fused_matmul"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddInput("ResidualData")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("trans_x")
+      .IsType<bool>()
+      .End()
+      .AddAttr("trans_y")
+      .IsType<bool>()
+      .End()
+      .AddAttr("matmul_alpha")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fuse_activation")
+      .IsType<std::string>()
+      .IsOptional()
+      .End()
+      .AddAttr("fuse_alpha")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fuse_beta")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_output_scale")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_reshape_X")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_transpose_X")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_reshape_Y")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_transpose_Y")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_reshape_Out")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_transpose_Out")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End();
+
   AddOpCompat(OpCompat("elementwise_add"))
       .AddInput("X")
       .IsTensor()
@@ -149,6 +220,7 @@ REGISTER_PASS(matmul_elementwise_add_mkldnn_fuse_pass,
 REGISTER_PASS_CAPABILITY(matmul_elementwise_add_mkldnn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("fused_matmul", 0)
             .LE("matmul", 1)
             .EQ("matmul_v2", 0)
             .LE("elementwise_add", 1));
diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc
index 40dbaa03a0615f1456c6530ed1340741d443f193..779c39834c6e3a1c04bd60610208d2ae56fbf252 100644
--- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@ namespace ir {
 using string::PrettyLogDetail;
 
 void MatmulTransposeReshapeMKLDNNPass::ApplyImpl(Graph *graph) const {
-  auto matmul_types = {"matmul", "matmul_v2"};
+  auto matmul_types = {"fused_matmul", "matmul", "matmul_v2"};
 
   for (const auto &matmul_type : matmul_types) {
     Fuse(graph, matmul_type);
@@ -84,6 +84,12 @@ void MatmulTransposeReshapeMKLDNNPass::Fuse(
     }
 
     OpDesc *matmul_desc = matmul_op->Op();
+    matmul_desc->SetType("fused_matmul");
+    if (matmul_type == "matmul") {
+      matmul_desc->SetAttr("trans_x", matmul_desc->GetAttr("transpose_X"));
+      matmul_desc->SetAttr("trans_y", matmul_desc->GetAttr("transpose_Y"));
+      matmul_desc->SetAttr("matmul_alpha", matmul_desc->GetAttr("alpha"));
+    }
     matmul_desc->SetOutput("Out", {reshape_out->Name()});
     matmul_desc->SetAttr("fused_reshape_Out", reshape_shape);
     matmul_desc->SetAttr("fused_transpose_Out", transpose_axis);
@@ -149,6 +155,71 @@ MatmulTransposeReshapeMKLDNNPass::MatmulTransposeReshapeMKLDNNPass() {
       .IsType<bool>()
       .End();
 
+  AddOpCompat(OpCompat("fused_matmul"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddInput("ResidualData")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("trans_x")
+      .IsType<bool>()
+      .End()
+      .AddAttr("trans_y")
+      .IsType<bool>()
+      .End()
+      .AddAttr("matmul_alpha")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fuse_activation")
+      .IsType<std::string>()
+      .IsOptional()
+      .End()
+      .AddAttr("fuse_alpha")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fuse_beta")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_output_scale")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_reshape_X")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_transpose_X")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_reshape_Y")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_transpose_Y")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_reshape_Out")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_transpose_Out")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End();
+
   AddOpCompat(OpCompat("transpose2"))
       .AddInput("X")
       .IsTensor()
@@ -189,6 +260,7 @@ REGISTER_PASS(matmul_transpose_reshape_mkldnn_fuse_pass,
 REGISTER_PASS_CAPABILITY(matmul_transpose_reshape_mkldnn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("fused_matmul", 0)
             .LE("matmul", 1)
             .EQ("matmul_v2", 0)
             .EQ("transpose2", 0)
diff --git a/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc
index cb06f6eb1205e94d0a1861183014edfc1a67de02..579764355d86cde4de363b9300e25f5b058d8a15 100644
--- a/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@ using string::PrettyLogDetail;
 void FuseOperatorScaleOneDNNPass::ApplyImpl(Graph *graph) const {
   const std::vector<std::string> fusable_ops{
       "fc",
+      "fused_matmul",
       "matmul",
       "matmul_v2",
       "elementwise_add",
@@ -85,6 +86,19 @@ void FuseOperatorScaleOneDNNPass::FuseScale(Graph *graph,
       scale = *(scale_tensor->data<float>());
     }
 
+    if (op_type == "matmul") {
+      operator_op->Op()->SetType("fused_matmul");
+      operator_op->Op()->SetAttr("trans_x",
+                                 operator_op->Op()->GetAttr("transpose_X"));
+      operator_op->Op()->SetAttr("trans_y",
+                                 operator_op->Op()->GetAttr("transpose_Y"));
+      operator_op->Op()->SetAttr("matmul_alpha",
+                                 operator_op->Op()->GetAttr("alpha"));
+    }
+    if (op_type == "matmul_v2") {
+      operator_op->Op()->SetType("fused_matmul");
+    }
+
     operator_op->Op()->SetAttr("fused_output_scale", scale);
     operator_op->Op()->SetOutput("Out", {scale_out->Name()});
 
@@ -111,6 +125,7 @@ REGISTER_PASS_CAPABILITY(operator_scale_onednn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
             .EQ("fc", 0)
+            .EQ("fused_matmul", 0)
             .LE("matmul", 1)
             .EQ("matmul_v2", 0)
             .LE("elementwise_add", 1)
diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc
index 25a79509b53f531ce53cd354bea1e16f9680f5c0..508cad94e8136eca50afa0e6c27503aa9335511c 100644
--- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@ namespace framework {
 namespace ir {
 
 void ReshapeTransposeMatmulMkldnnFusePass::ApplyImpl(Graph *graph) const {
-  auto matmul_types = {"matmul", "matmul_v2"};
+  auto matmul_types = {"matmul", "matmul_v2", "fused_matmul"};
 
   for (const auto &matmul_type : matmul_types) {
     Fuse(graph,
@@ -102,6 +102,25 @@ void ReshapeTransposeMatmulMkldnnFusePass::Fuse(
                                               matmul_type + " encountered.");
     }
 
+    // Return if input of fused_matmul is already fused
+    if (matmul_type == "fused_matmul") {
+      auto is_already_fused_X =
+          matmul_desc->HasAttr("fused_reshape_X")
+              ? !(PADDLE_GET_CONST(std::vector<int>,
+                                   matmul_desc->GetAttr("fused_reshape_X"))
+                      .empty())
+              : false;
+      if (is_already_fused_X && matmul_input_name == "X") return;
+
+      auto is_already_fused_Y =
+          matmul_desc->HasAttr("fused_reshape_Y")
+              ? !(PADDLE_GET_CONST(std::vector<int>,
+                                   matmul_desc->GetAttr("fused_reshape_Y"))
+                      .empty())
+              : false;
+      if (is_already_fused_Y && matmul_input_name == "Y") return;
+    }
+
     auto reshape_shape =
         paddle::get<std::vector<int>>(reshape_op->Op()->GetAttr("shape"));
     auto transpose_axis =
@@ -123,6 +142,12 @@ void ReshapeTransposeMatmulMkldnnFusePass::Fuse(
       return;
     }
 
+    matmul_desc->SetType("fused_matmul");
+    if (matmul_type == "matmul") {
+      matmul_desc->SetAttr("trans_x", matmul_desc->GetAttr("transpose_X"));
+      matmul_desc->SetAttr("trans_y", matmul_desc->GetAttr("transpose_Y"));
+      matmul_desc->SetAttr("matmul_alpha", matmul_desc->GetAttr("alpha"));
+    }
     matmul_desc->SetInput(matmul_input_name, {(reshape_in)->Name()});
     matmul_desc->SetAttr("fused_reshape_" + matmul_input_name, reshape_shape);
     matmul_desc->SetAttr("fused_transpose_" + matmul_input_name,
@@ -220,6 +245,71 @@ ReshapeTransposeMatmulMkldnnFusePass::ReshapeTransposeMatmulMkldnnFusePass() {
       .AddAttr("trans_y")
       .IsType<bool>()
       .End();
+
+  AddOpCompat(OpCompat("fused_matmul"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddInput("ResidualData")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("trans_x")
+      .IsType<bool>()
+      .End()
+      .AddAttr("trans_y")
+      .IsType<bool>()
+      .End()
+      .AddAttr("matmul_alpha")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fuse_activation")
+      .IsType<std::string>()
+      .IsOptional()
+      .End()
+      .AddAttr("fuse_alpha")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fuse_beta")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_output_scale")
+      .IsType<float>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_reshape_X")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_transpose_X")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_reshape_Y")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_transpose_Y")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_reshape_Out")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End()
+      .AddAttr("fused_transpose_Out")
+      .IsType<std::vector<int>>()
+      .IsOptional()
+      .End();
 }
 
 }  // namespace ir
@@ -234,5 +324,6 @@ REGISTER_PASS_CAPABILITY(reshape_transpose_matmul_mkldnn_fuse_pass)
         paddle::framework::compatible::OpVersionComparatorCombination()
             .EQ("reshape2", 0)
             .EQ("transpose2", 0)
+            .EQ("fused_matmul", 0)
             .EQ("matmul", 1)
             .EQ("matmul_v2", 0));
diff --git a/paddle/fluid/operators/compat/fused_matmul.pbtxt b/paddle/fluid/operators/compat/fused_matmul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1a858da2e72eeddcaea1619bec7bd4b3159d2fea
--- /dev/null
+++ b/paddle/fluid/operators/compat/fused_matmul.pbtxt
@@ -0,0 +1,93 @@
+type: "fused_matmul"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Y"
+  }
+  inputs {
+    name: "ResidualData"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "trans_x"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "trans_y"
+    type: BOOLEAN
+  }
+}
+extra {
+  attrs {
+    name: "matmul_alpha"
+    type: FLOAT
+  }
+  attrs {
+    name: "fuse_activation"
+    type: STRING
+  }
+  attrs {
+    name: "fuse_alpha"
+    type: FLOAT
+  }
+  attrs {
+    name: "fuse_beta"
+    type: FLOAT
+  }
+  attrs {
+    name: "fused_output_scale"
+    type: FLOAT
+  }
+  attrs {
+    name: "fused_reshape_X"
+    type: INTS
+  }
+  attrs {
+    name: "fused_transpose_X"
+    type: INTS
+  }
+  attrs {
+    name: "fused_reshape_Y"
+    type: INTS
+  }
+  attrs {
+    name: "fused_transpose_Y"
+    type: INTS
+  }
+  attrs {
+    name: "fused_reshape_Out"
+    type: INTS
+  }
+  attrs {
+    name: "fused_transpose_Out"
+    type: INTS
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "Scale_x"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_y"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_in_eltwise"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_out"
+    type: FLOAT
+  }
+  attrs {
+    name: "force_fp32_output"
+    type: BOOLEAN
+  }
+}
diff --git a/paddle/fluid/operators/compat/matmul_v2.pbtxt b/paddle/fluid/operators/compat/matmul_v2.pbtxt
index cefb964a59f71286cbc4c685f6bf8e8fe8b2f672..5f43e1f8bf0e0c502566a2cc783b8927e5df56cc 100644
--- a/paddle/fluid/operators/compat/matmul_v2.pbtxt
+++ b/paddle/fluid/operators/compat/matmul_v2.pbtxt
@@ -39,28 +39,4 @@ extra {
     name: "op_device"
     type: STRING
   }
-  attrs {
-    name: "fused_reshape_X"
-    type: INTS
-  }
-  attrs {
-    name: "fused_reshape_Y"
-    type: INTS
-  }
-  attrs {
-    name: "fused_transpose_X"
-    type: INTS
-  }
-  attrs {
-    name: "fused_transpose_Y"
-    type: INTS
-  }
-  attrs {
-    name: "fused_reshape_Out"
-    type: INTS
-  }
-  attrs {
-    name: "fused_transpose_Out"
-    type: INTS
-  }
 }
diff --git a/paddle/fluid/operators/fused/fused_matmul_op.cc b/paddle/fluid/operators/fused/fused_matmul_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bd204b53c1bd307ee6a90800cad1d9d31170ecd3
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_matmul_op.cc
@@ -0,0 +1,206 @@
+//   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/matmul_v2_op.h"
+
+namespace paddle {
+namespace operators {
+
+static std::vector<int64_t> GetInputShape(phi::DDim dim,
+                                          std::vector<int> shape,
+                                          std::vector<int> axis) {
+  PADDLE_ENFORCE_GT(dim.size(),
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The Input(%s) has not been initialized properly. The "
+                        "shape of Input(%s) = [%s].",
+                        dim));
+
+  auto is_input_fused = (!shape.empty() && !axis.empty());
+  if (is_input_fused) {
+    dim = dim.reshape(shape).transpose(axis);
+  }
+  return phi::vectorize(dim);
+}
+
+class FusedMatmulOp : public MatMulV2Op {
+ public:
+  using MatMulV2Op::MatMulV2Op;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fused_matmul");
+    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "fused_matmul");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "fused_matmul");
+    bool trans_x = ctx->Attrs().Get<bool>("trans_x");
+    bool trans_y = ctx->Attrs().Get<bool>("trans_y");
+
+    std::vector<int64_t> dims_x =
+        GetInputShape(ctx->GetInputDim("X"),
+                      ctx->Attrs().Get<std::vector<int>>("fused_reshape_X"),
+                      ctx->Attrs().Get<std::vector<int>>("fused_transpose_X"));
+    std::vector<int64_t> dims_y =
+        GetInputShape(ctx->GetInputDim("Y"),
+                      ctx->Attrs().Get<std::vector<int>>("fused_reshape_Y"),
+                      ctx->Attrs().Get<std::vector<int>>("fused_transpose_Y"));
+
+    auto ndims_x = dims_x.size();
+    auto ndims_y = dims_y.size();
+    PADDLE_ENFORCE_GT(ndims_x,
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The Input(X) dims size must be greater than 0,"
+                          " but received dims size is 0. "));
+    PADDLE_ENFORCE_GT(ndims_y,
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The Input(Y) dims size must be greater than 0,"
+                          " but received dims size is 0. "));
+
+    bool x_broadcasted = false;
+    bool y_broadcasted = false;
+
+    if (ndims_x == 1) {
+      dims_x.insert(dims_x.begin(), 1);
+      ndims_x = 2;
+      x_broadcasted = true;
+    }
+
+    if (ndims_y == 1) {
+      dims_y.push_back(1);
+      ndims_y = 2;
+      y_broadcasted = true;
+    }
+
+    size_t M, N;
+    if (trans_x) {
+      M = dims_x[ndims_x - 1];
+    } else {
+      M = dims_x[ndims_x - 2];
+    }
+    if (trans_y) {
+      N = dims_y[ndims_y - 2];
+    } else {
+      N = dims_y[ndims_y - 1];
+    }
+
+    std::vector<int64_t> new_dims;
+    if (ndims_x > ndims_y) {
+      new_dims.assign(dims_x.begin(), dims_x.end() - 2);
+    } else if (ndims_x < ndims_y) {
+      new_dims.assign(dims_y.begin(), dims_y.end() - 2);
+    } else {
+      new_dims.reserve(ndims_x);
+      for (size_t i = 0; i < ndims_x - 2; ++i) {
+        new_dims.push_back(std::max(dims_x[i], dims_y[i]));
+      }
+    }
+    if (!x_broadcasted) {
+      new_dims.push_back(M);
+    }
+    if (!y_broadcasted) {
+      new_dims.push_back(N);
+    }
+    if (x_broadcasted && y_broadcasted) {
+      new_dims.push_back(1);
+    }
+
+    auto ddim_out = phi::make_ddim(new_dims);
+
+    auto shape = ctx->Attrs().Get<std::vector<int>>("fused_reshape_Out");
+    auto axis = ctx->Attrs().Get<std::vector<int>>("fused_transpose_Out");
+
+    auto is_output_fused = (!shape.empty() && !axis.empty());
+    if (is_output_fused) {
+      ddim_out = ddim_out.transpose(axis).reshape(shape);
+    }
+
+    ctx->SetOutputDim("Out", ddim_out);
+    ctx->ShareLoD("X", "Out");
+  }
+};
+
+class FusedMatmulOpMaker : public MatMulV2OpMaker {
+ protected:
+  void Apply() override {
+    AddInput("ResidualData",
+             "Extra input from matmul_elementwise_add_mkldnn_fuse_pass")
+        .AsDispensable()
+        .AsExtra();
+    AddAttr<float>("matmul_alpha", "Output scale used in matmul_v1")
+        .SetDefault(1.0f);
+    AddAttr<std::string>(
+        "fuse_activation",
+        "Activation type from matmul_activation_mkldnn_fuse_pass")
+        .SetDefault("");
+    AddAttr<float>("fuse_alpha",
+                   "Activation alpha from matmul_activation_mkldnn_fuse_pass")
+        .SetDefault(0.0f);
+    AddAttr<float>("fuse_beta",
+                   "Activation beta from matmul_activation_mkldnn_fuse_pass")
+        .SetDefault(0.0f);
+    AddAttr<float>("fused_output_scale",
+                   "Output scale from operator_scale_onednn_fuse_pass")
+        .SetDefault(1.0f);
+    AddAttr<std::vector<int>>("fused_reshape_X",
+                              "Reshape's shape attribute from "
+                              "reshape_transpose_matmul_mkldnn_fuse_pass")
+        .SetDefault({});
+    AddAttr<std::vector<int>>("fused_transpose_X",
+                              "Transpose's axis attribute from "
+                              "reshape_transpose_matmul_mkldnn_fuse_pass")
+        .SetDefault({});
+    AddAttr<std::vector<int>>("fused_reshape_Y",
+                              "Reshape's shape attribute from "
+                              "reshape_transpose_matmul_mkldnn_fuse_pass")
+        .SetDefault({});
+    AddAttr<std::vector<int>>("fused_transpose_Y",
+                              "Transpose's axis attribute from "
+                              "reshape_transpose_matmul_mkldnn_fuse_pass")
+        .SetDefault({});
+    AddAttr<std::vector<int>>("fused_reshape_Out",
+                              "Reshape's shape attribute from "
+                              "matmul_transpose_reshape_mkldnn_fuse_pass")
+        .SetDefault({});
+    AddAttr<std::vector<int>>("fused_transpose_Out",
+                              "Transpose's axis attribute from "
+                              "matmul_transpose_reshape_mkldnn_fuse_pass")
+        .SetDefault({});
+    AddAttr<std::string>("mkldnn_data_type", "oneDNN operator data type")
+        .SetDefault("float32")
+        .InEnum({"float32", "int8", "bfloat16"});
+    AddAttr<float>("Scale_x", "Matmul X input quantization scale")
+        .SetDefault(1.0f);
+    AddAttr<float>("Scale_y", "Matmul Y input quantization scale")
+        .SetDefault(1.0f);
+    AddAttr<float>("Scale_in_eltwise", "Matmul ResidualData quantization scale")
+        .SetDefault(0.0f);
+    AddAttr<float>("Scale_out", "Matmul output quantization scale")
+        .SetDefault(1.0f);
+    AddAttr<bool>("force_fp32_output",
+                  "Flag determining if output should be converted to FP32")
+        .SetDefault(false);
+    AddComment(
+        R"DOC(Matrix multiplication extended with oneDNN-specific fusion logic.)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    fused_matmul,
+    ops::FusedMatmulOp,
+    ops::FusedMatmulOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc
index c52fc08c91d5258d3c52b44234f74b3b3474b442..dee182ca1034cbba566622fd6aba31a76f91ed82 100644
--- a/paddle/fluid/operators/matmul_v2_op.cc
+++ b/paddle/fluid/operators/matmul_v2_op.cc
@@ -1,4 +1,4 @@
-//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -24,168 +24,131 @@
 namespace paddle {
 namespace operators {
 
-static framework::DDim GetDimForInput(const framework::InferShapeContext& ctx,
-                                      const std::string input_name) {
-  auto shape = ctx.Attrs().Get<std::vector<int>>("fused_reshape_" + input_name);
-  auto axis =
-      ctx.Attrs().Get<std::vector<int>>("fused_transpose_" + input_name);
-  auto dim = ctx.GetInputDim(input_name);
-
-  PADDLE_ENFORCE_GT(dim.size(),
+void MatMulV2Op::InferShape(framework::InferShapeContext* ctx) const {
+  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "matmul_v2");
+  OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "matmul_v2");
+  OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "matmul_v2");
+  bool trans_x = ctx->Attrs().Get<bool>("trans_x");
+  bool trans_y = ctx->Attrs().Get<bool>("trans_y");
+
+  std::vector<int64_t> dims_x = phi::vectorize(ctx->GetInputDim("X"));
+  std::vector<int64_t> dims_y = phi::vectorize(ctx->GetInputDim("Y"));
+  auto ndims_x = dims_x.size();
+  auto ndims_y = dims_y.size();
+  PADDLE_ENFORCE_GT(ndims_x,
                     0,
-                    platform::errors::InvalidArgument(
-                        "The Input(%s) has not been initialized properly. The "
-                        "shape of Input(%s) = [%s].",
-                        dim));
-
-  if (!shape.empty() && !axis.empty()) {
-    dim = dim.reshape(shape).transpose(axis);
-  }
-  return dim;
-}
-
-class MatMulV2Op : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "matmul_v2");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "matmul_v2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "matmul_v2");
-    bool trans_x = ctx->Attrs().Get<bool>("trans_x");
-    bool trans_y = ctx->Attrs().Get<bool>("trans_y");
-
-    std::vector<int64_t> dims_x = phi::vectorize(GetDimForInput(*ctx, "X"));
-    std::vector<int64_t> dims_y = phi::vectorize(GetDimForInput(*ctx, "Y"));
-    auto ndims_x = dims_x.size();
-    auto ndims_y = dims_y.size();
-    PADDLE_ENFORCE_GT(ndims_x,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "The Input(X) dims size must be greater than 0,"
-                          " but received dims size is 0. "));
-    PADDLE_ENFORCE_GT(ndims_y,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "The Input(Y) dims size must be greater than 0,"
-                          " but received dims size is 0. "));
-
-    bool x_broadcasted = false, y_broadcasted = false;
-    if (ndims_x == 1) {
-      dims_x.insert(dims_x.begin(), 1);
-      ndims_x = 2;
-      x_broadcasted = true;
-    }
-
-    if (ndims_y == 1) {
-      dims_y.push_back(1);
-      ndims_y = 2;
-      y_broadcasted = true;
-    }
+                    phi::errors::InvalidArgument(
+                        "The Input(X) dims size must be greater than 0,"
+                        " but received dims size is 0. "));
+  PADDLE_ENFORCE_GT(ndims_y,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The Input(Y) dims size must be greater than 0,"
+                        " but received dims size is 0. "));
 
-    size_t M, N;
-    if (trans_x) {
-      M = dims_x[ndims_x - 1];
-    } else {
-      M = dims_x[ndims_x - 2];
-    }
-    if (trans_y) {
-      N = dims_y[ndims_y - 2];
-    } else {
-      N = dims_y[ndims_y - 1];
-    }
+  bool x_broadcasted = false;
+  bool y_broadcasted = false;
 
-    std::vector<int64_t> new_dims;
-    if (ndims_x > ndims_y) {
-      new_dims.assign(dims_x.begin(), dims_x.end() - 2);
-    } else if (ndims_x < ndims_y) {
-      new_dims.assign(dims_y.begin(), dims_y.end() - 2);
-    } else {
-      new_dims.reserve(ndims_x);
-      for (size_t i = 0; i < ndims_x - 2; ++i) {
-        new_dims.push_back(std::max(dims_x[i], dims_y[i]));
-      }
-    }
-    if (!x_broadcasted) {
-      new_dims.push_back(M);
-    }
-    if (!y_broadcasted) {
-      new_dims.push_back(N);
-    }
-    if (x_broadcasted && y_broadcasted) {
-      new_dims.push_back(1);
-    }
+  if (ndims_x == 1) {
+    dims_x.insert(dims_x.begin(), 1);
+    ndims_x = 2;
+    x_broadcasted = true;
+  }
 
-    auto ddim_out = phi::make_ddim(new_dims);
+  if (ndims_y == 1) {
+    dims_y.push_back(1);
+    ndims_y = 2;
+    y_broadcasted = true;
+  }
 
-#ifdef PADDLE_WITH_MKLDNN
-    auto shape = ctx->Attrs().Get<std::vector<int>>("fused_reshape_Out");
-    auto axis = ctx->Attrs().Get<std::vector<int>>("fused_transpose_Out");
+  size_t M, N;
+  if (trans_x) {
+    M = dims_x[ndims_x - 1];
+  } else {
+    M = dims_x[ndims_x - 2];
+  }
+  if (trans_y) {
+    N = dims_y[ndims_y - 2];
+  } else {
+    N = dims_y[ndims_y - 1];
+  }
 
-    if (!shape.empty() && !axis.empty()) {
-      ddim_out = ddim_out.transpose(axis).reshape(shape);
+  std::vector<int64_t> new_dims;
+  if (ndims_x > ndims_y) {
+    new_dims.assign(dims_x.begin(), dims_x.end() - 2);
+  } else if (ndims_x < ndims_y) {
+    new_dims.assign(dims_y.begin(), dims_y.end() - 2);
+  } else {
+    new_dims.reserve(ndims_x);
+    for (size_t i = 0; i < ndims_x - 2; ++i) {
+      new_dims.push_back(std::max(dims_x[i], dims_y[i]));
     }
-#endif
-
-    ctx->SetOutputDim("Out", ddim_out);
-    ctx->ShareLoD("X", "Out");
   }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto input_data_type =
-        OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "X", "Y");
-    return phi::KernelKey(input_data_type, ctx.GetPlace());
+  if (!x_broadcasted) {
+    new_dims.push_back(M);
+  }
+  if (!y_broadcasted) {
+    new_dims.push_back(N);
+  }
+  if (x_broadcasted && y_broadcasted) {
+    new_dims.push_back(1);
   }
 
-  phi::KernelKey GetKernelTypeForVar(
-      const std::string& var_name,
-      const phi::DenseTensor& tensor,
-      const phi::KernelKey& expected_kernel_type) const override {
-    if (framework::IsComplexType(expected_kernel_type.dtype())) {
-      // only promote inputs’s types when contains complex input
-      return phi::KernelKey(tensor.place(), tensor.layout(), tensor.dtype());
-    } else {
+  ctx->SetOutputDim("Out", phi::make_ddim(new_dims));
+  ctx->ShareLoD("X", "Out");
+}
+
+phi::KernelKey MatMulV2Op::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  auto input_data_type =
+      OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "X", "Y");
+  return phi::KernelKey(input_data_type, ctx.GetPlace());
+}
+
+phi::KernelKey MatMulV2Op::GetKernelTypeForVar(
+    const std::string& var_name,
+    const phi::DenseTensor& tensor,
+    const phi::KernelKey& expected_kernel_type) const {
+  if (framework::IsComplexType(expected_kernel_type.dtype())) {
+    // only promote inputs’s types when contains complex input
+    return phi::KernelKey(tensor.place(), tensor.layout(), tensor.dtype());
+  } else {
 #ifdef PADDLE_WITH_MKLDNN
-      // When matmul_v2 is first oneDNN op in a chain (there was some non oneDNN
-      // op previously) then we also need to rotate shape NHWC -> NCWH
-      if ((expected_kernel_type.layout() == phi::DataLayout::ONEDNN) &&
-          (tensor.layout() != phi::DataLayout::ONEDNN) &&
-          phi::OneDNNContext::tls().get_cur_paddle_data_layout() ==
-              phi::DataLayout::kNHWC) {
-        return phi::KernelKey(tensor.place(),
-                              phi::DataLayout::kNHWC,
-                              expected_kernel_type.dtype());
-      }
-#endif
+    // When matmul_v2 is first oneDNN op in a chain (there was some non oneDNN
+    // op previously) then we also need to rotate shape NHWC -> NCWH
+    if ((expected_kernel_type.layout() == phi::DataLayout::ONEDNN) &&
+        (tensor.layout() != phi::DataLayout::ONEDNN) &&
+        phi::OneDNNContext::tls().get_cur_paddle_data_layout() ==
+            phi::DataLayout::kNHWC) {
       return phi::KernelKey(
-          tensor.place(), tensor.layout(), expected_kernel_type.dtype());
+          tensor.place(), phi::DataLayout::kNHWC, expected_kernel_type.dtype());
     }
+#endif
+    return phi::KernelKey(
+        tensor.place(), tensor.layout(), expected_kernel_type.dtype());
   }
-};
+}
 
-class MatMulV2OpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "tensor of shape (d0, d1 ... M, K)");
-    AddInput("Y", "tensor of shape (d0, d1 ... K, N)");
-    AddOutput("Out", "tensor of shape (d0, d1 ... M, N)");
-    AddAttr<bool>("trans_x",
-                  "Set true to transpose the last two dimensions of X before "
-                  "doing multiplication")
-        .SetDefault(false);
-    AddAttr<bool>("trans_y",
-                  "Set true to transpose the last two dimensions of Y before "
-                  "doing multiplication")
-        .SetDefault(false);
-    AddComment(
-        R"DOC(Matrix multiplication Out = X * Y. A has shape (d0, d1 ... M, K),
+void MatMulV2OpMaker::Make() {
+  AddInput("X", "tensor of shape (d0, d1 ... M, K)");
+  AddInput("Y", "tensor of shape (d0, d1 ... K, N)");
+  AddOutput("Out", "tensor of shape (d0, d1 ... M, N)");
+  AddAttr<bool>("trans_x",
+                "Set true to transpose the last two dimensions of X before "
+                "doing multiplication")
+      .SetDefault(false);
+  AddAttr<bool>("trans_y",
+                "Set true to transpose the last two dimensions of Y before "
+                "doing multiplication")
+      .SetDefault(false);
+  AddComment(
+      R"DOC(Matrix multiplication Out = X * Y. A has shape (d0, d1 ... M, K),
         B has shape (d0, d1 ... K, N), Out has shape ((d0, d1 ... M, N)).
         In addition, it also follows the broadcast rule which is similar as
         numpy.matmul.
 )DOC");
-  }
-};
+  Apply();
+}
 
 class MatMulV2OpGrad : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h
index 70bdd0736bf4ecc2322d0a6e5c8d34c320d8a8f5..a27bf5a33e2f8fc302c033875397667c6ab727ff 100644
--- a/paddle/fluid/operators/matmul_v2_op.h
+++ b/paddle/fluid/operators/matmul_v2_op.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -37,6 +37,29 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+class MatMulV2Op : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  phi::KernelKey GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+
+  phi::KernelKey GetKernelTypeForVar(
+      const std::string& var_name,
+      const phi::DenseTensor& tensor,
+      const phi::KernelKey& expected_kernel_type) const override;
+};
+
+class MatMulV2OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() final;
+
+ protected:
+  virtual void Apply() {}
+};
+
 // Reshape a rank-3 tensor from P x M x N to (P * M) x N.
 // Identity op if the tensor is not of rank 3.
 static phi::DenseTensor FoldInitDims(const phi::DenseTensor& input) {
diff --git a/paddle/fluid/operators/ops_extra_info.h b/paddle/fluid/operators/ops_extra_info.h
index 94f0fa2a606c3642e835d8184e98186b14bed3e5..02624b9a49fbaaee4c4eb9ca733774823f663493 100644
--- a/paddle/fluid/operators/ops_extra_info.h
+++ b/paddle/fluid/operators/ops_extra_info.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -102,12 +102,6 @@ const std::unordered_map<std::string, ExtraAttrPropertySet>
         {"fused_output_scale", ExtraAttrProperty::ONEDNN},
         {"fuse_residual_connection", ExtraAttrProperty::ONEDNN},
         {"fuse_with_relu", ExtraAttrProperty::ONEDNN},
-        {"fused_reshape_Out", ExtraAttrProperty::ONEDNN},
-        {"fused_transpose_Out", ExtraAttrProperty::ONEDNN},
-        {"fused_reshape_X", ExtraAttrProperty::ONEDNN},
-        {"fused_reshape_Y", ExtraAttrProperty::ONEDNN},
-        {"fused_transpose_X", ExtraAttrProperty::ONEDNN},
-        {"fused_transpose_Y", ExtraAttrProperty::ONEDNN},
         {"mkldnn_data_type", ExtraAttrProperty::ONEDNN},
         {"scale_x", ExtraAttrProperty::ONEDNN},
         {"scale_y", ExtraAttrProperty::ONEDNN},
@@ -226,8 +220,7 @@ class ExtraInfoUtils {
   std::unordered_map<std::string, std::vector<std::string>>
       g_extra_input_names_map_ = {{"conv2d", {"Bias", "ResidualData"}},
                                   {"conv2d_transpose", {"Bias"}},
-                                  {"conv2d_grad", {"Bias"}},
-                                  {"matmul_v2", {"ResidualData"}}};
+                                  {"conv2d_grad", {"Bias"}}};
   std::vector<std::string> empty_extra_input_names_;
 };
 
diff --git a/paddle/phi/backends/onednn/onednn_reuse.h b/paddle/phi/backends/onednn/onednn_reuse.h
index c398138e2d5fa06ae6c35ca7901bb925689dcbb9..cb3f59036ee2ac364b271a839574ca9cfba656cc 100644
--- a/paddle/phi/backends/onednn/onednn_reuse.h
+++ b/paddle/phi/backends/onednn/onednn_reuse.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -53,26 +53,31 @@ constexpr bool is_bfloat16() {
 
 static void AppendActivation(const OneDNNContext& dev_ctx,
                              dnnl::post_ops& post_ops,  // NOLINT
-                             float activation_scale = 1.0f) {
-  const auto invalid_attribute =
-      dev_ctx.HasDnnAttr("fuse_activation")
-          ? PADDLE_GET_CONST(std::string, dev_ctx.GetDnnAttr("fuse_activation"))
-                .empty()
-          : true;
-  if (invalid_attribute) return;
-
-  const auto fuse_activation =
-      dev_ctx.HasDnnAttr("fuse_activation")
-          ? PADDLE_GET_CONST(std::string, dev_ctx.GetDnnAttr("fuse_activation"))
-          : "";
-  const auto fuse_alpha =
-      dev_ctx.HasDnnAttr("fuse_alpha")
-          ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("fuse_alpha"))
-          : 0.0f;
-  const auto fuse_beta =
-      dev_ctx.HasDnnAttr("fuse_beta")
-          ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("fuse_beta"))
-          : 0.0f;
+                             float activation_scale = 1.0f,
+                             std::string fuse_activation = "",
+                             float fuse_alpha = 0.0f,
+                             float fuse_beta = 0.0f) {
+  if (fuse_activation == "") {
+    const auto invalid_attribute =
+        dev_ctx.HasDnnAttr("fuse_activation")
+            ? PADDLE_GET_CONST(std::string,
+                               dev_ctx.GetDnnAttr("fuse_activation"))
+                  .empty()
+            : true;
+    if (invalid_attribute) return;
+
+    fuse_activation =
+        dev_ctx.HasDnnAttr("fuse_activation")
+            ? PADDLE_GET_CONST(std::string,
+                               dev_ctx.GetDnnAttr("fuse_activation"))
+            : "";
+    fuse_alpha = dev_ctx.HasDnnAttr("fuse_alpha")
+                     ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("fuse_alpha"))
+                     : 0.0f;
+    fuse_beta = dev_ctx.HasDnnAttr("fuse_beta")
+                    ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("fuse_beta"))
+                    : 0.0f;
+  }
 
   if (fuse_activation == "hard_sigmoid") {
     post_ops.append_eltwise(activation_scale,
diff --git a/paddle/phi/kernels/fusion/onednn/fused_matmul_kernel.cc b/paddle/phi/kernels/fusion/onednn/fused_matmul_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f54db963b09deb799f41690fc654213f5a0ab05c
--- /dev/null
+++ b/paddle/phi/kernels/fusion/onednn/fused_matmul_kernel.cc
@@ -0,0 +1,616 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+using dnnl::engine;
+using dnnl::inner_product_forward;
+using dnnl::memory;
+using dnnl::prop_kind;
+using dnnl::stream;
+using paddle::framework::ReshapeToMatrix;
+
+namespace phi {
+
+template <typename XT, typename YT, typename OT>
+class FusedMatmulOneDNNHandler
+    : public funcs::OneDNNHandlerNoCachingT<XT, dnnl::matmul> {
+ public:
+  FusedMatmulOneDNNHandler(const OneDNNContext &dev_ctx,
+                           const DenseTensor *residual_data,
+                           const std::vector<int64_t> &x_org_dims,
+                           const std::vector<int64_t> &y_org_dims,
+                           bool trans_x,
+                           bool trans_y,
+                           const float matmul_alpha,
+                           const std::vector<int64_t> &x_strides_override,
+                           const std::vector<int64_t> &y_strides_override,
+                           bool is_output_fused,
+                           const std::string &fuse_activation,
+                           const float fuse_alpha,
+                           const float fuse_beta,
+                           const float fused_output_scale,
+                           const float scale_x,
+                           const float scale_y,
+                           const float scale_in_eltwise,
+                           const float scale_out,
+                           const bool force_fp32_output)
+      : funcs::OneDNNHandlerNoCachingT<XT, dnnl::matmul>(dev_ctx.GetEngine(),
+                                                         dev_ctx.GetPlace()) {
+    // M X K * K X N
+    std::vector<int64_t> x_dims(x_org_dims);
+    std::vector<int64_t> y_dims(y_org_dims);
+
+    const int MB_idx = x_dims.size() - 3;
+    const int H_idx = x_dims.size() - 2;
+    const int W_idx = x_dims.size() - 1;
+
+    if (trans_x) std::swap(x_dims[H_idx], x_dims[W_idx]);
+    if (trans_y) std::swap(y_dims[H_idx], y_dims[W_idx]);
+
+    const memory::dim M = x_dims[H_idx];
+    const memory::dim K = x_dims[W_idx];
+    const memory::dim N = y_dims[W_idx];
+
+    std::vector<int64_t> x_strides(x_dims.size() - 3, 1);
+    std::vector<int64_t> y_strides(x_dims.size() - 3, 1);
+    std::vector<int64_t> out_strides(x_dims.size() - 3, 1);
+    std::vector<int64_t> out_ddims(x_dims.size() - 3, 1);
+
+    x_strides.reserve(x_dims.size());
+    y_strides.reserve(x_dims.size());
+    out_strides.reserve(x_dims.size());
+
+    if (x_strides_override.empty()) {
+      if (trans_x) {
+        x_strides.insert(x_strides.end(), {M * K, 1, M});
+      } else {
+        x_strides.insert(x_strides.end(), {M * K, K, 1});
+      }
+    } else {
+      x_strides = x_strides_override;
+    }
+
+    if (y_strides_override.empty()) {
+      if (trans_y) {
+        y_strides.insert(y_strides.end(), {N * K, 1, K});
+      } else {
+        y_strides.insert(y_strides.end(), {N * K, N, 1});
+      }
+    } else {
+      y_strides = y_strides_override;
+    }
+
+    out_strides.insert(out_strides.end(), {M * N, N, 1});
+    out_ddims.insert(out_ddims.end(),
+                     {std::max(x_dims[MB_idx], y_dims[MB_idx]), M, N});
+
+    for (int i = x_dims.size() - 4; i >= 0; --i) {
+      out_ddims[i] = std::max(x_dims[i], y_dims[i]);
+      if (x_strides_override.empty()) {
+        x_strides[i] = x_dims[i + 1] * x_strides[i + 1];
+      }
+      if (y_strides_override.empty()) {
+        y_strides[i] = y_dims[i + 1] * y_strides[i + 1];
+      }
+      out_strides[i] = out_ddims[i + 1] * out_strides[i + 1];
+    }
+
+    // TODO(jczaja): Why not for int8??
+    if (!funcs::is_int8<OT>() && is_output_fused) {
+      out_strides = FakeTransposeStrides(out_ddims);
+    }
+
+    auto x_md = memory::desc(x_dims, funcs::OneDNNGetDataType<XT>(), x_strides);
+    auto y_md = memory::desc(y_dims, funcs::OneDNNGetDataType<YT>(), y_strides);
+    auto out_md =
+        memory::desc(out_ddims, funcs::OneDNNGetDataType<OT>(), out_strides);
+
+    const auto matmul_attrs = CreateMatmulAttrs(dev_ctx,
+                                                residual_data,
+                                                matmul_alpha,
+                                                fuse_activation,
+                                                fuse_alpha,
+                                                fuse_beta,
+                                                fused_output_scale,
+                                                scale_x,
+                                                scale_y,
+                                                scale_in_eltwise,
+                                                scale_out,
+                                                force_fp32_output);
+
+    this->AcquireForwardPrimitiveDescriptor(matmul_attrs, x_md, y_md, out_md);
+  }
+
+  float ComputeOutputScale(float matmul_alpha,
+                           const float scale_x,
+                           const float scale_y,
+                           const float scale_in_eltwise,
+                           const float scale_out,
+                           const bool force_fp32_output) {
+    float f_scale_out = force_fp32_output ? 1.0f : scale_out;
+    matmul_alpha *= f_scale_out / (scale_x * scale_y);
+    return matmul_alpha;
+  }
+
+  dnnl::primitive_attr CreateMatmulAttrs(const OneDNNContext &dev_ctx,
+                                         const DenseTensor *residual_data,
+                                         const float matmul_alpha,
+                                         const std::string &fuse_activation,
+                                         const float fuse_alpha,
+                                         const float fuse_beta,
+                                         const float fused_output_scale,
+                                         const float scale_x,
+                                         const float scale_y,
+                                         const float scale_in_eltwise,
+                                         const float scale_out,
+                                         const bool force_fp32_output) {
+    dnnl::primitive_attr matmul_attrs;
+    dnnl::post_ops post_operations;
+
+    float computed_scale_out = ComputeOutputScale(matmul_alpha,
+                                                  scale_x,
+                                                  scale_y,
+                                                  scale_in_eltwise,
+                                                  scale_out,
+                                                  force_fp32_output);
+    if (computed_scale_out != 1.0f) {
+      matmul_attrs.set_output_scales(0, {computed_scale_out});
+    }
+
+    if (residual_data) {
+      auto residual_data_tz = vectorize(residual_data->dims());
+      auto residual_data_md = memory::desc(residual_data_tz,
+                                           funcs::OneDNNGetDataType<OT>(),
+                                           dnnl::memory::format_tag::any);
+      post_operations.append_binary(dnnl::algorithm::binary_add,
+                                    residual_data_md);
+      if (scale_in_eltwise != 0.0f) {
+        float sum_scale = scale_out / scale_in_eltwise;
+        post_operations.append_sum(sum_scale);
+      }
+    }
+
+    funcs::AppendActivation(
+        dev_ctx, post_operations, 1.0f, fuse_activation, fuse_alpha, fuse_beta);
+
+    if (fused_output_scale != 1.0f) {
+      post_operations.append_eltwise(
+          1.0, dnnl::algorithm::eltwise_linear, fused_output_scale, 0.0f);
+    }
+
+    matmul_attrs.set_post_ops(post_operations);
+    return matmul_attrs;
+  }
+
+  std::vector<int64_t> FakeTransposeStrides(
+      const std::vector<int64_t> &matmul_out_dims) const {
+    // fuse matmul_v2 + transpose + reshape guarantees that output is 4D and
+    // transpose axis are: {0, 2, 1, 3}
+    std::vector<int64_t> transpose_axis = {0, 2, 1, 3};
+    std::vector<int64_t> fake_strides(transpose_axis.size());
+    int ndims = static_cast<int>(transpose_axis.size());
+
+    int total_stride = 1;
+
+    for (int i = ndims - 1; i >= 0; --i) {
+      fake_strides[transpose_axis[i]] = total_stride;
+      total_stride *= matmul_out_dims[transpose_axis[i]];
+    }
+
+    return fake_strides;
+  }
+
+  std::shared_ptr<memory> AcquireWeightsMemory(const DenseTensor *input) {
+    const YT *input_data = input->data<YT>();
+    return this->AcquireMemoryFromPrimitive(
+        this->fwd_pd_->weights_desc(), funcs::to_void_cast<YT>(input_data));
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireDstMemory(const OneDNNContext &dev_ctx,
+                                                 DenseTensor *output) {
+    // We cannot use base AcquireDstMemory as it makes an allocation request
+    // base on DST memory primitive size. This is fine in general, but in MatMul
+    // we have primitive that covers only one batch of Data and then shift
+    // pointer for every new batch. Hence DenseTensor size is bigger that
+    // dst memory primitive size. So would we request less memory that is there
+    // and it triggers an assertion.  So as there is no 'any' format here we can
+    // leave default size of DenseTensor as computed in ComputeInferShape
+    OT *ptr = dev_ctx.template Alloc<OT>(output);
+    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr);
+  }
+};
+
+static DDim RowMatrixDimsFromVector(const DDim &x_dim) {
+  return x_dim.size() > 1 ? x_dim : make_ddim({1, x_dim[0]});
+}
+
+static DDim ColumnMatrixDimsFromVector(const DDim &y_dim) {
+  return y_dim.size() > 1 ? y_dim : make_ddim({y_dim[0], 1});
+}
+
+static std::vector<int64_t> TransposeAxis(const std::vector<int64_t> &x,
+                                          const std::vector<int> &axis) {
+  size_t in_rank = x.size();
+  size_t axis_size = axis.size();
+
+  auto axis_set = std::set<int>(axis.begin(), axis.end());
+  PADDLE_ENFORCE_EQ(axis_set.size(),
+                    axis_size,
+                    phi::errors::InvalidArgument(
+                        "In an axis array, elements must be unique."));
+
+  PADDLE_ENFORCE_EQ(
+      in_rank,
+      axis_size,
+      phi::errors::InvalidArgument("The input dimension's size "
+                                   "should be equal to the axis's size. "
+                                   "But received dimension is %d, "
+                                   "axis's size is %d",
+                                   in_rank,
+                                   axis_size));
+
+  PADDLE_ENFORCE_LT(*std::max_element(axis.begin(), axis.end()),
+                    axis_size,
+                    phi::errors::InvalidArgument(
+                        "Axis values must be ranging from 0 to (dims - 1)."));
+
+  std::vector<int64_t> new_x(x.size());
+  for (size_t i = 0; i < x.size(); i++) {
+    new_x[i] = x[axis[i]];
+  }
+  return new_x;
+}
+
+static std::vector<int64_t> GetInputStrides(const std::string input_name,
+                                            const DDim &input_dims,
+                                            std::vector<int> shape,
+                                            std::vector<int> axis,
+                                            const bool transpose_input) {
+  auto new_dims = input_dims;
+  if (!shape.empty() && !axis.empty()) {
+    new_dims = input_dims.reshape(shape).transpose(axis);
+  }
+
+  auto &MatrixDimsFromVector =
+      input_name == "X" ? RowMatrixDimsFromVector : ColumnMatrixDimsFromVector;
+  funcs::MatDescriptor mat_dim = funcs::CreateMatrixDescriptor(
+      MatrixDimsFromVector(new_dims), 0, transpose_input);
+
+  std::vector<int64_t> strides;
+  if (!shape.empty()) {
+    auto shape2 = input_dims.reshape(shape);
+    strides.push_back(1);
+    for (auto i = shape2.size() - 1; i > 0; --i) {
+      strides.insert(strides.begin(),
+                     strides.front() * static_cast<int64_t>(shape2[i]));
+    }
+    strides = TransposeAxis(strides, axis);
+    if (shape.size() == 2)
+      strides.insert(strides.begin(),
+                     static_cast<int64_t>(shape[0] * shape[1]));
+    mat_dim.stride_ = strides[0];
+    if (mat_dim.trans_) std::swap(*strides.rbegin(), *(++strides.rbegin()));
+  }
+  return strides;
+}
+
+template <typename T, typename T_out>
+void ExecuteFusedMatmul(const OneDNNContext &dev_ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &y,
+                        const DenseTensor *residual_data,
+                        const std::vector<int64_t> &x_dims,
+                        const std::vector<int64_t> &y_dims,
+                        bool trans_x,
+                        bool trans_y,
+                        const float matmul_alpha,
+                        const std::vector<int64_t> &x_strides_override,
+                        const std::vector<int64_t> &y_strides_override,
+                        const bool is_output_fused,
+                        const std::vector<int> &fused_transpose_Out,
+                        const std::string &fuse_activation,
+                        const float fuse_alpha,
+                        const float fuse_beta,
+                        const float fused_output_scale,
+                        const float scale_x,
+                        const float scale_y,
+                        const float scale_in_eltwise,
+                        const float scale_out,
+                        const bool force_fp32_output,
+                        DenseTensor *out) {
+  FusedMatmulOneDNNHandler<T, T, T_out> handler(dev_ctx,
+                                                residual_data,
+                                                x_dims,
+                                                y_dims,
+                                                trans_x,
+                                                trans_y,
+                                                matmul_alpha,
+                                                x_strides_override,
+                                                y_strides_override,
+                                                is_output_fused,
+                                                fuse_activation,
+                                                fuse_alpha,
+                                                fuse_beta,
+                                                fused_output_scale,
+                                                scale_x,
+                                                scale_y,
+                                                scale_in_eltwise,
+                                                scale_out,
+                                                force_fp32_output);
+
+  const auto src_memory_p = handler.AcquireSrcMemory(&x);
+  const auto weights_memory_p = handler.AcquireWeightsMemory(&y);
+  const auto dst_memory_p = handler.AcquireDstMemory(dev_ctx, out);
+
+  auto matmul_p = handler.AcquireForwardPrimitive();
+
+  std::unordered_map<int, memory> matmul_args = {
+      {DNNL_ARG_SRC, *src_memory_p},
+      {DNNL_ARG_WEIGHTS, *weights_memory_p},
+      {DNNL_ARG_DST, *dst_memory_p}};
+
+  if (residual_data) {
+    const auto residual_data_memory_p = handler.AcquireSrcMemory(residual_data);
+    matmul_args.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1,
+                        *residual_data_memory_p});
+  }
+
+  auto &astream = OneDNNContext::tls().get_stream();
+  matmul_p->execute(astream, matmul_args);
+  astream.wait();
+
+  if (is_output_fused && !funcs::is_int8<T_out>()) {
+    auto permuted_md =
+        dst_memory_p->get_desc().permute_axes(fused_transpose_Out);
+    out->set_mem_desc(permuted_md.reshape(vectorize<int64_t>(out->dims())));
+  } else {
+    out->set_mem_desc(
+        dst_memory_p->get_desc().reshape(vectorize<int64_t>(out->dims())));
+  }
+}
+
+std::vector<int64_t> GetInputShape(DDim input_dims,
+                                   std::vector<int> shape,
+                                   std::vector<int> axis) {
+  if (!shape.empty() && !axis.empty()) {
+    return vectorize(input_dims.reshape(shape).transpose(axis));
+  }
+  return vectorize(input_dims);
+}
+
+void CalculateMatrixDims(const std::vector<int64_t> &x_dims,
+                         const std::vector<int64_t> &y_dims,
+                         std::vector<int64_t> *x_bd_dims,
+                         std::vector<int64_t> *y_bd_dims,
+                         DenseTensor *out,
+                         const bool is_output_fused) {
+  if (x_dims.size() == 1) {
+    (*x_bd_dims)[(*x_bd_dims).size() - 1] = x_dims[0];
+  } else if (x_dims.size() == 2) {
+    (*x_bd_dims)[(*x_bd_dims).size() - 1] = x_dims[1];
+    (*x_bd_dims)[(*x_bd_dims).size() - 2] = x_dims[0];
+  } else {
+    for (size_t i = 0; i < x_dims.size(); ++i) {
+      (*x_bd_dims)[(*x_bd_dims).size() - x_dims.size() + i] = x_dims[i];
+    }
+  }
+  if (y_dims.size() == 1) {
+    (*y_bd_dims)[(*x_bd_dims).size() - 2] = y_dims[0];
+  } else if (y_dims.size() == 2) {
+    (*y_bd_dims)[(*y_bd_dims).size() - 1] = y_dims[1];
+    (*y_bd_dims)[(*y_bd_dims).size() - 2] = y_dims[0];
+  } else {
+    for (size_t i = 0; i < y_dims.size(); ++i) {
+      (*y_bd_dims)[(*y_bd_dims).size() - y_dims.size() + i] = y_dims[i];
+    }
+  }
+
+  if (!is_output_fused && x_dims.size() > 2 && y_dims.size() > 2) {
+    auto out_dims = vectorize(out->dims());
+    for (size_t i = 0; i < (*x_bd_dims).size() - 2; ++i) {
+      PADDLE_ENFORCE_EQ(
+          (*x_bd_dims)[i] == (*y_bd_dims)[i] || (*x_bd_dims)[i] == 1 ||
+              (*y_bd_dims)[i] == 1,
+          true,
+          errors::InvalidArgument(
+              "Tensor dimensions are incorrect for broadcasting."
+              "Dimensions in X and Y must be same or equal to 1, but "
+              "received x_dim[%d]=%d and y_dims[%d]= %d",
+              i,
+              (*x_bd_dims)[i],
+              i,
+              (*y_bd_dims)[i]));
+      (out_dims)[i] = std::max((*x_bd_dims)[i], (*y_bd_dims)[i]);
+    }
+    out->Resize(make_ddim((out_dims)));
+  }
+}
+
+template <typename T, typename Context>
+void FusedMatmulKernel(const Context &dev_ctx,
+                       const DenseTensor &x,
+                       const DenseTensor &y,
+                       const paddle::optional<DenseTensor> &residual_data,
+                       bool transpose_x,
+                       bool transpose_y,
+                       const float matmul_alpha,
+                       const std::string &fuse_activation,
+                       const float fuse_alpha,
+                       const float fuse_beta,
+                       const float fused_output_scale,
+                       const std::vector<int> &fused_reshape_X,
+                       const std::vector<int> &fused_transpose_X,
+                       const std::vector<int> &fused_reshape_Y,
+                       const std::vector<int> &fused_transpose_Y,
+                       const std::vector<int> &fused_reshape_Out,
+                       const std::vector<int> &fused_transpose_Out,
+                       const std::string &mkldnn_data_type,
+                       const float scale_x,
+                       const float scale_y,
+                       const float scale_in_eltwise,
+                       const float scale_out,
+                       const bool force_fp32_output,
+                       DenseTensor *out) {
+  if (dev_ctx.HasDnnAttr("head_number")) {
+    const auto head_number =
+        PADDLE_GET_CONST(int, dev_ctx.GetDnnAttr("head_number"));
+    PADDLE_ENFORCE_EQ(
+        head_number,
+        1,
+        errors::Unimplemented(
+            "oneDNN matmul doesn't support multiple heads. Expected "
+            "head_number=1. But received `head_number` is %d",
+            head_number));
+  }
+
+  constexpr bool is_int8 = funcs::is_int8<T>();
+  constexpr bool is_bfloat16 = funcs::is_bfloat16<T>();
+
+  bool fuse_relu = false;
+  if (fuse_activation == "relu" || fuse_activation == "relu6") {
+    fuse_relu = true;
+  }
+
+  auto x_dims = GetInputShape(x.dims(), fused_reshape_X, fused_transpose_X);
+  auto y_dims = GetInputShape(y.dims(), fused_reshape_Y, fused_transpose_Y);
+  auto is_output_fused =
+      !fused_reshape_Out.empty() && !fused_transpose_Out.empty();
+
+  auto x_strides_override = GetInputStrides(
+      "X", x.dims(), fused_reshape_X, fused_transpose_X, transpose_x);
+  auto y_strides_override = GetInputStrides(
+      "Y", y.dims(), fused_reshape_Y, fused_transpose_Y, transpose_y);
+
+  int ndims = std::max(x_dims.size(), y_dims.size());
+  ndims = std::max(ndims, 3);
+
+  std::vector<int64_t> x_bd_dims(ndims, 1);
+  std::vector<int64_t> y_bd_dims(ndims, 1);
+
+  CalculateMatrixDims(
+      x_dims, y_dims, &x_bd_dims, &y_bd_dims, out, is_output_fused);
+
+  if (force_fp32_output || ((!is_int8) && (!is_bfloat16))) {
+    ExecuteFusedMatmul<T, float>(dev_ctx,
+                                 x,
+                                 y,
+                                 residual_data.get_ptr(),
+                                 x_bd_dims,
+                                 y_bd_dims,
+                                 transpose_x,
+                                 transpose_y,
+                                 matmul_alpha,
+                                 x_strides_override,
+                                 y_strides_override,
+                                 is_output_fused,
+                                 fused_transpose_Out,
+                                 fuse_activation,
+                                 fuse_alpha,
+                                 fuse_beta,
+                                 fused_output_scale,
+                                 scale_x,
+                                 scale_y,
+                                 scale_in_eltwise,
+                                 scale_out,
+                                 force_fp32_output,
+                                 out);
+  } else if (is_bfloat16) {
+    ExecuteFusedMatmul<T, phi::dtype::bfloat16>(dev_ctx,
+                                                x,
+                                                y,
+                                                residual_data.get_ptr(),
+                                                x_bd_dims,
+                                                y_bd_dims,
+                                                transpose_x,
+                                                transpose_y,
+                                                matmul_alpha,
+                                                x_strides_override,
+                                                y_strides_override,
+                                                is_output_fused,
+                                                fused_transpose_Out,
+                                                fuse_activation,
+                                                fuse_alpha,
+                                                fuse_beta,
+                                                fused_output_scale,
+                                                scale_x,
+                                                scale_y,
+                                                scale_in_eltwise,
+                                                scale_out,
+                                                force_fp32_output,
+                                                out);
+  } else if (fuse_relu) {
+    ExecuteFusedMatmul<T, uint8_t>(dev_ctx,
+                                   x,
+                                   y,
+                                   residual_data.get_ptr(),
+                                   x_bd_dims,
+                                   y_bd_dims,
+                                   transpose_x,
+                                   transpose_y,
+                                   matmul_alpha,
+                                   x_strides_override,
+                                   y_strides_override,
+                                   is_output_fused,
+                                   fused_transpose_Out,
+                                   fuse_activation,
+                                   fuse_alpha,
+                                   fuse_beta,
+                                   fused_output_scale,
+                                   scale_x,
+                                   scale_y,
+                                   scale_in_eltwise,
+                                   scale_out,
+                                   force_fp32_output,
+                                   out);
+  } else {
+    ExecuteFusedMatmul<T, int8_t>(dev_ctx,
+                                  x,
+                                  y,
+                                  residual_data.get_ptr(),
+                                  x_bd_dims,
+                                  y_bd_dims,
+                                  transpose_x,
+                                  transpose_y,
+                                  matmul_alpha,
+                                  x_strides_override,
+                                  y_strides_override,
+                                  is_output_fused,
+                                  fused_transpose_Out,
+                                  fuse_activation,
+                                  fuse_alpha,
+                                  fuse_beta,
+                                  fused_output_scale,
+                                  scale_x,
+                                  scale_y,
+                                  scale_in_eltwise,
+                                  scale_out,
+                                  force_fp32_output,
+                                  out);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(fused_matmul,
+                   OneDNN,
+                   ONEDNN,
+                   phi::FusedMatmulKernel,
+                   float,
+                   phi::dtype::bfloat16,
+                   int8_t,
+                   uint8_t) {}
diff --git a/paddle/phi/kernels/onednn/matmul_kernel.cc b/paddle/phi/kernels/onednn/matmul_kernel.cc
index c820e738f09348b7d207dbd81e33fcb40b615d98..8f9baec36686ed68245d0a34d837821322526e9f 100644
--- a/paddle/phi/kernels/onednn/matmul_kernel.cc
+++ b/paddle/phi/kernels/onednn/matmul_kernel.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -51,8 +51,7 @@ void CalculateMatrixDims(const std::vector<int64_t> &x_dims,
                          const std::vector<int64_t> &y_dims,
                          std::vector<int64_t> *x_bd_dims,
                          std::vector<int64_t> *y_bd_dims,
-                         DenseTensor *out,
-                         const bool is_output_fused) {
+                         DenseTensor *out) {
   if (x_dims.size() == 1) {
     (*x_bd_dims)[(*x_bd_dims).size() - 1] = x_dims[0];
   } else if (x_dims.size() == 2) {
@@ -74,7 +73,7 @@ void CalculateMatrixDims(const std::vector<int64_t> &x_dims,
     }
   }
 
-  if (!is_output_fused && x_dims.size() > 2 && y_dims.size() > 2) {
+  if (x_dims.size() > 2 && y_dims.size() > 2) {
     auto out_dims = vectorize(out->dims());
     for (size_t i = 0; i < (*x_bd_dims).size() - 2; ++i) {
       PADDLE_ENFORCE_EQ(
@@ -121,15 +120,6 @@ void MatmulKernel(const Context &dev_ctx,
           ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("force_fp32_output"))
           : false;
 
-  bool fuse_relu = false;
-  if (dev_ctx.HasDnnAttr("fuse_activation")) {
-    auto act_type =
-        PADDLE_GET_CONST(std::string, dev_ctx.GetDnnAttr("fuse_activation"));
-    if (act_type == "relu" || act_type == "relu6") {
-      fuse_relu = true;
-    }
-  }
-
   auto x_dims = vectorize(GetDimsForInput(dev_ctx, x.dims(), "X"));
   auto y_dims = vectorize(GetDimsForInput(dev_ctx, y.dims(), "Y"));
 
@@ -139,12 +129,7 @@ void MatmulKernel(const Context &dev_ctx,
   std::vector<int64_t> x_bd_dims(ndims, 1);
   std::vector<int64_t> y_bd_dims(ndims, 1);
 
-  CalculateMatrixDims(x_dims,
-                      y_dims,
-                      &x_bd_dims,
-                      &y_bd_dims,
-                      out,
-                      funcs::IsOutputFused(dev_ctx));
+  CalculateMatrixDims(x_dims, y_dims, &x_bd_dims, &y_bd_dims, out);
 
   if (force_fp32_output || ((!is_int8) && (!is_bfloat16))) {
     funcs::ExecuteMatmul<T, float>(
@@ -152,9 +137,6 @@ void MatmulKernel(const Context &dev_ctx,
   } else if (is_bfloat16) {
     funcs::ExecuteMatmul<T, paddle::platform::bfloat16>(
         dev_ctx, x, y, x_bd_dims, y_bd_dims, transpose_x, transpose_y, out);
-  } else if (fuse_relu) {
-    funcs::ExecuteMatmul<T, uint8_t>(
-        dev_ctx, x, y, x_bd_dims, y_bd_dims, transpose_x, transpose_y, out);
   } else {
     funcs::ExecuteMatmul<T, int8_t>(
         dev_ctx, x, y, x_bd_dims, y_bd_dims, transpose_x, transpose_y, out);
diff --git a/paddle/phi/ops/compat/fused_matmul_sig.cc b/paddle/phi/ops/compat/fused_matmul_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..18e3eb52b803c342366cda57a79d85dc8303a1c4
--- /dev/null
+++ b/paddle/phi/ops/compat/fused_matmul_sig.cc
@@ -0,0 +1,47 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature FusedMatmulOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("fused_matmul",
+                         {"X", "Y", "ResidualData"},
+                         {"trans_x",
+                          "trans_y",
+                          "matmul_alpha",
+                          "fuse_activation",
+                          "fuse_alpha",
+                          "fuse_beta",
+                          "fused_output_scale",
+                          "fused_reshape_X",
+                          "fused_transpose_X",
+                          "fused_reshape_Y",
+                          "fused_transpose_Y",
+                          "fused_reshape_Out",
+                          "fused_transpose_Out",
+                          "mkldnn_data_type",
+                          "Scale_x",
+                          "Scale_y",
+                          "Scale_in_eltwise",
+                          "Scale_out",
+                          "force_fp32_output"},
+                         {"Out"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(fused_matmul, phi::FusedMatmulOpArgumentMapping);
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_matmul_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_matmul_pass.py
index e4f38b45fa24adfd4614afc9658ca25b288dc37c..1ef1cb9d2af3792894e87d8b17a8cb8ad1c2caef 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_matmul_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_matmul_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -103,12 +103,6 @@ class TestMapMatmulToMulPass(PassAutoScanTest):
             alpha=alpha,
             trans_x=transpose_X,
             trans_y=transpose_Y,
-            fused_reshape_Out=[],
-            fused_transpose_Out=[],
-            fused_reshape_X=[],
-            fused_reshape_Y=[],
-            fused_transpose_X=[],
-            fused_transpose_Y=[],
         )
 
         ops = [
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_mul_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_mul_pass.py
index efd12c856592f1a5c65d0d506e6c114ed2f66cdb..129103d1bc6aa2ca479c3be264678fff4f20b7e9 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_mul_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_mul_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -92,12 +92,6 @@ class TestMapMatmulToMulPass(PassAutoScanTest):
             alpha=alpha,
             trans_x=transpose_X,
             trans_y=transpose_Y,
-            fused_reshape_Out=[],
-            fused_transpose_Out=[],
-            fused_reshape_X=[],
-            fused_reshape_Y=[],
-            fused_transpose_X=[],
-            fused_transpose_Y=[],
         )
 
         ops = [
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_v2_scale_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_v2_scale_fuse_pass.py
index 1bfb629cf965cccc64a9859c1033b0c4dbc8f100..dee099954626b5b0188045a3c933466529008290 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_v2_scale_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_v2_scale_fuse_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -76,12 +76,6 @@ class TestMatmulV2ScaleFusePass(PassAutoScanTest):
             outputs={"Out": ["matmul_out"]},
             trans_x=transpose_X,
             trans_y=transpose_Y,
-            fused_reshape_X=[],
-            fused_reshape_Y=[],
-            fused_transpose_X=[],
-            fused_transpose_Y=[],
-            fused_reshape_Out=[],
-            fused_transpose_Out=[],
         )
         is_scale_tensor = draw(st.booleans())
         if is_scale_tensor:
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py
index 964aad16b971107ca92a1d41355263c1b7030a60..2b64a6be86f74099eeaf76fa81f4ba7182634273 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the 'License');
 # you may not use this file except in compliance with the License.
@@ -146,7 +146,7 @@ class TestMatmulActivationMkldnnFusePass(PassAutoScanTest):
                 'operator_scale_onednn_fuse_pass',
             ],
         )
-        yield config, ['matmul'], (1e-5, 1e-5)
+        yield config, ['fused_matmul'], (1e-5, 1e-5)
 
     def test(self):
         self.run_and_statis(
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py
index 0e0c542be632c3546640d885bb165579af7c756d..3d99e057d79217a59caa9ce92abeaa891d747324 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the 'License');
 # you may not use this file except in compliance with the License.
@@ -137,7 +137,7 @@ class TestMatmulElementwiseAddActivationMkldnnFusePass(PassAutoScanTest):
                 'matmul_activation_mkldnn_fuse_pass',
             ],
         )
-        yield config, ['matmul'], (1e-5, 1e-5)
+        yield config, ['fused_matmul'], (1e-5, 1e-5)
 
     def test(self):
         self.run_and_statis(
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py
index b359d4a4c93c43056550e7cd0f0654502451c4ee..c84c9f02ce0378062540399902fa8656c5a9ac4b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the 'License');
 # you may not use this file except in compliance with the License.
@@ -76,7 +76,7 @@ class TestMatmulElementwiseAddMkldnnFusePass(PassAutoScanTest):
         config = self.create_inference_config(
             use_mkldnn=True, passes=['matmul_elementwise_add_mkldnn_fuse_pass']
         )
-        yield config, ['matmul'], (1e-5, 1e-5)
+        yield config, ['fused_matmul'], (1e-5, 1e-5)
 
     def test(self):
         self.run_and_statis(
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py
index 3694041af10a85698c1de22dfe3af284308634a2..0b643b9061d04e1ab23ae4bcdec3222bb8d93051 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -150,7 +150,7 @@ class TestMatmulv2ActivationMkldnnFusePass(PassAutoScanTest):
                 'operator_scale_onednn_fuse_pass',
             ],
         )
-        yield config, ['matmul_v2'], (1e-5, 1e-5)
+        yield config, ['fused_matmul'], (1e-5, 1e-5)
 
     def test(self):
         self.run_and_statis(
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py
index f81a0cce52d29f471613785dcdddf131462acd56..e667c10fe6a0359cde7b40379531f2ac3993a721 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the 'License');
 # you may not use this file except in compliance with the License.
@@ -87,7 +87,7 @@ class TestMatmulV2ElementwiseAddMkldnnFusePass(PassAutoScanTest):
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(use_mkldnn=True)
-        yield config, ['matmul_v2'], (1e-5, 1e-5)
+        yield config, ['fused_matmul'], (1e-5, 1e-5)
 
     def test(self):
         self.run_and_statis(
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py
index 2d368433edc3cafa3a8b661f633493bb44ab0af5..45b17d59aeba5f676d216ad39610eae91a17bce0 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -90,12 +90,6 @@ class TestMatmulv2TransposeReshapeMkldnnFusePass(PassAutoScanTest):
             attrs={
                 "trans_x": transpose_X,
                 "trans_y": transpose_Y,
-                "fused_reshape_X": [],
-                "fused_reshape_Y": [],
-                "fused_transpose_X": [],
-                "fused_transpose_Y": [],
-                "fused_reshape_Out": [],
-                "fused_transpose_Out": [],
             },
         )
 
@@ -135,17 +129,8 @@ class TestMatmulv2TransposeReshapeMkldnnFusePass(PassAutoScanTest):
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        # gpu_cpu_map_matmul_v2_to_matmul_pass will affect the type of final fused op
-        fused_op = "matmul_v2"
-        input1_dim1 = program_config.inputs["input_data1"].shape[0]
-        input2_dim1 = program_config.inputs["input_data2"].shape[0]
-        input1_dim2 = program_config.inputs["input_data1"].shape[1]
-        input2_dim2 = program_config.inputs["input_data2"].shape[1]
-        if input1_dim1 == input2_dim1 and input1_dim2 == input2_dim2:
-            fused_op = "matmul"
-
         config = self.create_inference_config(use_mkldnn=True)
-        yield config, [fused_op], (1e-5, 1e-5)
+        yield config, ['fused_matmul'], (1e-5, 1e-5)
 
     def test(self):
         self.run_and_statis(
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmulv2_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmulv2_op.py
index 2fd966710b488fba9c9bb7fd66e37dfb9eb47cbe..9a72e806b322682650eb353febdc7ca430e6a2e5 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmulv2_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmulv2_op.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -93,12 +93,6 @@ class TestMkldnnMatmulv2Op(MkldnnAutoScanTest):
             attrs={
                 "trans_x": kwargs["transpose_X"],
                 "trans_y": kwargs["transpose_Y"],
-                "fused_reshape_X": [],
-                "fused_reshape_Y": [],
-                "fused_transpose_X": [],
-                "fused_transpose_Y": [],
-                "fused_reshape_Out": [],
-                "fused_transpose_Out": [],
             },
         )
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_matmul_transpose_reshape_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_matmul_transpose_reshape_fuse_pass.py
index 85cdfd314a7cf456fc938b8d602fa748490080fc..d7ad3f64162b30298081d3a1e8b98e3ed7d01546 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_matmul_transpose_reshape_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_matmul_transpose_reshape_fuse_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -116,7 +116,7 @@ class TestOneDNNMatmulTransposeReshapeFusePass(PassAutoScanTest):
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(use_mkldnn=True)
-        yield config, ['matmul'], (1e-5, 1e-5)
+        yield config, ['fused_matmul'], (1e-5, 1e-5)
 
     def test(self):
         self.run_and_statis(
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py
index 2f9051fe16b5c34546a1eb35e9b85ab725918d8c..ef5098b00704e2fb059a29b6bb241f3af87265cf 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -154,7 +154,7 @@ class TestOneDNNReshapeTransposeMatmulFusePass(PassAutoScanTest):
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(use_mkldnn=True)
-        yield config, ['matmul'], (1e-5, 1e-5)
+        yield config, ['fused_matmul'], (1e-5, 1e-5)
 
     def test(self):
         self.run_and_statis(