From 92d8d0bc757d520bd1f9f5876b508a8e2154df6f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Thu, 14 Apr 2022 12:41:50 +0200
Subject: [PATCH] FC+elementwise_add (residual connection) (#41776)

* Change tensor name to match activation

* declare fc_eltwise_add pass

* merge conv_eltwise refactor PR

* first compilable draft

* unittest feedback tools

* Fuse pass tester

* Move IsReachable() to shared file

* 100% coverage of fuse_pass_tester.cc

* register pass

* Add bias node

* Improve unit tests / remove bias node from pattern

* improve fc_eltwiseadd_unittest

* cancel eltwise_add fuse if act is already fused

* Add elementwise_input scale

* Residual MVP

* Add new FC attrs

* Add more test cases

* Add missing op attrs

* Adapt code to new Elementwise pattern

* reuse existing fcpattern

* improve code style

* remove unused arguments

* fix typo

* remove whitespace

* remove int8 related code

* Remove attributes from base ops

* style

* style check

* Remove input from base op

* Set attribute during fuse

* ut timeout

* download and test model

* DRY

* apply feedback from review

* Style check

* fix typo

* cosmetic changes

* explicitly set residual as output

* VIT-OCR accuracy check

* trigger CI

* remove whitespaces

* fix missing data file
---
 paddle/fluid/framework/ir/CMakeLists.txt      |   2 +
 .../fc_elementwise_add_mkldnn_fuse_pass.cc    | 144 +++++++++++++
 .../fc_elementwise_add_mkldnn_fuse_pass.h     |  48 +++++
 ...elementwise_add_mkldnn_fuse_pass_tester.cc | 202 ++++++++++++++++++
 .../inference/api/paddle_pass_builder.cc      |   8 +
 .../fluid/inference/api/paddle_pass_builder.h |   4 +
 .../fluid/inference/tests/api/CMakeLists.txt  |  13 ++
 .../tests/api/analyzer_bert_tester.cc         |   1 +
 .../tests/api/analyzer_vit_ocr_tester.cc      | 117 ++++++++++
 paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc |  41 +++-
 .../unittests/ir/inference/CMakeLists.txt     |   1 +
 ...est_mkldnn_fc_elementwise_add_fuse_pass.py | 101 +++++++++
 12 files changed, 671 insertions(+), 11 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc
 create mode 100644 paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h
 create mode 100644 paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass_tester.cc
 create mode 100644 paddle/fluid/inference/tests/api/analyzer_vit_ocr_tester.cc
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_fc_elementwise_add_fuse_pass.py

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index e8696a3c227..207ee713bf4 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -122,6 +122,7 @@ if(WITH_MKLDNN)
     pass_library(conv_activation_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(conv_concat_relu_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(conv_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
+    pass_library(fc_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(scale_matmul_fuse_pass inference DIR mkldnn)
     pass_library(cpu_bfloat16_placement_pass inference DIR mkldnn)
     pass_library(cpu_bfloat16_pass inference DIR mkldnn)
@@ -208,6 +209,7 @@ if (WITH_MKLDNN)
     cc_test(test_conv_activation_mkldnn_fuse_pass SRCS mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc DEPS conv_activation_mkldnn_fuse_pass)
     cc_test(test_conv_concat_relu_mkldnn_fuse_pass SRCS mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc DEPS conv_concat_relu_mkldnn_fuse_pass)
     cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass pass_test_util)
+    cc_test(test_fc_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/fc_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS fc_elementwise_add_mkldnn_fuse_pass pass_test_util)
     cc_test(test_fc_act_mkldnn_fuse_pass SRCS mkldnn/fc_act_mkldnn_fuse_pass_tester.cc DEPS fc_act_mkldnn_fuse_pass pass_test_util)
     cc_test(test_batch_norm_act_fuse_pass SRCS mkldnn/batch_norm_act_fuse_pass_tester.cc DEPS batch_norm_act_fuse_pass pass_test_util)
     set(TEST_CONV_BN_PASS_DEPS conv_bn_fuse_pass graph_to_program_pass conv_op conv_transpose_op math_function im2col vol2col batch_norm_op gelu_op activation_op elementwise_add_op concat_and_split naive_executor device_context eigen_function)
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc
new file mode 100644
index 00000000000..2e62597f2ee
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc
@@ -0,0 +1,144 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/graph_traits.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+FCResidualConnectionMKLDNNFusePass::FCResidualConnectionMKLDNNFusePass() {
+  AddOpCompat(OpCompat("fc"))
+      .AddInput("Input")
+      .IsTensor()
+      .End()
+      .AddInput("W")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("in_num_col_dims")
+      .IsNumGE(1)
+      .End();
+
+  AddOpCompat(OpCompat("elementwise_add"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .IsIntIn({-1, 0, 1})
+      .End();
+}
+
+GraphWithStats FCResidualConnectionMKLDNNFusePass::FuseFC(
+    const std::string& name_scope, const GraphWithStats& graph_with_stats,
+    bool fc_as_x) const {
+  GraphPatternDetector gpd;
+  auto pattern = gpd.mutable_pattern();
+  patterns::FCMKLDNN fc_pattern{pattern, name_scope};
+  bool fc_has_bias = true;
+  auto fc_output = fc_pattern(
+      gpd.mutable_pattern()->NewNode("fc")->AsInput()->assert_is_op_input(
+          "fc", "Input"),
+      fc_has_bias);
+
+  patterns::ResidualElementwise elementwise_pattern{pattern, name_scope,
+                                                    fc_as_x};
+  elementwise_pattern(
+      fc_output, pattern->NewNode(elementwise_pattern.residual_data_repr()),
+      "elementwise_add", fc_as_x);
+  fc_output->AsIntermediate();
+
+  int found_fc_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(fc_op, fc, fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(fc_input, input, fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(fc_weights, weights, fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(fc_output, output, fc_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(residual_data, residual_data,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out,
+                              elementwise_pattern);
+
+    if (FindFuseOption(*fc_op, *elementwise_op) != FUSE_MKLDNN) return;
+    if (!IsReachable(g, residual_data, fc_output)) return;
+    if (HasFusedActivation(fc_op)) return;
+
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING)
+          << "op compat for fc_elementwise_add_mkldnn_fuse_pass failed.";
+      return;
+    }
+
+    fc_op->Op()->SetOutput("ResidualData", {residual_data->Name()});
+    fc_op->Op()->SetOutput("Out", {elementwise_out->Name()});
+    fc_op->Op()->SetAttr("fuse_residual_connection", true);
+
+    GraphSafeRemoveNodes(g, {fc_output, elementwise_op});
+
+    IR_NODE_LINK_TO(residual_data, fc_op);
+    IR_NODE_LINK_TO(fc_op, elementwise_out);
+
+    found_fc_count++;
+  };
+
+  gpd(graph_with_stats.first, handler);
+  if (!Has("disable_logs") || !Get<bool>("disable_logs")) {
+    std::stringstream msg_ss;
+    std::string fusionMode = fc_as_x ? "x" : "y";
+    msg_ss << "---    Fused " << found_fc_count << " fc (as " << fusionMode
+           << ") + elementwise_add patterns";
+    paddle::string::PrettyLogDetail(msg_ss.str().c_str());
+  }
+
+  return std::make_pair(graph_with_stats.first,
+                        found_fc_count + graph_with_stats.second);
+}
+
+void FCResidualConnectionMKLDNNFusePass::ApplyImpl(ir::Graph* graph) const {
+  FusePassBase::Init(name_scope_, graph);
+  auto graph_with_stats = FuseFC(name_scope_, std::make_pair(graph, 0), true);
+  graph_with_stats = FuseFC(name_scope_, graph_with_stats, false);
+
+  AddStatis(graph_with_stats.second);
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(fc_elementwise_add_mkldnn_fuse_pass,
+              paddle::framework::ir::FCResidualConnectionMKLDNNFusePass);
+REGISTER_PASS_CAPABILITY(fc_elementwise_add_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("fc", 0)
+            .LE("elementwise_add", 1));
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h
new file mode 100644
index 00000000000..f92ce5bfc70
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using GraphWithStats = std::pair<ir::Graph*, int>;
+
+class FCResidualConnectionMKLDNNFusePass : public FusePassBase {
+ private:
+  GraphWithStats FuseFC(const std::string& name_scope,
+                        const GraphWithStats& graph_with_stats,
+                        bool fc_as_x) const;
+
+ public:
+  FCResidualConnectionMKLDNNFusePass();
+  virtual ~FCResidualConnectionMKLDNNFusePass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph* graph) const;
+
+  static bool HasFusedActivation(Node* fc_node) {
+    return !(
+        fc_node->Op()->GetAttrIfExists<std::string>("activation_type").empty());
+  }
+
+  const std::string name_scope_{"fc_elementwise_add_mkldnn_fuse"};
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass_tester.cc
new file mode 100644
index 00000000000..d2d27be3fce
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass_tester.cc
@@ -0,0 +1,202 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/pass_test_util.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+// Nodes elementwise_add and FC_output are deleted
+// FC node is removed and new version with fuse-pass is added
+// In general, the graph is 2 vertices smaller (per fuse-pass)
+constexpr int nodes_removed = 3;
+constexpr int nodes_added = 1;
+
+OpDesc* Create_Op_FC(ProgramDesc* prog,
+                     const std::vector<test::InOutVarNamePair>& inputs,
+                     const std::vector<test::InOutVarNamePair>& outputs) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType("fc");
+  op->SetAttr("use_mkldnn", true);
+  op->SetAttr("in_num_col_dims", 1);
+
+  for (const auto& input : inputs) {
+    op->SetInput(input.first, {input.second});
+  }
+  for (const auto& output : outputs) {
+    op->SetOutput(output.first, {output.second});
+  }
+
+  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+              static_cast<int>(OpRole::kForward));
+  return op;
+}
+
+OpDesc* Create_Op_elementwise_add(
+    ProgramDesc* prog, const std::vector<test::InOutVarNamePair>& inputs,
+    const std::vector<test::InOutVarNamePair>& outputs,
+    bool use_mkldnn = true) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType("elementwise_add");
+  op->SetAttr("use_mkldnn", use_mkldnn);
+  op->SetAttr("axis", -1);
+
+  for (const auto& input : inputs) {
+    op->SetInput(input.first, {input.second});
+  }
+  for (const auto& output : outputs) {
+    op->SetOutput(output.first, {output.second});
+  }
+
+  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+              static_cast<int>(OpRole::kForward));
+  return op;
+}
+
+TEST(FCElementwiseAddMKLDNNFusePass, FCBiasAsY) {
+  auto prog =
+      test::BuildProgramDesc({"a", "b", "c", "d", "e"}, {"bias", "weights"});
+
+  test::CreateOp(&prog, "sigmoid", {{"X", "a"}}, {{"Out", "b"}});
+  Create_Op_FC(&prog, {{"Input", "b"}, {"Bias", "bias"}, {"W", "weights"}},
+               {{"Out", "c"}});
+  Create_Op_elementwise_add(&prog, {{"X", "a"}, {"Y", "c"}}, {{"Out", "d"}});
+  test::CreateOp(&prog, "relu", {{"X", "d"}}, {{"Out", "e"}});
+
+  Graph graph(prog);
+
+  EXPECT_TRUE(test::RunPassAndAssert(&graph,
+                                     "fc_elementwise_add_mkldnn_fuse_pass", "a",
+                                     "e", nodes_removed, nodes_added));
+  EXPECT_TRUE(test::AssertOpsCount(graph, {{"fc", 1}, {"elementwise_add", 0}}));
+}
+
+TEST(FCElementwiseAddMKLDNNFusePass, FCBiasAsX) {
+  auto prog =
+      test::BuildProgramDesc({"a", "b", "c", "d", "e"}, {"bias", "weights"});
+
+  test::CreateOp(&prog, "sigmoid", {{"X", "a"}}, {{"Out", "b"}});
+  Create_Op_FC(&prog, {{"Input", "b"}, {"Bias", "bias"}, {"W", "weights"}},
+               {{"Out", "c"}});
+
+  Create_Op_elementwise_add(&prog, {{"X", "c"}, {"Y", "a"}}, {{"Out", "d"}});
+  test::CreateOp(&prog, "relu", {{"X", "d"}}, {{"Out", "e"}});
+
+  Graph graph(prog);
+
+  EXPECT_TRUE(test::RunPassAndAssert(&graph,
+                                     "fc_elementwise_add_mkldnn_fuse_pass", "a",
+                                     "e", nodes_removed, nodes_added));
+  EXPECT_TRUE(test::AssertOpsCount(graph, {{"fc", 1}, {"elementwise_add", 0}}));
+}
+
+TEST(FCElementwiseAddMKLDNNFusePass, NoFusion_NotResidualConnection) {
+  auto prog = test::BuildProgramDesc({"a", "b", "c", "d", "e", "f", "g"},
+                                     {"bias", "weights", "bias2", "weights2"});
+
+  test::CreateOp(&prog, "sigmoid", {{"X", "a"}}, {{"Out", "b"}});
+  Create_Op_FC(&prog, {{"Input", "b"}, {"Bias", "bias"}, {"W", "weights"}},
+               {{"Out", "c"}});
+
+  Create_Op_FC(&prog, {{"Input", "d"}, {"Bias", "bias2"}, {"W", "weights2"}},
+               {{"Out", "e"}});
+
+  Create_Op_elementwise_add(&prog, {{"X", "c"}, {"Y", "e"}}, {{"Out", "f"}});
+  test::CreateOp(&prog, "relu", {{"X", "f"}}, {{"Out", "g"}});
+
+  Graph graph(prog);
+
+  EXPECT_TRUE(test::RunPassAndAssert(
+      &graph, "fc_elementwise_add_mkldnn_fuse_pass", "a", "g", 0, 0));
+  EXPECT_TRUE(test::AssertOpsCount(graph, {{"fc", 2}, {"elementwise_add", 1}}));
+}
+
+TEST(FCElementwiseAddMKLDNNFusePass, FC_Residual_VITOCR) {
+  auto prog = test::BuildProgramDesc(
+      {"a", "b", "c", "d", "e", "f", "g", "h", "i"},
+      {"ln_bias", "ln_scale", "bias", "weights", "bias2", "weights2"});
+
+  Create_Op_elementwise_add(&prog, {{"X", "a"}, {"Y", "b"}}, {{"Out", "c"}});
+
+  test::CreateOp(&prog, "layer_norm",
+                 {{"X", "c"}, {"Bias", "ln_bias"}, {"Scale", "ln_scale"}},
+                 {{"Y", "d"}});
+  Create_Op_FC(&prog, {{"Input", "d"}, {"Bias", "bias"}, {"W", "weights"}},
+               {{"Out", "e"}});
+  test::CreateOp(&prog, "gelu", {{"X", "e"}}, {{"Out", "f"}});
+  Create_Op_FC(&prog, {{"Input", "f"}, {"Bias", "bias2"}, {"W", "weights2"}},
+               {{"Out", "g"}});
+  Create_Op_elementwise_add(&prog, {{"X", "g"}, {"Y", "c"}}, {{"Out", "h"}});
+  test::CreateOp(&prog, "relu", {{"X", "h"}}, {{"Out", "i"}});
+
+  Graph graph(prog);
+
+  EXPECT_TRUE(test::RunPassAndAssert(&graph,
+                                     "fc_elementwise_add_mkldnn_fuse_pass", "a",
+                                     "i", nodes_removed, nodes_added));
+  EXPECT_TRUE(test::AssertOpsCount(graph, {{"fc", 2}, {"elementwise_add", 1}}));
+}
+
+TEST(FCElementwiseAddMKLDNNFusePass, FC_Residual_Sequence) {
+  auto prog = test::BuildProgramDesc(
+      {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m"},
+      {"ln_bias", "ln_scale", "bias", "weights", "bias2", "weights2",
+       "ln_bias2", "ln_scale2", "bias3", "weights3", "bias4", "weights4"});
+
+  Create_Op_elementwise_add(&prog, {{"X", "a"}, {"Y", "b"}}, {{"Out", "c"}});
+
+  test::CreateOp(&prog, "layer_norm",
+                 {{"X", "c"}, {"Bias", "ln_bias"}, {"Scale", "ln_scale"}},
+                 {{"Y", "d"}});
+  Create_Op_FC(&prog, {{"Input", "d"}, {"Bias", "bias"}, {"W", "weights"}},
+               {{"Out", "e"}});
+  test::CreateOp(&prog, "gelu", {{"X", "e"}}, {{"Out", "f"}});
+  Create_Op_FC(&prog, {{"Input", "f"}, {"Bias", "bias2"}, {"W", "weights2"}},
+               {{"Out", "g"}});
+  Create_Op_elementwise_add(&prog, {{"X", "g"}, {"Y", "c"}}, {{"Out", "h"}});
+  test::CreateOp(&prog, "layer_norm",
+                 {{"X", "h"}, {"Bias", "ln_bias2"}, {"Scale", "ln_scale2"}},
+                 {{"Y", "i"}});
+  Create_Op_FC(&prog, {{"Input", "i"}, {"Bias", "bias3"}, {"W", "weights3"}},
+               {{"Out", "j"}});
+  test::CreateOp(&prog, "gelu", {{"X", "j"}}, {{"Out", "k"}});
+  Create_Op_FC(&prog, {{"Input", "k"}, {"Bias", "bias4"}, {"W", "weights4"}},
+               {{"Out", "l"}});
+  Create_Op_elementwise_add(&prog, {{"X", "h"}, {"Y", "l"}}, {{"Out", "m"}});
+
+  Graph graph(prog);
+
+  EXPECT_TRUE(test::RunPassAndAssert(&graph,
+                                     "fc_elementwise_add_mkldnn_fuse_pass", "a",
+                                     "m", nodes_removed * 2, nodes_added * 2));
+  EXPECT_TRUE(test::AssertOpsCount(graph, {{"fc", 4}, {"elementwise_add", 1}}));
+}
+
+TEST(FCElementwiseAddMKLDNNFusePass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("fc_elementwise_add_mkldnn_fuse_pass"));
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(fc_elementwise_add_mkldnn_fuse_pass);
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index ce733c53059..01988d5f539 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -20,6 +20,7 @@
 #include <miopen/miopen.h>
 #endif
 #include <glog/logging.h>
+#include <algorithm>
 #include <sstream>
 
 namespace paddle {
@@ -60,6 +61,12 @@ void PaddlePassBuilder::DeletePass(const std::string &pass_type) {
   }
 }
 
+size_t PaddlePassBuilder::GetPassIndex(const std::string &pass_type) {
+  auto iter = std::find(std::begin(passes_), std::end(passes_), pass_type);
+  if (iter == std::end(passes_)) return -1;
+  return std::distance(std::begin(passes_), iter);
+}
+
 void PaddlePassBuilder::InsertPass(size_t idx, const std::string &pass_type) {
   passes_.insert(std::begin(passes_) + idx, pass_type);
 }
@@ -300,6 +307,7 @@ void CpuPassStrategy::EnableMKLDNN() {
              // Disabled due to topology-dependent speed-up
              //  "fc_mkldnn_pass",
              //  "fc_act_mkldnn_fuse_pass",
+             "fc_elementwise_add_mkldnn_fuse_pass",   //
              "batch_norm_act_fuse_pass",              //
              "softplus_activation_mkldnn_fuse_pass",  //
              "shuffle_channel_mkldnn_detect_pass",    //
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 231ee2cb1e8..db6bde62ddc 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -71,6 +71,10 @@ class PD_INFER_DECL PaddlePassBuilder {
   /// \param[in] idx the position to delete.
   void DeletePass(size_t idx);
 
+  /// \brief Get the certain position of a pass.
+  /// \param[in] pass_type the type of insert pass.
+  size_t GetPassIndex(const std::string &pass_type);
+
   /// \brief Delete all passes that has a certain type 'pass_type'.
   /// \param[in] pass_type the certain pass type to be deleted.
   void DeletePass(const std::string &pass_type);
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 06d1cd0814e..e9b8c0ce70f 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -345,6 +345,19 @@ inference_analysis_test(test_analyzer_transformer_profile SRCS analyzer_transfor
   ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 
        --cpu_num_threads=${CPU_NUM_THREADS_ON_CI})
 
+# VIT-OCR
+set(VIT_OCR_URL "https://paddle-qa.bj.bcebos.com/inference_model/2.1.1/ocr")
+set(VIT_OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/vit_ocr")
+if (NOT EXISTS ${VIT_OCR_INSTALL_DIR}/vit_ocr.tgz)
+    inference_download_and_uncompress_without_verify(${VIT_OCR_INSTALL_DIR} ${VIT_OCR_URL} vit_ocr.tgz)
+endif()
+if (NOT EXISTS ${VIT_OCR_INSTALL_DIR}/datavit.txt)
+    file(DOWNLOAD ${VIT_OCR_URL}/datavit.txt ${VIT_OCR_INSTALL_DIR}/datavit.txt)
+endif()
+inference_analysis_test(test_analyzer_vit_ocr SRCS analyzer_vit_ocr_tester.cc
+  EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+  ARGS --infer_model=${VIT_OCR_INSTALL_DIR}/vit_ocr --infer_data=${VIT_OCR_INSTALL_DIR}/datavit.txt)
+
 # ocr
 set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
 if (NOT EXISTS ${OCR_INSTALL_DIR}/ocr.tar.gz)
diff --git a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
index 8f7e5100922..224bbaa7aab 100644
--- a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
@@ -158,6 +158,7 @@ void profile(bool use_mkldnn = false) {
     config.EnableMKLDNN();
     config.pass_builder()->AppendPass("fc_mkldnn_pass");
     config.pass_builder()->AppendPass("fc_act_mkldnn_fuse_pass");
+    config.pass_builder()->AppendPass("fc_elementwise_add_mkldnn_fuse_pass");
   }
 
   std::vector<std::vector<PaddleTensor>> outputs;
diff --git a/paddle/fluid/inference/tests/api/analyzer_vit_ocr_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vit_ocr_tester.cc
new file mode 100644
index 00000000000..029f2f0421d
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_vit_ocr_tester.cc
@@ -0,0 +1,117 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+struct Record {
+  std::vector<float> data;
+  std::vector<int32_t> shape;
+};
+
+Record ProcessALine(const std::string &line) {
+  std::vector<std::string> columns;
+  split(line, '\t', &columns);
+  CHECK_EQ(columns.size(), 2UL)
+      << "data format error, should be <data>\t<shape>";
+
+  Record record;
+  std::vector<std::string> data_strs;
+  split(columns[0], ' ', &data_strs);
+  for (auto &d : data_strs) {
+    record.data.push_back(std::stof(d));
+  }
+
+  std::vector<std::string> shape_strs;
+  split(columns[1], ' ', &shape_strs);
+  for (auto &s : shape_strs) {
+    record.shape.push_back(std::stoi(s));
+  }
+
+  return record;
+}
+
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
+  std::string line;
+  std::ifstream file(FLAGS_infer_data);
+  std::getline(file, line);
+  auto record = ProcessALine(line);
+
+  PaddleTensor input;
+  input.shape = record.shape;
+  input.dtype = PaddleDType::FLOAT32;
+  size_t input_size = record.data.size() * sizeof(float);
+  input.data.Resize(input_size);
+  memcpy(input.data.data(), record.data.data(), input_size);
+  std::vector<PaddleTensor> input_slots;
+  input_slots.assign({input});
+  (*inputs).emplace_back(input_slots);
+}
+
+void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) {
+  cfg->SetModel(FLAGS_infer_model + "/inference.pdmodel",
+                FLAGS_infer_model + "/inference.pdiparams");
+
+  if (use_mkldnn) {
+    cfg->EnableMKLDNN();
+    cfg->SwitchIrOptim();
+
+    size_t insertingIndex = cfg->pass_builder()->GetPassIndex(
+        "fc_elementwise_add_mkldnn_fuse_pass");
+    cfg->pass_builder()->InsertPass(insertingIndex, "fc_act_mkldnn_fuse_pass");
+    cfg->pass_builder()->InsertPass(insertingIndex, "fc_mkldnn_pass");
+  }
+}
+
+// Compare results of NativeConfig and AnalysisConfig
+void compare(bool use_mkldnn = false) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg, use_mkldnn);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
+}
+
+TEST(Analyzer_vit_ocr, compare) { compare(); }
+
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_vit_ocr, compare_mkldnn) { compare(true /* use_mkldnn */); }
+#endif
+
+#ifdef PADDLE_WITH_MKLDNN
+// Check the fuse status
+TEST(Analyzer_vit_ocr, fuse_status) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg, true);
+  int num_ops;
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_status = GetFuseStatis(
+      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
+
+  CHECK_EQ(fuse_status.at("fc_mkldnn_pass"), 33);
+  CHECK_EQ(fuse_status.at("conv_activation_mkldnn_fuse"), 2);
+  CHECK_EQ(fuse_status.at("fc_elementwise_add_mkldnn_fuse"), 16);
+}
+#endif
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index 30db4b3be66..4078d012fce 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -410,19 +410,17 @@ class FCPrimitiveFactory {
       const ExecutionContext& ctx) {
     auto scale_in_data = ctx.Attr<float>("Scale_in");
     auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights");
+    bool has_activation = !ctx.Attr<std::string>("activation_type").empty();
+    bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
 
     // If the output will be in floats, we don't multiply by scale_out.
-    float activation_scale = 1.0f;
-    float inner_scale = 1.0f;
-    if (!ctx.Attr<bool>("force_fp32_output")) {
-      // if has activation use it's scale, otherwise use inner scale.
-      if (!ctx.Attr<std::string>("activation_type").empty()) {
-        activation_scale = ctx.Attr<float>("Scale_out");
-      } else {
-        inner_scale = ctx.Attr<float>("Scale_out");
-      }
-    }
 
+    float scale = (!force_fp32_output && has_activation)
+                      ? ctx.Attr<float>("Scale_out")
+                      : 1.0f;
+    float inner_scale = (force_fp32_output || has_activation)
+                            ? 1.0f
+                            : ctx.Attr<float>("Scale_out");
     const size_t weight_scales_num = scale_weights_data.size();
     std::vector<float> output_shift_scale(weight_scales_num);
 
@@ -435,7 +433,7 @@ class FCPrimitiveFactory {
             inner_scale / (scale_in_data * scale_weights_data[i]);
     }
 
-    return make_tuple(output_shift_scale, activation_scale);
+    return make_tuple(output_shift_scale, scale);
   }
 
   // Computing MKL-DNN's scaling mask which determines along which dimension
@@ -467,6 +465,12 @@ class FCPrimitiveFactory {
     std::tie(output_shift_scale, scale) = ComputeOutputShiftScale(ctx);
     int mask = CreateMask(1, output_shift_scale.size() > 1);
     attributes.set_output_scales(mask, output_shift_scale);
+    float sum_scale = 1.0f;
+
+    if (ctx.HasAttr("fuse_residual_connection") &&
+        ctx.Attr<bool>("fuse_residual_connection")) {
+      post_operations.append_sum(sum_scale);
+    }
 
     if (ctx.Attr<std::string>("activation_type") == "relu") {
       constexpr float negative_slope = 0.0f;
@@ -531,6 +535,21 @@ class FCPrimitiveFactory {
   dnnl::memory CreateDstMemory(
       const dnnl::inner_product_forward::primitive_desc& fc_prim_desc,
       const ExecutionContext& ctx, Tensor* output) {
+    if (ctx.HasAttr("fuse_residual_connection") &&
+        ctx.Attr<bool>("fuse_residual_connection")) {
+      auto* residual_param = ctx.Output<Tensor>("ResidualData");
+
+      PADDLE_ENFORCE_EQ(
+          output->dims(), residual_param->dims(),
+          platform::errors::InvalidArgument(
+              "Output and elementwise parameter need to have the "
+              "same dimension sizes, but got output's dimension = %d"
+              " and residual param's dimension =%d .",
+              output->dims().size(), residual_param->dims().size()));
+
+      output->ShareDataWith(*residual_param);
+    }
+
     auto dst_desc = fc_prim_desc.dst_desc();
     auto buffer_size = dst_desc.get_size();
     T_out* output_data =
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index 808821f06cb..c23e2eaa154 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -141,5 +141,6 @@ if (WITH_MKLDNN)
   set_tests_properties(test_conv_eltwiseadd_bn_fuse_pass PROPERTIES TIMEOUT 300)
   set_tests_properties(test_mkldnn_conv_mish_fuse_pass PROPERTIES TIMEOUT 300)
   set_tests_properties(test_mkldnn_fc_mish_fuse_pass PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_mkldnn_fc_elementwise_add_fuse_pass PROPERTIES TIMEOUT 120)
 endif()
 endif()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_fc_elementwise_add_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_fc_elementwise_add_fuse_pass.py
new file mode 100644
index 00000000000..22b8960497b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_fc_elementwise_add_fuse_pass.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+
+
+class TestFCElementwiseAddMkldnnFusePass(PassAutoScanTest):
+    def sample_program_config(self, draw):
+        axis = draw(st.sampled_from([-1, 0, 1]))
+        fc_as_x = draw(st.sampled_from([True, False]))
+        fc_in = draw(st.sampled_from([32, 64]))
+        fc_wei = draw(st.sampled_from([32, 64]))
+
+        def generate_input():
+            return np.random.random([fc_in, fc_wei]).astype(np.float32)
+
+        def generate_fc_weight():
+            return np.random.random([fc_wei, fc_wei]).astype(np.float32)
+
+        def generate_fc_bias():
+            return np.random.random([fc_wei]).astype(np.float32)
+
+        relu_op = OpConfig(
+            type="relu",
+            inputs={"X": ["input_data"]},
+            outputs={"Out": ["relu_out"]},
+            attrs={})
+
+        fc_op = OpConfig(
+            type="fc",
+            inputs={
+                "Input": ["relu_out"],
+                "W": ["fc_weight"],
+                "Bias": ["fc_bias"]
+            },
+            outputs={"Out": ["fc_output"]},
+            attrs={
+                "use_mkldnn": True,
+                "padding_weights": False,
+                "activation_type": "",
+                "in_num_col_dims": 1,
+            })
+
+        if fc_as_x:
+            inputs = {"X": ["fc_output"], "Y": ["input_data"]}
+        else:
+            inputs = {"X": ["input_data"], "Y": ["fc_output"]}
+
+        elt_add_op = OpConfig(
+            type="elementwise_add",
+            inputs=inputs,
+            outputs={"Out": ["elementwise_output"]},
+            attrs={'axis': axis})
+
+        model_net = [relu_op, fc_op, elt_add_op]
+
+        program_config = ProgramConfig(
+            ops=model_net,
+            weights={
+                "fc_weight": TensorConfig(data_gen=partial(generate_fc_weight)),
+                "fc_bias": TensorConfig(data_gen=partial(generate_fc_bias)),
+            },
+            inputs={
+                "input_data": TensorConfig(data_gen=partial(generate_input))
+            },
+            outputs=["elementwise_output"])
+
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_mkldnn=True)
+        yield config, ["relu", "fc"], (1e-5, 1e-5)
+
+    def test(self):
+        self.run_and_statis(
+            quant=False, passes=["fc_elementwise_add_mkldnn_fuse_pass"])
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab