onednn: remove fc_elementwise_add fusion (#55504)

* onednn: remove fc+eltwiseadd fusion pass * onednn: remove post-sum fusion in fc kernel * onednn: tests: make unfused add run into f32

onednn: remove fc_elementwise_add fusion (#55504)
* onednn: remove fc+eltwiseadd fusion pass * onednn: remove post-sum fusion in fc kernel * onednn: tests: make unfused add run into f32
bea1f04c · Xinyu Chen · GitHub · 5b8f0637 · bea1f04c · 5b8f0637
10 changed file
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -174,7 +174,6 @@ if(WITH_MKLDNN)
  pass_library(conv_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
  pass_library(int8_scale_calculation_mkldnn_pass inference DIR mkldnn)
  pass_library(params_quantization_mkldnn_pass inference DIR mkldnn)
-  pass_library(fc_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
  pass_library(scale_matmul_fuse_pass inference DIR mkldnn)
  pass_library(cpu_bfloat16_placement_pass inference DIR mkldnn)
  pass_library(cpu_bfloat16_pass inference DIR mkldnn)

--- a/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h"
-
-#include "paddle/fluid/framework/ir/graph_traits.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/utils/string/pretty_log.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-FCResidualConnectionMKLDNNFusePass::FCResidualConnectionMKLDNNFusePass() {
-  AddOpCompat(OpCompat("fc"))
-      .AddInput("Input")
-      .IsTensor()
-      .End()
-      .AddInput("W")
-      .IsTensor()
-      .End()
-      .AddInput("Bias")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("in_num_col_dims")
-      .IsNumGE(1)
-      .End();
-
-  AddOpCompat(OpCompat("elementwise_add"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("axis")
-      .IsIntIn({-1, 0, 1})
-      .End();
-}
-
-GraphWithStats FCResidualConnectionMKLDNNFusePass::FuseFC(
-    const std::string& name_scope,
-    const GraphWithStats& graph_with_stats,
-    bool fc_as_x) const {
-  GraphPatternDetector gpd;
-  auto pattern = gpd.mutable_pattern();
-  patterns::FCMKLDNN fc_pattern{pattern, name_scope};
-  auto fc_output = fc_pattern(false /* with residual */);
-
-  patterns::ResidualElementwise elementwise_pattern{
-      pattern, name_scope, fc_as_x};
-  elementwise_pattern(
-      fc_output,
-      pattern->NewNode(elementwise_pattern.residual_data_repr()),
-      "elementwise_add",
-      fc_as_x);
-  fc_output->AsIntermediate();
-
-  int found_fc_count = 0;
-
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "Fuse fc + elementwise_add as residual";
-    GET_IR_NODE_FROM_SUBGRAPH(fc_op, fc, fc_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(fc_input, input, fc_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(fc_weights, weights, fc_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(fc_output, output, fc_pattern);
-
-    GET_IR_NODE_FROM_SUBGRAPH(
-        elementwise_op, elementwise_op, elementwise_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(
-        residual_data, residual_data, elementwise_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(
-        elementwise_out, elementwise_out, elementwise_pattern);
-
-    if (FindFuseOption(*fc_op, *elementwise_op) != FUSE_MKLDNN) {
-      VLOG(4) << "Skipping fusion for " << fc_op->Name() << "(" << fc_op->id()
-              << ") with " << elementwise_op->Name() << "("
-              << elementwise_op->id()
-              << ") because not both ops have use_mkldnn";
-      return;
-    }
-    if (!IsReachable(g, residual_data, fc_output)) {
-      VLOG(4) << "Skipping fusion for " << fc_op->Name() << "(" << fc_op->id()
-              << ") with " << elementwise_op->Name() << "("
-              << elementwise_op->id() << ") because residual input "
-              << residual_data->Name() << "(" << residual_data->id()
-              << ") is not "
-                 "reachable";
-      return;
-    }
-    if (HasFusedActivation(fc_op)) {
-      VLOG(4) << "Skipping fusion for " << fc_op->Name() << "(" << fc_op->id()
-              << ") with " << elementwise_op->Name() << "("
-              << elementwise_op->id() << ") because fc has activation fused";
-      return;
-    }
-
-    if (!IsCompat(subgraph, g)) {
-      LOG(WARNING)
-          << "op compat for fc_elementwise_add_mkldnn_fuse_pass failed.";
-      return;
-    }
-
-    fc_op->Op()->SetInput("ResidualData", {residual_data->Name()});
-    fc_op->Op()->SetOutput("Out", {elementwise_out->Name()});
-    fc_op->Op()->SetAttr("fuse_residual_connection", true);
-
-    GraphSafeRemoveNodes(g, {fc_output, elementwise_op});
-
-    IR_NODE_LINK_TO(residual_data, fc_op);
-    IR_NODE_LINK_TO(fc_op, elementwise_out);
-
-    found_fc_count++;
-  };
-
-  gpd(graph_with_stats.first, handler);
-  if ((!Has("disable_logs") || !Get<bool>("disable_logs")) &&
-      (found_fc_count > 0)) {
-    std::stringstream msg_ss;
-    std::string fusionMode = fc_as_x ? "x" : "y";
-    msg_ss << "---    Fused " << found_fc_count << " fc (as " << fusionMode
-           << ") + elementwise_add patterns";
-    paddle::string::PrettyLogDetail(msg_ss.str().c_str());
-  }
-
-  return std::make_pair(graph_with_stats.first,
-                        found_fc_count + graph_with_stats.second);
-}
-
-void FCResidualConnectionMKLDNNFusePass::ApplyImpl(ir::Graph* graph) const {
-  FusePassBase::Init(name_scope_, graph);
-  auto graph_with_stats = FuseFC(name_scope_, std::make_pair(graph, 0), true);
-  graph_with_stats = FuseFC(name_scope_, graph_with_stats, false);
-
-  AddStatis(graph_with_stats.second);
-}
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(fc_elementwise_add_mkldnn_fuse_pass,
-              paddle::framework::ir::FCResidualConnectionMKLDNNFusePass);
-REGISTER_PASS_CAPABILITY(fc_elementwise_add_mkldnn_fuse_pass)
-    .AddCombination(
-        paddle::framework::compatible::OpVersionComparatorCombination()
-            .LE("fc", 0)
-            .LE("elementwise_add", 1));
--- a/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-using GraphWithStats = std::pair<ir::Graph*, int>;
-
-class FCResidualConnectionMKLDNNFusePass : public FusePassBase {
- private:
-  GraphWithStats FuseFC(const std::string& name_scope,
-                        const GraphWithStats& graph_with_stats,
-                        bool fc_as_x) const;
-
- public:
-  FCResidualConnectionMKLDNNFusePass();
-  virtual ~FCResidualConnectionMKLDNNFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const;
-
-  static bool HasFusedActivation(Node* fc_node) {
-    return !(
-        fc_node->Op()->GetAttrIfExists<std::string>("activation_type").empty());
-  }
-
-  const std::string name_scope_{"fc_elementwise_add_mkldnn_fuse"};
-};
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -372,7 +372,6 @@ void CpuPassStrategy::EnableMKLDNN() {
             // Disabled due to topology-dependent speed-up
             "fc_mkldnn_pass",
             "fc_act_mkldnn_fuse_pass",
-             "fc_elementwise_add_mkldnn_fuse_pass",   //
             "self_attention_fuse_pass",              //
             "batch_norm_act_fuse_pass",              //
             "softplus_activation_onednn_fuse_pass",  //
@@ -407,7 +406,6 @@ void CpuPassStrategy::EnableMkldnnBfloat16() {
  if (!use_mkldnn_bfloat16_) {
    passes_.push_back("fc_mkldnn_pass");
    passes_.push_back("fc_act_mkldnn_fuse_pass");
-    passes_.push_back("fc_elementwise_add_mkldnn_fuse_pass");

    passes_.push_back("cpu_bfloat16_placement_pass");
    passes_.push_back("cpu_bfloat16_pass");
@@ -463,7 +461,6 @@ void CpuPassStrategy::EnableMkldnnInt8() {
    passes_.push_back("repeated_fc_relu_fuse_pass");
    passes_.push_back("fc_mkldnn_pass");
    passes_.push_back("fc_act_mkldnn_fuse_pass");
-    passes_.push_back("fc_elementwise_add_mkldnn_fuse_pass");
    passes_.push_back("matmul_transpose_reshape_mkldnn_fuse_pass");
    passes_.push_back("batch_norm_act_fuse_pass");
    passes_.push_back("softplus_activation_onednn_fuse_pass");
@@ -498,9 +495,7 @@ void CpuPassStrategy::DisableMkldnnFcPasses() {

 void CpuPassStrategy::EraseFcMkldnnPasses() {
  std::vector<std::string> fc_passes_to_erase(
-      {"fc_mkldnn_pass",
-       "fc_act_mkldnn_fuse_pass",
-       "fc_elementwise_add_mkldnn_fuse_pass"});
+      {"fc_mkldnn_pass", "fc_act_mkldnn_fuse_pass"});
  for (const auto &pass : fc_passes_to_erase) {
    int idx = GetPassIndex(pass);
    if (idx != -1) {

--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -46,9 +46,6 @@ GetDNNLScales(const ExecutionContext& ctx) {
  auto scale_in_data = ctx.Attr<float>("Scale_in");
  auto scale_out = ctx.Attr<float>("Scale_out");
  auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights");
-  auto scale_in_eltwise_data = ctx.HasAttr("Scale_in_eltwise")
-                                   ? ctx.Attr<float>("Scale_in_eltwise")
-                                   : 1.0f;

  std::vector<float> dnnl_src_scales = {1.f / scale_in_data};
  size_t count = scale_weights_data.size();
@@ -57,7 +54,7 @@ GetDNNLScales(const ExecutionContext& ctx) {
  for (size_t i = 0; i < count; i++) {
    dnnl_wei_scales[i] = 1.f / scale_weights_data[i];
  }
-  std::vector<float> dnnl_psum_scales = {1.f / scale_in_eltwise_data};
+  std::vector<float> dnnl_psum_scales = {1.f};
  std::vector<float> dnnl_dst_scales = {1.f / scale_out};

  return std::make_tuple(
@@ -127,7 +124,6 @@ class FCMKLDNNHandler
    dnnl::primitive_attr attributes;
    dnnl::post_ops post_operations;

-    float sum_scale = 1.0f;
    float activation_scale = 1.0f;
    if (phi::funcs::is_int8<T_w>()) {
      std::vector<float> src_scales, wei_scales, psum_scales, dst_scales;
@@ -168,13 +164,6 @@ class FCMKLDNNHandler
               dst_scales.data(),
               dst_scales.size() * sizeof(float));
      }
-
-      sum_scale = psum_scales[0];
-    }
-
-    if (ctx.HasAttr("fuse_residual_connection") &&
-        ctx.Attr<bool>("fuse_residual_connection")) {
-      post_operations.append_sum(sum_scale);
    }

    // ReLU from "fc_fuse_pass"
@@ -332,22 +321,6 @@ class FCMKLDNNHandler

  std::shared_ptr<dnnl::memory> AcquireCustomDstMemory(
      const ExecutionContext& ctx, phi::DenseTensor* out) {
-    if (ctx.HasAttr("fuse_residual_connection") &&
-        ctx.Attr<bool>("fuse_residual_connection")) {
-      auto* residual_param = ctx.Input<phi::DenseTensor>("ResidualData");
-
-      PADDLE_ENFORCE_EQ(
-          out->dims(),
-          residual_param->dims(),
-          phi::errors::InvalidArgument(
-              "Output and elementwise parameter need to have the "
-              "same dimension sizes, but got output's dimension = %d"
-              " and residual param's dimension =%d .",
-              out->dims().size(),
-              residual_param->dims().size()));
-
-      out->ShareDataWith(*residual_param);
-    }
    return this->template AcquireDstMemory<T_out>(out);
  }  // namespace operators

@@ -458,11 +431,7 @@ class FCMKLDNNKernel : public framework::OpKernel<T_in> {

      dst_memory_p =
          std::make_shared<dnnl::memory>(inner_product_cache->dst_mem);
-      if (ctx.HasAttr("fuse_residual_connection") &&
-          ctx.Attr<bool>("fuse_residual_connection")) {
-        auto* residual_param = ctx.Input<phi::DenseTensor>("ResidualData");
-        out->ShareDataWith(*residual_param);
-      }
+
      auto out_ptr = out->mutable_data<T_out>(
          ctx.GetPlace(), dst_memory_p->get_desc().get_size());
      dst_memory_p->set_data_handle(out_ptr);

--- a/test/cpp/inference/api/analysis_predictor_tester.cc
+++ b/test/cpp/inference/api/analysis_predictor_tester.cc
@@ -370,9 +370,7 @@ TEST(AnalysisPredictor, mkldnn_fc_passes_cpu_pass_strategy) {
  CpuPassStrategy cpuPassStrategy;
  cpuPassStrategy.EnableMKLDNN();
  const std::vector<std::string> fc_passes_to_erase(
-      {"fc_mkldnn_pass",
-       "fc_act_mkldnn_fuse_pass",
-       "fc_elementwise_add_mkldnn_fuse_pass"});
+      {"fc_mkldnn_pass", "fc_act_mkldnn_fuse_pass"});
  for (const auto& pass : fc_passes_to_erase) {
    ASSERT_NE(cpuPassStrategy.GetPassIndex(pass), (size_t)-1);
  }

--- a/test/cpp/inference/api/analyzer_ernie_int8_tester.cc
+++ b/test/cpp/inference/api/analyzer_ernie_int8_tester.cc
@@ -34,7 +34,7 @@ void SetInt8Config(AnalysisConfig *cfg,
  pass_builder->DeletePass("constant_folding_pass");
  auto warmup_data = std::make_shared<std::vector<PaddleTensor>>(data);
  cfg->mkldnn_quantizer_config()->SetEnabledOpTypes(
-      {"elementwise_add", "matmul", "matmul_v2", "fused_matmul"});
+      {"matmul", "matmul_v2", "fused_matmul"});
  // Exclusion of several matmules that should not be quantized due to the fact
  // that they reduce the accuracy of the model
  cfg->mkldnn_quantizer_config()->SetExcludedOpIds(

--- a/test/cpp/inference/api/analyzer_vit_ocr_tester.cc
+++ b/test/cpp/inference/api/analyzer_vit_ocr_tester.cc
@@ -104,7 +104,6 @@ TEST(Analyzer_vit_ocr, fuse_status) {

  CHECK_EQ(fuse_statis.at("fc_mkldnn_pass"), 33);
  CHECK_EQ(fuse_statis.at("fused_conv2d_gelu_mkldnn_fuse_pass"), 2);
-  CHECK_EQ(fuse_statis.at("fc_elementwise_add_mkldnn_fuse"), 16);
 }
 #endif


--- a/test/ir/inference/CMakeLists.txt
+++ b/test/ir/inference/CMakeLists.txt
@@ -301,8 +301,6 @@ if(WITH_GPU AND TENSORRT_FOUND)
    set_tests_properties(test_mkldnn_conv_mish_fuse_pass PROPERTIES TIMEOUT 300)
    set_tests_properties(test_onednn_fc_activation_fuse_pass PROPERTIES TIMEOUT
                                                                        300)
-    set_tests_properties(test_onednn_fc_elementwise_add_fuse_pass
-                         PROPERTIES TIMEOUT 120)
    set_tests_properties(test_mkldnn_conv_affine_channel_fuse_pass
                         PROPERTIES TIMEOUT 60)
  endif()

--- a/test/ir/inference/test_onednn_fc_elementwise_add_fuse_pass.py
+++ b/test/ir/inference/test_onednn_fc_elementwise_add_fuse_pass.py
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from functools import partial
-
-import hypothesis.strategies as st
-import numpy as np
-from auto_scan_test import PassAutoScanTest
-from program_config import OpConfig, ProgramConfig, TensorConfig
-
-
-class TestFCElementwiseAddOneDNNFusePass(PassAutoScanTest):
-    def sample_program_config(self, draw):
-        axis = draw(st.sampled_from([-1, 0, 1]))
-        fc_as_x = draw(st.booleans())
-        fc_in = draw(st.sampled_from([32, 64]))
-        fc_wei = draw(st.sampled_from([32, 64]))
-
-        def generate_data(shape):
-            return np.random.random(shape).astype(np.float32)
-
-        relu_op = OpConfig(
-            type='relu',
-            inputs={'X': ['input_data']},
-            outputs={'Out': ['relu_out']},
-            attrs={},
-        )
-
-        fc_op = OpConfig(
-            type='fc',
-            inputs={
-                'Input': ['relu_out'],
-                'W': ['fc_weight'],
-                'Bias': ['fc_bias'],
-            },
-            outputs={'Out': ['fc_output']},
-            attrs={
-                'use_mkldnn': True,
-                'padding_weights': False,
-                'activation_type': '',
-                'in_num_col_dims': 1,
-            },
-        )
-
-        if fc_as_x:
-            inputs = {'X': ['fc_output'], 'Y': ['input_data']}
-        else:
-            inputs = {'X': ['input_data'], 'Y': ['fc_output']}
-
-        elt_add_op = OpConfig(
-            type='elementwise_add',
-            inputs=inputs,
-            outputs={'Out': ['elementwise_output']},
-            attrs={'axis': axis, 'use_mkldnn': True},
-        )
-
-        model_net = [relu_op, fc_op, elt_add_op]
-
-        program_config = ProgramConfig(
-            ops=model_net,
-            weights={
-                'fc_weight': TensorConfig(
-                    data_gen=partial(generate_data, [fc_wei, fc_wei])
-                ),
-                'fc_bias': TensorConfig(
-                    data_gen=partial(generate_data, [fc_wei])
-                ),
-            },
-            inputs={
-                'input_data': TensorConfig(
-                    data_gen=partial(generate_data, [fc_in, fc_wei])
-                )
-            },
-            outputs=['elementwise_output'],
-        )
-
-        return program_config
-
-    def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(
-            use_mkldnn=True, passes=['fc_elementwise_add_mkldnn_fuse_pass']
-        )
-        yield config, ['relu', 'fc'], (1e-5, 1e-5)
-
-    def test(self):
-        self.run_and_statis(
-            quant=False, passes=['fc_elementwise_add_mkldnn_fuse_pass']
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()