未验证 提交 bea1f04c 编写于 作者: X Xinyu Chen 提交者: GitHub

onednn: remove fc_elementwise_add fusion (#55504)

* onednn: remove fc+eltwiseadd fusion pass
* onednn: remove post-sum fusion in fc kernel
* onednn: tests: make unfused add run into f32
上级 5b8f0637
......@@ -174,7 +174,6 @@ if(WITH_MKLDNN)
pass_library(conv_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
pass_library(int8_scale_calculation_mkldnn_pass inference DIR mkldnn)
pass_library(params_quantization_mkldnn_pass inference DIR mkldnn)
pass_library(fc_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
pass_library(scale_matmul_fuse_pass inference DIR mkldnn)
pass_library(cpu_bfloat16_placement_pass inference DIR mkldnn)
pass_library(cpu_bfloat16_pass inference DIR mkldnn)
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h"
#include "paddle/fluid/framework/ir/graph_traits.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/utils/string/pretty_log.h"
namespace paddle {
namespace framework {
namespace ir {
FCResidualConnectionMKLDNNFusePass::FCResidualConnectionMKLDNNFusePass() {
AddOpCompat(OpCompat("fc"))
.AddInput("Input")
.IsTensor()
.End()
.AddInput("W")
.IsTensor()
.End()
.AddInput("Bias")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("in_num_col_dims")
.IsNumGE(1)
.End();
AddOpCompat(OpCompat("elementwise_add"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("axis")
.IsIntIn({-1, 0, 1})
.End();
}
GraphWithStats FCResidualConnectionMKLDNNFusePass::FuseFC(
const std::string& name_scope,
const GraphWithStats& graph_with_stats,
bool fc_as_x) const {
GraphPatternDetector gpd;
auto pattern = gpd.mutable_pattern();
patterns::FCMKLDNN fc_pattern{pattern, name_scope};
auto fc_output = fc_pattern(false /* with residual */);
patterns::ResidualElementwise elementwise_pattern{
pattern, name_scope, fc_as_x};
elementwise_pattern(
fc_output,
pattern->NewNode(elementwise_pattern.residual_data_repr()),
"elementwise_add",
fc_as_x);
fc_output->AsIntermediate();
int found_fc_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "Fuse fc + elementwise_add as residual";
GET_IR_NODE_FROM_SUBGRAPH(fc_op, fc, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(fc_input, input, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(fc_weights, weights, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(fc_output, output, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(
elementwise_op, elementwise_op, elementwise_pattern);
GET_IR_NODE_FROM_SUBGRAPH(
residual_data, residual_data, elementwise_pattern);
GET_IR_NODE_FROM_SUBGRAPH(
elementwise_out, elementwise_out, elementwise_pattern);
if (FindFuseOption(*fc_op, *elementwise_op) != FUSE_MKLDNN) {
VLOG(4) << "Skipping fusion for " << fc_op->Name() << "(" << fc_op->id()
<< ") with " << elementwise_op->Name() << "("
<< elementwise_op->id()
<< ") because not both ops have use_mkldnn";
return;
}
if (!IsReachable(g, residual_data, fc_output)) {
VLOG(4) << "Skipping fusion for " << fc_op->Name() << "(" << fc_op->id()
<< ") with " << elementwise_op->Name() << "("
<< elementwise_op->id() << ") because residual input "
<< residual_data->Name() << "(" << residual_data->id()
<< ") is not "
"reachable";
return;
}
if (HasFusedActivation(fc_op)) {
VLOG(4) << "Skipping fusion for " << fc_op->Name() << "(" << fc_op->id()
<< ") with " << elementwise_op->Name() << "("
<< elementwise_op->id() << ") because fc has activation fused";
return;
}
if (!IsCompat(subgraph, g)) {
LOG(WARNING)
<< "op compat for fc_elementwise_add_mkldnn_fuse_pass failed.";
return;
}
fc_op->Op()->SetInput("ResidualData", {residual_data->Name()});
fc_op->Op()->SetOutput("Out", {elementwise_out->Name()});
fc_op->Op()->SetAttr("fuse_residual_connection", true);
GraphSafeRemoveNodes(g, {fc_output, elementwise_op});
IR_NODE_LINK_TO(residual_data, fc_op);
IR_NODE_LINK_TO(fc_op, elementwise_out);
found_fc_count++;
};
gpd(graph_with_stats.first, handler);
if ((!Has("disable_logs") || !Get<bool>("disable_logs")) &&
(found_fc_count > 0)) {
std::stringstream msg_ss;
std::string fusionMode = fc_as_x ? "x" : "y";
msg_ss << "--- Fused " << found_fc_count << " fc (as " << fusionMode
<< ") + elementwise_add patterns";
paddle::string::PrettyLogDetail(msg_ss.str().c_str());
}
return std::make_pair(graph_with_stats.first,
found_fc_count + graph_with_stats.second);
}
void FCResidualConnectionMKLDNNFusePass::ApplyImpl(ir::Graph* graph) const {
FusePassBase::Init(name_scope_, graph);
auto graph_with_stats = FuseFC(name_scope_, std::make_pair(graph, 0), true);
graph_with_stats = FuseFC(name_scope_, graph_with_stats, false);
AddStatis(graph_with_stats.second);
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(fc_elementwise_add_mkldnn_fuse_pass,
paddle::framework::ir::FCResidualConnectionMKLDNNFusePass);
REGISTER_PASS_CAPABILITY(fc_elementwise_add_mkldnn_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.LE("fc", 0)
.LE("elementwise_add", 1));
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle {
namespace framework {
namespace ir {
using GraphWithStats = std::pair<ir::Graph*, int>;
class FCResidualConnectionMKLDNNFusePass : public FusePassBase {
private:
GraphWithStats FuseFC(const std::string& name_scope,
const GraphWithStats& graph_with_stats,
bool fc_as_x) const;
public:
FCResidualConnectionMKLDNNFusePass();
virtual ~FCResidualConnectionMKLDNNFusePass() {}
protected:
void ApplyImpl(ir::Graph* graph) const;
static bool HasFusedActivation(Node* fc_node) {
return !(
fc_node->Op()->GetAttrIfExists<std::string>("activation_type").empty());
}
const std::string name_scope_{"fc_elementwise_add_mkldnn_fuse"};
};
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -372,7 +372,6 @@ void CpuPassStrategy::EnableMKLDNN() {
// Disabled due to topology-dependent speed-up
"fc_mkldnn_pass",
"fc_act_mkldnn_fuse_pass",
"fc_elementwise_add_mkldnn_fuse_pass", //
"self_attention_fuse_pass", //
"batch_norm_act_fuse_pass", //
"softplus_activation_onednn_fuse_pass", //
......@@ -407,7 +406,6 @@ void CpuPassStrategy::EnableMkldnnBfloat16() {
if (!use_mkldnn_bfloat16_) {
passes_.push_back("fc_mkldnn_pass");
passes_.push_back("fc_act_mkldnn_fuse_pass");
passes_.push_back("fc_elementwise_add_mkldnn_fuse_pass");
passes_.push_back("cpu_bfloat16_placement_pass");
passes_.push_back("cpu_bfloat16_pass");
......@@ -463,7 +461,6 @@ void CpuPassStrategy::EnableMkldnnInt8() {
passes_.push_back("repeated_fc_relu_fuse_pass");
passes_.push_back("fc_mkldnn_pass");
passes_.push_back("fc_act_mkldnn_fuse_pass");
passes_.push_back("fc_elementwise_add_mkldnn_fuse_pass");
passes_.push_back("matmul_transpose_reshape_mkldnn_fuse_pass");
passes_.push_back("batch_norm_act_fuse_pass");
passes_.push_back("softplus_activation_onednn_fuse_pass");
......@@ -498,9 +495,7 @@ void CpuPassStrategy::DisableMkldnnFcPasses() {
void CpuPassStrategy::EraseFcMkldnnPasses() {
std::vector<std::string> fc_passes_to_erase(
{"fc_mkldnn_pass",
"fc_act_mkldnn_fuse_pass",
"fc_elementwise_add_mkldnn_fuse_pass"});
{"fc_mkldnn_pass", "fc_act_mkldnn_fuse_pass"});
for (const auto &pass : fc_passes_to_erase) {
int idx = GetPassIndex(pass);
if (idx != -1) {
......
......@@ -46,9 +46,6 @@ GetDNNLScales(const ExecutionContext& ctx) {
auto scale_in_data = ctx.Attr<float>("Scale_in");
auto scale_out = ctx.Attr<float>("Scale_out");
auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights");
auto scale_in_eltwise_data = ctx.HasAttr("Scale_in_eltwise")
? ctx.Attr<float>("Scale_in_eltwise")
: 1.0f;
std::vector<float> dnnl_src_scales = {1.f / scale_in_data};
size_t count = scale_weights_data.size();
......@@ -57,7 +54,7 @@ GetDNNLScales(const ExecutionContext& ctx) {
for (size_t i = 0; i < count; i++) {
dnnl_wei_scales[i] = 1.f / scale_weights_data[i];
}
std::vector<float> dnnl_psum_scales = {1.f / scale_in_eltwise_data};
std::vector<float> dnnl_psum_scales = {1.f};
std::vector<float> dnnl_dst_scales = {1.f / scale_out};
return std::make_tuple(
......@@ -127,7 +124,6 @@ class FCMKLDNNHandler
dnnl::primitive_attr attributes;
dnnl::post_ops post_operations;
float sum_scale = 1.0f;
float activation_scale = 1.0f;
if (phi::funcs::is_int8<T_w>()) {
std::vector<float> src_scales, wei_scales, psum_scales, dst_scales;
......@@ -168,13 +164,6 @@ class FCMKLDNNHandler
dst_scales.data(),
dst_scales.size() * sizeof(float));
}
sum_scale = psum_scales[0];
}
if (ctx.HasAttr("fuse_residual_connection") &&
ctx.Attr<bool>("fuse_residual_connection")) {
post_operations.append_sum(sum_scale);
}
// ReLU from "fc_fuse_pass"
......@@ -332,22 +321,6 @@ class FCMKLDNNHandler
std::shared_ptr<dnnl::memory> AcquireCustomDstMemory(
const ExecutionContext& ctx, phi::DenseTensor* out) {
if (ctx.HasAttr("fuse_residual_connection") &&
ctx.Attr<bool>("fuse_residual_connection")) {
auto* residual_param = ctx.Input<phi::DenseTensor>("ResidualData");
PADDLE_ENFORCE_EQ(
out->dims(),
residual_param->dims(),
phi::errors::InvalidArgument(
"Output and elementwise parameter need to have the "
"same dimension sizes, but got output's dimension = %d"
" and residual param's dimension =%d .",
out->dims().size(),
residual_param->dims().size()));
out->ShareDataWith(*residual_param);
}
return this->template AcquireDstMemory<T_out>(out);
} // namespace operators
......@@ -458,11 +431,7 @@ class FCMKLDNNKernel : public framework::OpKernel<T_in> {
dst_memory_p =
std::make_shared<dnnl::memory>(inner_product_cache->dst_mem);
if (ctx.HasAttr("fuse_residual_connection") &&
ctx.Attr<bool>("fuse_residual_connection")) {
auto* residual_param = ctx.Input<phi::DenseTensor>("ResidualData");
out->ShareDataWith(*residual_param);
}
auto out_ptr = out->mutable_data<T_out>(
ctx.GetPlace(), dst_memory_p->get_desc().get_size());
dst_memory_p->set_data_handle(out_ptr);
......
......@@ -370,9 +370,7 @@ TEST(AnalysisPredictor, mkldnn_fc_passes_cpu_pass_strategy) {
CpuPassStrategy cpuPassStrategy;
cpuPassStrategy.EnableMKLDNN();
const std::vector<std::string> fc_passes_to_erase(
{"fc_mkldnn_pass",
"fc_act_mkldnn_fuse_pass",
"fc_elementwise_add_mkldnn_fuse_pass"});
{"fc_mkldnn_pass", "fc_act_mkldnn_fuse_pass"});
for (const auto& pass : fc_passes_to_erase) {
ASSERT_NE(cpuPassStrategy.GetPassIndex(pass), (size_t)-1);
}
......
......@@ -34,7 +34,7 @@ void SetInt8Config(AnalysisConfig *cfg,
pass_builder->DeletePass("constant_folding_pass");
auto warmup_data = std::make_shared<std::vector<PaddleTensor>>(data);
cfg->mkldnn_quantizer_config()->SetEnabledOpTypes(
{"elementwise_add", "matmul", "matmul_v2", "fused_matmul"});
{"matmul", "matmul_v2", "fused_matmul"});
// Exclusion of several matmules that should not be quantized due to the fact
// that they reduce the accuracy of the model
cfg->mkldnn_quantizer_config()->SetExcludedOpIds(
......
......@@ -104,7 +104,6 @@ TEST(Analyzer_vit_ocr, fuse_status) {
CHECK_EQ(fuse_statis.at("fc_mkldnn_pass"), 33);
CHECK_EQ(fuse_statis.at("fused_conv2d_gelu_mkldnn_fuse_pass"), 2);
CHECK_EQ(fuse_statis.at("fc_elementwise_add_mkldnn_fuse"), 16);
}
#endif
......
......@@ -301,8 +301,6 @@ if(WITH_GPU AND TENSORRT_FOUND)
set_tests_properties(test_mkldnn_conv_mish_fuse_pass PROPERTIES TIMEOUT 300)
set_tests_properties(test_onednn_fc_activation_fuse_pass PROPERTIES TIMEOUT
300)
set_tests_properties(test_onednn_fc_elementwise_add_fuse_pass
PROPERTIES TIMEOUT 120)
set_tests_properties(test_mkldnn_conv_affine_channel_fuse_pass
PROPERTIES TIMEOUT 60)
endif()
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from functools import partial
import hypothesis.strategies as st
import numpy as np
from auto_scan_test import PassAutoScanTest
from program_config import OpConfig, ProgramConfig, TensorConfig
class TestFCElementwiseAddOneDNNFusePass(PassAutoScanTest):
def sample_program_config(self, draw):
axis = draw(st.sampled_from([-1, 0, 1]))
fc_as_x = draw(st.booleans())
fc_in = draw(st.sampled_from([32, 64]))
fc_wei = draw(st.sampled_from([32, 64]))
def generate_data(shape):
return np.random.random(shape).astype(np.float32)
relu_op = OpConfig(
type='relu',
inputs={'X': ['input_data']},
outputs={'Out': ['relu_out']},
attrs={},
)
fc_op = OpConfig(
type='fc',
inputs={
'Input': ['relu_out'],
'W': ['fc_weight'],
'Bias': ['fc_bias'],
},
outputs={'Out': ['fc_output']},
attrs={
'use_mkldnn': True,
'padding_weights': False,
'activation_type': '',
'in_num_col_dims': 1,
},
)
if fc_as_x:
inputs = {'X': ['fc_output'], 'Y': ['input_data']}
else:
inputs = {'X': ['input_data'], 'Y': ['fc_output']}
elt_add_op = OpConfig(
type='elementwise_add',
inputs=inputs,
outputs={'Out': ['elementwise_output']},
attrs={'axis': axis, 'use_mkldnn': True},
)
model_net = [relu_op, fc_op, elt_add_op]
program_config = ProgramConfig(
ops=model_net,
weights={
'fc_weight': TensorConfig(
data_gen=partial(generate_data, [fc_wei, fc_wei])
),
'fc_bias': TensorConfig(
data_gen=partial(generate_data, [fc_wei])
),
},
inputs={
'input_data': TensorConfig(
data_gen=partial(generate_data, [fc_in, fc_wei])
)
},
outputs=['elementwise_output'],
)
return program_config
def sample_predictor_configs(self, program_config):
config = self.create_inference_config(
use_mkldnn=True, passes=['fc_elementwise_add_mkldnn_fuse_pass']
)
yield config, ['relu', 'fc'], (1e-5, 1e-5)
def test(self):
self.run_and_statis(
quant=False, passes=['fc_elementwise_add_mkldnn_fuse_pass']
)
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册