diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 0c5c4a7024e24dee3b8cc7b9a96c53e7a1e04e6f..4a1f80916ef2b8957b7ef013c3100978b5fd7daf 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -2046,11 +2046,9 @@ PDNode *patterns::Reshape2Matmul::operator()() { return matmul_out; } -PDNode *patterns::MatmulWithInputOps::operator()(bool with_residual) { - auto prev_op_x = pattern->NewNode(prev_op_x_repr())->assert_is_op(); - auto prev_op_y = pattern->NewNode(prev_op_y_repr())->assert_is_op(); - - auto matmul_op = pattern->NewNode(matmul_op_repr())->assert_is_op("matmul"); +PDNode *patterns::FusedMatmul::operator()(bool with_residual) { + auto matmul_op = + pattern->NewNode(matmul_op_repr())->assert_is_op("fused_matmul"); if (!with_residual) { matmul_op->assert_more([&](Node *x) { @@ -2061,26 +2059,24 @@ PDNode *patterns::MatmulWithInputOps::operator()(bool with_residual) { auto matmul_in_x = pattern->NewNode(matmul_in_x_repr()) ->AsInput() - ->assert_is_op_input("matmul", "X"); + ->assert_is_op_input("fused_matmul", "X"); auto matmul_in_y = pattern->NewNode(matmul_in_y_repr()) ->AsInput() - ->assert_is_op_input("matmul", "Y"); + ->assert_is_op_input("fused_matmul", "Y"); auto matmul_out = pattern->NewNode(matmul_out_repr()) ->AsOutput() - ->assert_is_op_output("matmul", "Out") - ->assert_is_only_output_of_op("matmul"); + ->assert_is_op_output("fused_matmul", "Out") + ->assert_is_only_output_of_op("fused_matmul"); std::vector links_from{matmul_in_x, matmul_in_y}; if (with_residual) { auto matmul_residual_data = pattern->NewNode(matmul_residual_data_repr()) ->AsInput() - ->assert_is_op_input("matmul", "ResidualData"); + ->assert_is_op_input("fused_matmul", "ResidualData"); links_from.push_back(matmul_residual_data); } - prev_op_x->LinksTo({matmul_in_x}); - prev_op_y->LinksTo({matmul_in_y}); matmul_op->LinksFrom(links_from).LinksTo({matmul_out}); return matmul_out; } @@ -2835,6 +2831,9 @@ PDNode *patterns::QuantizePlacement::operator()( const std::unordered_set &quantize_enabled_op_types) { auto *op = pattern->NewNode(op_repr())->assert_is_ops(quantize_enabled_op_types); + op->assert_more([&](Node *node) { + return node->Op()->GetAttrIfExists("use_mkldnn"); + }); return op; } diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index fff236e6ed6a04a30816d181948129697d04a89a..f25260e0ce04e2465230e8ba781006efb77c5f67 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -1281,15 +1281,13 @@ struct Reshape2Matmul : public PatternBase { PATTERN_DECL_NODE(matmul_out); }; -// Forward pass for two input ops and matmul op. +// Forward pass for two input ops and fused_matmul op. // matmul_out is a result of the operator. -struct MatmulWithInputOps : public PatternBase { - MatmulWithInputOps(PDPattern* pattern, const std::string& name_scope) - : PatternBase(pattern, name_scope, "matmul_with_input_ops") {} +struct FusedMatmul : public PatternBase { + FusedMatmul(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "fused_matmul") {} PDNode* operator()(bool with_residual); - PATTERN_DECL_NODE(prev_op_x); - PATTERN_DECL_NODE(prev_op_y); PATTERN_DECL_NODE(matmul_in_x); PATTERN_DECL_NODE(matmul_in_y); PATTERN_DECL_NODE(matmul_op); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index f3e9a21142db9b8e824caafbc9c641770c4c2f24..3bc317770c013dd74bc5be361c2e9093f07d42d7 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -880,7 +880,7 @@ void CPUQuantizePass::QuantizeImmutable(Graph* graph, void CPUQuantizePass::QuantizeMatmul(Graph* graph, bool with_residual) const { GraphPatternDetector gpd; auto pattern = gpd.mutable_pattern(); - patterns::MatmulWithInputOps matmul_pattern{pattern, name_scope_}; + patterns::FusedMatmul matmul_pattern{pattern, name_scope_}; matmul_pattern(with_residual); int quantize_matmul_count = 0; @@ -894,15 +894,7 @@ void CPUQuantizePass::QuantizeMatmul(Graph* graph, bool with_residual) const { LogQuantizationDisabled(matmul_op); return; } - GET_IR_NODE_FROM_SUBGRAPH(prev_op_x, prev_op_x, matmul_pattern); - GET_IR_NODE_FROM_SUBGRAPH(prev_op_y, prev_op_y, matmul_pattern); - // skip if prev ops are not quantized - if (!IsOpDequantized(prev_op_x) && !IsOpDequantized(prev_op_y)) { - MarkAndLogCannotQuantizeOp(matmul_op, - "No other quantizable operators nearby"); - return; - } GET_IR_NODE_FROM_SUBGRAPH(matmul_in_x, matmul_in_x, matmul_pattern); GET_IR_NODE_FROM_SUBGRAPH(matmul_in_y, matmul_in_y, matmul_pattern); GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, matmul_pattern); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc index f61e236bb38ee7cc502d9c4560d4411764a7c57f..195aa2728acd57c145d35a8c820b0501c672829c 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc @@ -87,7 +87,8 @@ void SetOp(ProgramDesc* prog, op->SetInput("Input", {inputs[0]}); op->SetOutput("Output", {outputs[0]}); op->SetAttr("Scale", 1.0f); - } else if (type == "matmul") { + } else if (type == "matmul" || type == "matmul_v2" || + type == "fused_matmul") { op->SetInput("X", {inputs[0]}); if (inputs.size() > 1) op->SetInput("Y", {inputs[1]}); if (inputs.size() > 2) op->SetInput("ResidualData", {inputs[2]}); @@ -176,12 +177,12 @@ void CheckScales(const OpDesc* op, float scale, float shift) { scale); scale_names.push_back("Scale_in"); scale_names.push_back("Scale_out"); - } else if (type == "matmul" || type == "elementwise_add" || + } else if (type == "fused_matmul" || type == "elementwise_add" || type == "elementwise_mul" || type == "elementwise_sub") { scale_names.push_back("Scale_x"); scale_names.push_back("Scale_y"); scale_names.push_back("Scale_out"); - if (type == "matmul") { + if (type == "fused_matmul") { auto const& names = op->InputNames(); if (std::find(names.begin(), names.end(), "ResidualData") != names.end()) scale_names.push_back("Scale_in_eltwise"); @@ -594,20 +595,7 @@ ProgramDesc BuildProgramDescMatmul() { } SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true); SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true); - SetOp(&prog, "matmul", "Matmul", {"b", "d"}, {"e"}, true, "int8"); - SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32"); - - return prog; -} - -ProgramDesc BuildProgramDescMatmulNotQuantized() { - ProgramDesc prog; - for (auto& v : variable_names_matmul) { - prog.MutableBlock(0)->Var(v); - } - SetOp(&prog, "dropout", "Dropout1", {"a"}, {"b"}, false); - SetOp(&prog, "dropout", "Dropout2", {"c"}, {"d"}, false); - SetOp(&prog, "matmul", "Matmul", {"b", "d"}, {"e"}, true, "int8"); + SetOp(&prog, "fused_matmul", "FusedMatmul", {"b", "d"}, {"e"}, true, "int8"); SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32"); return prog; @@ -621,7 +609,13 @@ ProgramDesc BuildProgramDescMatmulResidual() { SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true); SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true); SetOp(&prog, "dequantize", "Dequantize3", {"e"}, {"f"}, true); - SetOp(&prog, "matmul", "Matmul", {"b", "d", "f"}, {"g"}, true, "int8"); + SetOp(&prog, + "fused_matmul", + "FusedMatmul", + {"b", "d", "f"}, + {"g"}, + true, + "int8"); SetOp(&prog, "dropout", "Dropout", {"g"}, {"h"}, true, "float32"); return prog; @@ -631,7 +625,7 @@ TEST(CpuQuantizePass, matmul) { // 2 Quant + 2 IN + 1 DeQuant + 1 OUT int added_nodes = 6; std::unordered_map expected_operators = { - {"matmul", 1}, {"quantize", 2}, {"dequantize", 3}}; + {"fused_matmul", 1}, {"quantize", 2}, {"dequantize", 3}}; MainTest(BuildProgramDescMatmul(), variable_names_matmul, expected_operators, @@ -639,23 +633,11 @@ TEST(CpuQuantizePass, matmul) { SCALE * S8_MAX); } -TEST(CpuQuantizePass, matmul_not_quantized) { - // nothing change - int added_nodes = 0; - std::unordered_map expected_operators = { - {"matmul", 1}, {"quantize", 0}, {"dequantize", 0}}; - MainTest(BuildProgramDescMatmulNotQuantized(), - variable_names_matmul, - expected_operators, - added_nodes, - 1.0f); -} - TEST(CpuQuantizePass, matmul_residual) { // 3 Quant + 3 IN + 1 DeQuant + 1 OUT int added_nodes = 8; std::unordered_map expected_operators = { - {"matmul", 1}, {"quantize", 3}, {"dequantize", 4}}; + {"fused_matmul", 1}, {"quantize", 3}, {"dequantize", 4}}; MainTest(BuildProgramDescMatmulResidual(), variable_names_matmul, expected_operators, diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc index 08b66d8f2f56ee169f018d1f6996b6d2e93dac98..19fbb2a907032c180d00d3400a3d67ff7c1b3925 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc @@ -22,17 +22,44 @@ namespace ir { class Graph; +void ReplaceWithFusedOp(Node* op) { + const std::string matmul_type = op->Op()->Type(); + if (matmul_type == "matmul" || matmul_type == "matmul_v2") { + op->Op()->SetType("fused_matmul"); + if (matmul_type == "matmul") { + op->Op()->SetAttr("trans_x", op->Op()->GetAttr("transpose_X")); + op->Op()->SetAttr("trans_y", op->Op()->GetAttr("transpose_Y")); + op->Op()->SetAttr("matmul_alpha", op->Op()->GetAttr("alpha")); + } + } +} + void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const { VLOG(3) << "Marks operators which are to be quantized."; std::unordered_set supported_op_types = - std::unordered_set( - {"concat", "conv2d", "depthwise_conv2d", - "fused_conv2d", "fused_conv3d", "elementwise_add", - "elementwise_mul", "elementwise_sub", "fc", - "matmul", "nearest_interp", "nearest_interp_v2", - "pool2d", "prior_box", "reshape2", - "transpose2", "fusion_gru", "fusion_lstm", - "multi_gru", "slice", "split"}); + std::unordered_set({"concat", + "conv2d", + "depthwise_conv2d", + "fused_conv2d", + "fused_conv3d", + "fused_matmul", + "elementwise_add", + "elementwise_mul", + "elementwise_sub", + "fc", + "matmul", + "matmul_v2", + "nearest_interp", + "nearest_interp_v2", + "pool2d", + "prior_box", + "reshape2", + "transpose2", + "fusion_gru", + "fusion_lstm", + "multi_gru", + "slice", + "split"}); const auto& excluded_ids_list = Get>("quantize_excluded_op_ids"); const auto& op_types_list = @@ -69,6 +96,8 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const { if (op->Op()->GetAttrIfExists("skip_quant") == 1) { return; } + + ReplaceWithFusedOp(op); op->Op()->SetAttr("mkldnn_data_type", std::string("int8")); }; gpd(graph, handler); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc index b508ab4139c8b8ca8e66caaaaa2b26a5ac953be3..5cbd64c49d200ceee2c2de6ec406c689b6318704 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc @@ -30,6 +30,7 @@ void SetOp(ProgramDesc* prog, auto* op = prog->MutableBlock(0)->AppendOp(); op->SetType(type); + op->SetAttr("use_mkldnn", true); op->SetAttr("mkldnn_data_type", mkldnn_data_type); if (type == "conv2d") { diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h index b091236ddd8f3248598505eb10bd55f30aee1d6f..699ca1aed378a70445a150888d79619aecfe1932 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h @@ -36,7 +36,8 @@ static void SaveInfoInTheFirstOp( for (auto* op_node : ir::TopologyVarientSort(*graph, static_cast(0))) { if (!op_node->IsOp() || op_node->Op()->Type() == "feed" || - op_node->Op()->Type() == "fetch") + op_node->Op()->Type() == "fetch" || + op_node->Op()->Type() == "fill_constant") continue; op_node->Op()->SetAttr(flag, true); @@ -57,7 +58,8 @@ static void SaveInfoInTheFirstOp(ir::Graph* graph, for (auto* op_node : ir::TopologyVarientSort(*graph, static_cast(0))) { if (!op_node->IsOp() || op_node->Op()->Type() == "feed" || - op_node->Op()->Type() == "fetch") + op_node->Op()->Type() == "fetch" || + op_node->Op()->Type() == "fill_constant") continue; op_node->Op()->SetAttr(flag, true); diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 2975322dece3582ea590947ba6585b89cd5f8807..bf541b4d1b0a0aacb822a949b2478a1c306511d3 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -672,22 +672,8 @@ void AnalysisConfig::EnableMkldnnInt8( #ifdef PADDLE_WITH_MKLDNN use_mkldnn_int8_ = true; use_fc_padding_ = false; - if (!op_list.empty()) { - for (auto &type : op_list) { - if (!quantize_enabled_op_types_.count(type)) { - LOG(ERROR) << "There are unsupported operators in the configured " - "quantization operator list. The unsupported operator " - "is: " - << type; - use_mkldnn_int8_ = false; - break; - } - } - if (use_mkldnn_int8_) { - quantize_enabled_op_types_.clear(); - quantize_enabled_op_types_.insert(op_list.begin(), op_list.end()); - } - } + if (!op_list.empty()) + quantize_enabled_op_types_.insert(op_list.begin(), op_list.end()); #else LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnInt8"; use_mkldnn_int8_ = false; diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index eab8818a1df009feea4e391d6c2f75a819b8a597..8e12f15e7a01819c7364238c85760d897a800eb3 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -1191,26 +1191,7 @@ struct PD_INFER_DECL AnalysisConfig { std::unordered_set bfloat16_enabled_op_types_; bool use_mkldnn_int8_{false}; std::unordered_set quantize_excluded_op_ids_{}; - std::unordered_set quantize_enabled_op_types_{ - "concat", - "conv2d", - "depthwise_conv2d", - "fused_conv2d", - "elementwise_add", - "elementwise_mul", - "fc", - "matmul", - "nearest_interp", - "nearest_interp_v2", - "pool2d", - "prior_box", - "reshape2", - "transpose2", - "fusion_gru", - "fusion_lstm", - "multi_gru", - "slice", - "split"}; + std::unordered_set quantize_enabled_op_types_{}; bool disable_mkldnn_fc_passes_{false}; diff --git a/paddle/fluid/inference/tests/api/analyzer_ernie_int8_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ernie_int8_tester.cc index 43f1f8a1163f134ab00dda06961151a12ce19322..87f4883b7f0b7be77a47e5145ac6bf3253cf2a15 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ernie_int8_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ernie_int8_tester.cc @@ -29,11 +29,16 @@ void SetInt8Config(AnalysisConfig *cfg, std::vector data) { cfg->SetModel(FLAGS_infer_model); cfg->EnableMKLDNN(); - cfg->DisableMkldnnFcPasses(); // fc passes caused loss in accuracy cfg->EnableMkldnnQuantizer(); auto pass_builder = cfg->pass_builder(); pass_builder->DeletePass("constant_folding_pass"); auto warmup_data = std::make_shared>(data); + cfg->mkldnn_quantizer_config()->SetEnabledOpTypes( + {"elementwise_add", "matmul", "matmul_v2", "fused_matmul"}); + // Exclusion of several matmules that should not be quantized due to the fact + // that they reduce the accuracy of the model + cfg->mkldnn_quantizer_config()->SetExcludedOpIds( + {75, 172, 269, 366, 463, 560, 657, 754, 851, 948, 1045, 1142}); cfg->mkldnn_quantizer_config()->SetWarmupData(warmup_data); cfg->mkldnn_quantizer_config()->SetWarmupBatchSize(FLAGS_batch_size); cfg->SwitchSpecifyInputNames();