From ce08fdcf2b55896a3709f7fb496469f4cb20b425 Mon Sep 17 00:00:00 2001 From: "joanna.wozna.intel" Date: Wed, 8 Apr 2020 12:00:28 +0200 Subject: [PATCH] Add support for INT8 matmul in C-API quantization (#23463) * Integrate matmul with cpu_quantize_pass test=develop * Add matmul checking scales test=develop * Change condition of matmul quantization test=develop * Remove redundant var test=develop --- .../framework/ir/graph_pattern_detector.cc | 21 +++++ .../framework/ir/graph_pattern_detector.h | 16 ++++ .../framework/ir/mkldnn/cpu_quantize_pass.cc | 78 ++++++++++++++-- .../framework/ir/mkldnn/cpu_quantize_pass.h | 4 + .../ir/mkldnn/cpu_quantize_pass_tester.cc | 91 +++++++++++++++++++ .../inference/api/mkldnn_quantizer_config.cc | 4 + 6 files changed, 204 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 888c5ee8f4..8822c1a4c9 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1349,6 +1349,27 @@ PDNode *patterns::Reshape::operator()() { return reshape_out; } +PDNode *patterns::Matmul::operator()() { + auto prev_op_x = pattern->NewNode(prev_op_x_repr())->assert_is_op(); + auto prev_op_y = pattern->NewNode(prev_op_y_repr())->assert_is_op(); + + auto matmul_op = pattern->NewNode(matmul_op_repr())->assert_is_op("matmul"); + auto matmul_in_x = pattern->NewNode(matmul_in_x_repr()) + ->AsInput() + ->assert_is_op_input("matmul", "X"); + auto matmul_in_y = pattern->NewNode(matmul_in_y_repr()) + ->AsInput() + ->assert_is_op_input("matmul", "Y"); + auto matmul_out = pattern->NewNode(matmul_out_repr()) + ->AsOutput() + ->assert_is_op_output("matmul", "Out"); + + prev_op_x->LinksTo({matmul_in_x}); + prev_op_y->LinksTo({matmul_in_y}); + matmul_op->LinksFrom({matmul_in_x, matmul_in_y}).LinksTo({matmul_out}); + return matmul_out; +} + PDNode *patterns::ConvResidual::operator()(bool with_residual_data) { auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 6efda66d82..f2a415814d 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -835,6 +835,22 @@ struct Reshape : public PatternBase { PATTERN_DECL_NODE(next_op); }; +// Matmul op +// Forward pass for matmul. +// matmul_out is a result of the operator. +struct Matmul : public PatternBase { + Matmul(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "reshape2") {} + + PDNode* operator()(); + PATTERN_DECL_NODE(prev_op_x); + PATTERN_DECL_NODE(prev_op_y); + PATTERN_DECL_NODE(matmul_in_x); + PATTERN_DECL_NODE(matmul_in_y); + PATTERN_DECL_NODE(matmul_op); + PATTERN_DECL_NODE(matmul_out); +}; + // Concat op // Forward pass for concat. // concat_out is a result of the operator. diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index ba168c621f..95ec17ac7f 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -190,6 +190,16 @@ double CPUQuantizePass::GetScaleValueForNode(const Node* node, return scale_data.second.data()[0]; } +bool CPUQuantizePass::IsOpDequantized(const Node* node) const { + return node->Op()->Type() == "dequantize" || + node->Op()->GetAttrIfExists("use_quantizer"); +} + +bool CPUQuantizePass::IsOpQuantized(const Node* node) const { + return node->Op()->Type() == "quantize" || + node->Op()->GetAttrIfExists("use_quantizer"); +} + void CPUQuantizePass::QuantizeConv(Graph* graph, bool with_residual_data) const { GraphPatternDetector gpd; @@ -449,11 +459,8 @@ void CPUQuantizePass::QuantizeTranspose(Graph* graph) const { GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, transpose_pattern); GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, transpose_pattern); - // skip if prev op and next op are not quantized - if (!(prev_op->Op()->Type() == "dequantize" || - (prev_op->Op()->GetAttrIfExists("use_quantizer"))) && - !(next_op->Op()->Type() == "quantize" || - (next_op->Op()->GetAttrIfExists("use_quantizer")))) { + // skip if prev op and next op is not quantized + if (!(IsOpDequantized(prev_op)) && !(IsOpQuantized(next_op))) { return; } GET_IR_NODE_FROM_SUBGRAPH(transpose_in, transpose_in, transpose_pattern); @@ -500,11 +507,8 @@ void CPUQuantizePass::QuantizeReshape(Graph* graph) const { GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, reshape_pattern); GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, reshape_pattern); - // skip if prev op and next op is not quantized - if (!(prev_op->Op()->Type() == "dequantize" || - (prev_op->Op()->GetAttrIfExists("use_quantizer"))) && - !(next_op->Op()->Type() == "quantize" || - (next_op->Op()->GetAttrIfExists("use_quantizer")))) { + // skip if prev op and next op is not quantized + if (!(IsOpDequantized(prev_op)) && !(IsOpQuantized(next_op))) { return; } @@ -530,6 +534,59 @@ void CPUQuantizePass::QuantizeReshape(Graph* graph) const { PrettyLogDetail("--- quantized %d reshape ops", quantize_reshape_count); } +void CPUQuantizePass::QuantizeMatmul(Graph* graph) const { + GraphPatternDetector gpd; + auto pattern = gpd.mutable_pattern(); + patterns::Matmul matmul_pattern{pattern, name_scope_}; + matmul_pattern(); + + int quantize_matmul_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "Quantize matmul op"; + GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul_op, matmul_pattern); + auto* matmul_op_desc = matmul_op->Op(); + + // skip if should not be quantized + if (!matmul_op_desc->GetAttrIfExists("use_quantizer")) { + return; + } + GET_IR_NODE_FROM_SUBGRAPH(prev_op_x, prev_op_x, matmul_pattern); + GET_IR_NODE_FROM_SUBGRAPH(prev_op_y, prev_op_y, matmul_pattern); + + // skip if prev ops are not quantized + if (!IsOpDequantized(prev_op_x) || !IsOpDequantized(prev_op_y)) { + return; + } + GET_IR_NODE_FROM_SUBGRAPH(matmul_in_x, matmul_in_x, matmul_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_in_y, matmul_in_y, matmul_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, matmul_pattern); + + bool is_x_unsigned{false}, is_y_unsigned{false}; + auto input_x_scale = GetScaleValueForNode(matmul_in_x, &is_x_unsigned); + auto input_y_scale = GetScaleValueForNode(matmul_in_y, &is_y_unsigned); + PADDLE_ENFORCE_EQ( + is_x_unsigned, is_y_unsigned, + platform::errors::InvalidArgument( + "Matmul inputs should have the same value of is_unsigned")); + QuantizeInput(g, matmul_op, matmul_in_x, "X", input_x_scale, is_x_unsigned, + "Scale_x"); + QuantizeInput(g, matmul_op, matmul_in_y, "Y", input_y_scale, is_y_unsigned, + "Scale_y"); + + bool is_output_unsigned{false}; + auto output_scale = GetScaleValueForNode(matmul_out, &is_output_unsigned); + DequantizeOutput(g, matmul_op, matmul_out, "Out", output_scale, + is_output_unsigned, "Scale_out"); + + ++quantize_matmul_count; + }; + gpd(graph, handler); + AddStatis(quantize_matmul_count); + + PrettyLogDetail("--- quantized %d matmul ops", quantize_matmul_count); +} + void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { VLOG(3) << "Quantizing the graph."; PADDLE_ENFORCE(graph); @@ -545,6 +602,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { QuantizeTranspose(graph); QuantizeFc(graph); QuantizeReshape(graph); + QuantizeMatmul(graph); } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h index 66a48e46e9..cca691d443 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h @@ -58,6 +58,8 @@ class CPUQuantizePass : public FusePassBase { void QuantizeReshape(Graph* graph) const; + void QuantizeMatmul(Graph* graph) const; + void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name, double scale_to_one, bool is_unsigned, std::string scale_attr_name = "") const; @@ -76,6 +78,8 @@ class CPUQuantizePass : public FusePassBase { LoDTensor GetScaleTensorForNode(const Node* node) const; double GetScaleValueForNode(const Node* node, bool* is_unsigned = nullptr) const; + bool IsOpDequantized(const Node* node) const; + bool IsOpQuantized(const Node* node) const; const std::string name_scope_{"quantize"}; }; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc index 26e675545e..8a9a431e4d 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc @@ -74,6 +74,14 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, op->SetInput("Input", {inputs[0]}); op->SetOutput("Output", {outputs[0]}); op->SetAttr("Scale", 1.0f); + } else if (type == "matmul") { + op->SetInput("X", {inputs[0]}); + if (inputs.size() > 1) op->SetInput("Y", {inputs[1]}); + op->SetOutput("Out", {outputs[0]}); + op->SetAttr("use_quantizer", use_quantizer); + op->SetAttr("Scale_x", 1.0f); + op->SetAttr("Scale_y", 1.0f); + op->SetAttr("Scale_out", 1.0f); } } @@ -513,6 +521,89 @@ TEST(CPUQuantizePass, check_scales) { MainTestCheckScales(BuildProgramDescCheckScalesConv(), var_names, "a"); } +static const std::initializer_list variable_names_matmul = { + "a", "b", "c", "d", "e", "f"}; + +ProgramDesc BuildProgramDescMatmul() { + ProgramDesc prog; + for (auto& v : variable_names_transpose) { + prog.MutableBlock(0)->Var(v); + } + SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true); + SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true); + SetOp(&prog, "matmul", "Matmul", {"b", "d"}, {"e"}, true, true); + SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, false); + + return prog; +} + +ProgramDesc BuildProgramDescMatmulNotQuantized() { + ProgramDesc prog; + for (auto& v : variable_names_transpose) { + prog.MutableBlock(0)->Var(v); + } + SetOp(&prog, "dropout", "Dropout", {"a"}, {"b"}, false); + SetOp(&prog, "dequantize", "Dequantize", {"c"}, {"d"}, true); + SetOp(&prog, "matmul", "Matmul", {"b", "d"}, {"e"}, true, true); + SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, false); + + return prog; +} + +void MainTestMatmul(const ProgramDesc& prog, int matmul_count, int quant_count, + int dequant_count, int added_nodes_count, float scale) { + std::unique_ptr graph(new ir::Graph(prog)); + int original_nodes_num, current_nodes_num; + PreparePass(&graph, prog, variable_names_matmul, &original_nodes_num, + ¤t_nodes_num); + + int quantize_nodes_count = 0; + int dequantize_nodes_count = 0; + int matmul_nodes_count = 0; + for (auto* node : graph->Nodes()) { + if (node->IsOp()) { + auto* op = node->Op(); + if (op->Type() == "matmul") { + matmul_nodes_count++; + auto op_name = boost::get(op->GetAttr("name")); + EXPECT_EQ(boost::get(op->GetAttr("Scale_x")), scale) + << "Scale_x for node '" + op_name + "'."; + EXPECT_EQ(boost::get(op->GetAttr("Scale_y")), scale) + << "Scale_y for node '" + op_name + "'."; + EXPECT_EQ(boost::get(op->GetAttr("Scale_out")), scale) + << "Scale_out for node '" + op_name + "'."; + } else if (op->Type() == "quantize") { + quantize_nodes_count++; + } else if (op->Type() == "dequantize") { + dequantize_nodes_count++; + } + } + } + EXPECT_EQ(matmul_nodes_count, matmul_count); + EXPECT_EQ(quantize_nodes_count, quant_count); + EXPECT_EQ(dequantize_nodes_count, dequant_count); + EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num); +} + +TEST(CpuQuantizePass, matmul) { + int matmul_count = 1; + int quant_count = 2; + int dequant_count = 3; + // 2 Quant + 2 IN + 1 DeQuant + 1 OUT + int added_nodes_count = 6; + MainTestMatmul(BuildProgramDescMatmul(), matmul_count, quant_count, + dequant_count, added_nodes_count, 2.0f * 127); +} + +TEST(CpuQuantizePass, matmul_not_quantized) { + int matmul_count = 1; + int quant_count = 0; + int dequant_count = 1; + // nothing change + int added_nodes_count = 0; + MainTestMatmul(BuildProgramDescMatmulNotQuantized(), matmul_count, + quant_count, dequant_count, added_nodes_count, 1.0f); +} } // namespace } // namespace ir diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc index 313d517311..9ff5ef133e 100644 --- a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc +++ b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc @@ -45,6 +45,10 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() { rules_["fc"]["Bias"] = ScaleAlgo::NONE; rules_["fc"]["Out"] = ScaleAlgo::KL; + rules_["matmul"]["X"] = ScaleAlgo::KL; + rules_["matmul"]["Y"] = ScaleAlgo::KL; + rules_["matmul"]["Out"] = ScaleAlgo::KL; + // Reshape2 does not perform calculation on the data and shapes are not // changed. Scale is calculated on input data and assign to Quantize and // Dequantize scale. -- GitLab