Add support for INT8 matmul in C-API quantization (#23463)

* Integrate matmul with cpu_quantize_pass test=develop * Add matmul checking scales test=develop * Change condition of matmul quantization test=develop * Remove redundant var test=develop

Add support for INT8 matmul in C-API quantization (#23463)
* Integrate matmul with cpu_quantize_pass test=develop * Add matmul checking scales test=develop * Change condition of matmul quantization test=develop * Remove redundant var test=develop
ce08fdcf · joanna.wozna.intel · GitHub · c068512f · ce08fdcf · ce08fdcf
6 changed file
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1349,6 +1349,27 @@ PDNode *patterns::Reshape::operator()() {
  return reshape_out;
 }
+PDNode *patterns::Matmul::operator()() {
+  auto prev_op_x = pattern->NewNode(prev_op_x_repr())->assert_is_op();
+  auto prev_op_y = pattern->NewNode(prev_op_y_repr())->assert_is_op();
+  auto matmul_op = pattern->NewNode(matmul_op_repr())->assert_is_op("matmul");
+  auto matmul_in_x = pattern->NewNode(matmul_in_x_repr())
+                         ->AsInput()
+                         ->assert_is_op_input("matmul", "X");
+  auto matmul_in_y = pattern->NewNode(matmul_in_y_repr())
+                         ->AsInput()
+                         ->assert_is_op_input("matmul", "Y");
+  auto matmul_out = pattern->NewNode(matmul_out_repr())
+                        ->AsOutput()
+                        ->assert_is_op_output("matmul", "Out");
+  prev_op_x->LinksTo({matmul_in_x});
+  prev_op_y->LinksTo({matmul_in_y});
+  matmul_op->LinksFrom({matmul_in_x, matmul_in_y}).LinksTo({matmul_out});
+  return matmul_out;
+}
 PDNode *patterns::ConvResidual::operator()(bool with_residual_data) {
  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");

--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -835,6 +835,22 @@ struct Reshape : public PatternBase {
  PATTERN_DECL_NODE(next_op);
 };
+// Matmul op
+// Forward pass for matmul.
+// matmul_out is a result of the operator.
+struct Matmul : public PatternBase {
+  Matmul(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "reshape2") {}
+  PDNode* operator()();
+  PATTERN_DECL_NODE(prev_op_x);
+  PATTERN_DECL_NODE(prev_op_y);
+  PATTERN_DECL_NODE(matmul_in_x);
+  PATTERN_DECL_NODE(matmul_in_y);
+  PATTERN_DECL_NODE(matmul_op);
+  PATTERN_DECL_NODE(matmul_out);
+};
 // Concat op
 // Forward pass for concat.
 // concat_out is a result of the operator.

--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -190,6 +190,16 @@ double CPUQuantizePass::GetScaleValueForNode(const Node* node,
  return scale_data.second.data<double>()[0];
 }
+bool CPUQuantizePass::IsOpDequantized(const Node* node) const {
+  return node->Op()->Type() == "dequantize" ||
+         node->Op()->GetAttrIfExists<bool>("use_quantizer");
+}
+bool CPUQuantizePass::IsOpQuantized(const Node* node) const {
+  return node->Op()->Type() == "quantize" ||
+         node->Op()->GetAttrIfExists<bool>("use_quantizer");
+}
 void CPUQuantizePass::QuantizeConv(Graph* graph,
                                   bool with_residual_data) const {
  GraphPatternDetector gpd;
@@ -449,11 +459,8 @@ void CPUQuantizePass::QuantizeTranspose(Graph* graph) const {
    GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, transpose_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, transpose_pattern);
-    // skip if prev op and next op are not quantized
+    // skip if prev op and next op is not quantized
-    if (!(prev_op->Op()->Type() == "dequantize" ||
+    if (!(IsOpDequantized(prev_op)) && !(IsOpQuantized(next_op))) {
-          (prev_op->Op()->GetAttrIfExists<bool>("use_quantizer"))) &&
-        !(next_op->Op()->Type() == "quantize" ||
-          (next_op->Op()->GetAttrIfExists<bool>("use_quantizer")))) {
      return;
    }
    GET_IR_NODE_FROM_SUBGRAPH(transpose_in, transpose_in, transpose_pattern);
@@ -500,11 +507,8 @@ void CPUQuantizePass::QuantizeReshape(Graph* graph) const {
    GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, reshape_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, reshape_pattern);
-    // skip if prev op  and next op is not quantized
+    // skip if prev op and next op is not quantized
-    if (!(prev_op->Op()->Type() == "dequantize" ||
+    if (!(IsOpDequantized(prev_op)) && !(IsOpQuantized(next_op))) {
-          (prev_op->Op()->GetAttrIfExists<bool>("use_quantizer"))) &&
-        !(next_op->Op()->Type() == "quantize" ||
-          (next_op->Op()->GetAttrIfExists<bool>("use_quantizer")))) {
      return;
    }
@@ -530,6 +534,59 @@ void CPUQuantizePass::QuantizeReshape(Graph* graph) const {
  PrettyLogDetail("---    quantized %d reshape ops", quantize_reshape_count);
 }
+void CPUQuantizePass::QuantizeMatmul(Graph* graph) const {
+  GraphPatternDetector gpd;
+  auto pattern = gpd.mutable_pattern();
+  patterns::Matmul matmul_pattern{pattern, name_scope_};
+  matmul_pattern();
+  int quantize_matmul_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "Quantize matmul op";
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul_op, matmul_pattern);
+    auto* matmul_op_desc = matmul_op->Op();
+    // skip if should not be quantized
+    if (!matmul_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+      return;
+    }
+    GET_IR_NODE_FROM_SUBGRAPH(prev_op_x, prev_op_x, matmul_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(prev_op_y, prev_op_y, matmul_pattern);
+    // skip if prev ops are not quantized
+    if (!IsOpDequantized(prev_op_x) || !IsOpDequantized(prev_op_y)) {
+      return;
+    }
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_in_x, matmul_in_x, matmul_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_in_y, matmul_in_y, matmul_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, matmul_pattern);
+    bool is_x_unsigned{false}, is_y_unsigned{false};
+    auto input_x_scale = GetScaleValueForNode(matmul_in_x, &is_x_unsigned);
+    auto input_y_scale = GetScaleValueForNode(matmul_in_y, &is_y_unsigned);
+    PADDLE_ENFORCE_EQ(
+        is_x_unsigned, is_y_unsigned,
+        platform::errors::InvalidArgument(
+            "Matmul inputs should have the same value of is_unsigned"));
+    QuantizeInput(g, matmul_op, matmul_in_x, "X", input_x_scale, is_x_unsigned,
+                  "Scale_x");
+    QuantizeInput(g, matmul_op, matmul_in_y, "Y", input_y_scale, is_y_unsigned,
+                  "Scale_y");
+    bool is_output_unsigned{false};
+    auto output_scale = GetScaleValueForNode(matmul_out, &is_output_unsigned);
+    DequantizeOutput(g, matmul_op, matmul_out, "Out", output_scale,
+                     is_output_unsigned, "Scale_out");
+    ++quantize_matmul_count;
+  };
+  gpd(graph, handler);
+  AddStatis(quantize_matmul_count);
+  PrettyLogDetail("---    quantized %d matmul ops", quantize_matmul_count);
+}
 void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
  VLOG(3) << "Quantizing the graph.";
  PADDLE_ENFORCE(graph);
@@ -545,6 +602,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
  QuantizeTranspose(graph);
  QuantizeFc(graph);
  QuantizeReshape(graph);
+  QuantizeMatmul(graph);
 }
 }  // namespace ir

--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
@@ -58,6 +58,8 @@ class CPUQuantizePass : public FusePassBase {
  void QuantizeReshape(Graph* graph) const;
+  void QuantizeMatmul(Graph* graph) const;
  void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name,
                     double scale_to_one, bool is_unsigned,
                     std::string scale_attr_name = "") const;
@@ -76,6 +78,8 @@ class CPUQuantizePass : public FusePassBase {
  LoDTensor GetScaleTensorForNode(const Node* node) const;
  double GetScaleValueForNode(const Node* node,
                              bool* is_unsigned = nullptr) const;
+  bool IsOpDequantized(const Node* node) const;
+  bool IsOpQuantized(const Node* node) const;
  const std::string name_scope_{"quantize"};
 };

--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -74,6 +74,14 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
    op->SetInput("Input", {inputs[0]});
    op->SetOutput("Output", {outputs[0]});
    op->SetAttr("Scale", 1.0f);
+  } else if (type == "matmul") {
+    op->SetInput("X", {inputs[0]});
+    if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
+    op->SetOutput("Out", {outputs[0]});
+    op->SetAttr("use_quantizer", use_quantizer);
+    op->SetAttr("Scale_x", 1.0f);
+    op->SetAttr("Scale_y", 1.0f);
+    op->SetAttr("Scale_out", 1.0f);
  }
 }
@@ -513,6 +521,89 @@ TEST(CPUQuantizePass, check_scales) {
  MainTestCheckScales(BuildProgramDescCheckScalesConv(), var_names, "a");
 }
+static const std::initializer_list<std::string> variable_names_matmul = {
+    "a", "b", "c", "d", "e", "f"};
+ProgramDesc BuildProgramDescMatmul() {
+  ProgramDesc prog;
+  for (auto& v : variable_names_transpose) {
+    prog.MutableBlock(0)->Var(v);
+  }
+  SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
+  SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true);
+  SetOp(&prog, "matmul", "Matmul", {"b", "d"}, {"e"}, true, true);
+  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, false);
+  return prog;
+}
+ProgramDesc BuildProgramDescMatmulNotQuantized() {
+  ProgramDesc prog;
+  for (auto& v : variable_names_transpose) {
+    prog.MutableBlock(0)->Var(v);
+  }
+  SetOp(&prog, "dropout", "Dropout", {"a"}, {"b"}, false);
+  SetOp(&prog, "dequantize", "Dequantize", {"c"}, {"d"}, true);
+  SetOp(&prog, "matmul", "Matmul", {"b", "d"}, {"e"}, true, true);
+  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, false);
+  return prog;
+}
+void MainTestMatmul(const ProgramDesc& prog, int matmul_count, int quant_count,
+                    int dequant_count, int added_nodes_count, float scale) {
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  int original_nodes_num, current_nodes_num;
+  PreparePass(&graph, prog, variable_names_matmul, &original_nodes_num,
+              &current_nodes_num);
+  int quantize_nodes_count = 0;
+  int dequantize_nodes_count = 0;
+  int matmul_nodes_count = 0;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->Type() == "matmul") {
+        matmul_nodes_count++;
+        auto op_name = boost::get<std::string>(op->GetAttr("name"));
+        EXPECT_EQ(boost::get<float>(op->GetAttr("Scale_x")), scale)
+            << "Scale_x for node '" + op_name + "'.";
+        EXPECT_EQ(boost::get<float>(op->GetAttr("Scale_y")), scale)
+            << "Scale_y for node '" + op_name + "'.";
+        EXPECT_EQ(boost::get<float>(op->GetAttr("Scale_out")), scale)
+            << "Scale_out for node '" + op_name + "'.";
+      } else if (op->Type() == "quantize") {
+        quantize_nodes_count++;
+      } else if (op->Type() == "dequantize") {
+        dequantize_nodes_count++;
+      }
+    }
+  }
+  EXPECT_EQ(matmul_nodes_count, matmul_count);
+  EXPECT_EQ(quantize_nodes_count, quant_count);
+  EXPECT_EQ(dequantize_nodes_count, dequant_count);
+  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
+}
+TEST(CpuQuantizePass, matmul) {
+  int matmul_count = 1;
+  int quant_count = 2;
+  int dequant_count = 3;
+  // 2 Quant + 2 IN + 1 DeQuant + 1 OUT
+  int added_nodes_count = 6;
+  MainTestMatmul(BuildProgramDescMatmul(), matmul_count, quant_count,
+                 dequant_count, added_nodes_count, 2.0f * 127);
+}
+TEST(CpuQuantizePass, matmul_not_quantized) {
+  int matmul_count = 1;
+  int quant_count = 0;
+  int dequant_count = 1;
+  // nothing change
+  int added_nodes_count = 0;
+  MainTestMatmul(BuildProgramDescMatmulNotQuantized(), matmul_count,
+                 quant_count, dequant_count, added_nodes_count, 1.0f);
+}
 }  // namespace
 }  // namespace ir

--- a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
@@ -45,6 +45,10 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() {
  rules_["fc"]["Bias"] = ScaleAlgo::NONE;
  rules_["fc"]["Out"] = ScaleAlgo::KL;
+  rules_["matmul"]["X"] = ScaleAlgo::KL;
+  rules_["matmul"]["Y"] = ScaleAlgo::KL;
+  rules_["matmul"]["Out"] = ScaleAlgo::KL;
  // Reshape2 does not perform calculation on the data and shapes are not
  // changed. Scale is calculated on input data and assign to Quantize and
  // Dequantize scale.