Add multiple quantize operators fuse (#22062)

5b2e98aa · joanna.wozna.intel · Tao Luo · 96980c22 · 5b2e98aa · 5b2e98aa
5 changed file
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1659,6 +1659,21 @@ PDNode *patterns::DequantAny::operator()() {
  return dequant_out;
 }

+PDNode *patterns::MultipleQuantize::operator()() {
+  auto *prev_out = pattern->NewNode(prev_out_repr())->AsOutput();
+
+  // find nodes that are inputs to quantize operators
+  prev_out->assert_more([&](Node *node) {
+    int counter = std::count_if(
+        node->outputs.begin(), node->outputs.end(), [&](Node const *iter) {
+          return iter && iter->IsOp() && iter->Op()->Type() == "quantize";
+        });
+    return (counter > 1);
+  });
+
+  return prev_out;
+}
+
 // a -> transpose_op(1) -> transpose_out_a -> flatten_op(1) -> flatten_out_a
 // b -> transpose_op(2) -> transpose_out_b -> flatten_op(2) -> flatten_out_b
 // ...

--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1004,6 +1004,16 @@ struct DequantAny : public PatternBase {
  PATTERN_DECL_NODE(next_op);
 };

+// anyOp + more then one quantize op
+// This pattern is used for squashing multiple quantize with the same scale.
+struct MultipleQuantize : public PatternBase {
+  MultipleQuantize(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "multiple_quantize") {}
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(prev_out);
+};
+
 struct TransposeFlattenConcat : public PatternBase {
  TransposeFlattenConcat(PDPattern* pattern, const std::string& name_scope)
      : PatternBase(pattern, name_scope, "transpose_flatten_concat") {}

--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
@@ -228,6 +228,62 @@ void CPUQuantizeSquashPass::FcDequantSquash(Graph* graph) const {
                  found_fc_dequant_squash_count);
 }

+void CPUQuantizeSquashPass::MultipleQuantizeSquash(Graph* graph) const {
+  GraphPatternDetector gpd;
+  patterns::MultipleQuantize multiple_quantize_pattern{gpd.mutable_pattern(),
+                                                       "multiple_quantize"};
+  multiple_quantize_pattern();
+
+  int found_multiple_quantize_squash_count = 0;
+  int removed_quantize = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "fuse multiple quantize ops";
+
+    GET_IR_NODE_FROM_SUBGRAPH(prev_out, prev_out, multiple_quantize_pattern);
+
+    auto* first_quant_op = *(std::find_if(
+        prev_out->outputs.begin(), prev_out->outputs.end(), [&](Node* node) {
+          return (node->IsOp() && node->Op()->Type() == "quantize");
+        }));
+    auto* first_quant_out = first_quant_op->outputs[0];
+    float scale = first_quant_op->Op()->GetAttrIfExists<float>("Scale");
+
+    PADDLE_ENFORCE_NE(scale, 0, platform::errors::InvalidArgument(
+                                    "Quantize scale should not be equal 0"));
+
+    for (int iter = prev_out->outputs.size() - 1; iter >= 0; iter--) {
+      auto quant_op = prev_out->outputs[iter];
+      if (quant_op->IsOp() && quant_op->Op()->Type() == "quantize" &&
+          quant_op->id() != first_quant_op->id() &&
+          quant_op->Op()->GetAttrIfExists<float>("Scale") == scale) {
+        auto quant_out = quant_op->outputs[0];
+        auto last_op = quant_out->outputs[0];
+
+        std::string last_op_input_name;
+        for (auto name : last_op->Op()->InputNames())
+          for (auto input_name : last_op->Op()->Input(name))
+            if (input_name == quant_out->Name()) last_op_input_name = name;
+
+        PADDLE_ENFORCE_NE(
+            last_op_input_name.empty(), true,
+            platform::errors::NotFound("Operator after quantize operator "
+                                       "should has quantize output as input"));
+        last_op->Op()->SetInput(
+            last_op_input_name,
+            std::vector<std::string>({first_quant_out->Name()}));
+        IR_NODE_LINK_TO(first_quant_out, last_op);
+        GraphSafeRemoveNodes(graph, {quant_op, quant_out});
+        removed_quantize++;
+      }
+    }
+    found_multiple_quantize_squash_count++;
+  };
+  gpd(graph, handler);
+  AddStatis(found_multiple_quantize_squash_count);
+  PrettyLogDetail("---    squashed %d quantize op", removed_quantize);
+}
+
 void CPUQuantizeSquashPass::ApplyImpl(ir::Graph* graph) const {
  PADDLE_ENFORCE_NOT_NULL(
      graph,
@@ -240,6 +296,7 @@ void CPUQuantizeSquashPass::ApplyImpl(ir::Graph* graph) const {
  DequantQuantSquash(graph, &nodes_keep_counter);
  ConvDequantSquash(graph);
  FcDequantSquash(graph);
+  MultipleQuantizeSquash(graph);
 }

 }  // namespace ir

--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
@@ -65,6 +65,11 @@ class CPUQuantizeSquashPass : public FusePassBase {
  */
  void FcDequantSquash(Graph* graph) const;

+  /*
+  *  Squash quantize if several quatize ops have the same scale
+  */
+  void MultipleQuantizeSquash(Graph* graph) const;
+
  const std::string name_scope_{"squash"};
 };


--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
@@ -230,6 +230,28 @@ ProgramDesc BuildConvDequantConvProgramDesc(bool use_mkldnn, float scale_out,
  return prog;
 }

+// a->concat->b
+// b->Quant1(Scale1)->c
+// b->Quant2(Scale2)->d
+// b->concat->e
+// c->fc->f
+// d->fc->g
+ProgramDesc BuildMultipleQuantizeProgramDesc(bool use_mkldnn, float first_scale,
+                                             float second_scale) {
+  ProgramDesc prog;
+  for (auto& v : variable_names) {
+    prog.MutableBlock(0)->Var(v);
+  }
+  SetOp(&prog, "concat", "Concat1", {"a"}, {"b"}, use_mkldnn);
+  SetOp(&prog, "quantize", "Quantize1", {"b"}, {"c"}, use_mkldnn, first_scale);
+  SetOp(&prog, "quantize", "Quantize2", {"b"}, {"d"}, use_mkldnn, second_scale);
+  SetOp(&prog, "concat", "Concat2", {"b"}, {"e"}, use_mkldnn);
+  SetOp(&prog, "fc", "Fc1", {"c", "w1"}, {"f"}, use_mkldnn, first_scale);
+  SetOp(&prog, "fc", "Fc2", {"d", "w2"}, {"g"}, use_mkldnn, second_scale);
+
+  return prog;
+}
+
 void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
                      const char* var_name) {
  auto x = scope->Var(var_name);
@@ -467,6 +489,34 @@ TEST(CpuQuantizeSquashPass, fc_dequant_more_than_one_op_after_dequant) {
      BuildFcDequantFcProgramDesc(use_mkldnn, scale_out, scale), "fc", false);
 }

+// a->Concat1->b
+// b->Concat2
+// b->Quatize1(Scale)->c
+// c->Fc1
+// c->Fc2
+TEST(CpuQuantizeSquashPass, quatize_with_same_scale) {
+  auto first_scale = 1.2345f;
+  auto second_scale = 1.2345f;
+  auto use_mkldnn = true;
+  // remove nodes: Quantize2 + d
+  auto remove_nodes = 1 + 1;
+  CountNodeTest(
+      BuildMultipleQuantizeProgramDesc(use_mkldnn, first_scale, second_scale),
+      remove_nodes);
+}
+
+// if scales are not the same, do not fuse
+TEST(CpuQuantizeSquashPass, quatize_with_different_scale) {
+  auto first_scale = 1.2345f;
+  auto second_scale = 1.5432f;
+  auto use_mkldnn = true;
+  // nothing change
+  auto remove_nodes = 0;
+  CountNodeTest(
+      BuildMultipleQuantizeProgramDesc(use_mkldnn, first_scale, second_scale),
+      remove_nodes);
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle