From d065b5bf2ba0c29c8488bfd4c36083eaf6620ca3 Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Thu, 28 Mar 2019 10:08:47 +0000
Subject: [PATCH] Anakin ssd support refine trt first run add quant dequant
 fuse pass omit simplify_anakin_priorbox_detection template omit
 transpose_flatten_concat_fuse template test=develop

---
 paddle/fluid/framework/ir/CMakeLists.txt      |  19 +-
 ...cc => fillconstant_elementwisemul_fuse.cc} |  14 +-
 ...e.h => fillconstant_elementwisemul_fuse.h} |   4 +-
 .../framework/ir/graph_pattern_detector.cc    |  94 ++++++++--
 .../framework/ir/graph_pattern_detector.h     |  25 ++-
 .../ir/quant_conv2d_dequant_fuse_pass.cc      | 173 ++++++++++++++++++
 .../ir/quant_conv2d_dequant_fuse_pass.h       |  35 ++++
 ...ify_anakin_priorbox_detection_out_pass.cc} |  56 +++---
 ...lify_anakin_priorbox_detection_out_pass.h} |   1 -
 .../ir/transpose_flatten_concat_fuse_pass.cc  |  35 +---
 .../ir/transpose_flatten_concat_fuse_pass.h   |   3 +-
 .../anakin/convert/density_prior_box.cc       |  49 +++--
 .../inference/anakin/convert/op_converter.h   |   2 +-
 paddle/fluid/inference/anakin/op_teller.cc    |   2 +
 .../ir_passes/anakin_subgraph_pass.cc         |  10 +-
 .../ir_passes/tensorrt_subgraph_pass.cc       |   1 +
 .../ir_params_sync_among_devices_pass.cc      |   1 +
 .../fluid/inference/api/analysis_predictor.cc |   1 +
 .../inference/api/paddle_pass_builder.cc      |  27 ++-
 .../operators/tensorrt/tensorrt_engine_op.h   |  22 ++-
 20 files changed, 430 insertions(+), 144 deletions(-)
 rename paddle/fluid/framework/ir/{anakin_fillconstant_elementwisemul_fuse.cc => fillconstant_elementwisemul_fuse.cc} (82%)
 rename paddle/fluid/framework/ir/{anakin_fillconstant_elementwisemul_fuse.h => fillconstant_elementwisemul_fuse.h} (89%)
 create mode 100644 paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
 create mode 100644 paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h
 rename paddle/fluid/framework/ir/{simplify_anakin_detection_pattern_pass.cc => simplify_anakin_priorbox_detection_out_pass.cc} (84%)
 rename paddle/fluid/framework/ir/{simplify_anakin_detection_pattern_pass.h => simplify_anakin_priorbox_detection_out_pass.h} (98%)

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 81b8ffa83f..ba1d7379c5 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -68,21 +68,12 @@ pass_library(transpose_flatten_concat_fuse_pass inference)
 pass_library(identity_scale_op_clean_pass base)
 pass_library(sync_batch_norm_pass base)
 pass_library(runtime_context_cache_pass base)
-pass_library(simplify_anakin_detection_pattern_pass inference)
-pass_library(anakin_fillconstant_elementwisemul_fuse inference)
+pass_library(quant_conv2d_dequant_fuse_pass inference)
+pass_library(fillconstant_elementwisemul_fuse inference)
 
-# There may be many transpose-flatten structures in a model, and the output of
-# these structures will be used as inputs to the concat Op. This pattern will
-# be detected by our pass. The index here represents the number of structures in the
-# pattern. We use index 3 ~ 6, because these quantities of structures are
-# common in the models.
-foreach (index RANGE 2 6)
-   file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n")
-endforeach()
-
-foreach (index RANGE 2 6)
-   file(APPEND ${pass_file} "USE_PASS(simplify_anakin_detection_pattern_pass${index});\n")
-endforeach()
+if(ANAKIN_FOUND)
+pass_library(simplify_anakin_priorbox_detection_out_pass inference)
+endif()
 
 if(WITH_MKLDNN)
     pass_library(mkldnn_placement_pass base mkldnn)
diff --git a/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc b/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.cc
similarity index 82%
rename from paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc
rename to paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.cc
index 39077f6420..915a2f62ba 100644
--- a/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc
+++ b/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.cc
@@ -15,7 +15,7 @@
 #include <memory>
 #include <string>
 
-#include "paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h"
+#include "paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 
 namespace paddle {
@@ -29,8 +29,8 @@ namespace ir {
   GET_IR_NODE(elementwise_mul);   \
   GET_IR_NODE(elementwise_mul_out);
 
-void AnakinFillconstantElementwisemulFuse::ApplyImpl(ir::Graph* graph) const {
-  const std::string pattern_name = "anakin_fillconstant_elementwisemul_fuse";
+void FillconstantElementwisemulFuse::ApplyImpl(ir::Graph* graph) const {
+  const std::string pattern_name = "fillconstant_elementwisemul_fuse";
   FusePassBase::Init(pattern_name, graph);
 
   GraphPatternDetector gpd;
@@ -39,8 +39,8 @@ void AnakinFillconstantElementwisemulFuse::ApplyImpl(ir::Graph* graph) const {
                 ->assert_is_op_input("elementwise_mul", "X")
                 ->AsInput();
 
-  patterns::AnakinFillConstantElementWiseMulFuse pattern(gpd.mutable_pattern(),
-                                                         pattern_name);
+  patterns::FillConstantElementWiseMulFuse pattern(gpd.mutable_pattern(),
+                                                   pattern_name);
   pattern(x);
 
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
@@ -79,5 +79,5 @@ void AnakinFillconstantElementwisemulFuse::ApplyImpl(ir::Graph* graph) const {
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(anakin_fillconstant_elementwisemul_fuse,
-              paddle::framework::ir::AnakinFillconstantElementwisemulFuse);
+REGISTER_PASS(fillconstant_elementwisemul_fuse,
+              paddle::framework::ir::FillconstantElementwisemulFuse);
diff --git a/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h b/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h
similarity index 89%
rename from paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h
rename to paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h
index 14c07c5884..ab66fb4a46 100644
--- a/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h
+++ b/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h
@@ -21,9 +21,9 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-class AnakinFillconstantElementwisemulFuse : public FusePassBase {
+class FillconstantElementwisemulFuse : public FusePassBase {
  public:
-  virtual ~AnakinFillconstantElementwisemulFuse() {}
+  virtual ~FillconstantElementwisemulFuse() {}
 
  protected:
   void ApplyImpl(ir::Graph* graph) const override;
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 555fdc7b7a..8468f9ccc1 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1471,7 +1471,8 @@ PDNode *patterns::TransposeFlattenConcat::operator()(
 }
 
 PDNode *patterns::AnakinDetectionPattern::operator()(
-    std::vector<PDNode *> conv_in, int times) {
+    std::vector<PDNode *> conv_in, int times, std::string priorbox_type,
+    bool is_reshape) {
   // The times represents the repeat times of the
   // {prior_box, prior_box_loc_out, flatten, prior_box_var_out, reshape}
   const int kNumFields = 7;
@@ -1486,37 +1487,38 @@ PDNode *patterns::AnakinDetectionPattern::operator()(
   const int kMultiClassSecondInputNmsOffset = times + 1;
 
   std::vector<PDNode *> nodes;
+  std::string op_after_priorbox = is_reshape ? "reshape2" : "flatten2";
 
   for (int i = 0; i < times; i++) {
     nodes.push_back(
         pattern->NewNode(GetNodeName("prior_box" + std::to_string(i)))
-            ->assert_is_op("density_prior_box"));
+            ->assert_is_op(priorbox_type));
     nodes.push_back(pattern->NewNode(GetNodeName("box_out" + std::to_string(i)))
-                        ->assert_is_op_output("density_prior_box", "Boxes")
-                        ->assert_is_op_input("reshape2", "X")
+                        ->assert_is_op_output(priorbox_type, "Boxes")
+                        ->assert_is_op_input(op_after_priorbox, "X")
                         ->AsIntermediate());
     nodes.push_back(
         pattern->NewNode(GetNodeName("reshape1" + std::to_string(i)))
-            ->assert_is_op("reshape2"));
+            ->assert_is_op(op_after_priorbox));
 
     nodes.push_back(
         pattern->NewNode(GetNodeName("reshape1_out" + std::to_string(i)))
-            ->assert_is_op_output("reshape2")
+            ->assert_is_op_output(op_after_priorbox)
             ->assert_is_op_nth_input("concat", "X", i)
             ->AsIntermediate());
 
     nodes.push_back(
         pattern->NewNode(GetNodeName("box_var_out" + std::to_string(i)))
-            ->assert_is_op_output("density_prior_box", "Variances")
-            ->assert_is_op_input("reshape2", "X")
+            ->assert_is_op_output(priorbox_type, "Variances")
+            ->assert_is_op_input(op_after_priorbox, "X")
             ->AsIntermediate());
     nodes.push_back(
         pattern->NewNode(GetNodeName("reshape2" + std::to_string(i)))
-            ->assert_is_op("reshape2"));
+            ->assert_is_op(op_after_priorbox));
 
     nodes.push_back(
         pattern->NewNode(GetNodeName("reshape2_out" + std::to_string(i)))
-            ->assert_is_op_output("reshape2")
+            ->assert_is_op_output(op_after_priorbox)
             ->assert_is_op_nth_input("concat", "X", i)
             ->AsIntermediate());
   }
@@ -1612,7 +1614,7 @@ PDNode *patterns::AnakinDetectionPattern::operator()(
   return multiclass_nms_out;
 }
 
-PDNode *patterns::AnakinFillConstantElementWiseMulFuse::operator()(
+PDNode *patterns::FillConstantElementWiseMulFuse::operator()(
     PDNode *elementwise_op_input) {
   auto fill_constant =
       pattern->NewNode(fill_constant_repr())->assert_is_op("fill_constant");
@@ -1635,6 +1637,76 @@ PDNode *patterns::AnakinFillConstantElementWiseMulFuse::operator()(
   return elementwise_mul_out;
 }
 
+void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input,
+                                              const std::string &op_type,
+                                              const std::string &weight_name,
+                                              int times) {
+  const int kNumFields = 5;
+  const int kQuantizedWeightOffset = 0;
+  const int kQuantizedOpOffset = 1;
+  const int kQuantizedOpOutOffset = 2;
+  const int kDequantOpOffset = 3;
+  const int kDequantOpOutOffset = 4;
+  // the quant op always be one.
+  auto quant_op_in_scale =
+      pattern->NewNode(GetNodeName("quant_op_in_scale"))
+          ->assert_is_op_input("fake_quantize_range_abs_max", "InScale")
+          ->AsInput();
+  auto quant_op = pattern->NewNode(GetNodeName("quant_op"))
+                      ->assert_is_op("fake_quantize_range_abs_max");
+
+  auto quant_op_out_scale =
+      pattern->NewNode(GetNodeName("quant_op_out_scale"))
+          ->assert_is_op_output("fake_quantize_range_abs_max", "OutScale")
+          ->assert_is_op_input("fake_dequantize_max_abs", "Scale")
+          ->AsIntermediate();
+
+  auto quant_op_out =
+      pattern->NewNode(GetNodeName("quant_op_out"))
+          ->assert_is_op_output("fake_quantize_range_abs_max", "Out")
+          ->assert_is_op_input(op_type)
+          ->AsIntermediate();
+
+  // there are 'times' quantized and dequant op
+  std::vector<PDNode *> nodes;
+  for (int i = 0; i < times; i++) {
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("quantized_op_weight") + std::to_string(i))
+            ->assert_is_op_input(op_type, weight_name)
+            ->AsInput());
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("quantized_op") + std::to_string(i))
+            ->assert_is_op(op_type));
+
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("quantized_op_out") + std::to_string(i))
+            ->assert_is_op_output(op_type)
+            ->assert_is_op_input("fake_dequantize_max_abs", "X")
+            ->AsIntermediate());
+
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("dequant_op") + std::to_string(i))
+            ->assert_is_op("fake_dequantize_max_abs"));
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("dequant_op_out") + std::to_string(i))
+            ->assert_is_op_output("fake_dequantize_max_abs", "Out")
+            ->AsOutput());
+  }
+
+  quant_op->LinksFrom({quant_op_input, quant_op_in_scale});
+  quant_op_out->LinksFrom({quant_op});
+  for (int i = 0; i < times; i++) {
+    nodes[i * kNumFields + kQuantizedOpOffset]->LinksFrom(
+        {quant_op_out, nodes[i * kNumFields + kQuantizedWeightOffset]});
+    nodes[i * kNumFields + kQuantizedOpOutOffset]->LinksFrom(
+        {nodes[i * kNumFields + kQuantizedOpOffset]});
+    nodes[i * kNumFields + kDequantOpOffset]->LinksFrom(
+        {nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale});
+    nodes[i * kNumFields + kDequantOpOutOffset]->LinksFrom(
+        {nodes[i * kNumFields + kDequantOpOffset]});
+  }
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 130ddeac4c..a5ac3a0c37 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -848,7 +848,8 @@ struct AnakinDetectionPattern : public PatternBase {
   AnakinDetectionPattern(PDPattern* pattern, const std::string& name_scope)
       : PatternBase(pattern, name_scope, "anakin_detect_pattern") {}
 
-  PDNode* operator()(std::vector<PDNode*> conv_inputs, int times);
+  PDNode* operator()(std::vector<PDNode*> conv_inputs, int times,
+                     std::string priorbox_type, bool is_reshape);
 
   std::string GetNodeName(const std::string& op_type) {
     return PDNodeName(name_scope_, repr_, id_, op_type);
@@ -859,9 +860,9 @@ struct AnakinDetectionPattern : public PatternBase {
   }
 };
 
-struct AnakinFillConstantElementWiseMulFuse : public PatternBase {
-  AnakinFillConstantElementWiseMulFuse(PDPattern* pattern,
-                                       const std::string& name_scope)
+struct FillConstantElementWiseMulFuse : public PatternBase {
+  FillConstantElementWiseMulFuse(PDPattern* pattern,
+                                 const std::string& name_scope)
       : PatternBase(pattern, name_scope,
                     "anakin_fillconstant_elementwisemul_fuse") {}
 
@@ -874,6 +875,22 @@ struct AnakinFillConstantElementWiseMulFuse : public PatternBase {
   PATTERN_DECL_NODE(elementwise_mul_out);
 };
 
+struct QuantDequantOpFuse : public PatternBase {
+  QuantDequantOpFuse(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "quant_dequant_fuse") {}
+
+  void operator()(PDNode* quant_op_input, const std::string& op_name,
+                  const std::string& weight_name, int times = 1);
+
+  std::string GetNodeName(const std::string& op_type) {
+    return PDNodeName(name_scope_, repr_, id_, op_type);
+  }
+
+  PDNode* GetPDNode(const std::string& op_type) {
+    return pattern->RetrieveNode(GetNodeName(op_type));
+  }
+};
+
 }  // namespace patterns
 
 // Link two ir::Nodes from each other.
diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
new file mode 100644
index 0000000000..7cab9c353d
--- /dev/null
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
@@ -0,0 +1,173 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
+                     std::string op_type) {
+  const std::string pattern_name = "quant_dequant_fuse";
+  //  FusePassBase::Init(pattern_name, graph);
+  const int kNumFields = 5;
+  const int kQuantizedWeightOffset = 0;
+  const int kQuantizedOpOffset = 1;
+  const int kQuantizedOpOutOffset = 2;
+  const int kDequantOpOffset = 3;
+  const int kDequantOpOutOffset = 4;
+
+  GraphPatternDetector gpd;
+  auto* x = gpd.mutable_pattern()
+                ->NewNode("x")
+                ->assert_is_op_input("fake_quantize_range_abs_max", "X")
+                ->AsInput();
+
+  std::string quantized_op_type = "";
+  std::string weight_name = "";
+  if (op_type == "conv2d") {
+    quantized_op_type = "conv2d";
+    weight_name = "Filter";
+  } else if (op_type == "conv2d_fusion") {
+    quantized_op_type = "conv2d_fusion";
+    weight_name = "Filter";
+  } else if (op_type == "mul") {
+    quantized_op_type = "mul";
+    weight_name = "Y";
+  } else if (op_type == "fc") {
+    quantized_op_type = "fc";
+    weight_name = "W";
+  } else {
+    PADDLE_ENFORCE(
+        "QuantDequantFuse: We only support conv2d, conv2d_fusion, fc, mul for "
+        "now.");
+  }
+
+  patterns::QuantDequantOpFuse pattern(gpd.mutable_pattern(), pattern_name);
+  pattern(x, quantized_op_type, weight_name, times);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    PADDLE_ENFORCE(subgraph.count(x));
+    auto* input_node = subgraph.at(x);
+    Node* quant_op_in_scale =
+        subgraph.at(pattern.GetPDNode("quant_op_in_scale"));
+    Node* quant_op = subgraph.at(pattern.GetPDNode("quant_op"));
+    Node* quant_op_out_scale =
+        subgraph.at(pattern.GetPDNode("quant_op_out_scale"));
+    Node* quant_op_out = subgraph.at(pattern.GetPDNode("quant_op_out"));
+
+    std::vector<Node*> nodes;
+    for (int i = 0; i < times; i++) {
+      nodes.push_back(subgraph.at(
+          pattern.GetPDNode("quantized_op_weight" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("quantized_op" + std::to_string(i))));
+      nodes.push_back(subgraph.at(
+          pattern.GetPDNode("quantized_op_out" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("dequant_op" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("dequant_op_out" + std::to_string(i))));
+    }
+
+    int bit_length = boost::get<int>(quant_op->Op()->GetAttr("bit_length"));
+    int range = ((1 << (bit_length - 1)) - 1);
+    // Prepare input scale
+    std::string input_scale_var_name = quant_op->Op()->Input("InScale").front();
+    PADDLE_ENFORCE(scope);
+    const LoDTensor& input_scale_tensor =
+        scope->FindVar(input_scale_var_name)->Get<LoDTensor>();
+
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(input_scale_tensor.place()));
+    const float* input_scale_data = input_scale_tensor.data<float>();
+    float input_scale = input_scale_data[0];
+    std::unordered_set<const Node*> delete_nodes;
+
+    for (int i = 0; i < times; i++) {
+      // max_range = (range * range) / weight_scale
+      float max_range = boost::get<float>(
+          nodes[i * kNumFields + kDequantOpOffset]->Op()->GetAttr("max_range"));
+      float weight_scale = (range * range) / max_range;
+
+      auto base_op_desc =
+          *nodes[i * kNumFields + kQuantizedOpOffset]->Op()->Proto();
+      std::string new_input = input_node->Name();
+      std::string new_output =
+          nodes[i * kNumFields + kDequantOpOutOffset]->Name();
+
+      framework::OpDesc new_op_desc(base_op_desc, nullptr);
+      new_op_desc.SetType(quantized_op_type);
+
+      if (quantized_op_type == "conv2d" ||
+          quantized_op_type == "conv2d_fusion") {
+        new_op_desc.SetInput("Input", {new_input});
+        new_op_desc.SetOutput("Output", {new_output});
+      } else if (quantized_op_type == "fc") {
+        new_op_desc.SetInput("Input", {new_input});
+        new_op_desc.SetOutput("Out", {new_output});
+      } else if (quantized_op_type == "mul") {
+        new_op_desc.SetInput("X", {new_input});
+        new_op_desc.SetOutput("Out", {new_output});
+      }
+
+      new_op_desc.SetAttr("enable_int8", true);
+      new_op_desc.SetAttr("input_scale", input_scale);
+      new_op_desc.SetAttr("weight_scale", weight_scale);
+      new_op_desc.Flush();
+      auto* new_op = graph->CreateOpNode(&new_op_desc);
+      IR_NODE_LINK_TO(input_node, new_op);
+      IR_NODE_LINK_TO(nodes[i * kNumFields + kQuantizedWeightOffset], new_op);
+      IR_NODE_LINK_TO(new_op, nodes[i * kNumFields + kDequantOpOutOffset]);
+      delete_nodes.insert(nodes[i * kNumFields + kQuantizedOpOffset]);
+      delete_nodes.insert(nodes[i * kNumFields + kQuantizedOpOutOffset]);
+      delete_nodes.insert(nodes[i * kNumFields + kDequantOpOffset]);
+    }
+
+    delete_nodes.insert(quant_op_in_scale);
+    delete_nodes.insert(quant_op);
+    delete_nodes.insert(quant_op_out);
+    delete_nodes.insert(quant_op_out_scale);
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph, delete_nodes);
+  };
+  gpd(graph, handler);
+}
+
+void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const {
+  const std::string pattern_name = "quant_dequant_fuse";
+  FusePassBase::Init(pattern_name, graph);
+
+  std::unordered_set<std::string> quantized_op_types = {"conv2d", "mul"};
+  auto* scope = param_scope();
+  for (auto& op_type : quantized_op_types) {
+    for (int i = 1; i <= 6; i++) {
+      RunQuantDequant(graph, scope, i, op_type);
+    }
+  }
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(quant_conv2d_dequant_fuse_pass,
+              paddle::framework::ir::QuantDequantFusePass);
diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h
new file mode 100644
index 0000000000..a61b34563a
--- /dev/null
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class QuantDequantFusePass : public FusePassBase {
+ public:
+  virtual ~QuantDequantFusePass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc b/paddle/fluid/framework/ir/simplify_anakin_priorbox_detection_out_pass.cc
similarity index 84%
rename from paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
rename to paddle/fluid/framework/ir/simplify_anakin_priorbox_detection_out_pass.cc
index e1ddc44470..b3606e4d92 100644
--- a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
+++ b/paddle/fluid/framework/ir/simplify_anakin_priorbox_detection_out_pass.cc
@@ -17,25 +17,24 @@
 
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/ir/node.h"
-#include "paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h"
+#include "paddle/fluid/framework/ir/simplify_anakin_priorbox_detection_out_pass.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
-template <int times>
-void SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
-    ir::Graph *graph) const {
+void RunSimplifyAnakinDetection(ir::Graph *graph, int times, bool is_density,
+                                bool is_reshape) {
   const std::string pattern_name =
       "simplify_anakin_detection_pattern_pass" + std::to_string(times);
-  FusePassBase::Init(pattern_name, graph);
+  std::string priorbox_type = is_density ? "density_prior_box" : "prior_box";
 
   GraphPatternDetector gpd;
   std::vector<PDNode *> input_nodes;
   for (int i = 0; i < times; i++) {
     input_nodes.push_back(gpd.mutable_pattern()
                               ->NewNode("x" + std::to_string(i))
-                              ->assert_is_op_input("density_prior_box", "Input")
+                              ->assert_is_op_input(priorbox_type, "Input")
                               ->AsInput());
   }
   input_nodes.push_back(gpd.mutable_pattern()
@@ -49,7 +48,7 @@ void SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
                             ->AsInput());
 
   patterns::AnakinDetectionPattern pattern(gpd.mutable_pattern(), pattern_name);
-  pattern(input_nodes, times);
+  pattern(input_nodes, times, priorbox_type, is_reshape);
 
   auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
                      Graph *g) {
@@ -119,8 +118,7 @@ void SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
         boost::get<std::string>(box_coder_op->Op()->GetAttr("code_type"));
     bool box_normalized =
         boost::get<bool>(box_coder_op->Op()->GetAttr("box_normalized"));
-    // auto variance =
-    // boost::get<std::vector<float>>(box_coder_op->Op()->GetAttr("variance"));
+
     int background_label =
         boost::get<int>(multiclass_nms->Op()->GetAttr("background_label"));
     float score_threshold =
@@ -138,7 +136,6 @@ void SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
           nodes[i * kNumFields + kPriorBoxLocOffset]->Name());
     }
 
-    // int axis = boost::get<int>(concat_op1->Op()->GetAttr("axis"));
     framework::OpDesc concat1_desc;
     concat1_desc.SetType("concat");
     concat1_desc.SetInput("X", concat1_input_names);
@@ -213,31 +210,24 @@ void SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
   gpd(graph, handler);
 }
 
-template class SimplifyAnakinDetectionPatternPass<1>;
-template class SimplifyAnakinDetectionPatternPass<2>;
-template class SimplifyAnakinDetectionPatternPass<3>;
-template class SimplifyAnakinDetectionPatternPass<4>;
-template class SimplifyAnakinDetectionPatternPass<5>;
-template class SimplifyAnakinDetectionPatternPass<6>;
+void SimplifyAnakinDetectionPatternPass::ApplyImpl(ir::Graph *graph) const {
+  const int pattern_nums = 6;
+  const std::string pattern_name = "simplify_anakin_detection_pattern_pass";
+  FusePassBase::Init(pattern_name, graph);
+  std::vector<bool> options = {true, false};
+  for (const auto &is_density : options) {
+    for (const auto &is_reshape : options) {
+      for (int i = 1; i <= pattern_nums; i++) {
+        RunSimplifyAnakinDetection(graph, i, is_density, is_reshape);
+      }
+    }
+  }
+}
 
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(simplify_anakin_detection_pattern_pass,
-              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<1>);
-
-REGISTER_PASS(simplify_anakin_detection_pattern_pass2,
-              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<2>);
-
-REGISTER_PASS(simplify_anakin_detection_pattern_pass3,
-              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<3>);
-
-REGISTER_PASS(simplify_anakin_detection_pattern_pass4,
-              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<4>);
-
-REGISTER_PASS(simplify_anakin_detection_pattern_pass5,
-              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<5>);
-
-REGISTER_PASS(simplify_anakin_detection_pattern_pass6,
-              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<6>);
+typedef paddle::framework::ir::SimplifyAnakinDetectionPatternPass
+    priorbox_pattern;
+REGISTER_PASS(simplify_anakin_priorbox_detection_out_pass, priorbox_pattern);
diff --git a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h b/paddle/fluid/framework/ir/simplify_anakin_priorbox_detection_out_pass.h
similarity index 98%
rename from paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h
rename to paddle/fluid/framework/ir/simplify_anakin_priorbox_detection_out_pass.h
index e4a266cbe8..e882b9dc25 100644
--- a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h
+++ b/paddle/fluid/framework/ir/simplify_anakin_priorbox_detection_out_pass.h
@@ -26,7 +26,6 @@ namespace ir {
 // these structures will be used as inputs to the concat Op. This pattern will
 // be detected by our pass. The times here represents the repeat times of this
 // structure.
-template <int times>
 class SimplifyAnakinDetectionPatternPass : public FusePassBase {
  public:
   virtual ~SimplifyAnakinDetectionPatternPass() {}
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
index 61c12d4b6e..a984a4942b 100644
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
@@ -25,11 +25,9 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-template <int times>
-void TransposeFlattenConcatFusePass<times>::ApplyImpl(ir::Graph *graph) const {
+void RunTransposeFlattenConcatFuse(ir::Graph *graph, int times) {
   const std::string pattern_name =
       "transpose_flatten" + std::to_string(times) + "_concat_fuse";
-  FusePassBase::Init(pattern_name, graph);
 
   GraphPatternDetector gpd;
   std::vector<PDNode *> input_nodes;
@@ -122,31 +120,18 @@ void TransposeFlattenConcatFusePass<times>::ApplyImpl(ir::Graph *graph) const {
   gpd(graph, handler);
 }
 
-template class TransposeFlattenConcatFusePass<1>;
-template class TransposeFlattenConcatFusePass<2>;
-template class TransposeFlattenConcatFusePass<3>;
-template class TransposeFlattenConcatFusePass<4>;
-template class TransposeFlattenConcatFusePass<5>;
-template class TransposeFlattenConcatFusePass<6>;
+void TransposeFlattenConcatFusePass::ApplyImpl(ir::Graph *graph) const {
+  const int pattern_nums = 6;
+  const std::string pattern_name = "transpose_flatten_concat_fuse";
+  FusePassBase::Init(pattern_name, graph);
+  for (int i = 1; i <= pattern_nums; i++) {
+    RunTransposeFlattenConcatFuse(graph, i);
+  }
+}
 
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
 
 REGISTER_PASS(transpose_flatten_concat_fuse_pass,
-              paddle::framework::ir::TransposeFlattenConcatFusePass<1>);
-
-REGISTER_PASS(transpose_flatten2_concat_fuse_pass,
-              paddle::framework::ir::TransposeFlattenConcatFusePass<2>);
-
-REGISTER_PASS(transpose_flatten3_concat_fuse_pass,
-              paddle::framework::ir::TransposeFlattenConcatFusePass<3>);
-
-REGISTER_PASS(transpose_flatten4_concat_fuse_pass,
-              paddle::framework::ir::TransposeFlattenConcatFusePass<4>);
-
-REGISTER_PASS(transpose_flatten5_concat_fuse_pass,
-              paddle::framework::ir::TransposeFlattenConcatFusePass<5>);
-
-REGISTER_PASS(transpose_flatten6_concat_fuse_pass,
-              paddle::framework::ir::TransposeFlattenConcatFusePass<6>);
+              paddle::framework::ir::TransposeFlattenConcatFusePass);
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
index 366d26d800..939a8c31e5 100644
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #pragma once
+#include <memory>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 
@@ -24,7 +26,6 @@ namespace ir {
 // these structures will be used as inputs to the concat Op. This pattern will
 // be detected by our pass. The times here represents the repeat times of this
 // structure.
-template <int times>
 class TransposeFlattenConcatFusePass : public FusePassBase {
  public:
   virtual ~TransposeFlattenConcatFusePass() {}
diff --git a/paddle/fluid/inference/anakin/convert/density_prior_box.cc b/paddle/fluid/inference/anakin/convert/density_prior_box.cc
index a55c153f99..35e02919aa 100644
--- a/paddle/fluid/inference/anakin/convert/density_prior_box.cc
+++ b/paddle/fluid/inference/anakin/convert/density_prior_box.cc
@@ -34,25 +34,41 @@ void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc& op,
   auto input_name = op_desc.Input("Input").front();
   auto image_name = op_desc.Input("Image").front();
   auto output_name = op_desc.Output("Boxes").front();
+  auto op_type = op_desc.Type();
+  auto op_name = op_type + ":" + op_desc.Output("Boxes").front();
 
-  auto op_name = op_desc.Type() + ":" + op_desc.Output("Boxes").front();
+  // only for density_prior_box
+  std::vector<float> fixed_sizes = {};
+  std::vector<float> fixed_ratios = {};
+  std::vector<int> densities = {};
 
-  auto fixed_sizes =
-      boost::get<std::vector<float>>(op_desc.GetAttr("fixed_sizes"));
-  auto fixed_ratios =
-      boost::get<std::vector<float>>(op_desc.GetAttr("fixed_ratios"));
-  auto densities = boost::get<std::vector<int>>(op_desc.GetAttr("densities"));
+  std::vector<float> min_sizes = {};
+  std::vector<float> max_sizes = {};
+  std::vector<float> aspect_ratios = {};
+  bool is_clip = false;
+  bool is_flip = false;
+
+  if (op_type == "density_prior_box") {
+    fixed_sizes =
+        boost::get<std::vector<float>>(op_desc.GetAttr("fixed_sizes"));
+    fixed_ratios =
+        boost::get<std::vector<float>>(op_desc.GetAttr("fixed_ratios"));
+    densities = boost::get<std::vector<int>>(op_desc.GetAttr("densities"));
+    is_clip = boost::get<bool>(op_desc.GetAttr("clip"));
+  } else if (op_type == "prior_box") {
+    min_sizes = boost::get<std::vector<float>>(op_desc.GetAttr("min_sizes"));
+    max_sizes = boost::get<std::vector<float>>(op_desc.GetAttr("max_sizes"));
+    aspect_ratios =
+        boost::get<std::vector<float>>(op_desc.GetAttr("aspect_ratios"));
+    is_clip = boost::get<bool>(op_desc.GetAttr("clip"));
+    is_flip = boost::get<bool>(op_desc.GetAttr("flip"));
+  }
   std::vector<float> dens;
   for (auto& ele : densities) {
     dens.push_back(static_cast<float>(ele));
   }
 
-  // lack flip
-  // auto clip = boost::get<bool>(op_desc.GetAttr("clip"));
   auto variances = boost::get<std::vector<float>>(op_desc.GetAttr("variances"));
-  for (auto& ele : variances) {
-    LOG(INFO) << ele;
-  }
 
   // lack img_h, img_w
   auto step_h = boost::get<float>(op_desc.GetAttr("step_h"));
@@ -66,14 +82,14 @@ void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc& op,
   std::vector<float> temp_v = {};
 
   engine_->AddOp(op_name, "PriorBox", {input_name, image_name}, {output_name});
-  engine_->AddOpAttr<PTuple<float>>(op_name, "min_size", temp_v);
-  engine_->AddOpAttr<PTuple<float>>(op_name, "max_size", temp_v);
-  engine_->AddOpAttr<PTuple<float>>(op_name, "aspect_ratio", temp_v);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "min_size", min_sizes);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "max_size", max_sizes);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "aspect_ratio", aspect_ratios);
   engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_size", fixed_sizes);
   engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_ratio", fixed_ratios);
   engine_->AddOpAttr<PTuple<float>>(op_name, "density", dens);
-  engine_->AddOpAttr(op_name, "is_flip", static_cast<bool>(false));
-  engine_->AddOpAttr(op_name, "is_clip", static_cast<bool>(false));
+  engine_->AddOpAttr(op_name, "is_flip", is_flip);
+  engine_->AddOpAttr(op_name, "is_clip", is_clip);
   engine_->AddOpAttr<PTuple<float>>(op_name, "variance", variances);
   engine_->AddOpAttr(op_name, "img_h", static_cast<int>(0));
   engine_->AddOpAttr(op_name, "img_w", static_cast<int>(0));
@@ -88,3 +104,4 @@ void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc& op,
 }  // namespace paddle
 
 REGISTER_ANAKIN_OP_CONVERTER(density_prior_box, DensityPriorBoxOpConverter);
+REGISTER_ANAKIN_OP_CONVERTER(prior_box, DensityPriorBoxOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/op_converter.h b/paddle/fluid/inference/anakin/convert/op_converter.h
index 4603681e1e..45db422174 100644
--- a/paddle/fluid/inference/anakin/convert/op_converter.h
+++ b/paddle/fluid/inference/anakin/convert/op_converter.h
@@ -48,7 +48,7 @@ class AnakinOpConverter {
     framework::OpDesc op_desc(op, nullptr);
     std::string op_type = op_desc.Type();
     AnakinOpConverter *it = nullptr;
-
+    if (op_type == "depthwise_conv2d") op_type = "conv2d";
     if (op_type == "reshape2") op_type = "reshape";
     if (op_type == "transpose2") op_type = "transpose";
     if (op_type == "flatten2") op_type = "flatten";
diff --git a/paddle/fluid/inference/anakin/op_teller.cc b/paddle/fluid/inference/anakin/op_teller.cc
index 90cf021de2..2042fb18ea 100644
--- a/paddle/fluid/inference/anakin/op_teller.cc
+++ b/paddle/fluid/inference/anakin/op_teller.cc
@@ -42,6 +42,8 @@ struct SimpleOpTypeSetTeller : public Teller {
     teller_set.insert("dropout");
     teller_set.insert("sigmoid");
     teller_set.insert("sum");
+    teller_set.insert("depthwise_conv2d");
+    teller_set.insert("prior_box");
   }
 
   bool operator()(const std::string& op_type,
diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
index 9e05aa5c16..38612d5cc3 100644
--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
@@ -37,14 +37,14 @@ using framework::ir::Node;
 
 void analysis::AnakinSubgraphPass::ApplyImpl(
     framework::ir::Graph *graph) const {
-  framework::ir::FusePassBase::Init("anakin_subgraph_pass", graph.get());
+  framework::ir::FusePassBase::Init("anakin_subgraph_pass", graph);
 
   auto teller = [](const framework::ir::Node *node) {
     if (!node->IsOp() || !node->Op()) return false;
     return anakin::OpTeller::Global().Tell(node->Op()->Type(), *node->Op());
   };
 
-  SubGraphFuser fuser(graph.get(), teller, 6 /* min_subgraph_size */);
+  SubGraphFuser fuser(graph, teller, 6 /* min_subgraph_size */);
   fuser();
 
   std::vector<std::string> graph_param_names =
@@ -56,10 +56,10 @@ void analysis::AnakinSubgraphPass::ApplyImpl(
 
   for (auto *node : graph->Nodes()) {
     if (node->IsOp() && !Agent(node).subgraph()->empty()) {
-      CreateAnakinOp(node, graph.get(), graph_param_names, &repetitive_params);
+      CreateAnakinOp(node, graph, graph_param_names, &repetitive_params);
       std::unordered_set<const Node *> nodes2remove(
           Agent(node).subgraph()->begin(), Agent(node).subgraph()->end());
-      framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
+      framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
     }
   }
 
@@ -69,7 +69,7 @@ void analysis::AnakinSubgraphPass::ApplyImpl(
       nodes2remove.insert(node);
     }
   }
-  framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
+  framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
   graph->Set(framework::ir::kRepetitiveParamAttr,
              new std::vector<std::string>(repetitive_params));
 }
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index ef5872c52c..019098a5dd 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -192,6 +192,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
           block_desc.Proto()->SerializeAsString());
   SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size"));
   SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size"));
+  SetAttr(op_desc->Proto(), "gpu_id", Get<int>("gpu_device_id"));
   SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
   SetAttr(op_desc->Proto(), "parameters", params);
 
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index d13ec7608c..1f27e80cf4 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -52,6 +52,7 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
   for (auto &var_name : all_vars) {
     if (std::count(repetitive_params.begin(), repetitive_params.end(),
                    var_name)) {
+      scope->EraseVars({var_name});
       continue;
     }
     auto *var = scope->FindLocalVar(var_name);
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index f726056154..7d8e9fe8bf 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -886,4 +886,5 @@ USE_ANAKIN_CONVERTER(detection_out);
 USE_ANAKIN_CONVERTER(density_prior_box);
 USE_ANAKIN_CONVERTER(dropout);
 USE_ANAKIN_CONVERTER(sum);
+USE_ANAKIN_CONVERTER(prior_box);
 #endif
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 8ec32b3a0b..1d1d39e440 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -70,17 +70,15 @@ void GpuPassStrategy::EnableMKLDNN() {
 
 // The following passes works for Anakin sub-graph engine.
 const std::vector<std::string> kAnakinSubgraphPasses({
-    "infer_clean_graph_pass",                   //
-    "simplify_anakin_detection_pattern_pass5",  //
-    "simplify_anakin_detection_pattern_pass4",  //
-    "simplify_anakin_detection_pattern_pass3",  //
-    "simplify_anakin_detection_pattern_pass2",  //
-    "anakin_fillconstant_elementwisemul_fuse",  //
-    "fc_fuse_pass",                             //
-    "conv_elementwise_add_fuse_pass",           //
-    "conv_bn_fuse_pass",                        //
-    "conv_elementwise_add_fuse_pass",           //
-    "fc_gru_fuse_pass",                         //
+    "infer_clean_graph_pass",                       //
+    "simplify_anakin_priorbox_detection_out_pass",  //
+    "fillconstant_elementwisemul_fuse",             //
+    "fc_fuse_pass",                                 //
+    "conv_elementwise_add_fuse_pass",               //
+    "conv_bn_fuse_pass",                            //
+    "conv_elementwise_add_fuse_pass",               //
+    "fc_gru_fuse_pass",                             //
+    "quant_conv2d_dequant_fuse_pass",               //
     "anakin_subgraph_pass",
 });
 
@@ -97,13 +95,10 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
         "conv_elementwise_add2_act_fuse_pass",  //
         "conv_elementwise_add_fuse_pass",       //
         "runtime_context_cache_pass",           //
-#endif
+#endif                                          //
+        "transpose_flatten_concat_fuse_pass",
   });
 
-  for (int i = 6; i >= 2; i--) {
-    passes_.push_back("transpose_flatten" + std::to_string(i) +
-                      "_concat_fuse_pass");
-  }
   use_gpu_ = true;
 }
 
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index c366733124..8010bd8ecc 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -52,6 +52,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
   std::string engine_key_;
   std::string engine_serialized_data_;
   bool calibration_mode_;
+  int device_id_;
 
  public:
   TensorRTEngineOp(const std::string &type,
@@ -62,6 +63,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
     input_names_ = Inputs("Xs");
     max_batch_size_ = Attr<int>("max_batch_size");
     workspace_size_ = Attr<int>("workspace_size");
+    device_id_ = Attr<int>("gpu_id");
     enable_int8_ = Attr<bool>("enable_int8");
     calibration_data_ = Attr<std::string>("calibration_data");
     engine_key_ = Attr<std::string>("engine_key");
@@ -79,6 +81,17 @@ class TensorRTEngineOp : public framework::OperatorBase {
     if (enable_int8_ && calibration_data_.size()) {
       calibrator_.reset(new TRTInt8Calibrator(calibration_data_));
     }
+
+    if (!calibration_mode_) {
+      trt_engine_.reset(new inference::tensorrt::TensorRTEngine(
+          max_batch_size_, workspace_size_, enable_int8_, calibrator_.get(),
+          device_id_));
+      PADDLE_ENFORCE(engine_serialized_data_.size(),
+                     "TRT serialized data should not be empty here,"
+                     "there must be error when generate serialized data in TRT "
+                     "subgraph detect pass.");
+      trt_engine_->Deserialize(engine_serialized_data_);
+    }
   }
 
  protected:
@@ -223,14 +236,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
   TensorRTEngine *GetEngine(const framework::Scope &scope,
                             const platform::Place &dev_place) const {
     if (!trt_engine_) {
-      trt_engine_.reset(new inference::tensorrt::TensorRTEngine(
-          max_batch_size_, workspace_size_, enable_int8_, calibrator_.get(),
-          boost::get<platform::CUDAPlace>(dev_place).device));
-      if (!engine_serialized_data_.empty()) {
-        trt_engine_->Deserialize(engine_serialized_data_);
-      } else {
-        PrepareTRTEngine(scope, trt_engine_.get());
-      }
+      PrepareTRTEngine(scope, trt_engine_.get());
     }
     return trt_engine_.get();
   }
-- 
GitLab