From 61221ebc28a9cc6f953715b54838b810e06f8df9 Mon Sep 17 00:00:00 2001
From: Zhaolong Xing <nhzlx.dragon@gmail.com>
Date: Sat, 25 May 2019 12:17:54 +0800
Subject: [PATCH] TRT: Support set dynamic range in int8 mode. (#17524)

* fluid int8 train and trt int8 predict align.
trt int8 predict init
op converter

* 2. align fluid int8 train and trt int8 inference.
enhance quant dequant fuse pass
enhance op converter, trt engine, trt engine op, trt subgraph pass.

* 3. add delete_quant_dequant_pass for trt

test=develop

* 4. add the missing file
test=develop

* 5. i modify the c++ interface, but forget to modify the pybind code
fix the IS_TRT_VERSION_GE bug, and fix elementwise op converter
test=develop
---
 paddle/fluid/framework/ir/CMakeLists.txt      |  1 +
 .../ir/delete_quant_dequant_op_pass.cc        | 82 ++++++++++++++++++
 .../ir/delete_quant_dequant_op_pass.h         | 34 ++++++++
 paddle/fluid/framework/ir/fc_fuse_pass.cc     |  5 ++
 .../framework/ir/graph_pattern_detector.cc    | 85 ++++++++++++++++---
 .../framework/ir/graph_pattern_detector.h     | 17 +++-
 .../ir/quant_conv2d_dequant_fuse_pass.cc      | 73 ++++++++++------
 .../fluid/inference/anakin/convert/conv2d.cc  |  7 +-
 .../inference/anakin/convert/conv2d_fusion.cc |  7 +-
 paddle/fluid/inference/anakin/convert/fc.cc   |  7 +-
 paddle/fluid/inference/analysis/argument.h    |  1 +
 .../inference/analysis/ir_pass_manager.cc     |  2 +
 .../analysis/ir_passes/subgraph_util.cc       |  4 +-
 .../analysis/ir_passes/subgraph_util.h        |  2 +-
 .../ir_passes/tensorrt_subgraph_pass.cc       | 11 ++-
 paddle/fluid/inference/api/analysis_config.cc | 15 ++--
 .../fluid/inference/api/analysis_predictor.cc |  1 +
 .../inference/api/paddle_analysis_config.h    |  4 +-
 .../inference/api/paddle_pass_builder.cc      | 18 ++++
 .../fluid/inference/api/paddle_pass_builder.h |  1 +
 .../tensorrt/convert/activation_op.cc         | 13 +--
 .../tensorrt/convert/batch_norm_op.cc         |  8 +-
 .../inference/tensorrt/convert/concat_op.cc   |  8 +-
 .../inference/tensorrt/convert/conv2d_op.cc   | 47 ++++++----
 .../inference/tensorrt/convert/dropout_op.cc  |  7 +-
 .../tensorrt/convert/elementwise_op.cc        | 55 ++++++------
 .../fluid/inference/tensorrt/convert/fc_op.cc | 71 +++++++++++-----
 .../tensorrt/convert/leaky_relu_op.cc         | 10 +--
 .../inference/tensorrt/convert/op_converter.h | 15 ++++
 .../inference/tensorrt/convert/pad_op.cc      |  8 +-
 .../inference/tensorrt/convert/pool2d_op.cc   | 12 +--
 .../inference/tensorrt/convert/prelu_op.cc    |  9 +-
 .../inference/tensorrt/convert/softmax_op.cc  | 10 ++-
 .../inference/tensorrt/convert/ut_helper.h    |  3 +-
 paddle/fluid/inference/tensorrt/engine.cc     | 79 ++++++++++++++++-
 paddle/fluid/inference/tensorrt/engine.h      | 14 +++
 paddle/fluid/inference/tensorrt/op_teller.cc  |  2 +-
 paddle/fluid/inference/tensorrt/op_teller.h   |  2 +
 .../operators/tensorrt/tensorrt_engine_op.h   |  5 +-
 .../tensorrt/tensorrt_engine_op_test.cc       |  2 +
 paddle/fluid/pybind/inference_api.cc          |  2 +-
 41 files changed, 563 insertions(+), 196 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
 create mode 100644 paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 3210f3041a..bfba73c289 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -75,6 +75,7 @@ pass_library(runtime_context_cache_pass base)
 pass_library(quant_conv2d_dequant_fuse_pass inference)
 pass_library(fillconstant_elementwisemul_fuse inference)
 pass_library(shuffle_channel_detect_pass inference)
+pass_library(delete_quant_dequant_op_pass inference)
 
 if(ANAKIN_FOUND)
 pass_library(simplify_anakin_priorbox_detection_out_pass inference)
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
new file mode 100644
index 0000000000..3d4df87ab7
--- /dev/null
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
+#define GET_NODES                         \
+  GET_IR_NODE(any_op_out);                \
+  GET_IR_NODE(quant_dequant_op_inscale);  \
+  GET_IR_NODE(quant_dequant_op);          \
+  GET_IR_NODE(quant_dequant_op_outscale); \
+  GET_IR_NODE(quant_dequant_op_out);      \
+  GET_IR_NODE(any_op2);
+
+void DeleteQuantDequantOpPass::ApplyImpl(ir::Graph* graph) const {
+  const std::string pattern_name = "delete_quantdequant_op_pattern";
+  FusePassBase::Init(pattern_name, graph);
+
+  GraphPatternDetector gpd;
+
+  patterns::DeleteQuantDequantOpPattern pattern(gpd.mutable_pattern(),
+                                                pattern_name);
+  pattern();
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_NODES;
+    IR_NODE_LINK_TO(any_op_out, any_op2);
+    std::string any_op_out_name = any_op_out->Var()->Name();
+    std::string quant_dequant_op_out_name = quant_dequant_op_out->Var()->Name();
+
+    auto* any_op2_desc = any_op2->Op();
+    // auto input_args_names = any_op2_desc->InputArgumentNames();
+    auto var_map = any_op2_desc->Inputs();
+
+    for (auto& name_m : var_map) {
+      if (std::find(name_m.second.begin(), name_m.second.end(),
+                    quant_dequant_op_out_name) != name_m.second.end()) {
+        std::vector<std::string> new_inputs;
+        for (auto& i_n : name_m.second) {
+          if (i_n != quant_dequant_op_out_name) {
+            new_inputs.push_back(i_n);
+          }
+        }
+        new_inputs.push_back(any_op_out_name);
+        any_op2_desc->SetInput(name_m.first, new_inputs);
+        any_op2_desc->Flush();
+      }
+    }
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph,
+                         {quant_dequant_op, quant_dequant_op_out,
+                          quant_dequant_op_inscale, quant_dequant_op_outscale});
+  };
+
+  gpd(graph, handler);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(delete_quant_dequant_op_pass,
+              paddle::framework::ir::DeleteQuantDequantOpPass);
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h
new file mode 100644
index 0000000000..938ada6453
--- /dev/null
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class DeleteQuantDequantOpPass : public FusePassBase {
+ public:
+  virtual ~DeleteQuantDequantOpPass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index 4691b9abfd..102fd38865 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -78,6 +78,11 @@ void FCFusePass::ApplyImpl(ir::Graph* graph) const {
       desc.SetAttr("enable_int8", base_op_desc->GetAttr("enable_int8"));
       desc.SetAttr("input_scale", base_op_desc->GetAttr("input_scale"));
       desc.SetAttr("weight_scale", base_op_desc->GetAttr("weight_scale"));
+      if (base_op_desc->HasAttr("out_scale"))
+        desc.SetAttr("out_scale", base_op_desc->GetAttr("out_scale"));
+      auto elementwise_desc = elementwise_add->Op();
+      if (elementwise_desc->HasAttr("out_scale"))
+        desc.SetAttr("out_scale", elementwise_desc->GetAttr("out_scale"));
     }
 
     desc.SetType("fc");
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index f0d47ad57f..d50ca63603 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1738,13 +1738,16 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input,
                                               const std::string &op_type,
                                               const std::string &weight_name,
                                               int times,
-                                              const std::string &quant_type) {
-  const int kNumFields = 5;
+                                              const std::string &quant_type,
+                                              const std::string &dequant_type) {
+  int kNumFields = 5;
   const int kQuantizedWeightOffset = 0;
   const int kQuantizedOpOffset = 1;
   const int kQuantizedOpOutOffset = 2;
   const int kDequantOpOffset = 3;
   const int kDequantOpOutOffset = 4;
+  const int kDequantOpWeightScaleOffset = 5;
+
   // the quant op always be one.
   auto quant_op_in_scale = pattern->NewNode(GetNodeName("quant_op_in_scale"))
                                ->assert_is_op_input(quant_type, "InScale")
@@ -1752,11 +1755,19 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input,
   auto quant_op =
       pattern->NewNode(GetNodeName("quant_op"))->assert_is_op(quant_type);
 
-  auto quant_op_out_scale =
-      pattern->NewNode(GetNodeName("quant_op_out_scale"))
-          ->assert_is_op_output(quant_type, "OutScale")
-          ->assert_is_op_input("fake_dequantize_max_abs", "Scale")
-          ->AsIntermediate();
+  PDNode *quant_op_out_scale = nullptr;
+  if (dequant_type == "fake_channel_wise_dequantize_max_abs") {
+    kNumFields += 1;
+    quant_op_out_scale = pattern->NewNode(GetNodeName("quant_op_out_scale"))
+                             ->assert_is_op_output(quant_type, "OutScale")
+                             ->assert_is_op_nth_input(dequant_type, "Scales", 1)
+                             ->AsIntermediate();
+  } else {
+    quant_op_out_scale = pattern->NewNode(GetNodeName("quant_op_out_scale"))
+                             ->assert_is_op_output(quant_type, "OutScale")
+                             ->assert_is_op_input(dequant_type, "Scale")
+                             ->AsIntermediate();
+  }
 
   auto quant_op_out = pattern->NewNode(GetNodeName("quant_op_out"))
                           ->assert_is_op_output(quant_type, "Out")
@@ -1777,16 +1788,25 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input,
     nodes.push_back(
         pattern->NewNode(GetNodeName("quantized_op_out") + std::to_string(i))
             ->assert_is_op_output(op_type)
-            ->assert_is_op_input("fake_dequantize_max_abs", "X")
+            ->assert_is_op_input(dequant_type, "X")
             ->AsIntermediate());
 
     nodes.push_back(
         pattern->NewNode(GetNodeName("dequant_op") + std::to_string(i))
-            ->assert_is_op("fake_dequantize_max_abs"));
+            ->assert_is_op(dequant_type));
+
     nodes.push_back(
         pattern->NewNode(GetNodeName("dequant_op_out") + std::to_string(i))
-            ->assert_is_op_output("fake_dequantize_max_abs", "Out")
+            ->assert_is_op_output(dequant_type, "Out")
             ->AsOutput());
+
+    if (dequant_type == "fake_channel_wise_dequantize_max_abs") {
+      nodes.push_back(pattern
+                          ->NewNode(GetNodeName("dequant_channel_scale") +
+                                    std::to_string(i))
+                          ->assert_is_op_nth_input(dequant_type, "Scales", 0)
+                          ->AsInput());
+    }
   }
 
   quant_op->LinksFrom({quant_op_input, quant_op_in_scale});
@@ -1796,8 +1816,14 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input,
         {quant_op_out, nodes[i * kNumFields + kQuantizedWeightOffset]});
     nodes[i * kNumFields + kQuantizedOpOutOffset]->LinksFrom(
         {nodes[i * kNumFields + kQuantizedOpOffset]});
-    nodes[i * kNumFields + kDequantOpOffset]->LinksFrom(
-        {nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale});
+    if (dequant_type == "fake_channel_wise_dequantize_max_abs") {
+      nodes[i * kNumFields + kDequantOpOffset]->LinksFrom(
+          {nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale,
+           nodes[i * kNumFields + kDequantOpWeightScaleOffset]});
+    } else {
+      nodes[i * kNumFields + kDequantOpOffset]->LinksFrom(
+          {nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale});
+    }
     nodes[i * kNumFields + kDequantOpOutOffset]->LinksFrom(
         {nodes[i * kNumFields + kDequantOpOffset]});
   }
@@ -1834,6 +1860,41 @@ void patterns::ShuffleChannelPattern::operator()(PDNode *reshape1_in) {
   reshape2_out->LinksFrom({reshape2_op});
 }
 
+void patterns::DeleteQuantDequantOpPattern::operator()() {
+  auto any_op_out =
+      pattern->NewNode(any_op_out_repr())
+          ->assert_is_op_input(
+              "fake_quantize_dequantize_moving_average_abs_max", "X")
+          ->AsInput();
+
+  auto quant_dequant_op_inscale =
+      pattern->NewNode(quant_dequant_op_inscale_repr())
+          ->assert_is_op_input(
+              "fake_quantize_dequantize_moving_average_abs_max", "InScale")
+          ->AsInput();
+  auto quant_dequant_op =
+      pattern->NewNode(quant_dequant_op_repr())
+          ->assert_is_op("fake_quantize_dequantize_moving_average_abs_max");
+
+  auto quant_dequant_out =
+      pattern->NewNode(quant_dequant_op_out_repr())
+          ->assert_is_op_output(
+              "fake_quantize_dequantize_moving_average_abs_max", "Out")
+          ->AsIntermediate();
+
+  auto quant_dequant_op_outscale =
+      pattern->NewNode(quant_dequant_op_outscale_repr())
+          ->assert_is_op_output(
+              "fake_quantize_dequantize_moving_average_abs_max", "OutScale")
+          ->AsOutput();
+  auto any_op2 = pattern->NewNode(any_op2_repr())->assert_is_op()->AsOutput();
+
+  quant_dequant_op->LinksFrom({any_op_out, quant_dequant_op_inscale});
+  quant_dequant_op_outscale->LinksFrom({quant_dequant_op});
+  quant_dequant_out->LinksFrom({quant_dequant_op});
+  any_op2->LinksFrom({quant_dequant_out});
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 7df2f5efc4..41f9d12858 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -954,7 +954,8 @@ struct QuantDequantOpFuse : public PatternBase {
 
   void operator()(PDNode* quant_op_input, const std::string& op_name,
                   const std::string& weight_name, int times,
-                  const std::string& quant_type);
+                  const std::string& quant_type,
+                  const std::string& dequant_type);
 
   std::string GetNodeName(const std::string& op_type) {
     return PDNodeName(name_scope_, repr_, id_, op_type);
@@ -980,6 +981,20 @@ struct ShuffleChannelPattern : public PatternBase {
   PATTERN_DECL_NODE(reshape2_out);
 };
 
+struct DeleteQuantDequantOpPattern : public PatternBase {
+  DeleteQuantDequantOpPattern(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "delete_quantdequant_op_pattern") {}
+
+  void operator()();
+
+  PATTERN_DECL_NODE(any_op_out);
+  PATTERN_DECL_NODE(quant_dequant_op_inscale);
+  PATTERN_DECL_NODE(quant_dequant_op);
+  PATTERN_DECL_NODE(quant_dequant_op_outscale);
+  PATTERN_DECL_NODE(quant_dequant_op_out);
+  PATTERN_DECL_NODE(any_op2);
+};
+
 }  // namespace patterns
 
 // Link two ir::Nodes from each other.
diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
index 017e3ef234..62fba440ed 100644
--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
@@ -25,16 +25,20 @@ namespace framework {
 namespace ir {
 
 void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
-                     const std::string& op_type,
-                     const std::string& quant_type) {
+                     const std::string& op_type, const std::string& quant_type,
+                     const std::string& dequant_type) {
   const std::string pattern_name = "quant_dequant_fuse";
-  //  FusePassBase::Init(pattern_name, graph);
-  const int kNumFields = 5;
+  int kNumFields = 5;
   const int kQuantizedWeightOffset = 0;
   const int kQuantizedOpOffset = 1;
   const int kQuantizedOpOutOffset = 2;
   const int kDequantOpOffset = 3;
   const int kDequantOpOutOffset = 4;
+  const int kDequantOpWeightScaleOffset = 5;
+
+  if (dequant_type == "fake_channel_wise_dequantize_max_abs") {
+    kNumFields += 1;
+  }
 
   GraphPatternDetector gpd;
   auto* x = gpd.mutable_pattern()
@@ -42,22 +46,14 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
                 ->assert_is_op_input(quant_type, "X")
                 ->AsInput();
 
-  std::string quantized_op_type = "";
+  std::string quantized_op_type = op_type;
   std::string weight_name = "";
-  if (op_type == "conv2d") {
-    quantized_op_type = "conv2d";
-    weight_name = "Filter";
-  } else if (op_type == "depthwise_conv2d") {
-    quantized_op_type = "depthwise_conv2d";
-    weight_name = "Filter";
-  } else if (op_type == "conv2d_fusion") {
-    quantized_op_type = "conv2d_fusion";
+  if (op_type == "conv2d" || op_type == "depthwise_conv2d" ||
+      op_type == "conv2d_fusion") {
     weight_name = "Filter";
   } else if (op_type == "mul") {
-    quantized_op_type = "mul";
     weight_name = "Y";
   } else if (op_type == "fc") {
-    quantized_op_type = "fc";
     weight_name = "W";
   } else {
     PADDLE_ENFORCE(
@@ -66,7 +62,7 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
   }
 
   patterns::QuantDequantOpFuse pattern(gpd.mutable_pattern(), pattern_name);
-  pattern(x, quantized_op_type, weight_name, times, quant_type);
+  pattern(x, quantized_op_type, weight_name, times, quant_type, dequant_type);
 
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
@@ -91,6 +87,10 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
           subgraph.at(pattern.GetPDNode("dequant_op" + std::to_string(i))));
       nodes.push_back(
           subgraph.at(pattern.GetPDNode("dequant_op_out" + std::to_string(i))));
+      if (dequant_type == "fake_channel_wise_dequantize_max_abs") {
+        nodes.push_back(subgraph.at(
+            pattern.GetPDNode("dequant_channel_scale" + std::to_string(i))));
+      }
     }
 
     int bit_length = boost::get<int>(quant_op->Op()->GetAttr("bit_length"));
@@ -107,10 +107,31 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
     std::unordered_set<const Node*> delete_nodes;
 
     for (int i = 0; i < times; i++) {
-      float max_range = boost::get<float>(
-          nodes[i * kNumFields + kDequantOpOffset]->Op()->GetAttr("max_range"));
-      float weight_scale = (range * range) / max_range;
+      std::vector<float> weight_scale;
+
+      // Get weight scale from dequant op.
+      if (dequant_type == "fake_channel_wise_dequantize_max_abs") {
+        auto scales_name =
+            nodes[i * kNumFields + kDequantOpOffset]->Op()->Input("Scales");
+        PADDLE_ENFORCE(scales_name.size() == 2);
+        const LoDTensor& channel_scale_tensor =
+            scope->FindVar(scales_name[0])->Get<LoDTensor>();
+        PADDLE_ENFORCE(
+            paddle::platform::is_cpu_place(channel_scale_tensor.place()));
+        const float* channel_scale_data = channel_scale_tensor.data<float>();
+        for (int i = 0; i < channel_scale_tensor.numel(); i++) {
+          weight_scale.push_back(channel_scale_data[i]);
+        }
+        delete_nodes.insert(
+            nodes[i * kNumFields + kDequantOpWeightScaleOffset]);
+      } else {
+        float max_range = boost::get<float>(
+            nodes[i * kNumFields + kDequantOpOffset]->Op()->GetAttr(
+                "max_range"));
+        weight_scale.push_back((range * range) / max_range);
+      }
 
+      // create new op_desc
       auto base_op_desc =
           *nodes[i * kNumFields + kQuantizedOpOffset]->Op()->Proto();
       std::string new_input = input_node->Name();
@@ -141,6 +162,7 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
       IR_NODE_LINK_TO(input_node, new_op);
       IR_NODE_LINK_TO(nodes[i * kNumFields + kQuantizedWeightOffset], new_op);
       IR_NODE_LINK_TO(new_op, nodes[i * kNumFields + kDequantOpOutOffset]);
+
       delete_nodes.insert(nodes[i * kNumFields + kQuantizedOpOffset]);
       delete_nodes.insert(nodes[i * kNumFields + kQuantizedOpOutOffset]);
       delete_nodes.insert(nodes[i * kNumFields + kDequantOpOffset]);
@@ -160,16 +182,19 @@ void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const {
   const std::string pattern_name = "quant_dequant_fuse";
   FusePassBase::Init(pattern_name, graph);
 
+  std::unordered_set<std::string> dequant_types = {
+      "fake_dequantize_max_abs", "fake_channel_wise_dequantize_max_abs"};
   std::unordered_set<std::string> quant_types = {
       "fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"};
-
   std::unordered_set<std::string> quantized_op_types = {"conv2d", "mul",
                                                         "depthwise_conv2d"};
   auto* scope = param_scope();
-  for (auto& quant_type : quant_types) {
-    for (auto& op_type : quantized_op_types) {
-      for (int i = 6; i >= 1; i--) {
-        RunQuantDequant(graph, scope, i, op_type, quant_type);
+  for (auto& dequant_type : dequant_types) {
+    for (auto& quant_type : quant_types) {
+      for (auto& op_type : quantized_op_types) {
+        for (int i = 6; i >= 1; i--) {
+          RunQuantDequant(graph, scope, i, op_type, quant_type, dequant_type);
+        }
       }
     }
   }
diff --git a/paddle/fluid/inference/anakin/convert/conv2d.cc b/paddle/fluid/inference/anakin/convert/conv2d.cc
index 70e0adf5ea..26f78efa61 100644
--- a/paddle/fluid/inference/anakin/convert/conv2d.cc
+++ b/paddle/fluid/inference/anakin/convert/conv2d.cc
@@ -70,7 +70,8 @@ void Conv2dOpConverter<TargetT, PrecisionT>::operator()(
   if (enable_int8) {
     const float int8_range = 127.;
     float in_scale = boost::get<float>(op_desc.GetAttr("input_scale"));
-    float weight_scale = boost::get<float>(op_desc.GetAttr("weight_scale"));
+    auto weight_scale =
+        boost::get<std::vector<float>>(op_desc.GetAttr("weight_scale"));
     PBlock<TargetT> *weight1 =
         new PBlock<TargetT>(anakin_shape, ::anakin::AK_INT8);
     this->engine_->RegistBlock(weight1);
@@ -91,8 +92,8 @@ void Conv2dOpConverter<TargetT, PrecisionT>::operator()(
     weight1->d_tensor().copy_from(weight1->h_tensor());
     this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
     this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8);
-    this->engine_->Graph()->SetWeightsScale(op_name,
-                                            {weight_scale / int8_range}, false);
+    this->engine_->Graph()->SetWeightsScale(
+        op_name, {weight_scale[0] / int8_range}, false);
     this->engine_->AddTensorScale(input_name, in_scale / int8_range);
   } else {
     auto *weight1 = pblock_from_tensor<TargetT, PrecisionT>(
diff --git a/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
index a1568b8bde..f2e6003aa6 100644
--- a/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
+++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
@@ -72,7 +72,8 @@ void Conv2dFusionOpConverter<TargetT, PrecisionT>::operator()(
   if (enable_int8) {
     const float int8_range = 127.;
     float in_scale = boost::get<float>(op_desc.GetAttr("input_scale"));
-    float weight_scale = boost::get<float>(op_desc.GetAttr("weight_scale"));
+    auto weight_scale =
+        boost::get<std::vector<float>>(op_desc.GetAttr("weight_scale"));
     PBlock<TargetT> *weight1 =
         new PBlock<TargetT>(anakin_shape, ::anakin::AK_INT8);
     this->engine_->RegistBlock(weight1);
@@ -93,8 +94,8 @@ void Conv2dFusionOpConverter<TargetT, PrecisionT>::operator()(
     weight1->d_tensor().copy_from(weight1->h_tensor());
     this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
     this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8);
-    this->engine_->Graph()->SetWeightsScale(op_name,
-                                            {weight_scale / int8_range}, false);
+    this->engine_->Graph()->SetWeightsScale(
+        op_name, {weight_scale[0] / int8_range}, false);
     this->engine_->AddTensorScale(input_name, in_scale / int8_range);
   } else {
     auto weight_tensor = tensor_from_var(*filter_v, platform::CPUPlace());
diff --git a/paddle/fluid/inference/anakin/convert/fc.cc b/paddle/fluid/inference/anakin/convert/fc.cc
index 0621e3377b..b64d0b84fd 100644
--- a/paddle/fluid/inference/anakin/convert/fc.cc
+++ b/paddle/fluid/inference/anakin/convert/fc.cc
@@ -76,7 +76,8 @@ void FcBaseOpConverter<TargetT, PrecisionT>::operator()(
     ::anakin::saber::Shape anakin_shape(weight_shape);
     const float int8_range = 127.;
     float in_scale = boost::get<float>(op_desc.GetAttr("input_scale"));
-    float weight_scale = boost::get<float>(op_desc.GetAttr("weight_scale"));
+    auto weight_scale =
+        boost::get<std::vector<float>>(op_desc.GetAttr("weight_scale"));
     PBlock<TargetT> *weight1 =
         new PBlock<TargetT>(anakin_shape, ::anakin::AK_INT8);
     this->engine_->RegistBlock(weight1);
@@ -95,8 +96,8 @@ void FcBaseOpConverter<TargetT, PrecisionT>::operator()(
     weight1->d_tensor().copy_from(weight1->h_tensor());
     this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
     this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8);
-    this->engine_->Graph()->SetWeightsScale(op_name,
-                                            {weight_scale / int8_range}, false);
+    this->engine_->Graph()->SetWeightsScale(
+        op_name, {weight_scale[0] / int8_range}, false);
     this->engine_->AddTensorScale(input_name, in_scale / int8_range);
   } else {
     auto *weight1 = pblock_from_vector<TargetT, PrecisionT>(trans_weight_data,
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 66e8d8b528..590baf4ee3 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -164,6 +164,7 @@ struct Argument {
                       AnalysisConfig::Precision);
   DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine,
                       bool);
+  DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool);
 
   DECL_ARGUMENT_FIELD(anakin_max_input_shape, AnakinMaxInputShape,
                       anakin_max_shape_t);
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 371118ffaf..e22f1cbd2e 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -87,7 +87,9 @@ void IRPassManager::CreatePasses(Argument *argument,
       bool enable_int8 = argument->tensorrt_precision_mode() ==
                          AnalysisConfig::Precision::kInt8;
 
+      bool use_calib_mode = argument->tensorrt_use_calib_mode();
       pass->Set("enable_int8", new bool(enable_int8));
+      pass->Set("use_calib_mode", new bool(use_calib_mode));
 
       bool use_static_engine = argument->tensorrt_use_static_engine();
       bool model_from_memory = argument->model_from_memory();
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
index 8f7c6ac755..e16cce54c2 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
@@ -61,7 +61,7 @@ void RenameAndGetOutputs(
     std::set<std::string> *output_names,
     std::unordered_map<std::string, std::string> *output_name_map,
     const std::unordered_map<std::string, framework::ir::Node *> &graph_var_map,
-    bool is_trt) {
+    bool trt_and_not_int8) {
   //// In the normal case, the paddle-trt exists bug when runing the googlenet.
   // When there are more than two convolutions of 1 * 1 with the same input, the
   // paddle-tensorrt will do the merging optimization, which fuse those conv
@@ -121,7 +121,7 @@ void RenameAndGetOutputs(
     for (auto out_var : correspond_node->outputs) {
       var2id[out_var->Name()] = out_var->id();
     }
-    if (op_desc.Type() == "conv2d" && is_trt) {
+    if (op_desc.Type() == "conv2d" && trt_and_not_int8) {
       auto input_var_name = op_desc.Input("Input").front();
       auto filter_var_name = op_desc.Input("Filter").front();
       auto out_var_name = op_desc.Output("Output").front();
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
index bb44502782..444e1984cf 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
@@ -43,7 +43,7 @@ void RenameAndGetOutputs(
     std::set<std::string> *output_names,
     std::unordered_map<std::string, std::string> *output_name_map,
     const std::unordered_map<std::string, framework::ir::Node *> &graph_var_map,
-    bool is_trt = true);
+    bool trt_and_not_int8 = false);
 
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 67650a352d..3fad263b05 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -149,6 +149,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
       graph_var_map[node->Name()] = node;
     }
   }
+  auto enable_int8 = Get<bool>("enable_int8");
+  auto use_calib_mode = Get<bool>("use_calib_mode");
   auto &subgraph_nodes = *Agent(node).subgraph();
 
   // The following procedure is used to rename all the intermediate
@@ -165,7 +167,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   // it is either an OP's input or an OP's output.
   RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id,
                       &output_names_with_id, &output_names, &output_name_map,
-                      graph_var_map);
+                      graph_var_map, !enable_int8);
 
   // When tensorrt engine runs at the end of the operation,
   // output_mapping help us copy the data from the renamed ITensor
@@ -196,7 +198,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
   SetAttr(op_desc->Proto(), "parameters", params);
 
-  auto enable_int8 = Get<bool>("enable_int8");
   auto use_static_engine = Get<bool>("use_static_engine");
   auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id,
                                       std::to_string(0));
@@ -204,13 +205,14 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   // Get "" when there is no cached calibration table data.
   bool load_from_memory = Get<bool>("model_from_memory");
   std::string calibration_data = "";
-  if (enable_int8) {
+  if (enable_int8 && use_calib_mode) {
     calibration_data = GetTrtCalibTableData(
         Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8);
   }
   SetAttr(op_desc->Proto(), "calibration_data", calibration_data);
 
   SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
+  SetAttr(op_desc->Proto(), "use_calib_mode", use_calib_mode);
   SetAttr(op_desc->Proto(), "engine_key", engine_key);
   std::string trt_engine_serialized_data = "";
   SetAttr(op_desc->Proto(), "engine_serialized_data",
@@ -222,7 +224,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   }
   // When in int8 mode and calibration_mode, the program just produce the
   // calibration table data.
-  bool calibration_mode = (enable_int8 && calibration_data.size() == 0);
+  bool calibration_mode =
+      (enable_int8 && calibration_data.size() == 0 && use_calib_mode);
   if (calibration_mode) {
     // calibraion mode means generate int8 calibration table data process.
     return;
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 4fe0c48d8f..67c5d2c0bd 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -21,6 +21,7 @@
 #include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
+extern const std::vector<std::string> kTRTSubgraphPasses;
 extern const std::vector<std::string> kAnakinSubgraphPasses;
 
 PassStrategy *AnalysisConfig::pass_builder() const {
@@ -105,6 +106,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(tensorrt_min_subgraph_size_);
   CP_MEMBER(tensorrt_precision_mode_);
   CP_MEMBER(trt_use_static_engine_);
+  CP_MEMBER(trt_use_calib_mode_);
   // MKLDNN related.
   CP_MEMBER(use_mkldnn_);
   CP_MEMBER(mkldnn_enabled_op_types_);
@@ -177,7 +179,8 @@ std::shared_ptr<MkldnnQuantizerConfig> AnalysisConfig::mkldnn_quantizer_config()
 
 void AnalysisConfig::EnableTensorRtEngine(
     int workspace_size, int max_batch_size, int min_subgraph_size,
-    AnalysisConfig::Precision precision_mode, bool use_static) {
+    AnalysisConfig::Precision precision_mode, bool use_static,
+    bool use_calib_mode) {
 #ifdef PADDLE_WITH_CUDA
   if (!use_gpu()) {
     LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
@@ -190,6 +193,7 @@ void AnalysisConfig::EnableTensorRtEngine(
   tensorrt_min_subgraph_size_ = min_subgraph_size;
   tensorrt_precision_mode_ = precision_mode;
   trt_use_static_engine_ = use_static;
+  trt_use_calib_mode_ = use_calib_mode;
 
   Update();
 #else
@@ -228,13 +232,10 @@ void AnalysisConfig::Update() {
   }
 
   if (use_tensorrt_) {
-    const auto &passes = pass_builder_->AllPasses();
-    if (std::find(passes.begin(), passes.end(), "tensorrt_subgraph_pass") ==
-        std::end(passes)) {
-      // Append after the Affine_channel_conv_fuse pass.
-      pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
+    pass_builder()->ClearPasses();
+    for (const auto &pass : kTRTSubgraphPasses) {
+      pass_builder()->AppendPass(pass);
     }
-    pass_builder()->DeletePass("runtime_context_cache_pass");
   }
 
   if (use_mkldnn_) {
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index e57d3a8045..ef874646eb 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -385,6 +385,7 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_);
     argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_);
     argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_);
+    argument_.SetTensorRtUseCalibMode(config_.trt_use_calib_mode_);
   }
 
   if (config_.anakin_engine_enabled()) {
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index ebe289322b..8067cd777d 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -142,7 +142,8 @@ struct AnalysisConfig {
   void EnableTensorRtEngine(int workspace_size = 1 << 20,
                             int max_batch_size = 1, int min_subgraph_size = 3,
                             Precision precision = Precision::kFloat32,
-                            bool use_static = false);
+                            bool use_static = false,
+                            bool use_calib_mode = false);
   /** A boolean state telling whether the TensorRT engine is used.
    */
   bool tensorrt_engine_enabled() const { return use_tensorrt_; }
@@ -266,6 +267,7 @@ struct AnalysisConfig {
   int tensorrt_min_subgraph_size_{3};
   Precision tensorrt_precision_mode_;
   bool trt_use_static_engine_;
+  bool trt_use_calib_mode_;
 
   // memory reuse related.
   bool enable_memory_optim_{false};
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 2bad89cdb3..3dc9814d0d 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -70,6 +70,24 @@ void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
 
 void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
 
+const std::vector<std::string> kTRTSubgraphPasses({
+  "infer_clean_graph_pass",                        //
+      "conv_affine_channel_fuse_pass",             //
+      "conv_eltwiseadd_affine_channel_fuse_pass",  //
+      "quant_conv2d_dequant_fuse_pass",            //
+      "delete_quant_dequant_op_pass",              //
+      // "fc_fuse_pass",                                 //
+      "tensorrt_subgraph_pass",  //
+      "conv_bn_fuse_pass",       //
+#if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
+                           // guaranteed at least v7
+      "conv_elementwise_add_act_fuse_pass",   //
+      "conv_elementwise_add2_act_fuse_pass",  //
+      "conv_elementwise_add_fuse_pass",       //
+#endif                                        //
+      "transpose_flatten_concat_fuse_pass",
+});
+
 // The following passes works for Anakin sub-graph engine.
 const std::vector<std::string> kAnakinSubgraphPasses({
     "infer_clean_graph_pass",                       //
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 057e7dc65d..1a3430530f 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -142,6 +142,7 @@ class GpuPassStrategy : public PassStrategy {
   virtual ~GpuPassStrategy() = default;
 };
 
+extern const std::vector<std::string> kTRTSubgraphPasses;
 extern const std::vector<std::string> kAnakinSubgraphPasses;
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
index 0b756534ec..5c2454fa9a 100644
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -43,12 +43,13 @@ class ActivationOpConverter : public OpConverter {
         engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor),
         op_pair->second);
     auto output_name = op_desc.Output("Out")[0];
-    layer->setName((op_type_ + " (Output: " + output_name + ")").c_str());
-    layer->getOutput(0)->setName(output_name.c_str());
-    engine_->SetITensor(output_name, layer->getOutput(0));
-    if (test_mode) {  // the test framework can not determine which is the
-                      // output, so place the declaration inside.
-      engine_->DeclareOutput(output_name);
+
+    RreplenishLayerAndOutput(layer, op_type_, {output_name}, test_mode);
+    if (op_desc.HasAttr("out_scale")) {
+#if IS_TRT_VERSION_GE(5000)
+      float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
+      engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
+#endif
     }
   }
 
diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
index d017bac66d..d948868464 100644
--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
@@ -116,18 +116,12 @@ class BatchNormOpConverter : public OpConverter {
                              scale_weights.get(), power_weights.get());
 
     auto output_name = op_desc.Output("Y").front();
-    layer->setName(("batch_norm (Output: " + output_name + ")").c_str());
-    layer->getOutput(0)->setName(output_name.c_str());
     engine_->weight_map[op_desc.Input("Bias").front()] =
         std::move(combile_bias_tensor);
     engine_->weight_map[op_desc.Input("Scale").front()] =
         std::move(combile_scale_tensor);
 
-    engine_->SetITensor(output_name, layer->getOutput(0));
-
-    if (test_mode) {
-      engine_->DeclareOutput(output_name);
-    }
+    RreplenishLayerAndOutput(layer, "pool2d", {output_name}, test_mode);
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
index 525ba9dc34..ec771850ed 100644
--- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
@@ -42,13 +42,7 @@ class ConcatOpConverter : public OpConverter {
     axis = axis - 1;  // Remove batch dim
     layer->setAxis(axis);
     auto output_name = op_desc.Output("Out")[0];
-    layer->setName(("concat (Output: " + output_name + ")").c_str());
-    layer->getOutput(0)->setName(output_name.c_str());
-    engine_->SetITensor(output_name, layer->getOutput(0));
-    if (test_mode) {  // the test framework can not determine which is the
-                      // output, so place the declaration inside.
-      engine_->DeclareOutput(output_name);
-    }
+    RreplenishLayerAndOutput(layer, "concat", {output_name}, test_mode);
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 39a99a21ea..73bfa800f0 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -32,25 +32,31 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
 
   PADDLE_ENFORCE(engine != nullptr);
   auto* X = engine->GetITensor(op_desc.Input("Input").front());
-
-  // Declare weights
   auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
   PADDLE_ENFORCE_NOT_NULL(Y_v);
   auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+  float* weight_data = nullptr;
+  bool enable_int8 = boost::get<bool>(op_desc.HasAttr("enable_int8"));
+
+  if (enable_int8) {
+#if IS_TRT_VERSION_GE(5000)
+    float in_scale = boost::get<float>(op_desc.GetAttr("input_scale"));
+    auto weight_scale =
+        boost::get<std::vector<float>>(op_desc.GetAttr("weight_scale"));
+    weight_data = engine->GetWeightCPUData(op_desc.Input("Filter").front(), Y_t,
+                                           true, weight_scale);
+    engine->SetTensorDynamicRange(X, in_scale);
+#endif
+  } else {
+    weight_data =
+        engine->GetWeightCPUData(op_desc.Input("Filter").front(), Y_t, false);
+  }
 
-  platform::CPUPlace cpu_place;
-  std::unique_ptr<framework::LoDTensor> weight_tensor(
-      new framework::LoDTensor());
-  weight_tensor->Resize(Y_t->dims());
-  TensorCopySync((*Y_t), cpu_place, weight_tensor.get());
-
-  auto* weight_data = weight_tensor->mutable_data<float>(cpu_place);
-
-  PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
-  const int n_output = weight_tensor->dims()[0];
-  const int n_input = weight_tensor->dims()[1];
-  const int filter_h = weight_tensor->dims()[2];
-  const int filter_w = weight_tensor->dims()[3];
+  PADDLE_ENFORCE_EQ(Y_t->dims().size(), 4UL);
+  const int n_output = Y_t->dims()[0];
+  const int n_input = Y_t->dims()[1];
+  const int filter_h = Y_t->dims()[2];
+  const int filter_w = Y_t->dims()[3];
   const int groups = boost::get<int>(op_desc.GetAttr("groups"));
   const std::vector<int> dilations =
       boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
@@ -66,7 +72,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
 
   TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
                                 static_cast<void*>(weight_data),
-                                static_cast<size_t>(weight_tensor->numel())};
+                                static_cast<size_t>(Y_t->numel())};
 
   TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
   auto* layer = fadd_layer(const_cast<nvinfer1::ITensor*>(X), n_output, n_input,
@@ -80,11 +86,16 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
 
   auto output_name = op_desc.Output("Output").front();
   layer->setName((name + " (Output: " + output_name + ")").c_str());
-  engine->weight_map[op_desc.Input("Filter").front()] =
-      std::move(weight_tensor);
   layer->getOutput(0)->setName(output_name.c_str());
   engine->SetITensor(output_name, layer->getOutput(0));
 
+#if IS_TRT_VERSION_GE(5000)
+  if (enable_int8) {
+    float output_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
+    engine->SetTensorDynamicRange(layer->getOutput(0), output_scale);
+  }
+#endif
+
   if (test_mode) {
     engine->DeclareOutput(output_name);
   }
diff --git a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
index ddbc724e3b..71177e5e66 100644
--- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
@@ -55,11 +55,8 @@ class DropoutOpConverter : public OpConverter {
     engine_->weight_map[op_desc.Output("Out").front() + "_dropout"] =
         std::move(weight_tensor);
     auto output_name = op_desc.Output("Out")[0];
-    layer->setName(("dropout (Output: " + output_name + ")").c_str());
-    engine_->SetITensor(output_name, layer->getOutput(0));
-    if (test_mode) {
-      engine_->DeclareOutput(output_name);
-    }
+
+    RreplenishLayerAndOutput(layer, "dropout", {output_name}, test_mode);
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index 0c5a1a6ef1..a888b0803d 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -55,17 +55,13 @@ class ElementwiseWeightOpConverter : public OpConverter {
     auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
     PADDLE_ENFORCE_NOT_NULL(Y_v);
     auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+    float* weight_data = nullptr;
+    weight_data =
+        engine_->GetWeightCPUData(op_desc.Input("Y").front(), Y_t, false);
 
-    platform::CPUPlace cpu_place;
-    std::unique_ptr<framework::LoDTensor> weight_tensor(
-        new framework::LoDTensor());
-    weight_tensor->Resize(Y_t->dims());
-    TensorCopySync((*Y_t), cpu_place, weight_tensor.get());
-    auto* weight_data =
-        weight_tensor->mutable_data<float>(platform::CPUPlace());
     auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
 
-    std::vector<int> dims_y = framework::vectorize2int(weight_tensor->dims());
+    std::vector<int> dims_y = framework::vectorize2int(Y_t->dims());
     if (static_cast<int>(dims_y.size()) == dims_x.nbDims + 1) {
       if (dims_y[0] == 1) dims_y.erase(dims_y.begin());
     }
@@ -92,9 +88,9 @@ class ElementwiseWeightOpConverter : public OpConverter {
       PADDLE_THROW("TensorRT unsupported weight Shape for Elementwise op!");
     }
 
-    TensorRTEngine::Weight shift_weights{
-        nvinfer1::DataType::kFLOAT, static_cast<void*>(weight_data),
-        weight_tensor->memory_size() / sizeof(float)};
+    TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT,
+                                         static_cast<void*>(weight_data),
+                                         static_cast<size_t>(Y_t->numel())};
     TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, nullptr,
                                          0};
     TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
@@ -112,14 +108,13 @@ class ElementwiseWeightOpConverter : public OpConverter {
     }
 
     auto output_name = op_desc.Output("Out")[0];
-    layer->setName(
-        ("elementwise_" + op_type_ + "(Output: " + output_name + ")").c_str());
-    layer->getOutput(0)->setName(output_name.c_str());
-    engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor);
-    engine_->SetITensor(output_name, layer->getOutput(0));
-    if (test_mode) {  // the test framework can not determine which is the
-                      // output, so place the declaration inside.
-      engine_->DeclareOutput(output_name);
+    RreplenishLayerAndOutput(layer, "elementwise_" + op_type_, {output_name},
+                             test_mode);
+    if (op_desc.HasAttr("out_scale")) {
+#if IS_TRT_VERSION_GE(5000)
+      float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
+      engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
+#endif
     }
   }
 
@@ -138,6 +133,7 @@ class ElementwiseTensorOpConverter : public OpConverter {
     // Here the two nullptr looks strange, that's because the
     // framework::OpDesc's constructor is strange.
     framework::OpDesc op_desc(op, nullptr);
+    nvinfer1::ILayer* layer = nullptr;
 
     PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
     PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
@@ -153,13 +149,11 @@ class ElementwiseTensorOpConverter : public OpConverter {
     if (CheckDims(dims_x, dims_y)) {
       // The two input tensor should have the same dims
       VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer";
-      nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
+      nvinfer1::IElementWiseLayer* elet_layer = TRT_ENGINE_ADD_LAYER(
           engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
           *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
 
-      layer->setName(("elementwise (Output: " + output_name + ")").c_str());
-      layer->getOutput(0)->setName(output_name.c_str());
-      engine_->SetITensor(output_name, layer->getOutput(0));
+      layer = elet_layer;
     } else {
       VLOG(3) << "Convert a fluid elementwise op to TensorRT "
                  "ElementWisePluginLayer";
@@ -168,17 +162,18 @@ class ElementwiseTensorOpConverter : public OpConverter {
           new plugin::ElementWisePlugin(op_type_, dims_x, dims_y, axis);
       plugin->AddInput(X);
       plugin->AddInput(Y);
-      nvinfer1::IPluginLayer* layer = engine_->AddPlugin(
+      nvinfer1::IPluginLayer* plugin_layer = engine_->AddPlugin(
           const_cast<nvinfer1::ITensor* const*>(plugin->GetInputs().data()), 2,
           reinterpret_cast<plugin::PluginTensorRT*>(plugin));
 
-      layer->setName(("elementwise (Output: " + output_name + ")").c_str());
-      layer->getOutput(0)->setName(output_name.c_str());
-      engine_->SetITensor(output_name, layer->getOutput(0));
+      layer = plugin_layer;
     }
-    if (test_mode) {  // the test framework can not determine which is the
-                      // output, so place the declaration inside.
-      engine_->DeclareOutput(output_name);
+    RreplenishLayerAndOutput(layer, "elementwise", {output_name}, test_mode);
+    if (op_desc.HasAttr("out_scale")) {
+#if IS_TRT_VERSION_GE(5000)
+      float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
+      engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
+#endif
     }
   }
 
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index 42dcd68e40..fb7b89b189 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -53,33 +53,47 @@ class FcOpConverter : public OpConverter {
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
     VLOG(3) << "convert a fluid fc op to tensorrt fc layer without bias";
-
     framework::OpDesc op_desc(op, nullptr);
-    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+
+    auto input_names = op_desc.InputNames();
+    bool with_bias = input_names.size() >= 3;
+    std::string w_name = "Y";
+    std::string i_name = "X";
+    if (with_bias) {
+      w_name = "W";
+      i_name = "Input";
+    }
 
     // Declare inputs
-    auto* X = engine_->GetITensor(op_desc.Input("X").front());
+    auto* X = engine_->GetITensor(op_desc.Input(i_name).front());
 
     // Declare weights
-    auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
+    auto* Y_v = scope.FindVar(op_desc.Input(w_name).front());
     PADDLE_ENFORCE_NOT_NULL(Y_v);
     auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
     // This may trigger a GPU->CPU copy, because TRT's weight can only be
     // assigned from CPU memory, that can't be avoided.
-    platform::CPUPlace cpu_place;
-    framework::LoDTensor weight_tensor;
-    weight_tensor.Resize(Y_t->dims());
-    TensorCopySync((*Y_t), cpu_place, &weight_tensor);
-
-    auto* weight_data = weight_tensor.mutable_data<float>(platform::CPUPlace());
+    float* weight_data = nullptr;
+    bool enable_int8 = boost::get<bool>(op_desc.HasAttr("enable_int8"));
+    if (enable_int8) {
+#if IS_TRT_VERSION_GE(5000)
+      float in_scale = boost::get<float>(op_desc.GetAttr("input_scale"));
+      auto weight_scale =
+          boost::get<std::vector<float>>(op_desc.GetAttr("weight_scale"));
+      weight_data = engine_->GetWeightCPUData(op_desc.Input(w_name).front(),
+                                              Y_t, true, weight_scale);
+      engine_->SetTensorDynamicRange(X, in_scale);
+#endif
+    } else {
+      weight_data =
+          engine_->GetWeightCPUData(op_desc.Input(w_name).front(), Y_t, false);
+    }
 
-    PADDLE_ENFORCE_EQ(weight_tensor.dims().size(), 2UL);  // a matrix
-    size_t n_output = weight_tensor.dims()[1];
+    PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL);  // a matrix
+    size_t n_output = Y_t->dims()[1];
 
     std::unique_ptr<framework::Tensor> tmp(new framework::LoDTensor());
-    tmp->Resize(weight_tensor.dims());
+    tmp->Resize(Y_t->dims());
 
     memcpy(tmp->mutable_data<float>(platform::CPUPlace()), weight_data,
            Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float));
@@ -100,19 +114,32 @@ class FcOpConverter : public OpConverter {
     // but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just
     // handle `mul`, leave `add` as another layer.
     // DEBUG
-    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    float* bias_data = nullptr;
+    int bias_num = 0;
+    if (with_bias) {
+      auto* b_v = scope.FindVar(op_desc.Input("Bias").front());
+      auto* b_t = b_v->GetMutable<framework::LoDTensor>();
+      bias_data =
+          engine_->GetWeightCPUData(op_desc.Input("Bias").front(), b_t, false);
+      bias_num = b_t->numel();
+    }
+    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT,
+                                static_cast<void*>(bias_data),
+                                static_cast<size_t>(bias_num)};
 
     auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected,
                                        *const_cast<nvinfer1::ITensor*>(X),
                                        n_output, tmp_weight.get(), bias.get());
 
+    engine_->weight_map[op_desc.Input(w_name).front()] = std::move(tmp);
     auto output_name = op_desc.Output("Out").front();
-    layer->setName(("fc (Output: " + output_name + ")").c_str());
-    layer->getOutput(0)->setName(output_name.c_str());
-    engine_->SetITensor(output_name, layer->getOutput(0));
-    engine_->weight_map[op_desc.Input("Y").front()] = std::move(tmp);
-    if (test_mode) {
-      engine_->DeclareOutput(output_name);
+
+    RreplenishLayerAndOutput(layer, "fc", {output_name}, test_mode);
+    if (enable_int8) {
+#if IS_TRT_VERSION_GE(5000)
+      float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
+      engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
+#endif
     }
   }
 };
diff --git a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
index 3f6ed04c46..7753fda06c 100644
--- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
@@ -76,15 +76,9 @@ class LeakyReluOpConverter : public OpConverter {
                    engine_->weight_map.end());
     engine_->weight_map[alpha_name] = std::move(alpha_tensor);
 
-    std::string layer_name = "leaky_relu (Output: ";
     auto output_name = op_desc.Output("Out")[0];
-    output_layer->getOutput(0)->setName(output_name.c_str());
-    engine_->SetITensor(output_name, output_layer->getOutput(0));
-    layer_name += output_name;
-    if (test_mode) {
-      engine_->DeclareOutput(output_name);
-    }
-    output_layer->setName((layer_name + ")").c_str());
+    RreplenishLayerAndOutput(output_layer, "leaky_relu", {output_name},
+                             test_mode);
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 55515569ea..96a722dc89 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -172,6 +172,21 @@ class OpConverter {
     engine->FreezeNetwork();
   }
 
+  void RreplenishLayerAndOutput(
+      nvinfer1::ILayer* layer, const std::string& layer_type,
+      const std::vector<std::string>& output_tensor_names,
+      bool test_mode = false) {
+    size_t num_out = output_tensor_names.size();
+    for (size_t i = 0; i < num_out; i++) {
+      layer->getOutput(i)->setName(output_tensor_names[i].c_str());
+      engine_->SetITensor(output_tensor_names[i], layer->getOutput(i));
+      if (test_mode) {
+        engine_->DeclareOutput(output_tensor_names[i]);
+      }
+    }
+    layer->setName(
+        (layer_type + " (Output: " + output_tensor_names[0] + ")").c_str());
+  }
   void SetEngine(TensorRTEngine* engine) { engine_ = engine; }
 
   virtual ~OpConverter() {}
diff --git a/paddle/fluid/inference/tensorrt/convert/pad_op.cc b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
index 4afcb0aece..bcd2166728 100644
--- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
@@ -51,13 +51,7 @@ class PadOpConverter : public OpConverter {
 
     PADDLE_ENFORCE(layer != nullptr);
     auto output_name = op_desc.Output("Out")[0];
-    engine_->SetITensor(output_name, layer->getOutput(0));
-    layer->setName(("scale (Output: " + output_name + ")").c_str());
-    layer->getOutput(0)->setName(output_name.c_str());
-    if (test_mode) {  // the test framework can not determine which is the
-                      // output, so place the declaration inside.
-      engine_->DeclareOutput(output_name);
-    }
+    RreplenishLayerAndOutput(layer, "pad", {output_name}, test_mode);
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index 1d0d83d1f3..1752c52c3f 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -148,11 +148,13 @@ class Pool2dOpConverter : public OpConverter {
     }
 
     auto output_name = op_desc.Output("Out")[0];
-    layer->setName(("pool2d (Output: " + output_name + ")").c_str());
-    layer->getOutput(0)->setName(output_name.c_str());
-    engine_->SetITensor(output_name, layer->getOutput(0));
-    if (test_mode) {
-      engine_->DeclareOutput(output_name);
+    RreplenishLayerAndOutput(layer, "pool2d", {output_name}, test_mode);
+
+    if (op_desc.HasAttr("out_scale")) {
+#if IS_TRT_VERSION_GE(5000)
+      float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
+      engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
+#endif
     }
   }
 };
diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
index 2ae804106e..01bcd03e52 100644
--- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
@@ -58,15 +58,8 @@ class PReluOpConverter : public OpConverter {
     engine_->weight_map[op_desc.Input("Alpha")[0]] =
         std::move(alpha_tensor_temp);
 
-    std::string layer_name = "prelu (Output: ";
     auto output_name = op_desc.Output("Out")[0];
-    layer->getOutput(0)->setName(output_name.c_str());
-    engine_->SetITensor(output_name, layer->getOutput(0));
-    layer_name += output_name;
-    if (test_mode) {
-      engine_->DeclareOutput(output_name);
-    }
-    layer->setName((layer_name + ")").c_str());
+    RreplenishLayerAndOutput(layer, "prelu", {output_name}, test_mode);
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
index 80bfb2d190..b0ae169412 100644
--- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
@@ -34,9 +34,13 @@ class SoftMaxOpConverter : public OpConverter {
                                        *const_cast<nvinfer1::ITensor*>(input1));
 
     auto output_name = op_desc.Output("Out")[0];
-    engine_->SetITensor(output_name, layer->getOutput(0));
-    if (test_mode) {
-      engine_->DeclareOutput(output_name);
+    RreplenishLayerAndOutput(layer, "softmax", {output_name}, test_mode);
+
+    if (op_desc.HasAttr("out_scale")) {
+#if IS_TRT_VERSION_GE(5000)
+      float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
+      engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
+#endif
     }
   }
 };
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index 1856060cec..388d83d834 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -40,8 +40,7 @@ namespace tensorrt {
  * Get a random float value between [low, high]
  */
 float random(float low, float high) {
-  static std::random_device rd;
-  static std::mt19937 mt(rd());
+  static std::mt19937 mt(100);
   std::uniform_real_distribution<double> dist(low, high);
   return dist(mt);
 }
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index fddf5f11c2..c5ac6f3841 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -53,10 +53,40 @@ void TensorRTEngine::FreezeNetwork() {
   infer_builder_->setMaxWorkspaceSize(max_workspace_);
   if (enable_int8_) {
     infer_builder_->setInt8Mode(true);
-    PADDLE_ENFORCE(
-        calibrator_ != nullptr,
-        "The precision mode is 'INT8', the calibrator should not be nullptr");
-    infer_builder_->setInt8Calibrator(calibrator_);
+    if (calibrator_) {
+      infer_builder_->setInt8Calibrator(calibrator_);
+    } else {
+      infer_builder_->setInt8Calibrator(nullptr);
+
+#if IS_TRT_VERSION_GE(5000)
+      infer_builder_->setStrictTypeConstraints(true);
+      for (auto &quant_range : quant_dynamic_range_) {
+        auto tensor = quant_range.first;
+        float range = quant_range.second;
+        tensor->setDynamicRange(-range, range);
+      }
+
+      std::unordered_set<nvinfer1::ITensor *> all_t;
+      for (int i = 0; i < infer_network_->getNbLayers(); i++) {
+        auto layer = infer_network_->getLayer(i);
+        for (int j = 0; j < layer->getNbOutputs(); j++) {
+          all_t.insert(layer->getOutput(j));
+        }
+      }
+      for (int i = 0; i < infer_network_->getNbInputs(); i++) {
+        all_t.insert(infer_network_->getInput(i));
+      }
+
+      for (auto &t : all_t) {
+        if (!quant_dynamic_range_.count(t)) {
+          LOG(WARNING)
+              << "We are in trt int8 mode(not calibration), scale not setted"
+              << " for tensor " << t->getName()
+              << ", this might be ok when trt does not need this range";
+        }
+      }
+#endif
+    }
   }
 
   infer_engine_.reset(infer_builder_->buildCudaEngine(*infer_network_));
@@ -133,6 +163,47 @@ void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
   runtime_batch_ = batch_size;
 }
 
+float *TensorRTEngine::GetWeightCPUData(const std::string &name,
+                                        framework::Tensor *weight_tensor,
+                                        bool enable_int8,
+                                        const std::vector<float> &scale) {
+  auto w_dims = weight_tensor->dims();
+  platform::CPUPlace cpu_place;
+  PADDLE_ENFORCE(!weight_map.count(name),
+                 "During TRT Op converter: We set weight %s with the same name "
+                 "twice into the weight_map",
+                 name);
+  weight_map[name].reset(new framework::Tensor());
+  weight_map[name]->Resize(weight_tensor->dims());
+  TensorCopySync(*weight_tensor, cpu_place, weight_map[name].get());
+  float *weight_data = weight_map[name]->mutable_data<float>(cpu_place);
+
+  if (enable_int8) {
+    // when the op is fc, scale's size should be 1
+    // when the op is conv, the scale's size should be w_dims[0]
+    bool valid_scale_size =
+        (scale.size() == 1 || scale.size() == static_cast<size_t>(w_dims[0]));
+    PADDLE_ENFORCE(valid_scale_size, "TRT int8 quant: invalid scale size");
+    for (int i = 0; i < weight_tensor->numel(); i++) {
+      bool is_valid_int8 =
+          ((weight_data[i] >= -128) && (weight_data[i] <= 127));
+      PADDLE_ENFORCE(is_valid_int8,
+                     "We are in anakin subgraph int8 mode, the weight of conv "
+                     "should be in range [-128, 127]");
+      if (scale.size() == 1) {
+        weight_data[i] *= (scale[0] / 127);
+      } else {
+        PADDLE_ENFORCE(w_dims.size() == 4,
+                       "TRT int8 quant : We only use the channel quant for "
+                       "conv op, so the weight dims should be 4.");
+        int inner_size = w_dims[1] * w_dims[2] * w_dims[3];
+        weight_data[i] *= (scale[i / inner_size] / 127);
+      }
+    }
+  }
+  return weight_data;
+}
+
 int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }
 
 nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 657dfd9355..0396b084b8 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -18,8 +18,10 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/inference/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
@@ -131,6 +133,13 @@ class TensorRTEngine {
   int GetDeviceId() { return device_id_; }
   nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs,
                                     int num_inputs, plugin::PluginTensorRT*);
+  void SetTensorDynamicRange(nvinfer1::ITensor* tensor, float range) {
+    quant_dynamic_range_[tensor] = range;
+  }
+
+  float* GetWeightCPUData(const std::string& name,
+                          framework::Tensor* weight_tensor, bool enable_int8,
+                          const std::vector<float>& scale = {});
 
   // A pointer to CPU memory is needed of the TRT weight.
   // Before TRT runs, fluid loads weight into GPU storage.
@@ -184,8 +193,13 @@ class TensorRTEngine {
   infer_ptr<nvinfer1::ICudaEngine> infer_engine_;
   infer_ptr<nvinfer1::IExecutionContext> infer_context_;
   infer_ptr<nvinfer1::IHostMemory> ihost_memory_;
+  std::unordered_map<nvinfer1::ITensor*, float> quant_dynamic_range_;
 };  // class TensorRTEngine
 
+#define IS_TRT_VERSION_GE(version)                       \
+  ((NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \
+    NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD) >= version)
+
 // Add an layer__ into engine__ with args ARGS.
 // For example:
 //
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 9fecad6eb3..8a5aed5d43 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -32,7 +32,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
        "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
        "elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
-       "conv2d_transpose", "leaky_relu"}};
+       "conv2d_transpose", "leaky_relu", "fc"}};
 };
 
 bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) {
diff --git a/paddle/fluid/inference/tensorrt/op_teller.h b/paddle/fluid/inference/tensorrt/op_teller.h
index b98f052bf2..3363d77af8 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.h
+++ b/paddle/fluid/inference/tensorrt/op_teller.h
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #pragma once
+#include <memory>
 #include <string>
+#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/op_desc.h"
 
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 7f470924b3..1c32368e9d 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -48,6 +48,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
   int workspace_size_;
   std::unique_ptr<TRTInt8Calibrator> calibrator_;
   bool enable_int8_;
+  bool use_calib_mode_;
   std::string calibration_data_;
   std::string engine_key_;
   std::string engine_serialized_data_;
@@ -65,6 +66,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
     workspace_size_ = Attr<int>("workspace_size");
     device_id_ = Attr<int>("gpu_id");
     enable_int8_ = Attr<bool>("enable_int8");
+    use_calib_mode_ = Attr<bool>("use_calib_mode");
     calibration_data_ = Attr<std::string>("calibration_data");
     engine_key_ = Attr<std::string>("engine_key");
     engine_serialized_data_ = Attr<std::string>("engine_serialized_data");
@@ -75,7 +77,8 @@ class TensorRTEngineOp : public framework::OperatorBase {
     }
     // calibration_mode is ture represents we need to
     // generate the calibration table data.
-    calibration_mode_ = (enable_int8_ && calibration_data_.size() == 0);
+    calibration_mode_ =
+        (enable_int8_ && calibration_data_.size() == 0 && use_calib_mode_);
 
     VLOG(4) << "calibration_mode: " << calibration_mode_;
     if (enable_int8_ && calibration_data_.size()) {
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
index cc4d8d6e6f..b39508a34d 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -104,6 +104,7 @@ TEST(TensorRTEngineOp, manual) {
   engine_op_desc.SetAttr("engine_key", std::string("a_engine"));
   engine_op_desc.SetAttr("calibration_data", std::string(""));
   engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
+  engine_op_desc.SetAttr("use_calib_mode", static_cast<bool>(false));
   engine_op_desc.SetAttr("output_name_mapping",
                          std::vector<std::string>({"z0"}));
   engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
@@ -202,6 +203,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
   engine_op_desc.SetAttr("engine_key", std::string("b_engine"));
   engine_op_desc.SetAttr("calibration_data", std::string(""));
   engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
+  engine_op_desc.SetAttr("use_calib_mode", static_cast<bool>(false));
   engine_op_desc.SetAttr("output_name_mapping",
                          std::vector<std::string>({"z3"}));
   engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index b650225c64..8ec9806f5f 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -229,7 +229,7 @@ void BindAnalysisConfig(py::module *m) {
            py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1,
            py::arg("min_subgraph_size") = 3,
            py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
-           py::arg("use_static") = true)
+           py::arg("use_static") = true, py::arg("use_calib_mode") = false)
       .def("enable_anakin_engine", &AnalysisConfig::EnableAnakinEngine,
            py::arg("max_batch_size") = 1,
            py::arg("max_input_shape") =
-- 
GitLab