From 17188e8d5173ff86464603370b49b9531fd6f3b2 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Mon, 23 Aug 2021 19:04:29 +0800
Subject: [PATCH] trt convert ut add dynamic_shape and int8, etc. (#35061)

---
 paddle/fluid/framework/ir/graph_helper.cc     |   1 +
 paddle/fluid/framework/ir/graph_viz_pass.cc   |  28 +++
 .../unittests/ir/inference/auto_scan_test.py  |  42 +++-
 .../unittests/ir/inference/program_config.py  | 183 ++++++++++++++++++
 .../ir/inference/test_trt_convert_conv2d.py   |  30 ++-
 .../ir/inference/trt_layer_auto_scan_test.py  |  33 +++-
 python/paddle/static/io.py                    |   8 +-
 7 files changed, 308 insertions(+), 17 deletions(-)
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index 652ce77d844..a73bc487c92 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -535,6 +535,7 @@ void GraphToProgram(const Graph &graph, ProgramDesc *program,
 
       block = program_pb.add_blocks();
       block->set_idx(idx);
+      block->set_parent_idx(kRootBlockIndex);
       GraphToBlock(*graph.GetSubGraph(idx), block, sort_kind);
     }
   } else {
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc
index d8f90d5a757..f2c711fb6f0 100644
--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include <string>
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_printer.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/analysis/dot.h"
 
 namespace paddle {
@@ -44,6 +47,31 @@ void GraphVizPass::ApplyImpl(ir::Graph* graph) const {
           "Can not open file %s for printing the graph.", graph_viz_path));
   std::ostream& sout = *fout;
 
+  // serialize only model file.
+  std::string program_path;
+  std::size_t found1 = graph_viz_path.find("_ir_");
+  std::size_t found2 = graph_viz_path.find(".dot");
+  if (found1 != std::string::npos && found2 != std::string::npos) {
+    ProgramDesc program_desc;
+    GraphToProgram(*graph, &program_desc);
+    // TODO(wilber): GraphToProgram seems have bugs.
+    for (size_t i = 0; i < program_desc.Size(); ++i) {
+      for (size_t j = 0; j < program_desc.Block(i).OpSize(); ++j) {
+        if (program_desc.Block(i).Op(j)->Type() == "tensorrt_engine") {
+          program_desc.Block(i).Op(j)->RemoveAttr("sub_block");
+        }
+      }
+    }
+    std::string program_bytes = program_desc.Proto()->SerializeAsString();
+    // rename from "17_ir_fc_fuse_pass.dot" to "fc_fuse_pass.pdmodel"
+    program_path =
+        graph_viz_path.substr(found1 + 4, found2 - found1 - 4) + ".pdmodel";
+    std::ofstream file(program_path.c_str(), std::ios::binary);
+    file.write(program_bytes.c_str(), program_bytes.size());
+    file.close();
+    VLOG(3) << "serialize program to " << program_path;
+  }
+
   std::unordered_map<const ir::Node*, std::string> node2dot;
 
   Dot dot;
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
index 7d749cca5c2..3a899a2e5e0 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
@@ -15,6 +15,7 @@
 import numpy as np
 import unittest
 import abc
+import os
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.initializer import NumpyArrayInitializer
@@ -22,14 +23,13 @@ import paddle.fluid.core as core
 from paddle import compat as cpt
 import paddle.inference as paddle_infer
 from typing import Optional, List, Callable, Dict, Any, Set
-from program_config import TensorConfig, OpConfig, ProgramConfig, create_fake_model
+from program_config import TensorConfig, OpConfig, ProgramConfig, create_fake_model, create_quant_model
 
 
 class AutoScanTest(unittest.TestCase):
     def __init__(self, methodName='runTest'):
         paddle.enable_static()
         super(AutoScanTest, self).__init__(methodName)
-        self.threshold = 1e-5
 
     @abc.abstractmethod
     def sample_program_configs(self) -> List[ProgramConfig]:
@@ -56,11 +56,31 @@ class AutoScanTest(unittest.TestCase):
             input_tensor.copy_from_cpu(feed_data[name])
         predictor.run()
         result = {}
-        for out_name in prog_config.outputs:
-            result[out_name] = predictor.get_output_handle(
-                out_name).copy_to_cpu()
+        for out_name, o_name in zip(prog_config.outputs,
+                                    predictor.get_output_names()):
+            result[out_name] = predictor.get_output_handle(o_name).copy_to_cpu()
         return result
 
+    def assert_op_size(self, trt_engine_num, paddle_op_num):
+        cur_path = os.path.dirname(__file__)
+        last_passed_program = os.path.join(
+            cur_path, 'transpose_flatten_concat_fuse_pass.pdmodel')
+        model_bytes = paddle.static.load_from_file(last_passed_program)
+        pg = paddle.static.deserialize_program(model_bytes)
+        main_block = pg.desc.block(0)
+        op_size = main_block.op_size()
+        op_types = [
+            main_block.op(i).type() == 'tensorrt_engine' for i in range(op_size)
+        ]
+        trt_engine_size = sum(op_types)
+        paddle_op_size = op_size - trt_engine_size
+        self.assertTrue(trt_engine_size == trt_engine_num,
+                        'trt_engine_num is {}, but got {}!'.format(
+                            trt_engine_size, trt_engine_num))
+        self.assertTrue(paddle_op_size == paddle_op_num,
+                        'paddle_op_num is {}, but got {}!'.format(
+                            paddle_op_size, paddle_op_num))
+
     def assert_tensors_near(self,
                             threshold: float,
                             tensors: List[Dict[str, np.array]]):
@@ -73,9 +93,15 @@ class AutoScanTest(unittest.TestCase):
                         first[key], arr, atol=threshold),
                     "Output has diff between GPU and TensorRT. ")
 
-    def run_test(self):
+    def run_test(self,
+                 trt_engine_num: int,
+                 paddle_op_num: int,
+                 threshold=1e-5,
+                 quant=False):
         for prog_config in self.sample_program_configs():
             model, params = create_fake_model(prog_config)
+            if quant:
+                model, params = create_quant_model(model, params)
             for batch_size in self.batch_size_set:
                 feed_data = {}
                 for name, tensor_config in prog_config.inputs.items():
@@ -88,5 +114,5 @@ class AutoScanTest(unittest.TestCase):
                     results.append(
                         self.run_test_config(model, params, prog_config,
                                              pred_config, feed_data))
-                self.assert_tensors_near(
-                    threshold=self.threshold, tensors=results)
+                self.assert_tensors_near(threshold=threshold, tensors=results)
+                self.assert_op_size(trt_engine_num, paddle_op_num)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/program_config.py b/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
index 399501618b6..6b465f4b2af 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
@@ -21,6 +21,11 @@ from paddle import compat as cpt
 from paddle.fluid.initializer import NumpyArrayInitializer
 from paddle.fluid.framework import convert_np_dtype_to_dtype_
 
+from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
+from paddle.fluid.contrib.slim.quantization import QuantizationFreezePass
+from paddle.fluid.framework import IrGraph, IrNode, Operator
+from paddle.fluid.executor import global_scope
+
 
 class TensorConfig:
     '''
@@ -160,3 +165,181 @@ def create_fake_model(program_config):
         executor.run(util_program)
         params = scope.find_var("out_var_0").get_bytes()
     return model, params
+
+
+def create_quant_model(model,
+                       params,
+                       activation_quantize_type='moving_average_abs_max',
+                       weight_quantize_type='channel_wise_abs_max',
+                       save=False):
+    place = paddle.CUDAPlace(0)
+    scope = global_scope()
+    exe = paddle.static.Executor(place)
+    [inference_program, feed_target_names,
+     fetch_targets] = paddle.static.load_inference_model(
+         path_prefix=None,
+         executor=exe,
+         model_filename=model,
+         params_filename=params)
+    graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
+
+    transform_pass = QuantizationTransformPass(
+        scope=scope,
+        place=place,
+        activation_quantize_type=activation_quantize_type,
+        weight_quantize_type=weight_quantize_type)
+    transform_pass.apply(graph)
+
+    out_scale_op_list = [
+        "conv2d",
+        "depthwise_conv2d",
+        "mul",
+        "matmul",
+        "relu",
+        "leaky_relu",
+        "relu6",
+        "sigmoid",
+        "tanh",
+        "prelu",
+        "swish",
+        "softmax",
+        "batch_norm",
+        "layer_norm",
+        "elementwise_add",
+        "pool2d",
+        "reshape2",
+        "transpose2",
+        "concat",
+        "elementwise_mul",
+        "scale",
+        "slice",
+        "hard_swish",
+        "hard_sigmoid",
+        "conv2d_transpose",
+        "gru",
+        "bilinear_interp",
+        "nearest_interp",
+        "trilinear_interp",
+        "flatten",
+        "flatten2",
+        "transpose",
+        "pad2d",
+        "reshape",
+        "layer_norm",
+    ]
+    op_real_in_out_name = {
+        "conv2d": [["Input", "Filter"], ["Output"]],
+        "depthwise_conv2d": [["Input", "Filter"], ["Output"]],
+        "conv2d_transpose": [["Input", "Filter"], ["Output"]],
+        "mul": [["X", "Y"], ["Out"]],
+        "matmul": [["X", "Y"], ["Out"]],
+        "pool2d": [["X"], ["Out"]],
+        "elementwise_add": [["X", "Y"], ["Out"]],
+        "concat": [["X"], ["Out"]],
+        "softmax": [["X"], ["Out"]],
+        "argmax": [["X"], ["Out"]],
+        "transpose": [["X"], ["Out"]],
+        "equal": [["X", "Y"], ["Out"]],
+        "gather": [["X"], ["Out"]],
+        "greater_equal": [["X", "Y"], ["Out"]],
+        "greater_than": [["X", "Y"], ["Out"]],
+        "less_equal": [["X", "Y"], ["Out"]],
+        "less_than": [["X", "Y"], ["Out"]],
+        "mean": [["X"], ["Out"]],
+        "not_equal": [["X", "Y"], ["Out"]],
+        "reshape": [["X"], ["Out"]],
+        "reshape2": [["X"], ["Out"]],
+        "transpose2": [["X"], ["Out"]],
+        "bilinear_interp": [["X"], ["Out"]],
+        "nearest_interp": [["X"], ["Out"]],
+        "trilinear_interp": [["X"], ["Out"]],
+        "slice": [["Input"], ["Out"]],
+        "squeeze": [["X"], ["Out"]],
+        "elementwise_sub": [["X", "Y"], ["Out"]],
+        "relu": [["X"], ["Out"]],
+        "relu6": [["X"], ["Out"]],
+        "leaky_relu": [["X"], ["Out"]],
+        "prelu": [["X"], ["Out"]],
+        "tanh": [["X"], ["Out"]],
+        "swish": [["X"], ["Out"]],
+        "dropout": [["X"], ["Out"]],
+        "batch_norm": [["X"], ["Y"]],
+        "layer_norm": [["X"], ["Y"]],
+        "sigmoid": [["X"], ["Out"]],
+        "elementwise_mul": [["X", "Y"], ["Out"]],
+        "scale": [["X"], ["Out"]],
+        "hard_swish": [["X"], ["Out"]],
+        "hard_sigmoid": [["X"], ["Out"]],
+        "gru": [["Input", "Weight"], ["Hidden"]],
+        "lstm": [["Input", "Weight"], ["Hidden"]],
+        "pad2d": [["X"], ["Out"]],
+        "flatten": [["X"], ["Out"]],
+        "flatten2": [["X"], ["Out"]],
+    }
+
+    def _get_op_output_var_names(op):
+        """ """
+        assert isinstance(op, (IrNode, Operator)), \
+            "The input op should be IrNode or Operator."
+        var_names = []
+        op_name = op.name() if isinstance(op, IrNode) \
+            else op.type
+        if op_name not in op_real_in_out_name:
+            return []
+
+        name_list = op_real_in_out_name[op_name][1]
+        for name in name_list:
+            var_name = op.output(name)
+            if isinstance(var_name, list):
+                var_names.extend(var_name)
+            else:
+                var_names.append(var_name)
+        return var_names
+
+    op_nodes = graph.all_op_nodes()
+    for op_node in op_nodes:
+        if op_node.name() in out_scale_op_list:
+            var_names = _get_op_output_var_names(op_node)
+            for var_name in var_names:
+                in_node = graph._find_node_by_name(op_node.outputs, var_name)
+                if in_node.dtype() not in \
+                    [core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32]:
+                    continue
+
+                op_node.op()._set_attr("out_threshold", 3.0)
+
+    # Freeze graph for inference, but the weight of fc/conv is still float type.
+    freeze_pass = QuantizationFreezePass(
+        scope=scope, place=place, weight_quantize_type=weight_quantize_type)
+    freeze_pass.apply(graph)
+
+    main_program = graph.to_program()
+
+    # modify fake_quantize_moving_average_abs_max(InScale) and fake_channel_wise_dequantize_max_abs(Scales)
+    op_nodes = graph.all_op_nodes()
+    for op_node in op_nodes:
+        if op_node.name() == 'fake_quantize_moving_average_abs_max':
+            var_name = op_node.input("InScale")[0]
+            tensor = scope.var(var_name).get_tensor()
+            tensor.set(np.array([1], dtype=np.float32), place)
+        elif op_node.name() == 'fake_channel_wise_dequantize_max_abs':
+            var_name = op_node.input("Scales")[0]
+            tensor = scope.var(var_name).get_tensor()
+            tensor.set(np.ones(tensor.shape(), dtype=np.float32), place)
+
+    if save:
+        fluid.io.save_inference_model(
+            'test_inference_model',
+            feed_target_names,
+            fetch_targets,
+            exe,
+            main_program=main_program)
+
+    feed_vars = [
+        main_program.global_block().var(name) for name in feed_target_names
+    ]
+    serialized_program = paddle.static.serialize_program(
+        feed_vars, fetch_targets, program=main_program)
+    serialized_params = paddle.static.serialize_persistables(
+        feed_vars, fetch_targets, executor=exe, program=main_program)
+    return serialized_program, serialized_params
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py
index 98c3367b3f2..d3f1eca2bf6 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py
@@ -15,6 +15,7 @@
 from trt_layer_auto_scan_test import TrtLayerAutoScanTest
 from program_config import TensorConfig
 import numpy as np
+import paddle.inference as paddle_infer
 
 
 class TrtConvertConv2dTest(TrtLayerAutoScanTest):
@@ -59,8 +60,33 @@ class TrtConvertConv2dTest(TrtLayerAutoScanTest):
         self.program_inputs = {"input_data": input_data}
         self.program_outputs = ["relu_output_data"]
 
-    def test_check_output(self):
-        self.run_test()
+    def test_check_fp32_output(self):
+        self.trt_param.precision == paddle_infer.PrecisionType.Float32
+        # the fused tensorrt engine num is 1, and paddle op num is 2(feed and fetch).
+        self.run_test(trt_engine_num=1, paddle_op_num=2, threshold=1e-5)
+
+    def test_check_fp16_output(self):
+        self.trt_param.precision == paddle_infer.PrecisionType.Half
+        self.run_test(trt_engine_num=1, paddle_op_num=2, threshold=1e-2)
+
+    def test_dynamic_shape_fp32_check_output(self):
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
+        self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
+        self.dynamic_shape.opt_input_shape = {"input_data": [1, 3, 64, 64]}
+        self.run_test(trt_engine_num=1, paddle_op_num=2, threshold=1e-5)
+
+    def test_dynamic_shape_fp16_check_output(self):
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
+        self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
+        self.dynamic_shape.opt_input_shape = {"input_data": [1, 3, 64, 64]}
+        self.run_test(trt_engine_num=1, paddle_op_num=2, threshold=1e-2)
+
+    def test_trt_int8_check_output(self):
+        self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        self.run_test(
+            trt_engine_num=1, paddle_op_num=2, quant=True, threshold=1e-1)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py b/python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py
index 589916ad390..bf6fc7a24a3 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py
@@ -16,6 +16,7 @@ import numpy as np
 import unittest
 import itertools
 import abc
+import logging
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -26,6 +27,9 @@ from typing import *
 from program_config import TensorConfig, OpConfig, ProgramConfig
 from auto_scan_test import AutoScanTest
 
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(filename)s - %(message)s")
+
 
 class TrtLayerAutoScanTest(AutoScanTest):
     class TensorRTParam:
@@ -42,6 +46,18 @@ class TrtLayerAutoScanTest(AutoScanTest):
             self.use_static = use_static
             self.use_calib_mode = use_calib_mode
 
+    class DynamicShapeParam:
+        '''
+         Prepare TensorRT subgraph engine dynamic shape parameters. 
+         '''
+
+        def __init__(self, min_input_shape, max_input_shape, optim_input_shape,
+                     disable_trt_plugin_fp16):
+            self.min_input_shape = min_input_shape
+            self.max_input_shape = max_input_shape
+            self.optim_input_shape = optim_input_shape
+            self.disable_trt_plugin_fp16 = disable_trt_plugin_fp16
+
     def __init__(self, methodName='runTest'):
         super(TrtLayerAutoScanTest, self).__init__(methodName)
         self.trt_param = self.TensorRTParam(
@@ -51,6 +67,7 @@ class TrtLayerAutoScanTest(AutoScanTest):
             precision=paddle_infer.PrecisionType.Float32,
             use_static=False,
             use_calib_mode=False)
+        self.dynamic_shape = self.DynamicShapeParam({}, {}, {}, False)
 
     def update_program_input_and_weight_with_attr(self, op_attr_list):
         raise NotImplementedError
@@ -96,6 +113,7 @@ class TrtLayerAutoScanTest(AutoScanTest):
         config = paddle_infer.Config()
         config.enable_use_gpu(100, 0)
         if use_trt:
+            config.switch_ir_debug()
             config.enable_tensorrt_engine(
                 max_batch_size=self.trt_param.max_batch_size,
                 workspace_size=self.trt_param.workspace_size,
@@ -103,13 +121,22 @@ class TrtLayerAutoScanTest(AutoScanTest):
                 precision_mode=precision_mode,
                 use_static=self.trt_param.use_static,
                 use_calib_mode=self.trt_param.use_calib_mode)
+            if len(self.dynamic_shape.min_input_shape
+                   ) != 0 and self.dynamic_shape.min_input_shape.keys(
+                   ) == self.dynamic_shape.max_input_shape.keys(
+                   ) and self.dynamic_shape.min_input_shape.keys(
+                   ) == self.dynamic_shape.opt_input_shape.keys():
+                config.set_trt_dynamic_shape_info(
+                    self.dynamic_shape.min_input_shape,
+                    self.dynamic_shape.max_input_shape,
+                    self.dynamic_shape.opt_input_shape,
+                    self.dynamic_shape.disable_trt_plugin_fp16)
         return config
 
     @abc.abstractmethod
     def sample_predictor_configs(self):
+        logging.info('--------- gpu inference ---------')
         yield self.create_program_config(use_trt=False)
+        logging.info('--------- trt inference ---------')
         yield self.create_program_config(
             use_trt=True, precision_mode=self.trt_param.precision)
-        if self.trt_param.precision == paddle_infer.PrecisionType.Float32:
-            yield self.create_program_config(
-                use_trt=True, precision_mode=paddle_infer.PrecisionType.Half)
diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py
index d053ec4a803..d251e273bef 100644
--- a/python/paddle/static/io.py
+++ b/python/paddle/static/io.py
@@ -757,7 +757,7 @@ def load_inference_model(path_prefix, executor, **kwargs):
                 "params_filename cannot be None when path_prefix is None.")
         load_dirname = ''
         program_bytes = model_filename
-        params_filename = params_filename
+        params_bytes = params_filename
     # load from file
     else:
         # check and norm path_prefix
@@ -795,12 +795,12 @@ def load_inference_model(path_prefix, executor, **kwargs):
         program_bytes = load_from_file(model_path)
         load_dirname = os.path.dirname(params_path)
         params_filename = os.path.basename(params_path)
+        # load params data
+        params_path = os.path.join(load_dirname, params_filename)
+        params_bytes = load_from_file(params_path)
 
     # deserialize bytes to program
     program = deserialize_program(program_bytes)
-    # load params data
-    params_path = os.path.join(load_dirname, params_filename)
-    params_bytes = load_from_file(params_path)
     # deserialize bytes to params
     deserialize_persistables(program, params_bytes, executor)
 
-- 
GitLab