From 17188e8d5173ff86464603370b49b9531fd6f3b2 Mon Sep 17 00:00:00 2001 From: Wilber Date: Mon, 23 Aug 2021 19:04:29 +0800 Subject: [PATCH] trt convert ut add dynamic_shape and int8, etc. (#35061) --- paddle/fluid/framework/ir/graph_helper.cc | 1 + paddle/fluid/framework/ir/graph_viz_pass.cc | 28 +++ .../unittests/ir/inference/auto_scan_test.py | 42 +++- .../unittests/ir/inference/program_config.py | 183 ++++++++++++++++++ .../ir/inference/test_trt_convert_conv2d.py | 30 ++- .../ir/inference/trt_layer_auto_scan_test.py | 33 +++- python/paddle/static/io.py | 8 +- 7 files changed, 308 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index 652ce77d844..a73bc487c92 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -535,6 +535,7 @@ void GraphToProgram(const Graph &graph, ProgramDesc *program, block = program_pb.add_blocks(); block->set_idx(idx); + block->set_parent_idx(kRootBlockIndex); GraphToBlock(*graph.GetSubGraph(idx), block, sort_kind); } } else { diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc index d8f90d5a757..f2c711fb6f0 100644 --- a/paddle/fluid/framework/ir/graph_viz_pass.cc +++ b/paddle/fluid/framework/ir/graph_viz_pass.cc @@ -13,8 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/graph_viz_pass.h" +#include +#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_printer.h" #include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/inference/analysis/dot.h" namespace paddle { @@ -44,6 +47,31 @@ void GraphVizPass::ApplyImpl(ir::Graph* graph) const { "Can not open file %s for printing the graph.", graph_viz_path)); std::ostream& sout = *fout; + // serialize only model file. + std::string program_path; + std::size_t found1 = graph_viz_path.find("_ir_"); + std::size_t found2 = graph_viz_path.find(".dot"); + if (found1 != std::string::npos && found2 != std::string::npos) { + ProgramDesc program_desc; + GraphToProgram(*graph, &program_desc); + // TODO(wilber): GraphToProgram seems have bugs. + for (size_t i = 0; i < program_desc.Size(); ++i) { + for (size_t j = 0; j < program_desc.Block(i).OpSize(); ++j) { + if (program_desc.Block(i).Op(j)->Type() == "tensorrt_engine") { + program_desc.Block(i).Op(j)->RemoveAttr("sub_block"); + } + } + } + std::string program_bytes = program_desc.Proto()->SerializeAsString(); + // rename from "17_ir_fc_fuse_pass.dot" to "fc_fuse_pass.pdmodel" + program_path = + graph_viz_path.substr(found1 + 4, found2 - found1 - 4) + ".pdmodel"; + std::ofstream file(program_path.c_str(), std::ios::binary); + file.write(program_bytes.c_str(), program_bytes.size()); + file.close(); + VLOG(3) << "serialize program to " << program_path; + } + std::unordered_map node2dot; Dot dot; diff --git a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py index 7d749cca5c2..3a899a2e5e0 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py @@ -15,6 +15,7 @@ import numpy as np import unittest import abc +import os import paddle import paddle.fluid as fluid from paddle.fluid.initializer import NumpyArrayInitializer @@ -22,14 +23,13 @@ import paddle.fluid.core as core from paddle import compat as cpt import paddle.inference as paddle_infer from typing import Optional, List, Callable, Dict, Any, Set -from program_config import TensorConfig, OpConfig, ProgramConfig, create_fake_model +from program_config import TensorConfig, OpConfig, ProgramConfig, create_fake_model, create_quant_model class AutoScanTest(unittest.TestCase): def __init__(self, methodName='runTest'): paddle.enable_static() super(AutoScanTest, self).__init__(methodName) - self.threshold = 1e-5 @abc.abstractmethod def sample_program_configs(self) -> List[ProgramConfig]: @@ -56,11 +56,31 @@ class AutoScanTest(unittest.TestCase): input_tensor.copy_from_cpu(feed_data[name]) predictor.run() result = {} - for out_name in prog_config.outputs: - result[out_name] = predictor.get_output_handle( - out_name).copy_to_cpu() + for out_name, o_name in zip(prog_config.outputs, + predictor.get_output_names()): + result[out_name] = predictor.get_output_handle(o_name).copy_to_cpu() return result + def assert_op_size(self, trt_engine_num, paddle_op_num): + cur_path = os.path.dirname(__file__) + last_passed_program = os.path.join( + cur_path, 'transpose_flatten_concat_fuse_pass.pdmodel') + model_bytes = paddle.static.load_from_file(last_passed_program) + pg = paddle.static.deserialize_program(model_bytes) + main_block = pg.desc.block(0) + op_size = main_block.op_size() + op_types = [ + main_block.op(i).type() == 'tensorrt_engine' for i in range(op_size) + ] + trt_engine_size = sum(op_types) + paddle_op_size = op_size - trt_engine_size + self.assertTrue(trt_engine_size == trt_engine_num, + 'trt_engine_num is {}, but got {}!'.format( + trt_engine_size, trt_engine_num)) + self.assertTrue(paddle_op_size == paddle_op_num, + 'paddle_op_num is {}, but got {}!'.format( + paddle_op_size, paddle_op_num)) + def assert_tensors_near(self, threshold: float, tensors: List[Dict[str, np.array]]): @@ -73,9 +93,15 @@ class AutoScanTest(unittest.TestCase): first[key], arr, atol=threshold), "Output has diff between GPU and TensorRT. ") - def run_test(self): + def run_test(self, + trt_engine_num: int, + paddle_op_num: int, + threshold=1e-5, + quant=False): for prog_config in self.sample_program_configs(): model, params = create_fake_model(prog_config) + if quant: + model, params = create_quant_model(model, params) for batch_size in self.batch_size_set: feed_data = {} for name, tensor_config in prog_config.inputs.items(): @@ -88,5 +114,5 @@ class AutoScanTest(unittest.TestCase): results.append( self.run_test_config(model, params, prog_config, pred_config, feed_data)) - self.assert_tensors_near( - threshold=self.threshold, tensors=results) + self.assert_tensors_near(threshold=threshold, tensors=results) + self.assert_op_size(trt_engine_num, paddle_op_num) diff --git a/python/paddle/fluid/tests/unittests/ir/inference/program_config.py b/python/paddle/fluid/tests/unittests/ir/inference/program_config.py index 399501618b6..6b465f4b2af 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/program_config.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/program_config.py @@ -21,6 +21,11 @@ from paddle import compat as cpt from paddle.fluid.initializer import NumpyArrayInitializer from paddle.fluid.framework import convert_np_dtype_to_dtype_ +from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass +from paddle.fluid.contrib.slim.quantization import QuantizationFreezePass +from paddle.fluid.framework import IrGraph, IrNode, Operator +from paddle.fluid.executor import global_scope + class TensorConfig: ''' @@ -160,3 +165,181 @@ def create_fake_model(program_config): executor.run(util_program) params = scope.find_var("out_var_0").get_bytes() return model, params + + +def create_quant_model(model, + params, + activation_quantize_type='moving_average_abs_max', + weight_quantize_type='channel_wise_abs_max', + save=False): + place = paddle.CUDAPlace(0) + scope = global_scope() + exe = paddle.static.Executor(place) + [inference_program, feed_target_names, + fetch_targets] = paddle.static.load_inference_model( + path_prefix=None, + executor=exe, + model_filename=model, + params_filename=params) + graph = IrGraph(core.Graph(inference_program.desc), for_test=True) + + transform_pass = QuantizationTransformPass( + scope=scope, + place=place, + activation_quantize_type=activation_quantize_type, + weight_quantize_type=weight_quantize_type) + transform_pass.apply(graph) + + out_scale_op_list = [ + "conv2d", + "depthwise_conv2d", + "mul", + "matmul", + "relu", + "leaky_relu", + "relu6", + "sigmoid", + "tanh", + "prelu", + "swish", + "softmax", + "batch_norm", + "layer_norm", + "elementwise_add", + "pool2d", + "reshape2", + "transpose2", + "concat", + "elementwise_mul", + "scale", + "slice", + "hard_swish", + "hard_sigmoid", + "conv2d_transpose", + "gru", + "bilinear_interp", + "nearest_interp", + "trilinear_interp", + "flatten", + "flatten2", + "transpose", + "pad2d", + "reshape", + "layer_norm", + ] + op_real_in_out_name = { + "conv2d": [["Input", "Filter"], ["Output"]], + "depthwise_conv2d": [["Input", "Filter"], ["Output"]], + "conv2d_transpose": [["Input", "Filter"], ["Output"]], + "mul": [["X", "Y"], ["Out"]], + "matmul": [["X", "Y"], ["Out"]], + "pool2d": [["X"], ["Out"]], + "elementwise_add": [["X", "Y"], ["Out"]], + "concat": [["X"], ["Out"]], + "softmax": [["X"], ["Out"]], + "argmax": [["X"], ["Out"]], + "transpose": [["X"], ["Out"]], + "equal": [["X", "Y"], ["Out"]], + "gather": [["X"], ["Out"]], + "greater_equal": [["X", "Y"], ["Out"]], + "greater_than": [["X", "Y"], ["Out"]], + "less_equal": [["X", "Y"], ["Out"]], + "less_than": [["X", "Y"], ["Out"]], + "mean": [["X"], ["Out"]], + "not_equal": [["X", "Y"], ["Out"]], + "reshape": [["X"], ["Out"]], + "reshape2": [["X"], ["Out"]], + "transpose2": [["X"], ["Out"]], + "bilinear_interp": [["X"], ["Out"]], + "nearest_interp": [["X"], ["Out"]], + "trilinear_interp": [["X"], ["Out"]], + "slice": [["Input"], ["Out"]], + "squeeze": [["X"], ["Out"]], + "elementwise_sub": [["X", "Y"], ["Out"]], + "relu": [["X"], ["Out"]], + "relu6": [["X"], ["Out"]], + "leaky_relu": [["X"], ["Out"]], + "prelu": [["X"], ["Out"]], + "tanh": [["X"], ["Out"]], + "swish": [["X"], ["Out"]], + "dropout": [["X"], ["Out"]], + "batch_norm": [["X"], ["Y"]], + "layer_norm": [["X"], ["Y"]], + "sigmoid": [["X"], ["Out"]], + "elementwise_mul": [["X", "Y"], ["Out"]], + "scale": [["X"], ["Out"]], + "hard_swish": [["X"], ["Out"]], + "hard_sigmoid": [["X"], ["Out"]], + "gru": [["Input", "Weight"], ["Hidden"]], + "lstm": [["Input", "Weight"], ["Hidden"]], + "pad2d": [["X"], ["Out"]], + "flatten": [["X"], ["Out"]], + "flatten2": [["X"], ["Out"]], + } + + def _get_op_output_var_names(op): + """ """ + assert isinstance(op, (IrNode, Operator)), \ + "The input op should be IrNode or Operator." + var_names = [] + op_name = op.name() if isinstance(op, IrNode) \ + else op.type + if op_name not in op_real_in_out_name: + return [] + + name_list = op_real_in_out_name[op_name][1] + for name in name_list: + var_name = op.output(name) + if isinstance(var_name, list): + var_names.extend(var_name) + else: + var_names.append(var_name) + return var_names + + op_nodes = graph.all_op_nodes() + for op_node in op_nodes: + if op_node.name() in out_scale_op_list: + var_names = _get_op_output_var_names(op_node) + for var_name in var_names: + in_node = graph._find_node_by_name(op_node.outputs, var_name) + if in_node.dtype() not in \ + [core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32]: + continue + + op_node.op()._set_attr("out_threshold", 3.0) + + # Freeze graph for inference, but the weight of fc/conv is still float type. + freeze_pass = QuantizationFreezePass( + scope=scope, place=place, weight_quantize_type=weight_quantize_type) + freeze_pass.apply(graph) + + main_program = graph.to_program() + + # modify fake_quantize_moving_average_abs_max(InScale) and fake_channel_wise_dequantize_max_abs(Scales) + op_nodes = graph.all_op_nodes() + for op_node in op_nodes: + if op_node.name() == 'fake_quantize_moving_average_abs_max': + var_name = op_node.input("InScale")[0] + tensor = scope.var(var_name).get_tensor() + tensor.set(np.array([1], dtype=np.float32), place) + elif op_node.name() == 'fake_channel_wise_dequantize_max_abs': + var_name = op_node.input("Scales")[0] + tensor = scope.var(var_name).get_tensor() + tensor.set(np.ones(tensor.shape(), dtype=np.float32), place) + + if save: + fluid.io.save_inference_model( + 'test_inference_model', + feed_target_names, + fetch_targets, + exe, + main_program=main_program) + + feed_vars = [ + main_program.global_block().var(name) for name in feed_target_names + ] + serialized_program = paddle.static.serialize_program( + feed_vars, fetch_targets, program=main_program) + serialized_params = paddle.static.serialize_persistables( + feed_vars, fetch_targets, executor=exe, program=main_program) + return serialized_program, serialized_params diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py index 98c3367b3f2..d3f1eca2bf6 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py @@ -15,6 +15,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest from program_config import TensorConfig import numpy as np +import paddle.inference as paddle_infer class TrtConvertConv2dTest(TrtLayerAutoScanTest): @@ -59,8 +60,33 @@ class TrtConvertConv2dTest(TrtLayerAutoScanTest): self.program_inputs = {"input_data": input_data} self.program_outputs = ["relu_output_data"] - def test_check_output(self): - self.run_test() + def test_check_fp32_output(self): + self.trt_param.precision == paddle_infer.PrecisionType.Float32 + # the fused tensorrt engine num is 1, and paddle op num is 2(feed and fetch). + self.run_test(trt_engine_num=1, paddle_op_num=2, threshold=1e-5) + + def test_check_fp16_output(self): + self.trt_param.precision == paddle_infer.PrecisionType.Half + self.run_test(trt_engine_num=1, paddle_op_num=2, threshold=1e-2) + + def test_dynamic_shape_fp32_check_output(self): + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]} + self.dynamic_shape.opt_input_shape = {"input_data": [1, 3, 64, 64]} + self.run_test(trt_engine_num=1, paddle_op_num=2, threshold=1e-5) + + def test_dynamic_shape_fp16_check_output(self): + self.trt_param.precision = paddle_infer.PrecisionType.Half + self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]} + self.dynamic_shape.opt_input_shape = {"input_data": [1, 3, 64, 64]} + self.run_test(trt_engine_num=1, paddle_op_num=2, threshold=1e-2) + + def test_trt_int8_check_output(self): + self.trt_param.precision = paddle_infer.PrecisionType.Int8 + self.run_test( + trt_engine_num=1, paddle_op_num=2, quant=True, threshold=1e-1) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py b/python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py index 589916ad390..bf6fc7a24a3 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py @@ -16,6 +16,7 @@ import numpy as np import unittest import itertools import abc +import logging import paddle import paddle.fluid as fluid import paddle.fluid.core as core @@ -26,6 +27,9 @@ from typing import * from program_config import TensorConfig, OpConfig, ProgramConfig from auto_scan_test import AutoScanTest +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(filename)s - %(message)s") + class TrtLayerAutoScanTest(AutoScanTest): class TensorRTParam: @@ -42,6 +46,18 @@ class TrtLayerAutoScanTest(AutoScanTest): self.use_static = use_static self.use_calib_mode = use_calib_mode + class DynamicShapeParam: + ''' + Prepare TensorRT subgraph engine dynamic shape parameters. + ''' + + def __init__(self, min_input_shape, max_input_shape, optim_input_shape, + disable_trt_plugin_fp16): + self.min_input_shape = min_input_shape + self.max_input_shape = max_input_shape + self.optim_input_shape = optim_input_shape + self.disable_trt_plugin_fp16 = disable_trt_plugin_fp16 + def __init__(self, methodName='runTest'): super(TrtLayerAutoScanTest, self).__init__(methodName) self.trt_param = self.TensorRTParam( @@ -51,6 +67,7 @@ class TrtLayerAutoScanTest(AutoScanTest): precision=paddle_infer.PrecisionType.Float32, use_static=False, use_calib_mode=False) + self.dynamic_shape = self.DynamicShapeParam({}, {}, {}, False) def update_program_input_and_weight_with_attr(self, op_attr_list): raise NotImplementedError @@ -96,6 +113,7 @@ class TrtLayerAutoScanTest(AutoScanTest): config = paddle_infer.Config() config.enable_use_gpu(100, 0) if use_trt: + config.switch_ir_debug() config.enable_tensorrt_engine( max_batch_size=self.trt_param.max_batch_size, workspace_size=self.trt_param.workspace_size, @@ -103,13 +121,22 @@ class TrtLayerAutoScanTest(AutoScanTest): precision_mode=precision_mode, use_static=self.trt_param.use_static, use_calib_mode=self.trt_param.use_calib_mode) + if len(self.dynamic_shape.min_input_shape + ) != 0 and self.dynamic_shape.min_input_shape.keys( + ) == self.dynamic_shape.max_input_shape.keys( + ) and self.dynamic_shape.min_input_shape.keys( + ) == self.dynamic_shape.opt_input_shape.keys(): + config.set_trt_dynamic_shape_info( + self.dynamic_shape.min_input_shape, + self.dynamic_shape.max_input_shape, + self.dynamic_shape.opt_input_shape, + self.dynamic_shape.disable_trt_plugin_fp16) return config @abc.abstractmethod def sample_predictor_configs(self): + logging.info('--------- gpu inference ---------') yield self.create_program_config(use_trt=False) + logging.info('--------- trt inference ---------') yield self.create_program_config( use_trt=True, precision_mode=self.trt_param.precision) - if self.trt_param.precision == paddle_infer.PrecisionType.Float32: - yield self.create_program_config( - use_trt=True, precision_mode=paddle_infer.PrecisionType.Half) diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py index d053ec4a803..d251e273bef 100644 --- a/python/paddle/static/io.py +++ b/python/paddle/static/io.py @@ -757,7 +757,7 @@ def load_inference_model(path_prefix, executor, **kwargs): "params_filename cannot be None when path_prefix is None.") load_dirname = '' program_bytes = model_filename - params_filename = params_filename + params_bytes = params_filename # load from file else: # check and norm path_prefix @@ -795,12 +795,12 @@ def load_inference_model(path_prefix, executor, **kwargs): program_bytes = load_from_file(model_path) load_dirname = os.path.dirname(params_path) params_filename = os.path.basename(params_path) + # load params data + params_path = os.path.join(load_dirname, params_filename) + params_bytes = load_from_file(params_path) # deserialize bytes to program program = deserialize_program(program_bytes) - # load params data - params_path = os.path.join(load_dirname, params_filename) - params_bytes = load_from_file(params_path) # deserialize bytes to params deserialize_persistables(program, params_bytes, executor) -- GitLab