未验证 提交 17188e8d 编写于 作者: W Wilber 提交者: GitHub

trt convert ut add dynamic_shape and int8, etc. (#35061)

上级 a95db6a7
......@@ -535,6 +535,7 @@ void GraphToProgram(const Graph &graph, ProgramDesc *program,
block = program_pb.add_blocks();
block->set_idx(idx);
block->set_parent_idx(kRootBlockIndex);
GraphToBlock(*graph.GetSubGraph(idx), block, sort_kind);
}
} else {
......
......@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include <string>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph_printer.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/analysis/dot.h"
namespace paddle {
......@@ -44,6 +47,31 @@ void GraphVizPass::ApplyImpl(ir::Graph* graph) const {
"Can not open file %s for printing the graph.", graph_viz_path));
std::ostream& sout = *fout;
// serialize only model file.
std::string program_path;
std::size_t found1 = graph_viz_path.find("_ir_");
std::size_t found2 = graph_viz_path.find(".dot");
if (found1 != std::string::npos && found2 != std::string::npos) {
ProgramDesc program_desc;
GraphToProgram(*graph, &program_desc);
// TODO(wilber): GraphToProgram seems have bugs.
for (size_t i = 0; i < program_desc.Size(); ++i) {
for (size_t j = 0; j < program_desc.Block(i).OpSize(); ++j) {
if (program_desc.Block(i).Op(j)->Type() == "tensorrt_engine") {
program_desc.Block(i).Op(j)->RemoveAttr("sub_block");
}
}
}
std::string program_bytes = program_desc.Proto()->SerializeAsString();
// rename from "17_ir_fc_fuse_pass.dot" to "fc_fuse_pass.pdmodel"
program_path =
graph_viz_path.substr(found1 + 4, found2 - found1 - 4) + ".pdmodel";
std::ofstream file(program_path.c_str(), std::ios::binary);
file.write(program_bytes.c_str(), program_bytes.size());
file.close();
VLOG(3) << "serialize program to " << program_path;
}
std::unordered_map<const ir::Node*, std::string> node2dot;
Dot dot;
......
......@@ -15,6 +15,7 @@
import numpy as np
import unittest
import abc
import os
import paddle
import paddle.fluid as fluid
from paddle.fluid.initializer import NumpyArrayInitializer
......@@ -22,14 +23,13 @@ import paddle.fluid.core as core
from paddle import compat as cpt
import paddle.inference as paddle_infer
from typing import Optional, List, Callable, Dict, Any, Set
from program_config import TensorConfig, OpConfig, ProgramConfig, create_fake_model
from program_config import TensorConfig, OpConfig, ProgramConfig, create_fake_model, create_quant_model
class AutoScanTest(unittest.TestCase):
def __init__(self, methodName='runTest'):
paddle.enable_static()
super(AutoScanTest, self).__init__(methodName)
self.threshold = 1e-5
@abc.abstractmethod
def sample_program_configs(self) -> List[ProgramConfig]:
......@@ -56,11 +56,31 @@ class AutoScanTest(unittest.TestCase):
input_tensor.copy_from_cpu(feed_data[name])
predictor.run()
result = {}
for out_name in prog_config.outputs:
result[out_name] = predictor.get_output_handle(
out_name).copy_to_cpu()
for out_name, o_name in zip(prog_config.outputs,
predictor.get_output_names()):
result[out_name] = predictor.get_output_handle(o_name).copy_to_cpu()
return result
def assert_op_size(self, trt_engine_num, paddle_op_num):
cur_path = os.path.dirname(__file__)
last_passed_program = os.path.join(
cur_path, 'transpose_flatten_concat_fuse_pass.pdmodel')
model_bytes = paddle.static.load_from_file(last_passed_program)
pg = paddle.static.deserialize_program(model_bytes)
main_block = pg.desc.block(0)
op_size = main_block.op_size()
op_types = [
main_block.op(i).type() == 'tensorrt_engine' for i in range(op_size)
]
trt_engine_size = sum(op_types)
paddle_op_size = op_size - trt_engine_size
self.assertTrue(trt_engine_size == trt_engine_num,
'trt_engine_num is {}, but got {}!'.format(
trt_engine_size, trt_engine_num))
self.assertTrue(paddle_op_size == paddle_op_num,
'paddle_op_num is {}, but got {}!'.format(
paddle_op_size, paddle_op_num))
def assert_tensors_near(self,
threshold: float,
tensors: List[Dict[str, np.array]]):
......@@ -73,9 +93,15 @@ class AutoScanTest(unittest.TestCase):
first[key], arr, atol=threshold),
"Output has diff between GPU and TensorRT. ")
def run_test(self):
def run_test(self,
trt_engine_num: int,
paddle_op_num: int,
threshold=1e-5,
quant=False):
for prog_config in self.sample_program_configs():
model, params = create_fake_model(prog_config)
if quant:
model, params = create_quant_model(model, params)
for batch_size in self.batch_size_set:
feed_data = {}
for name, tensor_config in prog_config.inputs.items():
......@@ -88,5 +114,5 @@ class AutoScanTest(unittest.TestCase):
results.append(
self.run_test_config(model, params, prog_config,
pred_config, feed_data))
self.assert_tensors_near(
threshold=self.threshold, tensors=results)
self.assert_tensors_near(threshold=threshold, tensors=results)
self.assert_op_size(trt_engine_num, paddle_op_num)
......@@ -21,6 +21,11 @@ from paddle import compat as cpt
from paddle.fluid.initializer import NumpyArrayInitializer
from paddle.fluid.framework import convert_np_dtype_to_dtype_
from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
from paddle.fluid.contrib.slim.quantization import QuantizationFreezePass
from paddle.fluid.framework import IrGraph, IrNode, Operator
from paddle.fluid.executor import global_scope
class TensorConfig:
'''
......@@ -160,3 +165,181 @@ def create_fake_model(program_config):
executor.run(util_program)
params = scope.find_var("out_var_0").get_bytes()
return model, params
def create_quant_model(model,
params,
activation_quantize_type='moving_average_abs_max',
weight_quantize_type='channel_wise_abs_max',
save=False):
place = paddle.CUDAPlace(0)
scope = global_scope()
exe = paddle.static.Executor(place)
[inference_program, feed_target_names,
fetch_targets] = paddle.static.load_inference_model(
path_prefix=None,
executor=exe,
model_filename=model,
params_filename=params)
graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
transform_pass = QuantizationTransformPass(
scope=scope,
place=place,
activation_quantize_type=activation_quantize_type,
weight_quantize_type=weight_quantize_type)
transform_pass.apply(graph)
out_scale_op_list = [
"conv2d",
"depthwise_conv2d",
"mul",
"matmul",
"relu",
"leaky_relu",
"relu6",
"sigmoid",
"tanh",
"prelu",
"swish",
"softmax",
"batch_norm",
"layer_norm",
"elementwise_add",
"pool2d",
"reshape2",
"transpose2",
"concat",
"elementwise_mul",
"scale",
"slice",
"hard_swish",
"hard_sigmoid",
"conv2d_transpose",
"gru",
"bilinear_interp",
"nearest_interp",
"trilinear_interp",
"flatten",
"flatten2",
"transpose",
"pad2d",
"reshape",
"layer_norm",
]
op_real_in_out_name = {
"conv2d": [["Input", "Filter"], ["Output"]],
"depthwise_conv2d": [["Input", "Filter"], ["Output"]],
"conv2d_transpose": [["Input", "Filter"], ["Output"]],
"mul": [["X", "Y"], ["Out"]],
"matmul": [["X", "Y"], ["Out"]],
"pool2d": [["X"], ["Out"]],
"elementwise_add": [["X", "Y"], ["Out"]],
"concat": [["X"], ["Out"]],
"softmax": [["X"], ["Out"]],
"argmax": [["X"], ["Out"]],
"transpose": [["X"], ["Out"]],
"equal": [["X", "Y"], ["Out"]],
"gather": [["X"], ["Out"]],
"greater_equal": [["X", "Y"], ["Out"]],
"greater_than": [["X", "Y"], ["Out"]],
"less_equal": [["X", "Y"], ["Out"]],
"less_than": [["X", "Y"], ["Out"]],
"mean": [["X"], ["Out"]],
"not_equal": [["X", "Y"], ["Out"]],
"reshape": [["X"], ["Out"]],
"reshape2": [["X"], ["Out"]],
"transpose2": [["X"], ["Out"]],
"bilinear_interp": [["X"], ["Out"]],
"nearest_interp": [["X"], ["Out"]],
"trilinear_interp": [["X"], ["Out"]],
"slice": [["Input"], ["Out"]],
"squeeze": [["X"], ["Out"]],
"elementwise_sub": [["X", "Y"], ["Out"]],
"relu": [["X"], ["Out"]],
"relu6": [["X"], ["Out"]],
"leaky_relu": [["X"], ["Out"]],
"prelu": [["X"], ["Out"]],
"tanh": [["X"], ["Out"]],
"swish": [["X"], ["Out"]],
"dropout": [["X"], ["Out"]],
"batch_norm": [["X"], ["Y"]],
"layer_norm": [["X"], ["Y"]],
"sigmoid": [["X"], ["Out"]],
"elementwise_mul": [["X", "Y"], ["Out"]],
"scale": [["X"], ["Out"]],
"hard_swish": [["X"], ["Out"]],
"hard_sigmoid": [["X"], ["Out"]],
"gru": [["Input", "Weight"], ["Hidden"]],
"lstm": [["Input", "Weight"], ["Hidden"]],
"pad2d": [["X"], ["Out"]],
"flatten": [["X"], ["Out"]],
"flatten2": [["X"], ["Out"]],
}
def _get_op_output_var_names(op):
""" """
assert isinstance(op, (IrNode, Operator)), \
"The input op should be IrNode or Operator."
var_names = []
op_name = op.name() if isinstance(op, IrNode) \
else op.type
if op_name not in op_real_in_out_name:
return []
name_list = op_real_in_out_name[op_name][1]
for name in name_list:
var_name = op.output(name)
if isinstance(var_name, list):
var_names.extend(var_name)
else:
var_names.append(var_name)
return var_names
op_nodes = graph.all_op_nodes()
for op_node in op_nodes:
if op_node.name() in out_scale_op_list:
var_names = _get_op_output_var_names(op_node)
for var_name in var_names:
in_node = graph._find_node_by_name(op_node.outputs, var_name)
if in_node.dtype() not in \
[core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32]:
continue
op_node.op()._set_attr("out_threshold", 3.0)
# Freeze graph for inference, but the weight of fc/conv is still float type.
freeze_pass = QuantizationFreezePass(
scope=scope, place=place, weight_quantize_type=weight_quantize_type)
freeze_pass.apply(graph)
main_program = graph.to_program()
# modify fake_quantize_moving_average_abs_max(InScale) and fake_channel_wise_dequantize_max_abs(Scales)
op_nodes = graph.all_op_nodes()
for op_node in op_nodes:
if op_node.name() == 'fake_quantize_moving_average_abs_max':
var_name = op_node.input("InScale")[0]
tensor = scope.var(var_name).get_tensor()
tensor.set(np.array([1], dtype=np.float32), place)
elif op_node.name() == 'fake_channel_wise_dequantize_max_abs':
var_name = op_node.input("Scales")[0]
tensor = scope.var(var_name).get_tensor()
tensor.set(np.ones(tensor.shape(), dtype=np.float32), place)
if save:
fluid.io.save_inference_model(
'test_inference_model',
feed_target_names,
fetch_targets,
exe,
main_program=main_program)
feed_vars = [
main_program.global_block().var(name) for name in feed_target_names
]
serialized_program = paddle.static.serialize_program(
feed_vars, fetch_targets, program=main_program)
serialized_params = paddle.static.serialize_persistables(
feed_vars, fetch_targets, executor=exe, program=main_program)
return serialized_program, serialized_params
......@@ -15,6 +15,7 @@
from trt_layer_auto_scan_test import TrtLayerAutoScanTest
from program_config import TensorConfig
import numpy as np
import paddle.inference as paddle_infer
class TrtConvertConv2dTest(TrtLayerAutoScanTest):
......@@ -59,8 +60,33 @@ class TrtConvertConv2dTest(TrtLayerAutoScanTest):
self.program_inputs = {"input_data": input_data}
self.program_outputs = ["relu_output_data"]
def test_check_output(self):
self.run_test()
def test_check_fp32_output(self):
self.trt_param.precision == paddle_infer.PrecisionType.Float32
# the fused tensorrt engine num is 1, and paddle op num is 2(feed and fetch).
self.run_test(trt_engine_num=1, paddle_op_num=2, threshold=1e-5)
def test_check_fp16_output(self):
self.trt_param.precision == paddle_infer.PrecisionType.Half
self.run_test(trt_engine_num=1, paddle_op_num=2, threshold=1e-2)
def test_dynamic_shape_fp32_check_output(self):
self.trt_param.precision = paddle_infer.PrecisionType.Float32
self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
self.dynamic_shape.opt_input_shape = {"input_data": [1, 3, 64, 64]}
self.run_test(trt_engine_num=1, paddle_op_num=2, threshold=1e-5)
def test_dynamic_shape_fp16_check_output(self):
self.trt_param.precision = paddle_infer.PrecisionType.Half
self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
self.dynamic_shape.opt_input_shape = {"input_data": [1, 3, 64, 64]}
self.run_test(trt_engine_num=1, paddle_op_num=2, threshold=1e-2)
def test_trt_int8_check_output(self):
self.trt_param.precision = paddle_infer.PrecisionType.Int8
self.run_test(
trt_engine_num=1, paddle_op_num=2, quant=True, threshold=1e-1)
if __name__ == "__main__":
......
......@@ -16,6 +16,7 @@ import numpy as np
import unittest
import itertools
import abc
import logging
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
......@@ -26,6 +27,9 @@ from typing import *
from program_config import TensorConfig, OpConfig, ProgramConfig
from auto_scan_test import AutoScanTest
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(filename)s - %(message)s")
class TrtLayerAutoScanTest(AutoScanTest):
class TensorRTParam:
......@@ -42,6 +46,18 @@ class TrtLayerAutoScanTest(AutoScanTest):
self.use_static = use_static
self.use_calib_mode = use_calib_mode
class DynamicShapeParam:
'''
Prepare TensorRT subgraph engine dynamic shape parameters.
'''
def __init__(self, min_input_shape, max_input_shape, optim_input_shape,
disable_trt_plugin_fp16):
self.min_input_shape = min_input_shape
self.max_input_shape = max_input_shape
self.optim_input_shape = optim_input_shape
self.disable_trt_plugin_fp16 = disable_trt_plugin_fp16
def __init__(self, methodName='runTest'):
super(TrtLayerAutoScanTest, self).__init__(methodName)
self.trt_param = self.TensorRTParam(
......@@ -51,6 +67,7 @@ class TrtLayerAutoScanTest(AutoScanTest):
precision=paddle_infer.PrecisionType.Float32,
use_static=False,
use_calib_mode=False)
self.dynamic_shape = self.DynamicShapeParam({}, {}, {}, False)
def update_program_input_and_weight_with_attr(self, op_attr_list):
raise NotImplementedError
......@@ -96,6 +113,7 @@ class TrtLayerAutoScanTest(AutoScanTest):
config = paddle_infer.Config()
config.enable_use_gpu(100, 0)
if use_trt:
config.switch_ir_debug()
config.enable_tensorrt_engine(
max_batch_size=self.trt_param.max_batch_size,
workspace_size=self.trt_param.workspace_size,
......@@ -103,13 +121,22 @@ class TrtLayerAutoScanTest(AutoScanTest):
precision_mode=precision_mode,
use_static=self.trt_param.use_static,
use_calib_mode=self.trt_param.use_calib_mode)
if len(self.dynamic_shape.min_input_shape
) != 0 and self.dynamic_shape.min_input_shape.keys(
) == self.dynamic_shape.max_input_shape.keys(
) and self.dynamic_shape.min_input_shape.keys(
) == self.dynamic_shape.opt_input_shape.keys():
config.set_trt_dynamic_shape_info(
self.dynamic_shape.min_input_shape,
self.dynamic_shape.max_input_shape,
self.dynamic_shape.opt_input_shape,
self.dynamic_shape.disable_trt_plugin_fp16)
return config
@abc.abstractmethod
def sample_predictor_configs(self):
logging.info('--------- gpu inference ---------')
yield self.create_program_config(use_trt=False)
logging.info('--------- trt inference ---------')
yield self.create_program_config(
use_trt=True, precision_mode=self.trt_param.precision)
if self.trt_param.precision == paddle_infer.PrecisionType.Float32:
yield self.create_program_config(
use_trt=True, precision_mode=paddle_infer.PrecisionType.Half)
......@@ -757,7 +757,7 @@ def load_inference_model(path_prefix, executor, **kwargs):
"params_filename cannot be None when path_prefix is None.")
load_dirname = ''
program_bytes = model_filename
params_filename = params_filename
params_bytes = params_filename
# load from file
else:
# check and norm path_prefix
......@@ -795,12 +795,12 @@ def load_inference_model(path_prefix, executor, **kwargs):
program_bytes = load_from_file(model_path)
load_dirname = os.path.dirname(params_path)
params_filename = os.path.basename(params_path)
# deserialize bytes to program
program = deserialize_program(program_bytes)
# load params data
params_path = os.path.join(load_dirname, params_filename)
params_bytes = load_from_file(params_path)
# deserialize bytes to program
program = deserialize_program(program_bytes)
# deserialize bytes to params
deserialize_persistables(program, params_bytes, executor)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册