From 7987a9052fcd213eb632de82bf1092a8d0e038df Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <39978853+zhoutianzi666@users.noreply.github.com> Date: Mon, 10 Oct 2022 15:13:51 +0800 Subject: [PATCH] [Paddle-TRT] support new quant format from slim (#46022) --- .../ir/delete_quant_dequant_linear_op_pass.cc | 10 +- .../inference/api/paddle_pass_builder.cc | 4 +- .../tensorrt/convert/matmul_v2_op.cc | 30 ++++- paddle/fluid/inference/tensorrt/op_teller.cc | 1 + .../inference/test_trt_convert_matmul_v2.py | 122 ++++++++++++++++++ 5 files changed, 153 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc index ee7a2a72233..e049d1e950a 100644 --- a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc +++ b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc @@ -111,9 +111,6 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { } */ std::unordered_set nodes2rm = {}; - int bit_length = - PADDLE_GET_CONST(int, quantize_linear_op->Op()->GetAttr("bit_length")); - int range = ((1 << (bit_length - 1)) - 1); // Get input scale from tensor const LoDTensor& input_scale_tensor = @@ -124,7 +121,7 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { platform::errors::InvalidArgument( "Input scale tensor's place should be CPU.")); const float* input_scale_data = input_scale_tensor.data(); - float input_scale = input_scale_data[0] / range; + float input_scale = input_scale_data[0]; int nums_any_ops = dequantize_linear_op_out->outputs.size(); for (int i = 0; i < nums_any_ops; ++i) { @@ -138,8 +135,9 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { IR_NODE_LINK_TO(quantize_linear_op_x, dequantize_linear_op_out->outputs[i]); } - - nodes2rm.insert(quantize_linear_op_scale); + // Forbid removing weight tensor when weight is shared between ops + if (quantize_linear_op_scale->outputs.size() <= 1UL) + nodes2rm.insert(quantize_linear_op_scale); nodes2rm.insert(quantize_linear_op); nodes2rm.insert(quantize_linear_op_out); nodes2rm.insert(dequantize_linear_op); diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 222c90703a5..133c9b363a4 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -84,8 +84,7 @@ void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { void PaddlePassBuilder::ClearPasses() { passes_.clear(); } const std::vector kTRTSubgraphPasses({ - "identity_scale_op_clean_pass", // - "adaptive_pool2d_convert_global_pass", // + "adaptive_pool2d_convert_global_pass", // "shuffle_channel_detect_pass", // "quant_conv2d_dequant_fuse_pass", // "delete_fill_constant_op_pass", // @@ -93,6 +92,7 @@ const std::vector kTRTSubgraphPasses({ "delete_quant_dequant_filter_op_pass", // "delete_weight_dequant_linear_op_pass", // "delete_quant_dequant_linear_op_pass", // + "identity_scale_op_clean_pass", // "add_support_int8_pass", // // "fc_fuse_pass", // "simplify_with_basic_ops_pass", // diff --git a/paddle/fluid/inference/tensorrt/convert/matmul_v2_op.cc b/paddle/fluid/inference/tensorrt/convert/matmul_v2_op.cc index c6f5a42a7da..e87b2844373 100644 --- a/paddle/fluid/inference/tensorrt/convert/matmul_v2_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/matmul_v2_op.cc @@ -37,9 +37,9 @@ class MatMulV2OpConverter : public OpConverter { void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { - VLOG(3) << "convert a fluid matmul_v2 op to tensorrt matmul layer "; + VLOG(3) << "convert a matmul_v2 op to tensorrt IMatrixMultiplyLayer layer "; framework::OpDesc op_desc(op, nullptr); - nvinfer1::ILayer* layer = nullptr; + nvinfer1::IMatrixMultiplyLayer* layer = nullptr; // Declare inputs auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); @@ -61,8 +61,9 @@ class MatMulV2OpConverter : public OpConverter { : nvinfer1::MatrixOperation::kNONE; int one_num = 0; + bool all_matrix = dims_x.nbDims >= 2 && dims_y.nbDims >= 2; nvinfer1::ITensor* new_shape_tensor = nullptr; - if (dims_x.nbDims < dims_y.nbDims) { + if (dims_x.nbDims < dims_y.nbDims && all_matrix) { one_num = dims_y.nbDims - dims_x.nbDims; new_shape_tensor = Shape(input1); std::vector one_vec(one_num, 1); @@ -80,7 +81,7 @@ class MatMulV2OpConverter : public OpConverter { *input2, matrix_operation_Y); - } else if (dims_x.nbDims > dims_y.nbDims) { + } else if (dims_x.nbDims > dims_y.nbDims && all_matrix) { one_num = dims_x.nbDims - dims_y.nbDims; new_shape_tensor = Shape(input2); std::vector one_vec(one_num, 1); @@ -105,9 +106,26 @@ class MatMulV2OpConverter : public OpConverter { *input2, matrix_operation_Y); } - VLOG(3) << "Convert a fluid matmul_v2_op_float to TensorRT "; + if (dims_x.nbDims == 1) + layer->setOperation(0, nvinfer1::MatrixOperation::kVECTOR); + if (dims_y.nbDims == 1) + layer->setOperation(1, nvinfer1::MatrixOperation::kVECTOR); + nvinfer1::ILayer* final_layer = static_cast(layer); + // When vec * vec, trt produces a scalar, so to be consistent with paddle, + // we need add a reshape. + if (dims_x.nbDims == 1 && dims_y.nbDims == 1) { + auto reshape_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *layer->getOutput(0)); + nvinfer1::Dims reshape_dim; + reshape_dim.nbDims = 1; + reshape_dim.d[0] = 1; + reshape_layer->setReshapeDimensions(reshape_dim); + final_layer = static_cast(reshape_layer); + } + VLOG(3) << "Convert a matmul_v2_op to TensorRT "; - RreplenishLayerAndOutput(layer, "matmul_v2_op", {output_name}, test_mode); + RreplenishLayerAndOutput( + final_layer, "matmul_v2_op", {output_name}, test_mode); } }; diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 5d056bb0da8..18db0944f26 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -46,6 +46,7 @@ struct SimpleOpTypeSetTeller : public Teller { #if IS_TRT_VERSION_GE(7000) teller_set.insert("tile"); teller_set.insert("flatten_contiguous_range"); + int8_teller_set.insert("flatten_contiguous_range"); teller_set.insert("rnn"); int8_teller_set.insert("rnn"); teller_set.insert("fill_constant_batch_size_like"); diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul_v2.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul_v2.py index b5d94ebfe3c..d895872db4b 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul_v2.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul_v2.py @@ -193,5 +193,127 @@ class TrtConvertMatmulTest_dynamic2(TrtLayerAutoScanTest): self.run_test() +class TrtConvertMatmulTest_dynamic3(TrtLayerAutoScanTest): + + def sample_program_configs(self): + + def generate_input(shape): + return np.random.random(shape).astype(np.float32) + + # case0: mat * vec + # case1: vec * mat + # case2: vec * vec + for case in [0, 1, 2]: + for batch in range(20, 23): + for trans_x in [False, True]: + for trans_y in [False, True]: + self.case = case + input1_shape = [] + input2_shape = [] + if case == 0: + input1_shape = [batch, 50] + input2_shape = [50] + elif case == 1: + input1_shape = [50] + input2_shape = [50, batch] + elif case == 2: + input1_shape = [50] + input2_shape = [50] + if (case == 0 or case == 1): + dics = [{ + "trans_x": False, + "trans_y": False, + }] + elif (case == 2): + dics = [{ + "trans_x": trans_x, + "trans_y": trans_y, + }] + ops_config = [{ + "op_type": "matmul_v2", + "op_inputs": { + "X": ["input1_data"], + "Y": ["input2_data"] + }, + "op_outputs": { + "Out": ["output_data"] + }, + "op_attrs": dics[0] + }] + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "input1_data": + TensorConfig(data_gen=partial( + generate_input, input1_shape)), + "input2_data": + TensorConfig(data_gen=partial( + generate_input, input2_shape)) + }, + outputs=["output_data"]) + + yield program_config + + def sample_predictor_configs( + self, program_config) -> (paddle_infer.Config, List[int], float): + + def generate_dynamic_shape(): + if (self.case == 0): + self.dynamic_shape.min_input_shape = { + "input1_data": [20, 50], + "input2_data": [50] + } + self.dynamic_shape.max_input_shape = { + "input1_data": [30, 50], + "input2_data": [50] + } + self.dynamic_shape.opt_input_shape = { + "input1_data": [25, 50], + "input2_data": [50] + } + elif (self.case == 1): + self.dynamic_shape.min_input_shape = { + "input2_data": [50, 20], + "input1_data": [50] + } + self.dynamic_shape.max_input_shape = { + "input2_data": [50, 30], + "input1_data": [50] + } + self.dynamic_shape.opt_input_shape = { + "input2_data": [50, 25], + "input1_data": [50] + } + elif (self.case == 2): + self.dynamic_shape.min_input_shape = { + "input2_data": [30], + "input1_data": [50] + } + self.dynamic_shape.max_input_shape = { + "input2_data": [50], + "input1_data": [50] + } + self.dynamic_shape.opt_input_shape = { + "input2_data": [50], + "input1_data": [50] + } + + generate_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), (1, 3), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), (1, 3), 1e-5 + + def add_skip_trt_case(self): + pass + + def test(self): + self.add_skip_trt_case() + self.run_test() + + if __name__ == "__main__": unittest.main() -- GitLab