[Paddle-TRT] support new quant format from slim (#46022)

7987a905 · zhoutianzi666 · GitHub · 6e4cba14 · 7987a905 · 7987a905
5 changed file
--- a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
@@ -111,9 +111,6 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
    }
    */
    std::unordered_set<const Node*> nodes2rm = {};
-    int bit_length =
-        PADDLE_GET_CONST(int, quantize_linear_op->Op()->GetAttr("bit_length"));
-    int range = ((1 << (bit_length - 1)) - 1);

    // Get input scale from tensor
    const LoDTensor& input_scale_tensor =
@@ -124,7 +121,7 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
        platform::errors::InvalidArgument(
            "Input scale tensor's place should be CPU."));
    const float* input_scale_data = input_scale_tensor.data<float>();
-    float input_scale = input_scale_data[0] / range;
+    float input_scale = input_scale_data[0];

    int nums_any_ops = dequantize_linear_op_out->outputs.size();
    for (int i = 0; i < nums_any_ops; ++i) {
@@ -138,7 +135,8 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
      IR_NODE_LINK_TO(quantize_linear_op_x,
                      dequantize_linear_op_out->outputs[i]);
    }
-
+    // Forbid removing weight tensor when weight is shared between ops
+    if (quantize_linear_op_scale->outputs.size() <= 1UL)
      nodes2rm.insert(quantize_linear_op_scale);
    nodes2rm.insert(quantize_linear_op);
    nodes2rm.insert(quantize_linear_op_out);

--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -84,7 +84,6 @@ void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
 void PaddlePassBuilder::ClearPasses() { passes_.clear(); }

 const std::vector<std::string> kTRTSubgraphPasses({
-  "identity_scale_op_clean_pass",              //
  "adaptive_pool2d_convert_global_pass",       //
      "shuffle_channel_detect_pass",           //
      "quant_conv2d_dequant_fuse_pass",        //
@@ -93,6 +92,7 @@ const std::vector<std::string> kTRTSubgraphPasses({
      "delete_quant_dequant_filter_op_pass",   //
      "delete_weight_dequant_linear_op_pass",  //
      "delete_quant_dequant_linear_op_pass",   //
+      "identity_scale_op_clean_pass",          //
      "add_support_int8_pass",                 //
      // "fc_fuse_pass",                        //
      "simplify_with_basic_ops_pass",  //

--- a/paddle/fluid/inference/tensorrt/convert/matmul_v2_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/matmul_v2_op.cc
@@ -37,9 +37,9 @@ class MatMulV2OpConverter : public OpConverter {
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope,
                  bool test_mode) override {
-    VLOG(3) << "convert a fluid matmul_v2 op to tensorrt matmul layer ";
+    VLOG(3) << "convert a matmul_v2 op to tensorrt IMatrixMultiplyLayer layer ";
    framework::OpDesc op_desc(op, nullptr);
-    nvinfer1::ILayer* layer = nullptr;
+    nvinfer1::IMatrixMultiplyLayer* layer = nullptr;

    // Declare inputs
    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
@@ -61,8 +61,9 @@ class MatMulV2OpConverter : public OpConverter {
                    : nvinfer1::MatrixOperation::kNONE;

    int one_num = 0;
+    bool all_matrix = dims_x.nbDims >= 2 && dims_y.nbDims >= 2;
    nvinfer1::ITensor* new_shape_tensor = nullptr;
-    if (dims_x.nbDims < dims_y.nbDims) {
+    if (dims_x.nbDims < dims_y.nbDims && all_matrix) {
      one_num = dims_y.nbDims - dims_x.nbDims;
      new_shape_tensor = Shape(input1);
      std::vector<int32_t> one_vec(one_num, 1);
@@ -80,7 +81,7 @@ class MatMulV2OpConverter : public OpConverter {
                                   *input2,
                                   matrix_operation_Y);

-    } else if (dims_x.nbDims > dims_y.nbDims) {
+    } else if (dims_x.nbDims > dims_y.nbDims && all_matrix) {
      one_num = dims_x.nbDims - dims_y.nbDims;
      new_shape_tensor = Shape(input2);
      std::vector<int32_t> one_vec(one_num, 1);
@@ -105,9 +106,26 @@ class MatMulV2OpConverter : public OpConverter {
                                   *input2,
                                   matrix_operation_Y);
    }
-    VLOG(3) << "Convert a fluid matmul_v2_op_float to TensorRT ";
+    if (dims_x.nbDims == 1)
+      layer->setOperation(0, nvinfer1::MatrixOperation::kVECTOR);
+    if (dims_y.nbDims == 1)
+      layer->setOperation(1, nvinfer1::MatrixOperation::kVECTOR);
+    nvinfer1::ILayer* final_layer = static_cast<nvinfer1::ILayer*>(layer);
+    // When vec * vec, trt produces a scalar, so to be consistent with paddle,
+    // we need add a reshape.
+    if (dims_x.nbDims == 1 && dims_y.nbDims == 1) {
+      auto reshape_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *layer->getOutput(0));
+      nvinfer1::Dims reshape_dim;
+      reshape_dim.nbDims = 1;
+      reshape_dim.d[0] = 1;
+      reshape_layer->setReshapeDimensions(reshape_dim);
+      final_layer = static_cast<nvinfer1::ILayer*>(reshape_layer);
+    }
+    VLOG(3) << "Convert a matmul_v2_op to TensorRT ";

-    RreplenishLayerAndOutput(layer, "matmul_v2_op", {output_name}, test_mode);
+    RreplenishLayerAndOutput(
+        final_layer, "matmul_v2_op", {output_name}, test_mode);
  }
 };


--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -46,6 +46,7 @@ struct SimpleOpTypeSetTeller : public Teller {
 #if IS_TRT_VERSION_GE(7000)
    teller_set.insert("tile");
    teller_set.insert("flatten_contiguous_range");
+    int8_teller_set.insert("flatten_contiguous_range");
    teller_set.insert("rnn");
    int8_teller_set.insert("rnn");
    teller_set.insert("fill_constant_batch_size_like");

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul_v2.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul_v2.py
@@ -193,5 +193,127 @@ class TrtConvertMatmulTest_dynamic2(TrtLayerAutoScanTest):
        self.run_test()


+class TrtConvertMatmulTest_dynamic3(TrtLayerAutoScanTest):
+
+    def sample_program_configs(self):
+
+        def generate_input(shape):
+            return np.random.random(shape).astype(np.float32)
+
+        # case0: mat * vec
+        # case1: vec * mat
+        # case2: vec * vec
+        for case in [0, 1, 2]:
+            for batch in range(20, 23):
+                for trans_x in [False, True]:
+                    for trans_y in [False, True]:
+                        self.case = case
+                        input1_shape = []
+                        input2_shape = []
+                        if case == 0:
+                            input1_shape = [batch, 50]
+                            input2_shape = [50]
+                        elif case == 1:
+                            input1_shape = [50]
+                            input2_shape = [50, batch]
+                        elif case == 2:
+                            input1_shape = [50]
+                            input2_shape = [50]
+                        if (case == 0 or case == 1):
+                            dics = [{
+                                "trans_x": False,
+                                "trans_y": False,
+                            }]
+                        elif (case == 2):
+                            dics = [{
+                                "trans_x": trans_x,
+                                "trans_y": trans_y,
+                            }]
+                        ops_config = [{
+                            "op_type": "matmul_v2",
+                            "op_inputs": {
+                                "X": ["input1_data"],
+                                "Y": ["input2_data"]
+                            },
+                            "op_outputs": {
+                                "Out": ["output_data"]
+                            },
+                            "op_attrs": dics[0]
+                        }]
+                        ops = self.generate_op_config(ops_config)
+
+                        program_config = ProgramConfig(
+                            ops=ops,
+                            weights={},
+                            inputs={
+                                "input1_data":
+                                TensorConfig(data_gen=partial(
+                                    generate_input, input1_shape)),
+                                "input2_data":
+                                TensorConfig(data_gen=partial(
+                                    generate_input, input2_shape))
+                            },
+                            outputs=["output_data"])
+
+                        yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+
+        def generate_dynamic_shape():
+            if (self.case == 0):
+                self.dynamic_shape.min_input_shape = {
+                    "input1_data": [20, 50],
+                    "input2_data": [50]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input1_data": [30, 50],
+                    "input2_data": [50]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input1_data": [25, 50],
+                    "input2_data": [50]
+                }
+            elif (self.case == 1):
+                self.dynamic_shape.min_input_shape = {
+                    "input2_data": [50, 20],
+                    "input1_data": [50]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input2_data": [50, 30],
+                    "input1_data": [50]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input2_data": [50, 25],
+                    "input1_data": [50]
+                }
+            elif (self.case == 2):
+                self.dynamic_shape.min_input_shape = {
+                    "input2_data": [30],
+                    "input1_data": [50]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input2_data": [50],
+                    "input1_data": [50]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input2_data": [50],
+                    "input1_data": [50]
+                }
+
+        generate_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 3), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 3), 1e-5
+
+    def add_skip_trt_case(self):
+        pass
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+
 if __name__ == "__main__":
    unittest.main()