[Paddle-TRT] matmul_v2 support (#44918)

* Support matmul_v2 in PaddleTensorRT

[Paddle-TRT] matmul_v2 support (#44918)
* Support matmul_v2 in PaddleTensorRT
aee4f8ab · zhoutianzi666 · GitHub · da10fb3b · aee4f8ab · aee4f8ab
5 changed file
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2107,6 +2107,7 @@ USE_TRT_CONVERTER(transpose2);
 USE_TRT_CONVERTER(flatten);
 USE_TRT_CONVERTER(flatten_contiguous_range);
 USE_TRT_CONVERTER(matmul);
+USE_TRT_CONVERTER(matmul_v2);
 USE_TRT_CONVERTER(conv2d);
 USE_TRT_CONVERTER(relu);
 USE_TRT_CONVERTER(exp);

--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -3,6 +3,7 @@ list(
  APPEND
  CONVERT_FILES
  matmul_op.cc
+  matmul_v2_op.cc
  conv2d_op.cc
  fc_op.cc
  pool2d_op.cc

--- a/paddle/fluid/inference/tensorrt/convert/matmul_v2_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/matmul_v2_op.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h"
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+/*
+ * MatMulV2Op, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights.
+ */
+class MatMulV2OpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope,
+                  bool test_mode) override {
+    VLOG(3) << "convert a fluid matmul_v2 op to tensorrt matmul layer ";
+    framework::OpDesc op_desc(op, nullptr);
+    nvinfer1::ILayer* layer = nullptr;
+    // Declare inputs
+    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
+    nvinfer1::Dims dims_x = input1->getDimensions();
+    nvinfer1::Dims dims_y = input2->getDimensions();
+    bool transpose_X = PADDLE_GET_CONST(bool, op_desc.GetAttr("trans_x"));
+    bool transpose_Y = PADDLE_GET_CONST(bool, op_desc.GetAttr("trans_y"));
+    auto output_name = op_desc.Output("Out")[0];
+    nvinfer1::MatrixOperation matrix_operation_X =
+        transpose_X ? nvinfer1::MatrixOperation::kTRANSPOSE
+                    : nvinfer1::MatrixOperation::kNONE;
+    nvinfer1::MatrixOperation matrix_operation_Y =
+        transpose_Y ? nvinfer1::MatrixOperation::kTRANSPOSE
+                    : nvinfer1::MatrixOperation::kNONE;
+    int one_num = 0;
+    nvinfer1::ITensor* new_shape_tensor = nullptr;
+    if (dims_x.nbDims < dims_y.nbDims) {
+      one_num = dims_y.nbDims - dims_x.nbDims;
+      new_shape_tensor = Shape(input1);
+      std::vector<int32_t> one_vec(one_num, 1);
+      auto* one_tensor = Add1DConstantLayer(one_vec);
+      new_shape_tensor =
+          Concat(std::vector<nvinfer1::ITensor*>{one_tensor, new_shape_tensor});
+      auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input1);
+      reshape_layer->setInput(1, *new_shape_tensor);
+      layer = TRT_ENGINE_ADD_LAYER(engine_,
+                                   MatrixMultiply,
+                                   *reshape_layer->getOutput(0),
+                                   matrix_operation_X,
+                                   *input2,
+                                   matrix_operation_Y);
+    } else if (dims_x.nbDims > dims_y.nbDims) {
+      one_num = dims_x.nbDims - dims_y.nbDims;
+      new_shape_tensor = Shape(input2);
+      std::vector<int32_t> one_vec(one_num, 1);
+      auto* one_tensor = Add1DConstantLayer(one_vec);
+      new_shape_tensor =
+          Concat(std::vector<nvinfer1::ITensor*>{one_tensor, new_shape_tensor});
+      auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input2);
+      reshape_layer->setInput(1, *new_shape_tensor);
+      layer = TRT_ENGINE_ADD_LAYER(engine_,
+                                   MatrixMultiply,
+                                   *input1,
+                                   matrix_operation_X,
+                                   *reshape_layer->getOutput(0),
+                                   matrix_operation_Y);
+    } else {
+      layer = TRT_ENGINE_ADD_LAYER(engine_,
+                                   MatrixMultiply,
+                                   *input1,
+                                   matrix_operation_X,
+                                   *input2,
+                                   matrix_operation_Y);
+    }
+    VLOG(3) << "Convert a fluid matmul_v2_op_float to TensorRT ";
+    RreplenishLayerAndOutput(layer, "matmul_v2_op", {output_name}, test_mode);
+  }
+};
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+REGISTER_TRT_OP_CONVERTER(matmul_v2, MatMulV2OpConverter);
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -326,6 +326,20 @@ struct SimpleOpTypeSetTeller : public Teller {
      }
    }
+    if (op_type == "matmul_v2") {
+      if (!with_dynamic_shape) {
+        return false;
+      }
+      auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
+        return false;
+      }
+      return true;
+    }
    if (op_type == "matmul") {
      auto* block = desc.Block();
      if (block == nullptr) {
@@ -2081,6 +2095,7 @@ struct SimpleOpTypeSetTeller : public Teller {
  std::unordered_set<std::string> int8_teller_set{
      "mul",
      "matmul",
+      "matmul_v2",
      "conv2d",
      "conv2d_fusion",
      "pool2d",
@@ -2190,6 +2205,7 @@ struct SimpleOpTypeSetTeller : public Teller {
  std::unordered_set<std::string> teller_set{
      "mul",
      "matmul",
+      "matmul_v2",
      "conv2d",
      "conv2d_fusion",
      "pool2d",

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul_v2.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul_v2.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+import os
+class TrtConvertMatmulTest_dynamic(TrtLayerAutoScanTest):
+    def sample_program_configs(self):
+        def generate_input(shape):
+            return np.random.random(shape).astype(np.float32)
+        for batch in [10, 11, 12, 13, 14, 15]:
+            for trans_x in [False]:
+                for trans_y in [False]:
+                    input1_shape = [batch, 64, 350, 75]
+                    input2_shape = [75, 25]
+                    dics = [{
+                        "trans_x": trans_x,
+                        "trans_y": trans_y,
+                    }]
+                    ops_config = [{
+                        "op_type": "matmul_v2",
+                        "op_inputs": {
+                            "X": ["input1_data"],
+                            "Y": ["input2_data"]
+                        },
+                        "op_outputs": {
+                            "Out": ["output_data"]
+                        },
+                        "op_attrs": dics[0]
+                    }]
+                    ops = self.generate_op_config(ops_config)
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={},
+                        inputs={
+                            "input1_data":
+                            TensorConfig(
+                                data_gen=partial(generate_input, input1_shape)),
+                            "input2_data":
+                            TensorConfig(
+                                data_gen=partial(generate_input, input2_shape))
+                        },
+                        outputs=["output_data"])
+                    yield program_config
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {
+                "input1_data": [10, 64, 350, 75],
+                "input2_data": [75, 25]
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input1_data": [100, 64, 350, 75],
+                "input2_data": [75, 25]
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input1_data": [15, 64, 350, 75],
+                "input2_data": [75, 25]
+            }
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+        # The output has little diff between gpu and trt in CI-Windows-Inference
+        tol_fp32 = 1e-5
+        tol_half = 1e-5
+        if (os.name == 'nt'):
+            tol_fp32 = 1e-3
+            tol_half = 1e-3
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 3), tol_fp32
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 3), tol_half
+    def add_skip_trt_case(self):
+        pass
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+class TrtConvertMatmulTest_dynamic2(TrtLayerAutoScanTest):
+    def sample_program_configs(self):
+        def generate_input(shape):
+            return np.random.random(shape).astype(np.float32)
+        for batch in [10, 11, 12, 13, 14, 15]:
+            for trans_x in [False]:
+                for trans_y in [False]:
+                    input1_shape = [60, 40]
+                    input2_shape = [batch, 40, 90]
+                    dics = [{
+                        "trans_x": trans_x,
+                        "trans_y": trans_y,
+                    }]
+                    ops_config = [{
+                        "op_type": "matmul_v2",
+                        "op_inputs": {
+                            "X": ["input1_data"],
+                            "Y": ["input2_data"]
+                        },
+                        "op_outputs": {
+                            "Out": ["output_data"]
+                        },
+                        "op_attrs": dics[0]
+                    }]
+                    ops = self.generate_op_config(ops_config)
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={},
+                        inputs={
+                            "input1_data":
+                            TensorConfig(
+                                data_gen=partial(generate_input, input1_shape)),
+                            "input2_data":
+                            TensorConfig(
+                                data_gen=partial(generate_input, input2_shape))
+                        },
+                        outputs=["output_data"])
+                    yield program_config
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {
+                "input1_data": [60, 40],
+                "input2_data": [10, 40, 90]
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input1_data": [60, 40],
+                "input2_data": [20, 40, 90]
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input1_data": [60, 40],
+                "input2_data": [15, 40, 90]
+            }
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+        # The output has little diff between gpu and trt in CI-Windows-Inference
+        tol_fp32 = 1e-5
+        tol_half = 1e-5
+        if (os.name == 'nt'):
+            tol_fp32 = 1e-3
+            tol_half = 1e-3
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 3), tol_fp32
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 3), tol_half
+    def add_skip_trt_case(self):
+        pass
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+if __name__ == "__main__":
+    unittest.main()