[cherry pick] add cast trt convert (#44837)

* add cast trt convert * skip cast trt convert when input dtype is bool * code format * fix bug * update unittest * fix bug

[cherry pick] add cast trt convert (#44837)
* add cast trt convert * skip cast trt convert when input dtype is bool * code format * fix bug * update unittest * fix bug
7cdce09b · ccrrong · GitHub · 627e5bd5 · 7cdce09b · 7cdce09b
5 changed file
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1793,6 +1793,7 @@ USE_TRT_CONVERTER(multiclass_nms3);
 USE_TRT_CONVERTER(nearest_interp);
 USE_TRT_CONVERTER(nearest_interp_v2);
 USE_TRT_CONVERTER(bilinear_interp_v2);
+USE_TRT_CONVERTER(cast);
 USE_TRT_CONVERTER(reshape);
 USE_TRT_CONVERTER(reduce_sum);
 USE_TRT_CONVERTER(gather_nd);

--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
 # Add TRT tests
-nv_library(tensorrt_converter
-           SRCS matmul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
-                batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc group_norm_op.cc
-                pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc gelu_op.cc layer_norm_op.cc multihead_matmul_op.cc
-                shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc flatten_contiguous_range_op.cc
-                emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc
-                gather_op.cc
-		bilinear_interp_v2_op.cc
-                anchor_generator_op.cc
-                yolo_box_op.cc
-                roi_align_op.cc
-                affine_channel_op.cc
-                multiclass_nms_op.cc
-                multiclass_nms3_op.cc
-                nearest_interp_op.cc
-                reshape_op.cc
-                reduce_op.cc
-                gather_nd_op.cc
-                tile_op.cc
-                conv3d_op.cc
-                mish_op.cc
-                nearest_interp_v2_op.cc
-                pool3d_op.cc
-                deformable_conv_op.cc
-                preln_emb_eltwise_layernorm.cc
-		strided_slice_op.cc
-                preln_skip_layernorm.cc
-		roll_op.cc
-           DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
+nv_library(
+  tensorrt_converter
+  SRCS matmul_op.cc
+       conv2d_op.cc
+       fc_op.cc
+       pool2d_op.cc
+       elementwise_op.cc
+       batch_norm_op.cc
+       activation_op.cc
+       softmax_op.cc
+       concat_op.cc
+       dropout_op.cc
+       group_norm_op.cc
+       pad_op.cc
+       split_op.cc
+       prelu_op.cc
+       leaky_relu_op.cc
+       gelu_op.cc
+       layer_norm_op.cc
+       multihead_matmul_op.cc
+       shuffle_channel_op.cc
+       swish_op.cc
+       instance_norm_op.cc
+       stack_op.cc
+       transpose_op.cc
+       flatten_op.cc
+       flatten_contiguous_range_op.cc
+       emb_eltwise_layernorm.cc
+       skip_layernorm.cc
+       scale_op.cc
+       slice_op.cc
+       hard_sigmoid_op.cc
+       hard_swish_op.cc
+       clip_op.cc
+       gather_op.cc
+       bilinear_interp_v2_op.cc
+       cast_op.cc
+       anchor_generator_op.cc
+       yolo_box_op.cc
+       roi_align_op.cc
+       affine_channel_op.cc
+       multiclass_nms_op.cc
+       multiclass_nms3_op.cc
+       nearest_interp_op.cc
+       reshape_op.cc
+       reduce_op.cc
+       gather_nd_op.cc
+       tile_op.cc
+       conv3d_op.cc
+       mish_op.cc
+       nearest_interp_v2_op.cc
+       pool3d_op.cc
+       deformable_conv_op.cc
+       preln_emb_eltwise_layernorm.cc
+       strided_slice_op.cc
+       preln_skip_layernorm.cc
+       roll_op.cc
+  DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto
+       op_registry)

-nv_test(test_op_converter SRCS test_op_converter.cc DEPS
-  paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_converter)
+nv_test(
+  test_op_converter
+  SRCS test_op_converter.cc
+  DEPS paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_engine
+       tensorrt_converter)
--- a/paddle/fluid/inference/tensorrt/convert/cast_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/cast_op.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class CastOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope,
+                  bool test_mode) override {
+    VLOG(3) << "convert a cast op to tensorrt";
+    framework::OpDesc op_desc(op, nullptr);
+
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto out_dtype = BOOST_GET_CONST(int, op_desc.GetAttr("out_dtype"));
+
+    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Identity, *input);
+
+    switch (out_dtype) {
+      case 2:  // INT32 = 2
+        layer->getOutput(0)->setType(nvinfer1::DataType::kINT32);
+        break;
+      case 4:  // FP16 = 4
+        layer->getOutput(0)->setType(nvinfer1::DataType::kHALF);
+        break;
+      case 5:  // FP32 = 5
+        layer->getOutput(0)->setType(nvinfer1::DataType::kFLOAT);
+        break;
+      default:
+        LOG(ERROR) << "Unable to convert a fluid data type(" << out_dtype
+                   << ") to a nvinfer DataType";
+        break;
+    }
+
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(layer, "cast", {output_name}, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(cast, CastOpConverter);
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -49,7 +49,8 @@ struct SimpleOpTypeSetTeller : public Teller {
 #endif
  }

-  bool operator()(const std::string& op_type, const framework::OpDesc& desc,
+  bool operator()(const std::string& op_type,
+                  const framework::OpDesc& desc,
                  bool use_no_calib_int8) override {
    if (use_no_calib_int8) {
      return int8_teller_set.count(op_type);
@@ -111,6 +112,7 @@ struct SimpleOpTypeSetTeller : public Teller {
      "mish",
      "nearest_interp_v2",
      "bilinear_interp_v2",
+      "cast",
      "pool3d",
      "deformable_conv",
      "relu6",
@@ -175,6 +177,7 @@ struct SimpleOpTypeSetTeller : public Teller {
      "mish",
      "bilinear_interp_v2",
      "nearest_interp_v2",
+      "cast",
      "pool3d",
      "deformable_conv",
      "relu6",
@@ -191,7 +194,8 @@ struct SimpleOpTypeSetTeller : public Teller {
      "multiclass_nms3"};
 };

-bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
+bool OpTeller::Tell(const framework::ir::Node* node,
+                    bool use_no_calib_int8,
                    bool with_dynamic_shape) {
  const std::string op_type = node->Op()->Type();
  const framework::OpDesc desc = *node->Op();
@@ -706,8 +710,8 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
    }

    if (op_type == "nearest_interp") {
-      std::vector<std::string> attrs{"interp_method", "align_corners", "scale",
-                                     "out_h", "out_w"};
+      std::vector<std::string> attrs{
+          "interp_method", "align_corners", "scale", "out_h", "out_w"};
      for (auto const attr : attrs) {
        if (!desc.HasAttr(attr)) return false;
      }
@@ -747,9 +751,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
    }

    if (op_type == "nearest_interp_v2") {
-      std::vector<std::string> attrs{"data_layout",   "interp_method",
-                                     "align_corners", "scale",
-                                     "out_h",         "out_w"};
+      std::vector<std::string> attrs{"data_layout",
+                                     "interp_method",
+                                     "align_corners",
+                                     "scale",
+                                     "out_h",
+                                     "out_w"};
      for (auto const attr : attrs) {
        if (!desc.HasAttr(attr)) return false;
      }
@@ -775,9 +782,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
    }

    if (op_type == "bilinear_interp_v2") {
-      std::vector<std::string> attrs{"data_layout",   "interp_method",
-                                     "align_corners", "scale",
-                                     "out_h",         "out_w"};
+      std::vector<std::string> attrs{"data_layout",
+                                     "interp_method",
+                                     "align_corners",
+                                     "scale",
+                                     "out_h",
+                                     "out_w"};
      for (auto const attr : attrs) {
        if (!desc.HasAttr(attr)) {
          VLOG(3) << "The op_type " << op_type << " doesn't have the attr "
@@ -882,8 +892,8 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
    }

    if (op_type == "batch_norm") {
-      const std::vector<std::string> bn_inputs = {"X", "Bias", "Mean", "Scale",
-                                                  "Variance"};
+      const std::vector<std::string> bn_inputs = {
+          "X", "Bias", "Mean", "Scale", "Variance"};
      for (unsigned int i = 0; i < bn_inputs.size(); i++) {
        if (desc.Input(bn_inputs[i]).size() != 1) {
          VLOG(3) << "Invalid " << bn_inputs[i]
@@ -1458,8 +1468,10 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
                   "the roi_align will change the batch size.";
        return false;
      }
-      std::vector<std::string> attrs{"pooled_height", "pooled_width",
-                                     "spatial_scale", "sampling_ratio",
+      std::vector<std::string> attrs{"pooled_height",
+                                     "pooled_width",
+                                     "spatial_scale",
+                                     "sampling_ratio",
                                     "aligned"};
      for (auto const attr : attrs) {
        if (!desc.HasAttr(attr)) return false;
@@ -1641,10 +1653,10 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
          auto x_var_name = desc.Input("X")[0];
          auto* x_var_desc = block->FindVar(x_var_name);
          const auto x_shape = x_var_desc->GetShape();
-          int input_num = std::accumulate(x_shape.begin() + 1, x_shape.end(), 1,
-                                          std::multiplies<int>());
-          int shape_num = std::accumulate(shape.begin() + 1, shape.end(), 1,
-                                          std::multiplies<int>());
+          int input_num = std::accumulate(
+              x_shape.begin() + 1, x_shape.end(), 1, std::multiplies<int>());
+          int shape_num = std::accumulate(
+              shape.begin() + 1, shape.end(), 1, std::multiplies<int>());
          if (input_num == shape_num) {
            return true;
          }
@@ -1751,6 +1763,36 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
    }
 #endif

+    if (op_type == "cast") {
+// trt 6015 result in Windows ppyolo_mbv3 TRT fp32 diff
+#if !IS_TRT_VERSION_GE(7000)
+      return false;
+#endif
+      if (!(desc.HasAttr("in_dtype") && desc.HasAttr("out_dtype"))) {
+        VLOG(3) << "the " << op_type
+                << " does not have attr (in_dtype or "
+                   "out_dtype)";
+        return false;
+      }
+      int in_dtype = BOOST_GET_CONST(int, desc.GetAttr("in_dtype"));
+      int out_dtype = BOOST_GET_CONST(int, desc.GetAttr("out_dtype"));
+      if ((in_dtype == 4 || in_dtype == 5) && out_dtype == 4) {
+        VLOG(3) << "unsupport data type conversion";
+        return false;
+      }
+      if (in_dtype == 0) {
+        VLOG(3) << "do not support input data type as bool now";
+        return false;
+      }
+      if (!((in_dtype == 5 || in_dtype == 4 || in_dtype == 2) &&
+            (out_dtype == 5 || out_dtype == 4 || out_dtype == 2))) {
+        VLOG(3)
+            << "only valid conversions are: "
+               "(kFLOAT | kHALF | kINT32 | kBOOL) -> (kFLOAT | kHALF | kINT32)";
+        return false;
+      }
+    }
+
    if (op_type == "conv3d" || op_type == "conv3d_transpose") {
      if (desc.HasAttr("padding_algorithm")) {
        std::string padding_algorithm =

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_cast.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_cast.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import unittest
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+
+
+class TrtConvertCastTest(TrtLayerAutoScanTest):
+
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        ver = paddle_infer.get_trt_compile_version()
+        if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 < 7000:
+            return False
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+        if attrs[0]['in_dtype'] == 0:
+            return False
+        if attrs[0]['in_dtype'] in [4, 5] and attrs[0]['out_dtype'] == 4:
+            return False
+        if attrs[0]['in_dtype'] not in [
+                2, 4, 5
+        ] or attrs[0]['out_dtype'] not in [2, 4, 5]:
+            return False
+        return True
+
+    def sample_program_configs(self):
+
+        def generate_input(type):
+            if type == 0:
+                return np.ones([1, 3, 64, 64]).astype(np.bool)
+            elif type == 2:
+                return np.ones([1, 3, 64, 64]).astype(np.int32)
+            elif type == 4:
+                return np.ones([1, 3, 64, 64]).astype(np.float16)
+            else:
+                return np.ones([1, 3, 64, 64]).astype(np.float32)
+
+        for in_dtype in [0, 2, 4, 5, 6]:
+            for out_dtype in [0, 2, 4, 5, 6]:
+                dics = [{"in_dtype": in_dtype, "out_dtype": out_dtype}]
+
+                ops_config = [{
+                    "op_type": "cast",
+                    "op_inputs": {
+                        "X": ["input_data"]
+                    },
+                    "op_outputs": {
+                        "Out": ["cast_output_data"]
+                    },
+                    "op_attrs": dics[0]
+                }]
+                ops = self.generate_op_config(ops_config)
+
+                program_config = ProgramConfig(
+                    ops=ops,
+                    weights={},
+                    inputs={
+                        "input_data":
+                        TensorConfig(data_gen=partial(generate_input, in_dtype))
+                    },
+                    outputs=["cast_output_data"])
+
+                yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 64, 64]}
+            self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
+            self.dynamic_shape.opt_input_shape = {"input_data": [1, 3, 64, 64]}
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            return 1, 2
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-2
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-2
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()