[inference][trt]Upgrade expand cast nearestinterp for sd (#48998)

* update nearest_interp, expand_v2, cast for stable diffusion * update nearest_interp, expand_v2, cast for stable diffusion * correct shape rank * Update expand_v2_op.cc

[inference][trt]Upgrade expand cast nearestinterp for sd (#48998)
* update nearest_interp, expand_v2, cast for stable diffusion * update nearest_interp, expand_v2, cast for stable diffusion * correct shape rank * Update expand_v2_op.cc
5defefd6 · Zhang Jun · GitHub · 351d37d9 · 5defefd6 · 5defefd6
6 changed file
--- a/paddle/fluid/inference/tensorrt/convert/expand_v2_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/expand_v2_op.cc
@@ -33,21 +33,41 @@ class ExpandV2OpConverter : public OpConverter {
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope,
                  bool test_mode) override {
+    VLOG(3) << "convert a paddle expand_v2 op to trt expand layer.";
    framework::OpDesc op_desc(op, nullptr);
    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto inputs = op_desc.Inputs();
    auto input_dims = input->getDimensions();
    auto output_name = op_desc.Output("Out")[0];
    auto rank = input_dims.nbDims;
+
+    nvinfer1::ITensor* shape_tensor = nullptr;
+    int32_t shape_rank = 0;
+    if (inputs.find("Shape") != inputs.end() &&
+        op_desc.Input("Shape").size() >= 1) {
+      shape_tensor = engine_->GetITensor(op_desc.Input("Shape")[0]);
+      shape_rank = shape_tensor->getDimensions().d[0];
+    } else if (inputs.find("expand_shapes_tensor") != inputs.end() &&
+               op_desc.Input("expand_shapes_tensor").size() >= 1) {
+      int shape_size = op_desc.Input("expand_shapes_tensor").size();
+      std::vector<nvinfer1::ITensor*> shape_tensors;
+      for (int i = 0; i < shape_size; ++i) {
+        shape_tensors.push_back(
+            engine_->GetITensor(op_desc.Input("expand_shapes_tensor")[i]));
+      }
+      shape_tensor = Concat(shape_tensors);
+      shape_rank = shape_size;
+    } else {
      std::vector<int32_t> shape =
          PADDLE_GET_CONST(std::vector<int32_t>, op_desc.GetAttr("shape"));
-    int32_t nbDims_num = shape.size();
+      shape_tensor = Add1DConstantLayer(shape, output_name + "_shape_tensor_");
+      shape_rank = shape.size();
+    }

-    auto* shape_tensor =
-        Add1DConstantLayer(shape, output_name + "_shape_tensor_");
    nvinfer1::ITensor* input_shape_tensor;
-    if (rank < nbDims_num) {
+    if (rank < shape_rank) {
      auto* one_rank_tensor =
-          Add1DConstantLayer(std::vector<int32_t>(nbDims_num - rank, 1),
+          Add1DConstantLayer(std::vector<int32_t>(shape_rank - rank, 1),
                             output_name + "_one_rank_tensor_");
      auto in_shape_tensor = Shape(input);
      std::vector<nvinfer1::ITensor*> itensors;
@@ -61,16 +81,16 @@ class ExpandV2OpConverter : public OpConverter {
    auto* shuffle = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
    shuffle->setInput(1, *input_shape_tensor);

-    std::vector<int32_t> start_vec(nbDims_num, 0);
+    std::vector<int32_t> start_vec(shape_rank, 0);
    nvinfer1::Dims start;
-    start.nbDims = nbDims_num;
-    for (int32_t i = 0; i < nbDims_num; ++i) {
+    start.nbDims = shape_rank;
+    for (int32_t i = 0; i < shape_rank; ++i) {
      start.d[i] = start_vec[i];
    }
    nvinfer1::Dims size;
-    size.nbDims = nbDims_num;
+    size.nbDims = shape_rank;
    nvinfer1::Dims stride;
-    stride.nbDims = nbDims_num;
+    stride.nbDims = shape_rank;

    auto starts_tensor =
        Add1DConstantLayer(start_vec, output_name + "_start_tensor_");

--- a/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc
@@ -12,15 +12,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"

-namespace paddle {
-namespace framework {
-class Scope;
-namespace proto {
-class OpDesc;
-}  // namespace proto
-}  // namespace framework
-}  // namespace paddle
-
 namespace paddle {
 namespace inference {
 namespace tensorrt {
@@ -34,6 +25,7 @@ class NearestInterpolateOpConverter : public OpConverter {

    framework::OpDesc op_desc(op, nullptr);

+    auto inputs = op_desc.Inputs();
    std::string input_name = op_desc.Input("X").front();
    std::string output_name = op_desc.Output("Out").front();


--- a/paddle/fluid/inference/tensorrt/convert/nearest_interp_v2_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/nearest_interp_v2_op.cc
@@ -38,6 +38,7 @@ class NearestInterpolateV2OpConverter : public OpConverter {
    std::string output_name = op_desc.Output("Out").front();

    auto input = engine_->GetITensor(input_name);
+    auto inputs = op_desc.Inputs();

    auto data_layout = phi::StringToDataLayout(
        PADDLE_GET_CONST(std::string, op_desc.GetAttr("data_layout")));
@@ -73,9 +74,23 @@ class NearestInterpolateV2OpConverter : public OpConverter {
      scale_w =
          static_cast<float>(out_w) / static_cast<float>(in_dim.d[w_axis]);
    } else {
+      if (scale.size() >= 2) {
        scale_h = scale[0];
        scale_w = scale[1];
      }
+    }
+
+    // Priority: Input(SizeTensor) > attr(out_h/out_w) > attr(scale)
+    nvinfer1::ITensor* outsize_tensor = nullptr;
+    if (engine_->with_dynamic_shape() &&
+        inputs.find("SizeTensor") != inputs.end()) {
+      if (op_desc.Input("SizeTensor").size() >= 2) {
+        auto* outsize_h = engine_->GetITensor(op_desc.Input("SizeTensor")[0]);
+        auto* outsize_w = engine_->GetITensor(op_desc.Input("SizeTensor")[1]);
+        outsize_tensor =
+            Concat(std::vector<nvinfer1::ITensor*>{outsize_h, outsize_w});
+      }
+    }

    if (engine_->with_dynamic_shape()) {
      scales.push_back(1.f);
@@ -94,7 +109,27 @@ class NearestInterpolateV2OpConverter : public OpConverter {
      PADDLE_THROW(platform::errors::InvalidArgument(
          "Data layout must be NCHW or NHWC."));
    }
+
+    if (engine_->with_dynamic_shape()) {
+      if (outsize_tensor != nullptr) {
+        std::vector<nvinfer1::ITensor*> outsize_itensors;
+        auto* input_shape = Shape(input);
+        outsize_itensors.push_back(GetEleTensorOfShape(input_shape, 0));
+
+        if (data_layout == phi::DataLayout::kNCHW) {
+          outsize_itensors.push_back(GetEleTensorOfShape(input_shape, 1));
+          outsize_itensors.push_back(outsize_tensor);
+        } else if (data_layout == phi::DataLayout::kNHWC) {
+          outsize_itensors.push_back(outsize_tensor);
+          outsize_itensors.push_back(GetEleTensorOfShape(input_shape, 3));
+        }
+        layer->setInput(1, *Concat(outsize_itensors));
+      } else {
        layer->setScales(scales.data(), scales.size());
+      }
+    } else {
+      layer->setScales(scales.data(), scales.size());
+    }

    RreplenishLayerAndOutput(
        layer, "nearest_interp_v2", {output_name}, test_mode);

--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -676,7 +676,7 @@ struct SimpleOpTypeSetTeller : public Teller {
      if (!has_attrs) return false;
    }

-    if (op_type == "arg_max") {
+    if (op_type == "arg_max" || op_type == "arg_min") {
      if (!desc.HasAttr("axis", /*with_attr_var=*/false)) {
        VLOG(3) << "Skip to convert into TRT while found Attribute('axis') is "
                   "Variable type in arg_max.";
@@ -691,21 +691,6 @@ struct SimpleOpTypeSetTeller : public Teller {
      if (axis == 0 || flatten || (dtype != 2 && dtype != 3)) return false;
    }

-    if (op_type == "arg_min") {
-      if (!desc.HasAttr("axis", /*with_attr_var=*/false)) {
-        VLOG(3) << "Skip to convert into TRT while found Attribute('axis') is "
-                   "Variable type in arg_min.";
-        return false;
-      }
-
-      int axis = desc.HasAttr("axis")
-                     ? PADDLE_GET_CONST(int64_t, desc.GetAttr("axis"))
-                     : -1;
-      bool flatten = PADDLE_GET_CONST(bool, desc.GetAttr("flatten"));
-      int dtype = PADDLE_GET_CONST(int, desc.GetAttr("dtype"));
-      if (axis == 0 || flatten || dtype != 2) return false;
-    }
-
    if (op_type == "affine_channel") {
      if (!desc.HasAttr("data_layout")) return false;
      auto data_layout = phi::StringToDataLayout(
@@ -836,6 +821,14 @@ struct SimpleOpTypeSetTeller : public Teller {
      auto interp_method =
          PADDLE_GET_CONST(std::string, desc.GetAttr("interp_method"));
      if (interp_method != "nearest") return false;
+
+      auto resize_inputs = desc.Inputs();
+      if (with_dynamic_shape &&
+          resize_inputs.find("SizeTensor") != resize_inputs.end() &&
+          desc.Input("SizeTensor").size() == 2) {
+        return true;
+      }
+
      auto scale = PADDLE_GET_CONST(std::vector<float>, desc.GetAttr("scale"));
      auto out_h = PADDLE_GET_CONST(int, desc.GetAttr("out_h"));
      auto out_w = PADDLE_GET_CONST(int, desc.GetAttr("out_w"));
@@ -2292,7 +2285,8 @@ struct SimpleOpTypeSetTeller : public Teller {
        }
      }
 #endif
-      if (!((in_dtype == 5 || in_dtype == 4 || in_dtype == 2) &&
+      if (!((in_dtype == 5 || in_dtype == 4 || in_dtype == 3 ||
+             in_dtype == 2) &&
            (out_dtype == 5 || out_dtype == 4 || out_dtype == 2))) {
        VLOG(3) << "only valid conversions are: "
                   "(kFLOAT | kHALF | kINT32) -> (kFLOAT | kHALF | kINT32)";
@@ -2411,18 +2405,6 @@ struct SimpleOpTypeSetTeller : public Teller {
      if (!desc.HasAttr("shape")) {
        return false;
      }
-      auto expand_v2_inputs = desc.Inputs();
-      if (expand_v2_inputs.find("Shape") != expand_v2_inputs.end()) {
-        if (desc.Input("Shape").size() >= 1) {
-          return false;
-        }
-      }
-      if (expand_v2_inputs.find("expand_shapes_tensor") !=
-          expand_v2_inputs.end()) {
-        if (desc.Input("expand_shapes_tensor").size() >= 1) {
-          return false;
-        }
-      }
    }

    if (use_no_calib_int8) {

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_expand_v2.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_expand_v2.py
@@ -249,9 +249,9 @@ class TrtConvertExpandV2Test2(TrtLayerAutoScanTest):
        generate_dynamic_shape()
        self.trt_param.precision = paddle_infer.PrecisionType.Float32
        # fill_constant will be folded by constnt folding pass!
-        yield self.create_inference_config(), (0, 3), 1e-5
+        yield self.create_inference_config(), (1, 2), 1e-5
        self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (0, 3), 1e-3
+        yield self.create_inference_config(), (1, 2), 1e-3

    def add_skip_trt_case(self):
        pass
@@ -393,9 +393,9 @@ class TrtConvertExpandV2Test3(TrtLayerAutoScanTest):
        generate_dynamic_shape()
        self.trt_param.precision = paddle_infer.PrecisionType.Float32
        # fill_constant will be folded by constnt folding pass!
-        yield self.create_inference_config(), (0, 3), 1e-5
+        yield self.create_inference_config(), (1, 2), 1e-5
        self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (0, 3), 1e-3
+        yield self.create_inference_config(), (1, 2), 1e-3

    def add_skip_trt_case(self):
        pass

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp_v2.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp_v2.py
@@ -30,10 +30,16 @@ class TrtConvertNearestInterpV2Test(TrtLayerAutoScanTest):
        def generate_input():
            return np.ones([1, 3, 32, 32]).astype(np.float32)

+        def generate_weight():
+            return np.array([64]).astype(np.int32)
+
        ops_config = [
            {
                "op_type": "nearest_interp_v2",
-                "op_inputs": {"X": ["input_data"]},
+                "op_inputs": {
+                    "X": ["input_data"],
+                    "SizeTensor": ["size_tensor_data0", "size_tensor_data1"],
+                },
                "op_outputs": {"Out": ["interp_output_data"]},
                "op_attrs": {
                    "data_layout": "NCHW",
@@ -51,7 +57,10 @@ class TrtConvertNearestInterpV2Test(TrtLayerAutoScanTest):
        ops = self.generate_op_config(ops_config)
        program_config = ProgramConfig(
            ops=ops,
-            weights={},
+            weights={
+                "size_tensor_data0": TensorConfig(data_gen=generate_weight),
+                "size_tensor_data1": TensorConfig(data_gen=generate_weight),
+            },
            inputs={"input_data": TensorConfig(data_gen=generate_input)},
            outputs=["interp_output_data"],
        )