[Paddle-TRT][Cherry-Pick]Rewrite strided_slice converter using shape tensor (#47153)

* stride_to_24 * fix CI failing

[Paddle-TRT][Cherry-Pick]Rewrite strided_slice converter using shape tensor (#47153)
* stride_to_24 * fix CI failing
68c4ac31 · zhoutianzi666 · GitHub · 09b19233 · 68c4ac31 · 68c4ac31
3 changed file
--- a/paddle/fluid/inference/tensorrt/convert/strided_slice_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/strided_slice_op.cc
@@ -14,33 +14,23 @@ limitations under the License. */

 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"

-namespace paddle {
-namespace framework {
-class Scope;
-namespace proto {
-class OpDesc;
-}  // namespace proto
-}  // namespace framework
-}  // namespace paddle
-
 namespace paddle {
 namespace inference {
 namespace tensorrt {

-/*
- * Stack converter from fluid to tensorRT.
- */
 class StridedSliceOpConverter : public OpConverter {
 public:
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope,
                  bool test_mode) override {
-    VLOG(4) << "convert fluid StridedSlice op to tensorrt Slice layer";
-
+    VLOG(4) << "convert strided_slice op to tensorrt layer";
    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
    auto* input = engine_->GetITensor(op_desc.Input("Input")[0]);
-    nvinfer1::Dims input_dims = input->getDimensions();
    auto output_name = op_desc.Output("Out")[0];
+
+    // phi only allow axes[i] >= 0 && <rank, so we need not deal with minus
+    // axes[i]
    std::vector<int> axes =
        PADDLE_GET_CONST(std::vector<int>, op_desc.GetAttr("axes"));
    std::vector<int> starts =
@@ -49,119 +39,148 @@ class StridedSliceOpConverter : public OpConverter {
        PADDLE_GET_CONST(std::vector<int>, op_desc.GetAttr("ends"));
    std::vector<int> strides =
        PADDLE_GET_CONST(std::vector<int>, op_desc.GetAttr("strides"));
-    int axes_size = axes.size();
-    nvinfer1::Dims start;
-    nvinfer1::Dims stride;
-    nvinfer1::Dims size;
-    start.nbDims = input_dims.nbDims;
-    stride.nbDims = input_dims.nbDims;
-    size.nbDims = input_dims.nbDims;
-    for (int i = 0; i < input_dims.nbDims; i++) {
-      start.d[i] = 0;
-      stride.d[i] = 1;
-      size.d[i] = input_dims.d[i];
-    }
+    std::vector<int> decrease_axises =
+        PADDLE_GET_CONST(std::vector<int>, op_desc.GetAttr("decrease_axis"));

+    auto input_dims = input->getDimensions();
    if (!engine_->with_dynamic_shape()) {
-      for (int i = 0; i < axes_size; i++) {
-        start.d[axes[i] - 1] = starts[i];
-      }
-      for (int i = 0; i < axes_size; i++) {
-        stride.d[axes[i] - 1] = strides[i];
-      }
-      for (int i = 0; i < axes_size; ++i) {
-        int dim = size.d[axes[i] - 1];
-        if (dim > 0) {
-          int start = starts[i] < 0 ? (starts[i] + dim) : starts[i];
-          int end = ends[i] < 0 ? (ends[i] + dim) : ends[i];
-          int stride = std::abs(strides[i]);
-          start = std::max(start, 0);
-          end = std::max(end, 0);
-          end = std::min(end, dim);
-          size.d[axes[i] - 1] = (std::abs(end - start) + stride - 1) / stride;
-        }
-      }
-      auto* layer =
-          TRT_ENGINE_ADD_LAYER(engine_, Slice, *input, start, size, stride);
-      RreplenishLayerAndOutput(
-          layer, "strided_slice", {output_name}, test_mode);
+      // notice that input shape is [CHW] without batch axis when input has
+      // static shape
+      for (size_t i = input_dims.nbDims; i > 0; i--) {
+        input_dims.d[i] = input_dims.d[i - 1];
+      }
+      input_dims.d[0] = 1;  // fake batchsize, not useful here
+      for (size_t i = 0; i < axes.size(); i++) {
+        if (starts[i] < 0) {
+          starts[i] = std::max(starts[i] + input_dims.d[axes[i]], 0);
+        }
+        if (ends[i] < 0) {
+          ends[i] = std::max(ends[i] + input_dims.d[axes[i]], 0);
+        }
+        ends[i] = std::min(ends[i], input_dims.d[axes[i]]);
+        PADDLE_ENFORCE_GT(
+            ends[i],
+            starts[i],
+            platform::errors::InvalidArgument(
+                "Attr(ends) should be greater than attr(starts) in "
+                "slice op. But received ends = %d, starts = %d.",
+                ends[i],
+                starts[i]));
+      }
+    }
+
+    nvinfer1::ILayer* layer = nullptr;
+    if (engine_->with_dynamic_shape()) {
+      auto nchw_input_dims = input->getDimensions();
+      nvinfer1::Dims trt_start_dims;
+      trt_start_dims.nbDims = nchw_input_dims.nbDims;
+      memset(trt_start_dims.d, 0, sizeof(int32_t) * nchw_input_dims.nbDims);
+      nvinfer1::Dims trt_size_dims = trt_start_dims;
+      nvinfer1::Dims trt_end_dims = trt_start_dims;
+      nvinfer1::Dims trt_step_dims = trt_start_dims;
+      for (int i = 0; i < trt_step_dims.nbDims; i++) trt_step_dims.d[i] = 1;
+      // input : [N,C,H,W]
+      bool has_neg_indices = false;
+      for (size_t i = 0; i < axes.size(); i++) {
+        int trt_axis = axes[i];
+        trt_start_dims.d[trt_axis] = starts[i];
+        trt_end_dims.d[trt_axis] = ends[i];
+        trt_step_dims.d[axes[i]] = strides[i];
+        if (starts[i] < 0 || ends[i] < 0) has_neg_indices = true;
+      }
+      auto* shape_tensor = Shape(input);
+      auto* start_tensor = Add1DConstantLayer(trt_start_dims);
+      if (has_neg_indices) {
+        start_tensor = FixNegIndices(shape_tensor, start_tensor);
+      }
+
+      std::vector<nvinfer1::ITensor*> end_vec_tensor;
+      for (int i = 0; i < trt_end_dims.nbDims; i++) {
+        end_vec_tensor.push_back(GetEleTensorOfShape(shape_tensor, i));
+      }
+
+      for (size_t i = 0; i < axes.size(); i++) {
+        int trt_axis = axes[i];
+        if (ends[i] >= 0) {
+          end_vec_tensor[trt_axis] = Add1DConstantLayer(ends[i]);
+        } else {
+          end_vec_tensor[trt_axis] =
+              Sum(end_vec_tensor[trt_axis], Add1DConstantLayer(ends[i]));
+        }
+      }
+
+      auto* size_tensor =
+          Sub(start_tensor, Min(Concat(end_vec_tensor), shape_tensor));
+      auto zero_t =
+          Add1DConstantLayer(std::vector<int>(nchw_input_dims.nbDims, 0));
+      auto step_tensor = Add1DConstantLayer(trt_step_dims);
+      size_tensor = Sub(zero_t, FloorDiv(size_tensor, step_tensor));
+
+      layer = TRT_ENGINE_ADD_LAYER(
+          engine_, Slice, *input, trt_start_dims, trt_size_dims, trt_step_dims);
+      layer->setInput(1, *start_tensor);
+      layer->setInput(2, *size_tensor);
+      layer->setInput(3, *step_tensor);
+
+      if (decrease_axises.size() > 0) {
+        std::vector<int32_t> gather_indices;
+        for (int i = 0; i < trt_size_dims.nbDims; i++) {
+          if (decrease_axises.end() !=
+              std::find(decrease_axises.begin(), decrease_axises.end(), i))
+            continue;
+          gather_indices.push_back(i);
+        }
+        if (gather_indices.empty())
+          gather_indices.push_back(decrease_axises[0]);
+        auto real_size_tensor = Gather(size_tensor, gather_indices);
+        layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *layer->getOutput(0));
+        layer->setInput(1, *real_size_tensor);
+      }
    } else {
-      for (int i = 0; i < axes_size; i++) {
-        start.d[axes[i]] = starts[i];
-      }
-      for (int i = 0; i < axes_size; i++) {
-        stride.d[axes[i]] = strides[i];
-      }
-      for (int i = 0; i < axes_size; ++i) {
-        int dim = size.d[axes[i]];
-        if (dim > 0) {
-          int start = starts[i] < 0 ? (starts[i] + dim) : starts[i];
-          int end = ends[i] < 0 ? (ends[i] + dim) : ends[i];
-          int stride = std::abs(strides[i]);
-          start = std::max(start, 0);
-          end = std::max(end, 0);
-          end = std::min(end, dim);
-          size.d[axes[i]] = (std::abs(end - start) + stride - 1) / stride;
-        }
-      }
-
-      auto create_weights = [&](const std::vector<int>& data,
-                                const std::string& type) -> int* {
-        std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor());
-        int data_size = data.size();
-        tmp_tensor->Resize({data_size});
-        auto* tmp_data = tmp_tensor->mutable_data<int>(platform::CPUPlace());
-        for (int i = 0; i < data_size; i++) {
-          tmp_data[i] = data[i];
-        }
-
-        engine_->SetWeights(output_name + "_add_slice_op_" + type,
-                            std::move(tmp_tensor));
-        return tmp_data;
-      };
-
-      std::vector<int> const_weight(input_dims.nbDims, 0);
-      for (int i = 0; i < axes_size; i++) {
-        int dim = input_dims.d[axes[i]];
-        int start = starts[i] < 0 ? (starts[i] + dim) : starts[i];
-        int end = ends[i] < 0 ? (ends[i] + dim) : ends[i];
-        int stride = std::abs(strides[i]);
-        start = std::max(start, 0);
-        end = std::max(end, 0);
-        end = std::min(end, dim);
-        const_weight[axes[i]] =
-            dim - ((std::abs(end - start) + stride - 1) / stride);
-      }
-
-      int* weight_data = create_weights(const_weight, "size");
-
-      TensorRTEngine::Weight weight{nvinfer1::DataType::kINT32,
-                                    static_cast<void*>(weight_data),
-                                    static_cast<size_t>(input_dims.nbDims)};
-
-      int input_dim_size = input_dims.nbDims;
-      nvinfer1::Dims input_shape;
-      input_shape.nbDims = 1;
-      input_shape.d[0] = input_dim_size;
-
-      auto const_layer =
-          TRT_ENGINE_ADD_LAYER(engine_, Constant, input_shape, weight.get());
-
-      auto shape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shape, *input);
-      // slice layer
-      auto* layer =
-          TRT_ENGINE_ADD_LAYER(engine_, Slice, *input, start, size, stride);
-      // elementwise layer for get size tensor
-      auto size_layer =
-          TRT_ENGINE_ADD_LAYER(engine_,
-                               ElementWise,
-                               *shape_layer->getOutput(0),
-                               *const_layer->getOutput(0),
-                               nvinfer1::ElementWiseOperation::kSUB);
-      layer->setInput(2, *size_layer->getOutput(0));
-      RreplenishLayerAndOutput(
-          layer, "strided_slice", {output_name}, test_mode);
+      auto chw_input_dims = input->getDimensions();
+      nvinfer1::Dims trt_start_dims;
+      trt_start_dims.nbDims = chw_input_dims.nbDims;
+      memset(trt_start_dims.d, 0, sizeof(int32_t) * chw_input_dims.nbDims);
+      nvinfer1::Dims trt_size_dims = chw_input_dims;
+      nvinfer1::Dims trt_step_dims;
+      trt_step_dims.nbDims = chw_input_dims.nbDims;
+      for (int i = 0; i < trt_step_dims.nbDims; i++) trt_step_dims.d[i] = 1;
+
+      // input : [C,H,W]
+      for (size_t i = 0; i < axes.size(); i++) {
+        int trt_axis = axes[i] - 1;
+        trt_start_dims.d[trt_axis] = starts[i];
+        trt_size_dims.d[trt_axis] =
+            (ends[i] - starts[i] + strides[i] - 1) / strides[i];
+        trt_step_dims.d[trt_axis] = strides[i];
+      }
+      layer = TRT_ENGINE_ADD_LAYER(
+          engine_, Slice, *input, trt_start_dims, trt_size_dims, trt_step_dims);
+      nvinfer1::Dims real_trt_size_dims;
+      real_trt_size_dims.nbDims = 0;
+
+      if (decrease_axises.size() > 0) {
+        for (size_t i = 0; i < decrease_axises.size(); i++) {
+          decrease_axises[i]--;
+        }
+        for (int i = 0; i < trt_size_dims.nbDims; i++) {
+          if (decrease_axises.end() !=
+              std::find(decrease_axises.begin(), decrease_axises.end(), i))
+            continue;
+          real_trt_size_dims.d[real_trt_size_dims.nbDims] = trt_size_dims.d[i];
+          real_trt_size_dims.nbDims++;
+        }
+        if (real_trt_size_dims.nbDims == 0) {
+          real_trt_size_dims.nbDims = 1;
+          real_trt_size_dims.d[0] = 1;
+        }
+        auto reshape_layer =
+            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *layer->getOutput(0));
+        reshape_layer->setReshapeDimensions(real_trt_size_dims);
+        layer = static_cast<nvinfer1::ILayer*>(reshape_layer);
+      }
    }
+    RreplenishLayerAndOutput(layer, "strided_slice", {output_name}, test_mode);
  }
 };


--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -503,6 +503,18 @@ class TensorRTEngineOp : public framework::OperatorBase {
      // convert input and copy to TRT engine's buffer
      auto &t =
          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
+      PADDLE_ENFORCE_GT(
+          t.numel(),
+          0,
+          phi::errors::InvalidArgument(
+              "The input tensor named %s of trt-subgraph must "
+              "have >0 elements, but now have %d elements. "
+              "It's likely that this tensor is connected to a Concat op inside "
+              "a trt-subgraph, "
+              "try to ues API to forbid this op into trt-subgraph.",
+              x,
+              t.numel()));
+
      // check the input_tensor
      if (!platform::is_gpu_place(t.place())) {
        framework::Tensor out;

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py
@@ -34,7 +34,7 @@ class TrtConvertStridedSliceTest(TrtLayerAutoScanTest):
    def sample_program_configs(self):

        def generate_input1(attrs: List[Dict[str, Any]]):
-            return np.ones([1, 56, 56, 192]).astype(np.float32)
+            return np.random.random([1, 56, 56, 192]).astype(np.float32)

        for axes in [[1, 2]]:
            for starts in [[1, 1]]:
@@ -130,5 +130,88 @@ class TrtConvertStridedSliceTest(TrtLayerAutoScanTest):
        self.run_test()


+class TrtConvertStridedSliceTest2(TrtLayerAutoScanTest):
+
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+
+        def generate_input1(attrs: List[Dict[str, Any]]):
+            return np.random.random([1, 56, 56, 192]).astype(np.float32)
+
+        for axes in [[1, 2], [2, 3], [1, 3]]:
+            for starts in [[-10, 1], [-10, 20], [-10, 15], [-10, 16], [-10,
+                                                                       20]]:
+                for ends in [[-9, 10000], [-9, -1], [-9, 40]]:
+                    for decrease_axis in [[]]:
+                        for infer_flags in [[1, 1]]:
+                            for strides in [[2, 2]]:
+                                dics = [{
+                                    "axes": axes,
+                                    "starts": starts,
+                                    "ends": ends,
+                                    "decrease_axis": [axes[0]],
+                                    "infer_flags": infer_flags,
+                                    "strides": strides
+                                }]
+
+                                ops_config = [{
+                                    "op_type": "strided_slice",
+                                    "op_inputs": {
+                                        "Input": ["input_data"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["slice_output_data"]
+                                    },
+                                    "op_attrs": dics[0]
+                                }]
+                                ops = self.generate_op_config(ops_config)
+
+                                program_config = ProgramConfig(
+                                    ops=ops,
+                                    weights={},
+                                    inputs={
+                                        "input_data":
+                                        TensorConfig(data_gen=partial(
+                                            generate_input1, dics))
+                                    },
+                                    outputs=["slice_output_data"])
+
+                                yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+
+        def generate_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {
+                "input_data": [1, 56, 56, 192]
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input_data": [8, 100, 100, 200]
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input_data": [4, 56, 56, 192]
+            }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 2), 1e-5
+
+        # for dynamic_shape
+        generate_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 2), 1e-5
+
+    def test(self):
+        self.run_test()
+
+
 if __name__ == "__main__":
    unittest.main()