[inference][trt] optimize set_value and top_k op (#54372)

* set_value update * support ValueTensor's rank != Input'rank & update topk * update range to avoid coredump * fix addShape error * Dims definition differ between 7.2 and 8.0+ * Update test_trt_convert_top_k_v2.py * update top_k * Update test_trt_convert_top_k_v2.py

[inference][trt] optimize set_value and top_k op (#54372)
* set_value update * support ValueTensor's rank != Input'rank & update topk * update range to avoid coredump * fix addShape error * Dims definition differ between 7.2 and 8.0+ * Update test_trt_convert_top_k_v2.py * update top_k * Update test_trt_convert_top_k_v2.py
e25e86f4 · Zhang Jun · GitHub · 34cfbe79 · e25e86f4 · e25e86f4
7 changed file
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -373,6 +373,13 @@ class OpConverter {
    engine->ClearWeights();
  }

+  nvinfer1::ITensor* Cast(nvinfer1::ITensor* input, nvinfer1::DataType dtype) {
+    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Identity, *input);
+    layer->setOutputType(0, dtype);
+    layer->getOutput(0)->setType(dtype);
+    return layer->getOutput(0);
+  }
+
  // rank(result) = rank(input)
  nvinfer1::ITensor* Gather(nvinfer1::ITensor* input,
                            const std::vector<int32_t> indices,
@@ -384,6 +391,59 @@ class OpConverter {
    return result;
  }

+  nvinfer1::ITensor* Unsqueeze(nvinfer1::ITensor* input,
+                               const std::vector<int32_t> axis) {
+    const auto dims = input->getDimensions();
+    const std::unordered_set<int32_t> axis_data(axis.begin(), axis.end());
+    std::vector<int32_t> subscripts(dims.nbDims);
+    std::iota(subscripts.begin(), subscripts.end(), 0);
+    for (const auto& axis_value : axis_data) {
+      subscripts.insert(subscripts.begin() + axis_value, dims.nbDims);
+    }
+    nvinfer1::ITensor* input_shape{nullptr};
+    if (engine_->with_dynamic_shape()) {
+      input_shape = Shape(input);
+    } else {
+      input_shape = Add1DConstantLayer(dims);
+    }
+    auto* new_dim =
+        TRT_ENGINE_ADD_LAYER(engine_,
+                             Gather,
+                             *Concat(std::vector<nvinfer1::ITensor*>{
+                                 input_shape, Add1DConstantLayer(1)}),
+                             *Add1DConstantLayer(subscripts),
+                             0)
+            ->getOutput(0);
+    auto result = Reshape(input, new_dim);
+    return result;
+  }
+
+  nvinfer1::ITensor* Squeeze(nvinfer1::ITensor* input,
+                             const std::vector<int32_t> axis) {
+    const auto dims = input->getDimensions();
+    std::vector<int32_t> subscripts(dims.nbDims);
+    std::iota(subscripts.begin(), subscripts.end(), 0);
+    auto p =
+        std::remove_if(subscripts.begin(), subscripts.end(), [axis](int x) {
+          return std::find(axis.begin(), axis.end(), x) != axis.end();
+        });
+    subscripts.resize(p - subscripts.begin());
+
+    nvinfer1::ITensor* input_shape{nullptr};
+    if (engine_->with_dynamic_shape()) {
+      input_shape = Shape(input);
+    } else {
+      input_shape = Add1DConstantLayer(dims);
+    }
+
+    auto* new_dim =
+        TRT_ENGINE_ADD_LAYER(
+            engine_, Gather, *input_shape, *Add1DConstantLayer(subscripts), 0)
+            ->getOutput(0);
+    auto result = Reshape(input, new_dim);
+    return result;
+  }
+
  // paddle allows negative index
  // for axis length = 5, paddle allows [-5, 4]
  nvinfer1::ITensor* FixNegIndices(nvinfer1::ITensor* input_shape,
@@ -406,7 +466,23 @@ class OpConverter {
                             nvinfer1::ITensor* newShape,
                             const std::string& name = "") {
    auto* shuffle = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
-    shuffle->setInput(1, *newShape);
+    if (engine_->with_dynamic_shape()) {
+      shuffle->setInput(1, *newShape);
+    } else {
+      auto shape = newShape->getDimensions();
+      shuffle->setReshapeDimensions(shape);
+    }
+    if (name != "") {
+      shuffle->setName(name.c_str());
+    }
+    return shuffle->getOutput(0);
+  }
+
+  nvinfer1::ITensor* Reshape(nvinfer1::ITensor* input,
+                             nvinfer1::Dims shape,
+                             const std::string& name = "") {
+    auto* shuffle = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+    shuffle->setReshapeDimensions(shape);
    if (name != "") {
      shuffle->setName(name.c_str());
    }

--- a/paddle/fluid/inference/tensorrt/convert/range_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/range_op.cc
@@ -46,8 +46,13 @@ class RangeOpConverter : public OpConverter {
      quotient_tensor = fquotient_tensor;
    }
    auto number_tensor = Max(Sub(zero_tensor, quotient_tensor), zero_tensor);
-    auto* start1 = engine_->GetITensor(op_desc.Input("Start")[0], true);
-
+    auto* start1 = engine_->GetITensor(op_desc.Input("Start")[0]);
+#if IS_TRT_VERSION_LT(8000)
+    nvinfer1::Dims start_dims{0, {1}, { nvinfer1::DimensionType::kSPATIAL }};
+#else
+    nvinfer1::Dims start_dims{0, {1}};
+#endif
+    start1 = Reshape(start1, start_dims);
    layer = TRT_ENGINE_ADD_LAYER(
        engine_, Fill, nvinfer1::Dims{}, nvinfer1::FillOperation::kLINSPACE);
    layer->setInput(0, *number_tensor);

--- a/paddle/fluid/inference/tensorrt/convert/set_value_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/set_value_op.cc
@@ -24,16 +24,6 @@ limitations under the License. */
    }                                                                       \
  } while (0)

-namespace paddle {
-namespace framework {
-class Scope;
-
-namespace proto {
-class OpDesc;
-}  // namespace proto
-}  // namespace framework
-}  // namespace paddle
-
 namespace paddle {
 namespace inference {
 namespace tensorrt {
@@ -55,6 +45,14 @@ class SetValueConverter : public OpConverter {

    auto* inputs = engine_->GetITensor(op_desc.Input("Input")[0]);
    auto* updates = engine_->GetITensor(op_desc.Input("ValueTensor")[0]);
+    const auto decrease_axes = PADDLE_GET_CONST(
+        std::vector<int64_t>, op_desc.GetAttr("decrease_axes"));
+    std::vector<int32_t> decr_axes{decrease_axes.begin(), decrease_axes.end()};
+    auto value_rank = updates->getDimensions().nbDims;
+    auto input_rank = inputs->getDimensions().nbDims;
+    if (decrease_axes.size() > 0 && value_rank != input_rank) {
+      updates = Unsqueeze(updates, decr_axes);
+    }

    int64_t axes = 0;
    int64_t starts = 0;
@@ -115,39 +113,14 @@ class SetValueConverter : public OpConverter {
        indices.insert(indices.end(), axes_index.begin(), axes_index.end());
      }

-      nvinfer1::Dims indice_dims = update_dims;
-
-      // create a tensor to store data
-      std::vector<int> indice_dim_vec;
-      for (int i = 0; i < update_dims.nbDims; i++) {
-        indice_dim_vec.emplace_back(update_dims.d[i]);
-      }
-      auto indice_tensor_dims = phi::make_ddim(indice_dim_vec);
-      std::unique_ptr<phi::DenseTensor> indice_tensor(
-          std::make_unique<phi::DenseTensor>());
-      indice_tensor->Resize(indice_tensor_dims);
-
-      auto* dev_ctx = static_cast<phi::CPUContext*>(
-          platform::DeviceContextPool::Instance().Get(platform::CPUPlace()));
-      auto* weight_data = dev_ctx->template HostAlloc<int>(indice_tensor.get());
-
-      memcpy(weight_data, indices.data(), sizeof(int) * indice_tensor->numel());
-
-      TensorRTEngine::Weight weight{
-          nvinfer1::DataType::kINT32,
-          static_cast<void*>(weight_data),
-          static_cast<size_t>(indice_tensor->numel())};
      auto output_name = op_desc.Output("Out")[0];
-      engine_->SetWeights("set_value_index_" + output_name,
-                          std::move(indice_tensor));
-
-      auto const_layer =
-          TRT_ENGINE_ADD_LAYER(engine_, Constant, indice_dims, weight.get());
+      const auto const_layer = AddConstantLayer(
+          indices.data(), update_dims, "set_value_index_" + output_name);

      auto* layer = TRT_ENGINE_ADD_LAYER(engine_,
                                         Scatter,
                                         *inputs,
-                                         *const_layer->getOutput(0),
+                                         *const_layer,
                                         *updates,
                                         nvinfer1::ScatterMode::kELEMENT);


--- a/paddle/fluid/inference/tensorrt/convert/top_k_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/top_k_op.cc
@@ -33,77 +33,71 @@ class TopKOpConverter : public OpConverter {
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope,
                  bool test_mode) override {
-    VLOG(3) << "convert a top_k op to tensorrt TopK layer";
-    // Here the two nullptr looks strange, that's because the
-    // framework::OpDesc's constructor is strange.
+    VLOG(3) << "convert a top_k op to tensorrt layer";
    framework::OpDesc op_desc(op, nullptr);

    auto* input_tensor = engine_->GetITensor(op_desc.Input("X")[0]);

-    const int k = op_desc.HasAttr("k")
-                      ? PADDLE_GET_CONST(int, op_desc.GetAttr("k"))
-                      : 1.0f;
-
-    nvinfer1::Dims input_dims = input_tensor->getDimensions();
-    int axis = input_dims.nbDims;
-    nvinfer1::ITopKLayer* layer =
-        TRT_ENGINE_ADD_LAYER(engine_,
-                             TopK,
-                             *input_tensor,
-                             nvinfer1::TopKOperation::kMAX,
-                             k,
-                             1 << (axis - 1));
-
-    std::vector<std::string> output_names;
-    output_names.push_back(op_desc.Output("Out").front());
-    output_names.push_back(op_desc.Output("Indices").front());
-
-    RreplenishLayerAndOutput(layer, "top_k", output_names, test_mode);
-  }
-};
-class TopKv2OpConverter : public OpConverter {
- public:
-  TopKv2OpConverter() {}
-  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope,
-                  bool test_mode) override {
-    // Here the two nullptr looks strange, that's because the
-    // framework::OpDesc's constructor is strange.
-    framework::OpDesc op_desc(op, nullptr);
-
-    auto* input_tensor = engine_->GetITensor(op_desc.Input("X")[0]);
-
-    const int k = op_desc.HasAttr("k")
-                      ? PADDLE_GET_CONST(int, op_desc.GetAttr("k"))
-                      : 1.0f;
-    const int axis = op_desc.HasAttr("axis")
-                         ? PADDLE_GET_CONST(int, op_desc.GetAttr("axis"))
-                         : 1.0f;
+    const int k =
+        op_desc.HasAttr("k") ? PADDLE_GET_CONST(int, op_desc.GetAttr("k")) : 1;
+    int axis = op_desc.HasAttr("axis")
+                   ? PADDLE_GET_CONST(int, op_desc.GetAttr("axis"))
+                   : -1;
    const bool largest =
        op_desc.HasAttr("largest")
            ? PADDLE_GET_CONST(bool, op_desc.GetAttr("largest"))
            : true;
    auto flag =
        largest ? nvinfer1::TopKOperation::kMAX : nvinfer1::TopKOperation::kMIN;
+
+    auto input_dims = input_tensor->getDimensions();
+    auto input_rank = input_dims.nbDims;
+    // 1d needs expand to 2d
+    bool expand_to_2d = (input_rank == 1);
+    if (engine_->with_dynamic_shape() && expand_to_2d) {
+      input_tensor = Unsqueeze(input_tensor, std::vector<int32_t>{1});
+    }
+
+    // INT32 only, other data type should to casted to INT32.
+    nvinfer1::DataType type = input_tensor->getType();
+    bool cast = (type == nvinfer1::DataType::kINT32);
+    if (cast) {
+      input_tensor = Cast(input_tensor, nvinfer1::DataType::kFLOAT);
+    }
+
    nvinfer1::ITopKLayer* layer = nullptr;
-    if (axis == -1) {
-      nvinfer1::Dims input_dims = input_tensor->getDimensions();
-      layer = TRT_ENGINE_ADD_LAYER(
-          engine_, TopK, *input_tensor, flag, k, 1 << (input_dims.nbDims - 1));
-    } else {
-      if (engine_->with_dynamic_shape()) {
-        layer = TRT_ENGINE_ADD_LAYER(
-            engine_, TopK, *input_tensor, flag, k, 1 << axis);
-      } else {
-        layer = TRT_ENGINE_ADD_LAYER(
-            engine_, TopK, *input_tensor, flag, k, 1 << (axis - 1));
-      }
+    if (axis > 0 && !engine_->with_dynamic_shape()) {
+      axis -= 1;
    }
-    std::vector<std::string> output_names;
-    output_names.push_back(op_desc.Output("Out").front());
-    output_names.push_back(op_desc.Output("Indices").front());
+    if (axis < 0) axis += input_rank;
+
+    layer =
+        TRT_ENGINE_ADD_LAYER(engine_, TopK, *input_tensor, flag, k, 1 << axis);
+
+    nvinfer1::ITensor* values = layer->getOutput(0);
+    nvinfer1::ITensor* indices = layer->getOutput(1);
+
+    // un-expand to 1d
+    if (engine_->with_dynamic_shape() && expand_to_2d) {
+      values = Squeeze(values, std::vector<int32_t>{1});
+      indices = Squeeze(indices, std::vector<int32_t>{1});
+    }
+
+    // cast back
+    if (cast) {
+      values = Cast(values, nvinfer1::DataType::kINT32);
+    }
+
+    auto out_name = op_desc.Output("Out").front();
+    auto indices_name = op_desc.Output("Indices").front();
+    values->setName(out_name.c_str());
+    engine_->SetITensor(out_name.c_str(), values);
+
+    indices->setName(indices_name.c_str());
+    engine_->SetITensor(indices_name.c_str(), indices);

-    RreplenishLayerAndOutput(layer, "top_k_v2", output_names, test_mode);
+    layer->setName(
+        ("top_k (Output: " + out_name + "," + indices_name + ")").c_str());
  }
 };
 }  // namespace tensorrt
@@ -111,4 +105,4 @@ class TopKv2OpConverter : public OpConverter {
 }  // namespace paddle

 REGISTER_TRT_OP_CONVERTER(top_k, TopKOpConverter);
-REGISTER_TRT_OP_CONVERTER(top_k_v2, TopKv2OpConverter);
+REGISTER_TRT_OP_CONVERTER(top_k_v2, TopKOpConverter);
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -2402,6 +2402,22 @@ struct SimpleOpTypeSetTeller : public Teller {
 #if !IS_TRT_VERSION_GE(8200)
      return false;
 #endif
+      auto inputs = desc.Inputs();
+      if (inputs.find("StartsTensorList") != inputs.end()) {
+        if (desc.Input("StartsTensorList").size() >= 1) {
+          return false;
+        }
+      }
+      if (inputs.find("EndsTensorList") != inputs.end()) {
+        if (desc.Input("EndsTensorList").size() >= 1) {
+          return false;
+        }
+      }
+      if (inputs.find("StepsTensorList") != inputs.end()) {
+        if (desc.Input("StepsTensorList").size() >= 1) {
+          return false;
+        }
+      }
      if (!(desc.HasAttr("axes") && desc.HasAttr("starts") &&
            desc.HasAttr("steps"))) {
        VLOG(3) << "the " << op_type
@@ -2409,52 +2425,22 @@ struct SimpleOpTypeSetTeller : public Teller {
                   "starts or steps)";
        return false;
      }
-      auto* block = desc.Block();
-      auto input_name = desc.Input("Input")[0];
-      auto* input_desc = block->FindVar(input_name);
-      const auto input_shape = input_desc->GetShape();
-      auto update_name = desc.Input("ValueTensor")[0];
-      auto* update_desc = block->FindVar(update_name);
-      const auto update_shape = update_desc->GetShape();
-      if (update_shape.size() != input_shape.size()) return false;
    }

    if (op_type == "top_k_v2" || op_type == "top_k") {
-      auto* block = desc.Block();
-      auto x_var_name = desc.Input("X")[0];
-
-      if (block == nullptr) {
-        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
-                   "Developers need to check whether block_desc is passed in "
-                   "the pass.";
-        return false;
-      }
-      auto* x_var_desc = block->FindVar(x_var_name);
-      auto x_dtype = x_var_desc->GetDataType();
-
-      if (!(x_dtype == framework::proto::VarType::FP32 ||
-            x_dtype == framework::proto::VarType::FP16)) {
-        return false;
-      }
-
-      const auto x_shape = x_var_desc->GetShape();
-      if (x_shape.size() == 1) {
-        VLOG(3) << "top_k/top_k_v2 does not support 1-dimensional input in "
-                   "tensorrt";
-        return false;
-      }
      if (desc.HasAttr("axis")) {
        int axis = PADDLE_GET_CONST(int, desc.GetAttr("axis"));
-        if (axis == 0) {
+        if (!with_dynamic_shape && axis == 0) {
          VLOG(3) << "top_k_v2 does not support axis == 0 in "
-                     "tensorrt";
+                     "tensorrt static shape.";
          return false;
        }
      }
      if (desc.HasAttr("sorted")) {
        bool sorted = PADDLE_GET_CONST(bool, desc.GetAttr("sorted"));
        if (!sorted) {
-          VLOG(3) << "top_k_v2 does not support results not sorted in "
+          VLOG(3) << op_type
+                  << " does not support results not sorted in "
                     "tensorrt";
          return false;
        }

--- a/test/ir/inference/test_trt_convert_top_k.py
+++ b/test/ir/inference/test_trt_convert_top_k.py
@@ -107,7 +107,7 @@ class TrtConvertActivationTest(TrtLayerAutoScanTest):
            self.dynamic_shape.opt_input_shape = {}

        def generate_trt_nodes_num(attrs, dynamic_shape):
-            if self.dims == 1:
+            if not dynamic_shape and self.dims == 1:
                return 0, 4
            return 1, 3


--- a/test/ir/inference/test_trt_convert_top_k_v2.py
+++ b/test/ir/inference/test_trt_convert_top_k_v2.py
@@ -23,7 +23,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest
 import paddle.inference as paddle_infer


-class TrtConvertActivationTest(TrtLayerAutoScanTest):
+class TrtConvertTopKV2Test(TrtLayerAutoScanTest):
    def is_program_valid(self, program_config: ProgramConfig) -> bool:
        inputs = program_config.inputs
        attrs = [
@@ -31,6 +31,10 @@ class TrtConvertActivationTest(TrtLayerAutoScanTest):
        ]
        if len(inputs['input_data'].shape) <= attrs[0]['axis']:
            return False
+        axis = attrs[0]['axis']
+        axis = axis if axis >= 0 else axis + len(inputs['input_data'].shape)
+        if inputs['input_data'].shape[axis] <= attrs[0]['k']:
+            return False
        return True

    def sample_program_configs(self):
@@ -49,11 +53,12 @@ class TrtConvertActivationTest(TrtLayerAutoScanTest):
        for dims in [1, 2, 3, 4]:
            for batch in [1, 4]:
                for k in [1, 3]:
-                    for axis in [-1, 1, 2, 3]:
+                    for axis in [-1, 1, 0, 2, 3]:
                        for largest in [True, False]:
                            for sort in [True, False]:
                                self.dims = dims
                                self.sort = sort
+                                self.axis = axis
                                dics = [
                                    {
                                        "k": k,
@@ -120,7 +125,7 @@ class TrtConvertActivationTest(TrtLayerAutoScanTest):
                    "input_data": [4, 32, 32, 32]
                }
                self.dynamic_shape.opt_input_shape = {
-                    "input_data": [1, 3, 32, 32]
+                    "input_data": [4, 3, 32, 32]
                }

        def clear_dynamic_shape():
@@ -129,7 +134,7 @@ class TrtConvertActivationTest(TrtLayerAutoScanTest):
            self.dynamic_shape.opt_input_shape = {}

        def generate_trt_nodes_num(attrs, dynamic_shape):
-            if self.dims == 1:
+            if not dynamic_shape and (self.dims == 1 or self.axis == 0):
                return 0, 4
            if not self.sort:
                return 0, 4