[Inference Tensorrt] Add attr for trt engine and handle the input seq problem...

[Inference Tensorrt] Add attr for trt engine and handle the input seq problem for ernie var len. (#33575) (#33622)

[Inference Tensorrt] Add attr for trt engine and handle the input seq problem...
[Inference Tensorrt] Add attr for trt engine and handle the input seq problem for ernie var len. (#33575) (#33622)
8e163f92 · Wilber · GitHub · c3807f9e · 8e163f92 · 8e163f92
7 changed file
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@@ -36,6 +36,8 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
    framework::OpDesc op_desc(op, nullptr);
    auto word_id_name = op_desc.Input("WordId").front();
    auto pos_id_name = op_desc.Input("PosId").front();
+    engine_->Set("ernie_pos_name", new std::string(pos_id_name));
    auto sent_id_name = op_desc.Input("SentId").front();
    auto word_emb_name = op_desc.Input("WordEmbedding").front();
    auto pos_emb_name = op_desc.Input("PosEmbedding").front();

--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
@@ -191,9 +191,15 @@ class MultiheadMatMulOpConverter : public OpConverter {
        std::vector<nvinfer1::ITensor*> plugin_inputs;
        plugin_inputs.emplace_back(fc_layer->getOutput(0));
        plugin_inputs.emplace_back(mask_tensor);
+        if (engine_->Has("ernie_pos_name")) {
+          plugin_inputs.emplace_back(
+              engine_->GetITensor(engine_->Get<std::string>("ernie_pos_name")));
+        } else {
          plugin_inputs.emplace_back(engine_->GetITensor(
-            engine_->network()->getInput(2)->getName()));  // cu_seqlens,
+              engine_->network()
-                                                           // eval_placeholder_2
+                  ->getInput(2)
+                  ->getName()));  // cu_seqlens, eval_placeholder_2
+        }
        auto max_seqlen_tensor =
            engine_->GetITensor(engine_->network()->getInput(3)->getName());
        auto* shuffle_layer = TRT_ENGINE_ADD_LAYER(

--- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
@@ -76,9 +76,16 @@ class SliceOpConverter : public OpConverter {
        std::vector<nvinfer1::ITensor*> plugin_inputs;
        // plugin_inputs.emplace_back(trans_layer->getOutput(0));
        plugin_inputs.emplace_back(input);
-        plugin_inputs.emplace_back(engine_->GetITensor(
-            engine_->network()->getInput(2)->getName()));  // cu_seqlens,
+        std::string pos_name;
-                                                           // eval_placeholder_2
+        if (engine_->Has("ernie_pos_name")) {
+          pos_name = engine_->Get<std::string>("ernie_pos_name");
+        } else {
+          // hard code for compatibility
+          pos_name = engine_->network()->getInput(2)->getName();
+        }
+        plugin_inputs.emplace_back(
+            engine_->GetITensor(pos_name));  // cu_seqlens, eval_placeholder_2
        // bool ban_fp16 = engine_->disable_trt_plugin_fp16();
        plugin::SpecialSlicePluginDynamic* plugin =

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -202,7 +202,15 @@ class TensorRTEngine {
    dy::initLibNvInferPlugins(&logger, "");
  }
-  ~TensorRTEngine() {}
+  ~TensorRTEngine() {
+    for (auto& attr : attrs_) {
+      if (attr_dels_.find(attr.first) != attr_dels_.end()) {
+        attr_dels_[attr.first]();
+      }
+    }
+    attrs_.clear();
+    attr_dels_.clear();
+  }
  // Add an input and set its name, data type and dimension.
  nvinfer1::ITensor* DeclareInput(const std::string& name,
@@ -386,6 +394,82 @@ class TensorRTEngine {
  }
 #endif
+  bool Has(const std::string& attr_name) const {
+    return attrs_.count(attr_name) > 0;
+  }
+  void Erase(const std::string& attr_name) {
+    if (!Has(attr_name)) {
+      return;
+    }
+    if (attr_dels_.find(attr_name) != attr_dels_.end()) {
+      attr_dels_[attr_name]();
+      attr_dels_.erase(attr_name);
+    }
+    attrs_.erase(attr_name);
+  }
+  // Set a pointer to the attribute. Engine takes ownership of the attribute.
+  template <typename AttrType>
+  void Set(const std::string& attr_name, AttrType* attr) {
+    if (attrs_.count(attr_name) == 0) {
+      PADDLE_ENFORCE_EQ(
+          attrs_.count(attr_name), 0,
+          platform::errors::AlreadyExists(
+              "Attribute %s already set in trt engine.", attr_name));
+    } else {
+      VLOG(3) << "Setting the attribute " << attr_name << " for trt engine "
+              << this;
+    }
+    attrs_[attr_name] = attr;
+    attr_dels_[attr_name] = [attr, attr_name]() {
+      VLOG(3) << "deleting " << attr_name;
+      delete attr;
+    };
+  }
+  // Set a pointer to the attribute. Engine doesn't take ownership. Caller
+  // should delete the attribute.
+  template <typename AttrType>
+  void SetNotOwned(const std::string& attr_name, AttrType* attr) {
+    PADDLE_ENFORCE_EQ(
+        attrs_.count(attr_name), 0,
+        platform::errors::AlreadyExists(
+            "Attribute %s already set in trt engine.", attr_name));
+    attrs_[attr_name] = attr;
+  }
+  // Get a reference to the attributed previously set.
+  template <typename AttrType>
+  AttrType& Get(const std::string& attr_name) const {
+    PADDLE_ENFORCE_NE(attrs_.find(attr_name), attrs_.end(),
+                      platform::errors::InvalidArgument(
+                          "Attribute %s not found in trt engine.", attr_name));
+    try {
+      return *boost::any_cast<AttrType*>(attrs_.at(attr_name));
+    } catch (boost::bad_any_cast&) {
+      auto TypeToString = [](const std::type_info& info) -> std::string {
+        if (std::type_index(info) == std::type_index(typeid(bool*))) {
+          return "bool";
+        } else if (std::type_index(info) == std::type_index(typeid(int*))) {
+          return "int";
+        } else if (std::type_index(info) ==
+                   std::type_index(typeid(const int*))) {
+          return "const int";
+        } else if (std::type_index(info) ==
+                   std::type_index(typeid(std::string*))) {
+          return "std::string";
+        }
+        return info.name();
+      };
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Invalid type for attritube %s, expected: %s, actual: %s.", attr_name,
+          TypeToString(typeid(AttrType*)),
+          TypeToString(attrs_.at(attr_name).type())));
+    }
+  }
 private:
  // Each ICudaEngine object is bound to a specific GPU when it is instantiated,
  // ensure that the thread is associated with the correct device by calling
@@ -441,6 +525,9 @@ class TensorRTEngine {
  infer_ptr<nvinfer1::IHostMemory> ihost_memory_;
  std::unordered_map<nvinfer1::ITensor*, float> quant_dynamic_range_;
+  std::unordered_map<std::string, boost::any> attrs_;
+  std::unordered_map<std::string, std::function<void(void)>> attr_dels_;
  // For dynamic shape
  bool with_dynamic_shape_{false};
  infer_ptr<nvinfer1::INetworkDefinition> infer_networkv2_;

--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -91,6 +91,15 @@ TEST_F(TensorRTEngineTest, add_layer) {
  buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
  buffers[1] = reinterpret_cast<void *>(y_gpu_data);
+  LOG(INFO) << "Set attr";
+  engine_->Set("test_attr", new std::string("test_attr"));
+  if (engine_->Has("test_attr")) {
+    auto attr_val = engine_->Get<std::string>("test_attr");
+    engine_->Erase("test_attr");
+  }
+  std::string *attr_key = new std::string("attr_key");
+  engine_->SetNotOwned("attr1", attr_key);
  LOG(INFO) << "to execute";
  engine_->Execute(1, &buffers, ctx_->stream());
@@ -99,6 +108,8 @@ TEST_F(TensorRTEngineTest, add_layer) {
  LOG(INFO) << "to checkout output";
  ASSERT_EQ(y_cpu[0], x_v[0] * 2 + 3);
+  delete attr_key;
 }
 TEST_F(TensorRTEngineTest, add_layer_multi_dim) {

--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -33,6 +33,7 @@
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/tests/api/config_printer.h"
 #include "paddle/fluid/inference/tests/test_helper.h"

--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 namespace paddle {
@@ -143,5 +144,136 @@ TEST(AnalysisPredictor, fp16) {
 #endif
 }
+// ernie_varlen
+std::shared_ptr<paddle_infer::Predictor> InitPredictor() {
+  paddle_infer::Config config;
+  config.SetModel(FLAGS_infer_model);
+  config.EnableUseGpu(100, 0);
+  // Open the memory optim.
+  config.EnableMemoryOptim();
+  int max_batch = 32;
+  int max_single_seq_len = 128;
+  int opt_single_seq_len = 64;
+  int min_batch_seq_len = 1;
+  int max_batch_seq_len = 512;
+  int opt_batch_seq_len = 256;
+  std::string input_name0 = "read_file_0.tmp_0";
+  std::string input_name1 = "read_file_0.tmp_1";
+  std::string input_name2 = "read_file_0.tmp_2";
+  std::string input_name3 = "read_file_0.tmp_4";
+  std::vector<int> min_shape = {min_batch_seq_len};
+  std::vector<int> max_shape = {max_batch_seq_len};
+  std::vector<int> opt_shape = {opt_batch_seq_len};
+  // Set the input's min, max, opt shape
+  std::map<std::string, std::vector<int>> min_input_shape = {
+      {input_name0, min_shape},
+      {input_name1, min_shape},
+      {input_name2, {1}},
+      {input_name3, {1, 1, 1}}};
+  std::map<std::string, std::vector<int>> max_input_shape = {
+      {input_name0, max_shape},
+      {input_name1, max_shape},
+      {input_name2, {max_batch + 1}},
+      {input_name3, {1, max_single_seq_len, 1}}};
+  std::map<std::string, std::vector<int>> opt_input_shape = {
+      {input_name0, opt_shape},
+      {input_name1, opt_shape},
+      {input_name2, {max_batch + 1}},
+      {input_name3, {1, opt_single_seq_len, 1}}};
+  // only kHalf supported
+  config.EnableTensorRtEngine(
+      1 << 30, 1, 5, paddle_infer::Config::Precision::kHalf, false, false);
+  // erinie varlen must be used with dynamic shape
+  config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
+                                opt_input_shape);
+  // erinie varlen must be used with oss
+  config.EnableTensorRtOSS();
+  return paddle_infer::CreatePredictor(config);
+}
+void run(paddle_infer::Predictor* predictor, std::vector<float>* out_data) {
+  const int run_batch = 2;
+  const int run_seq_len = 71;
+  const int max_seq_len = 128;
+  int32_t i1[run_seq_len] = {
+      // sentence 1
+      1, 3558, 4, 75, 491, 89, 340, 313, 93, 4, 255, 10, 75, 321, 4095, 1902, 4,
+      134, 49, 75, 311, 14, 44, 178, 543, 15, 12043, 2, 75, 201, 340, 9, 14, 44,
+      486, 218, 1140, 279, 12043, 2,
+      // sentence 2
+      101, 2054, 2234, 2046, 2486, 2044, 1996, 2047, 4552, 2001, 9536, 1029,
+      102, 2004, 1997, 2008, 2154, 1010, 1996, 2047, 4552, 9536, 2075, 1996,
+      2117, 3072, 2234, 2046, 2486, 1012, 102,
+  };
+  int32_t i2[run_seq_len] = {
+      // sentence 1
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      // sentence 2
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1};
+  // shape info of this batch
+  int32_t i3[3] = {0, 40, 71};
+  // max_seq_len represents the max sentence length of all the sentences, only
+  // length of
+  // input i4 is useful, data means nothing.
+  int32_t i4[max_seq_len] = {0};
+  auto input_names = predictor->GetInputNames();
+  // first input
+  auto input_t1 = predictor->GetInputHandle(input_names[0]);
+  input_t1->Reshape({run_seq_len});
+  input_t1->CopyFromCpu(i1);
+  // second input
+  auto input_t2 = predictor->GetInputHandle(input_names[1]);
+  input_t2->Reshape({run_seq_len});
+  input_t2->CopyFromCpu(i2);
+  // third input
+  auto input_t3 = predictor->GetInputHandle(input_names[2]);
+  input_t3->Reshape({run_batch + 1});
+  input_t3->CopyFromCpu(i3);
+  // fourth input
+  auto input_t4 = predictor->GetInputHandle(input_names[3]);
+  input_t4->Reshape({1, max_seq_len, 1});
+  input_t4->CopyFromCpu(i4);
+  CHECK(predictor->Run());
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputHandle(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                std::multiplies<int>());
+  out_data->resize(out_num);
+  output_t->CopyToCpu(out_data->data());
+  return;
+}
+TEST(AnalysisPredictor, ernie_varlen) {
+#if IS_TRT_VERSION_GE(7234)
+  auto predictor = InitPredictor();
+  std::vector<float> out_data;
+  run(predictor.get(), &out_data);
+  std::vector<float> ref_data{0.59814,  0.219882, 0.181978,
+                              0.359796, 0.577414, 0.0627908};
+  float near_tolerance = 1e-3;
+  for (size_t i = 0; i < out_data.size(); i++) {
+    EXPECT_NEAR(ref_data[i], out_data[i], near_tolerance);
+  }
+#endif
+}
 }  // namespace inference
 }  // namespace paddle