[Paddle-Inference] add Paddle Trt config: with_interleaved (#38884)

* add Paddle Trt config: with_interleaved

[Paddle-Inference] add Paddle Trt config: with_interleaved (#38884)
* add Paddle Trt config: with_interleaved
dccdc719 · Wangzheee · GitHub · 7f123456 · dccdc719 · dccdc719
15 changed file
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -212,6 +212,7 @@ struct Argument {
                      bool);
  DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool);
  DECL_ARGUMENT_FIELD(tensorrt_use_oss, TensorRtUseOSS, bool);
+  DECL_ARGUMENT_FIELD(tensorrt_with_interleaved, TensorRtWithInterleaved, bool);
  DECL_ARGUMENT_FIELD(tensorrt_shape_range_info_path,
                      TensorRtShapeRangeInfoPath, std::string);
  DECL_ARGUMENT_FIELD(tensorrt_tuned_dynamic_shape, TensorRtTunedDynamicShape,

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -108,6 +108,8 @@ void IRPassManager::CreatePasses(Argument *argument,
      pass->Set("enable_int8", new bool(enable_int8));
      pass->Set("use_calib_mode", new bool(use_calib_mode));
      pass->Set("use_oss", new bool(argument->tensorrt_use_oss()));
+      pass->Set("with_interleaved",
+                new bool(argument->tensorrt_with_interleaved()));
      pass->Set("precision_mode",
                new AnalysisConfig::Precision(precision_mode));


--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -369,6 +369,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
                  Get<int>("gpu_device_id"), min_input_shape, max_input_shape,
                  opt_input_shape, disable_trt_plugin_fp16);
  trt_engine->SetUseOSS(Get<bool>("use_oss"));
+  trt_engine->SetWithInterleaved(Get<bool>("with_interleaved"));
  trt_engine->SetUseDLA(Get<bool>("trt_use_dla"));
  trt_engine->SetDLACore(Get<int>("trt_dla_core"));


--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -189,6 +189,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  CP_MEMBER(trt_use_static_engine_);
  CP_MEMBER(trt_use_calib_mode_);
  CP_MEMBER(trt_use_oss_);
+  CP_MEMBER(trt_with_interleaved_);
  CP_MEMBER(trt_tuned_dynamic_shape_);
  CP_MEMBER(trt_allow_build_at_runtime_);
  CP_MEMBER(collect_shape_range_info_);
@@ -864,6 +865,8 @@ std::string AnalysisConfig::Summary() {
                                                        : "false"});

      os.InsertRow({"tensorrt_use_oss", trt_use_oss_ ? "true" : "false"});
+      os.InsertRow({"tensorrt_with_interleaved",
+                    trt_with_interleaved_ ? "true" : "false"});
      os.InsertRow({"tensorrt_use_dla", trt_use_dla_ ? "true" : "false"});
      if (trt_use_dla_) {
        os.InsertRow({"tensorrt_dla_core", std::to_string(trt_dla_core_)});

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -605,6 +605,7 @@ void AnalysisPredictor::PrepareArgument() {
    argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_);
    argument_.SetTensorRtUseCalibMode(config_.trt_use_calib_mode_);
    argument_.SetTensorRtUseOSS(config_.trt_use_oss_);
+    argument_.SetTensorRtWithInterleaved(config_.trt_with_interleaved_);
    argument_.SetMinInputShape(config_.min_input_shape_);
    argument_.SetMaxInputShape(config_.max_input_shape_);
    argument_.SetOptimInputShape(config_.optim_input_shape_);
@@ -1603,5 +1604,11 @@ bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p,
 #endif
  return false;
 }
+void InternalUtils::UpdateConfigInterleaved(paddle_infer::Config *c,
+                                            bool with_interleaved) {
+#ifdef PADDLE_WITH_CUDA
+  c->trt_with_interleaved_ = with_interleaved;
+#endif
+}
 }  // namespace experimental
 }  // namespace paddle_infer
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -796,6 +796,7 @@ struct PD_INFER_DECL AnalysisConfig {
  bool trt_use_static_engine_{false};
  bool trt_use_calib_mode_{true};
  bool trt_use_oss_{false};
+  bool trt_with_interleaved_{false};
  bool trt_use_dla_{false};
  int trt_dla_core_{0};
  std::map<std::string, std::vector<int>> min_input_shape_{};
@@ -883,6 +884,7 @@ struct PD_INFER_DECL AnalysisConfig {
  // So we release the memory when the predictor is set up.
  mutable bool is_valid_{true};
  std::string opt_cache_dir_;
+  friend class paddle_infer::experimental::InternalUtils;
 };

 }  // namespace paddle
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -405,3 +405,24 @@ PD_INFER_DECL std::shared_ptr<framework::Cipher> MakeCipher(
    const std::string& config_file);

 }  // namespace paddle
+
+// forward declation
+using cudaStream_t = struct CUstream_st*;
+using hipStream_t = struct ihipStream_t*;
+
+namespace paddle_infer {
+class Predictor;
+using Config = paddle::AnalysisConfig;
+namespace experimental {
+class PD_INFER_DECL InternalUtils {
+ public:
+  // Note: Can only be used under thread_local semantics.
+  static bool RunWithExternalStream(paddle_infer::Predictor* pred,
+                                    cudaStream_t stream);
+  static bool RunWithExternalStream(paddle_infer::Predictor* pred,
+                                    hipStream_t stream);
+  static void UpdateConfigInterleaved(paddle_infer::Config* c,
+                                      bool with_interleaved);
+};
+}  // namespace experimental
+}  // namespace paddle_infer
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -41,27 +41,11 @@ limitations under the License. */
 /// \since 2.0.0-beta
 ///

-// forward declation
-using cudaStream_t = struct CUstream_st*;
-using hipStream_t = struct ihipStream_t*;
-
 namespace paddle_infer {

 using PrecisionType = paddle::AnalysisConfig::Precision;
 using Config = paddle::AnalysisConfig;

-class Predictor;
-namespace experimental {
-class PD_INFER_DECL InternalUtils {
- public:
-  // Note: Can only be used under thread_local semantics.
-  static bool RunWithExternalStream(paddle_infer::Predictor* pred,
-                                    cudaStream_t stream);
-  static bool RunWithExternalStream(paddle_infer::Predictor* pred,
-                                    hipStream_t stream);
-};
-}  // namespace experimental
-
 ///
 /// \class Predictor
 ///

--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
@@ -45,7 +45,7 @@ class BatchNormOpConverter : public OpConverter {
    auto* Scale_v = scope.FindVar(op_desc.Input("Scale").front());
    auto* Variance_v = scope.FindVar(op_desc.Input("Variance").front());
    const float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
-
+    auto output_name = op_desc.Output("Y").front();
    PADDLE_ENFORCE_NOT_NULL(
        Bias_v,
        platform::errors::NotFound(
@@ -145,6 +145,10 @@ class BatchNormOpConverter : public OpConverter {
      expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
      expand_layer->setReshapeDimensions(expand_shape);
      X = expand_layer->getOutput(0);
+      expand_layer->getOutput(0)->setName(
+          ("reshape_before_batchnorm_out: " + output_name).c_str());
+      expand_layer->setName(
+          ("BN_Shuffle: (Output: " + output_name + ")").c_str());
    }

    layer = TRT_ENGINE_ADD_LAYER(engine_, ScaleNd, *X,
@@ -152,12 +156,13 @@ class BatchNormOpConverter : public OpConverter {
                                 shift_weights.get(), scale_weights.get(),
                                 power_weights.get(), dynamic_shape_offset);

-    auto output_name = op_desc.Output("Y").front();
    engine_->SetWeights(op_desc.Input("Bias").front(),
                        std::move(combile_bias_tensor));
    engine_->SetWeights(op_desc.Input("Scale").front(),
                        std::move(combile_scale_tensor));
    if (x_dim.nbDims < 3 + dynamic_shape_offset) {
+      layer->getOutput(0)->setName("batch_norm_out");
+      layer->setName(("BN: ScaleNd: (Output: " + output_name + ")").c_str());
      nvinfer1::Dims squeeze_shape;
      squeeze_shape.nbDims = x_dim.nbDims;
      for (int i = 0; i < squeeze_shape.nbDims; i++) {
@@ -166,11 +171,13 @@ class BatchNormOpConverter : public OpConverter {
      squeeze_layer =
          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0)));
      squeeze_layer->setReshapeDimensions(squeeze_shape);
-      layer = static_cast<nvinfer1::ILayer*>(squeeze_layer);
-    }
+      RreplenishLayerAndOutput(squeeze_layer, "batchnorm_add_scale",
+                               {output_name}, test_mode);
+    } else {
      RreplenishLayerAndOutput(layer, "batchnorm_add_scale", {output_name},
                               test_mode);
    }
+  }
 };

 }  // namespace tensorrt

--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -50,6 +50,7 @@ class ElementwiseWeightOpConverter : public OpConverter {
                                        op_desc.Input("Y").front().c_str()));
    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
    float* weight_data = nullptr;
+    auto output_name = op_desc.Output("Out")[0];
    weight_data =
        engine_->GetWeightCPUData(op_desc.Input("Y").front(), Y_t, false);
    nvinfer1::Dims dims_x = X->getDimensions();
@@ -80,6 +81,10 @@ class ElementwiseWeightOpConverter : public OpConverter {
        expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
        expand_layer->setReshapeDimensions(expand_shape);
        X = expand_layer->getOutput(0);
+        expand_layer->getOutput(0)->setName(
+            ("elementwise_reshape_out: " + output_name).c_str());
+        expand_layer->setName(
+            ("Elewise: Shuffle: (Output: " + output_name + ")").c_str());
      }
      if (op_type_ == "add") {
        nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER(
@@ -101,11 +106,12 @@ class ElementwiseWeightOpConverter : public OpConverter {
        squeeze_layer =
            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0)));
        squeeze_layer->setReshapeDimensions(squeeze_shape);
-        layer = static_cast<nvinfer1::ILayer*>(squeeze_layer);
+        RreplenishLayerAndOutput(squeeze_layer, "elementwise_" + op_type_,
+                                 {output_name}, test_mode);
+      } else {
+        RreplenishLayerAndOutput(layer, "elementwise_" + op_type_,
+                                 {output_name}, test_mode);
      }
-      auto output_name = op_desc.Output("Out")[0];
-      RreplenishLayerAndOutput(layer, "elementwise_" + op_type_, {output_name},
-                               test_mode);
      if (op_desc.HasAttr("enable_int8")) {
 #if IS_TRT_VERSION_GE(5000)
        CHECK(op_desc.HasAttr("X_scale"));

--- a/paddle/fluid/inference/tensorrt/convert/gather_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/gather_op.cc
@@ -56,6 +56,8 @@ class GatherOpConverter : public OpConverter {
    index_shape.d[0] = -1;

    reshape_layer->setReshapeDimensions(index_shape);
+    reshape_layer->setName(
+        ("Gather: Shuffle: (Output: " + output_name + ")").c_str());

    auto layer = TRT_ENGINE_ADD_LAYER(engine_, Gather, *input_tensor,
                                      *reshape_layer->getOutput(0), axis);

--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -144,8 +144,9 @@ class OpConverter {
    it->SetEngine(engine);
    (*it)(op, scope, test_mode);

-    bool has_out_scale = op_desc.HasAttr("out_threshold");
-    if (has_out_scale) {
+    size_t output_num = op_desc.OutputNames().size();
+    if (output_num == 1) {  // The number of output is 1
+      if (op_desc.HasAttr("out_threshold")) {
        float out_scale =
            BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
        std::string output_name = "";
@@ -167,6 +168,21 @@ class OpConverter {
        VLOG(1) << "Set out scale = " << out_scale << " for tensor "
                << output_name << ".";
      }
+    } else if (output_num > 1) {  // The number of outputs greater than 1
+      for (size_t i = 0; i < output_num; ++i) {
+        if (op_desc.HasAttr("out_" + std::to_string(i) + "_threshold")) {
+          float out_scale = BOOST_GET_CONST(
+              float,
+              op_desc.GetAttr("out_" + std::to_string(i) + "_threshold"));
+          std::string output_name =
+              op_desc.Output(op_desc.OutputNames()[i]).front();
+          auto* output_itensor = engine->GetITensor(output_name);
+          engine->SetTensorDynamicRange(output_itensor, out_scale);
+          VLOG(1) << "Set out scale = " << out_scale << " for tensor "
+                  << output_name << ".";
+        }
+      }
+    }
  }

  // Convert a fluid block to tensorrt network, NOTE it just convert operators,

--- a/paddle/fluid/inference/tensorrt/convert/scale_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/scale_op.cc
@@ -89,21 +89,34 @@ class ScaleOpConverter : public OpConverter {
      expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
      expand_layer->setReshapeDimensions(expand_shape);
      input = expand_layer->getOutput(0);
+      expand_layer->getOutput(0)->setName(
+          ("before_reshape_out: " + out_name).c_str());
+      expand_layer->setName(
+          ("Scale: before_reshape (Output: " + out_name + ")").c_str());
    }

    if (bias_after_scale) {
      layer = TRT_ENGINE_ADD_LAYER(
          engine_, Scale, *input, nvinfer1::ScaleMode::kUNIFORM,
          shift_weights.get(), scale_weights.get(), power_weights.get());
+      layer->getOutput(0)->setName(
+          ("bias_after_scale_out: " + out_name).c_str());
+      layer->setName(("Scale: scale (Output: " + out_name + ")").c_str());
    } else {
      // add bias
      layer = TRT_ENGINE_ADD_LAYER(
          engine_, Scale, *(input), nvinfer1::ScaleMode::kUNIFORM,
          shift_weights.get(), power_weights.get(), power_weights.get());
+      layer->getOutput(0)->setName(
+          ("bias_before_scale：bias_out: " + out_name).c_str());
+      layer->setName(("Scale: scale_bias (Output: " + out_name + ")").c_str());
      // mul scale
      layer = TRT_ENGINE_ADD_LAYER(
          engine_, Scale, *(layer->getOutput(0)), nvinfer1::ScaleMode::kUNIFORM,
          power_weights.get(), scale_weights.get(), power_weights.get());
+      layer->getOutput(0)->setName(
+          ("bias_before_scale：scale_out: " + out_name).c_str());
+      layer->setName(("Scale: scale_scale (Output: " + out_name + ")").c_str());
    }

    PADDLE_ENFORCE_EQ(layer != nullptr, true,
@@ -119,6 +132,9 @@ class ScaleOpConverter : public OpConverter {
          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0)));
      squeeze_layer->setReshapeDimensions(squeeze_shape);
      layer = static_cast<nvinfer1::ILayer*>(squeeze_layer);
+      layer->getOutput(0)->setName(("after_reshape_out: " + out_name).c_str());
+      layer->setName(
+          ("Scale: Shuffle_reshape (Output: " + out_name + ")").c_str());
    }
    RreplenishLayerAndOutput(layer, "scale", {out_name}, test_mode);
  }

--- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
@@ -30,10 +30,11 @@ class SliceOpConverter : public OpConverter {
    framework::OpDesc op_desc(op, nullptr);
    // Declare inputs
    auto* input = engine_->GetITensor(op_desc.Input("Input")[0]);
+    auto output_name = op_desc.Output("Out")[0];

+    float out_scale = 1;
    if (op_desc.HasAttr("out_threshold")) {
-      float out_scale =
-          BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
+      out_scale = BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
      engine_->SetTensorDynamicRange(input, out_scale);
    }

@@ -71,12 +72,22 @@ class SliceOpConverter : public OpConverter {

    nvinfer1::ILayer* layer = nullptr;
    if (engine_->with_dynamic_shape()) {
-#if IS_TRT_VERSION_GE(6000)
      if (engine_->use_oss() && engine_->with_ernie()) {
        std::vector<nvinfer1::ITensor*> plugin_inputs;
-        // plugin_inputs.emplace_back(trans_layer->getOutput(0));
+        if (engine_->with_interleaved()) {
+          auto* shuffler_slice = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+          nvinfer1::Permutation transpose_embed{2, 1, 0, 3};
+          shuffler_slice->setSecondTranspose(transpose_embed);
+          engine_->SetTensorDynamicRange(shuffler_slice->getOutput(0),
+                                         out_scale);
+          shuffler_slice->setName(
+              ("SpecialSlice_interleaved: Shuffle: (Output: " + output_name +
+               ")")
+                  .c_str());
+          plugin_inputs.emplace_back(shuffler_slice->getOutput(0));
+        } else {
          plugin_inputs.emplace_back(input);
-
+        }
        std::string pos_name;
        if (engine_->Has("ernie_pos_name")) {
          pos_name = engine_->Get<std::string>("ernie_pos_name");
@@ -99,11 +110,6 @@ class SliceOpConverter : public OpConverter {
            new plugin::SlicePluginDynamic(starts, ends, axes, with_fp16);
        layer = engine_->AddDynamicPlugin(&input, 1, plugin);
      }
-#else
-      PADDLE_THROW(platform::errors::Fatal(
-          "You are running the TRT Dynamic Shape mode, need to confirm that "
-          "your TRT version is no less than 6.0"));
-#endif
    } else {
      bool with_fp16 =
          engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
@@ -111,8 +117,6 @@ class SliceOpConverter : public OpConverter {
          new plugin::SlicePlugin(starts, ends, axes, with_fp16);
      layer = engine_->AddPlugin(&input, 1, plugin);
    }
-
-    auto output_name = op_desc.Output("Out")[0];
    RreplenishLayerAndOutput(layer, "slice", {output_name}, test_mode);
  }
 };

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -407,6 +407,9 @@ class TensorRTEngine {
  void SetUseDLA(bool use_dla) { use_dla_ = use_dla; }
  void SetDLACore(int dla_core) { dla_core_ = dla_core; }
  void SetWithErnie(bool with_ernie) { with_ernie_ = with_ernie; }
+  void SetWithInterleaved(bool with_interleaved) {
+    with_interleaved_ = with_interleaved;
+  }

  void ClearWeights() {
    for (auto& weight_pair : weight_map) {
@@ -480,6 +483,7 @@ class TensorRTEngine {

  bool use_oss() { return use_oss_; }
  bool with_ernie() { return with_ernie_; }
+  bool with_interleaved() { return with_interleaved_; }
  bool disable_trt_plugin_fp16() { return disable_trt_plugin_fp16_; }
  bool with_dynamic_shape() { return with_dynamic_shape_; }
  AnalysisConfig::Precision precision() { return precision_; }
@@ -612,6 +616,7 @@ class TensorRTEngine {
  bool use_dla_{false};
  int dla_core_{0};
  bool with_ernie_{false};
+  bool with_interleaved_{false};
  nvinfer1::ILogger& logger_;

  // max data size for the buffers.