[inference][trt]add trt sparse weights switch (#53562)

4a69a536 · Zhang Jun · GitHub · 04e5e7b7 · 4a69a536 · 4a69a536
11 changed file
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -250,6 +250,9 @@ struct Argument {
                      TensorRtAllowBuildAtRuntime,
                      bool);
  DECL_ARGUMENT_FIELD(tensorrt_use_inspector, TensorRtUseInspector, bool);
+  DECL_ARGUMENT_FIELD(tensorrt_use_sparse_weights,
+                      TensorRtUseSparseWeights,
+                      bool);

  DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool);
  DECL_ARGUMENT_FIELD(dlnne_min_subgraph_size, DlnneMinSubgraphSize, int);

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -213,6 +213,8 @@ void IRPassManager::CreatePasses(Argument *argument,
      pass->Set("use_static_engine", new bool(use_static_engine));
      pass->Set("model_from_memory", new bool(argument->model_from_memory()));
      pass->Set("use_inspector", new bool(argument->tensorrt_use_inspector()));
+      pass->Set("use_sparse_weights",
+                new bool(argument->tensorrt_use_sparse_weights()));

      // tuned trt dynamic_shape
      pass->Set("trt_shape_range_info_path",

--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -523,6 +523,7 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
  op_desc->SetAttr("allow_build_at_runtime", allow_build_at_runtime);
  op_desc->SetAttr("shape_range_info_path", shape_range_info_path);
  op_desc->SetAttr("use_inspector", Get<bool>("use_inspector"));
+  op_desc->SetAttr("use_sparse_weights", Get<bool>("use_sparse_weights"));
  op_desc->SetAttr("model_precision", Get<int>("model_precision"));
  op_desc->SetAttr("with_dynamic_shape", with_dynamic_shape);

@@ -614,17 +615,14 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
    opt_input_shape = {};
  }

-  auto to_major_version = [&](int full_version) -> float {
-    return (full_version / 100) / 10.0;
-  };
-  const float compile_time_trt_version = to_major_version(TRT_VERSION);
-  const float run_time_trt_version =
-      to_major_version(tensorrt::GetInferLibVersion());
-  if (compile_time_trt_version != run_time_trt_version) {
+  const float trt_compile_version = tensorrt::TrtMajorVersion(TRT_VERSION);
+  const float trt_runtime_version =
+      tensorrt::TrtMajorVersion(tensorrt::GetInferLibVersion());
+  if (trt_compile_version != trt_runtime_version) {
    LOG_FIRST_N(WARNING, 1)
        << "The Paddle Inference library is compiled with "
-        << compile_time_trt_version << " version TensorRT, "
-        << "but the runtime TensorRT you are using is " << run_time_trt_version
+        << trt_compile_version << " version TensorRT, "
+        << "but the runtime TensorRT you are using is " << trt_runtime_version
        << " version. "
           "This might cause serious compatibility issues. We strongly "
           "recommend using the same TRT version at runtime.";
@@ -666,6 +664,7 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
  trt_engine->SetUseDLA(Get<bool>("trt_use_dla"));
  trt_engine->SetDLACore(Get<int>("trt_dla_core"));
  trt_engine->SetUseInspector(Get<bool>("use_inspector"));
+  trt_engine->SetUseSparseWeights(Get<bool>("use_sparse_weights"));
  trt_engine->SetWithErnie(
      graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
      graph->Has(framework::ir::kMultiheadMatmulPass));

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -451,6 +451,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  CP_MEMBER(collect_shape_range_info_);
  CP_MEMBER(shape_range_info_path_);
  CP_MEMBER(trt_use_inspector_);
+  CP_MEMBER(trt_use_sparse_weights_);
  CP_MEMBER(trt_engine_memory_sharing_);
  CP_MEMBER(trt_engine_memory_sharing_identifier_);
  // Dlnne related
@@ -805,6 +806,10 @@ void AnalysisConfig::EnableTensorRtDLA(int dla_core) {

 void AnalysisConfig::EnableTensorRtInspector() { trt_use_inspector_ = true; }

+void AnalysisConfig::EnableTensorRtSparseWeights() {
+  trt_use_sparse_weights_ = true;
+}
+
 void AnalysisConfig::Exp_DisableTensorRtOPs(
    const std::vector<std::string> &ops) {
  trt_disabled_ops_.insert(trt_disabled_ops_.end(), ops.begin(), ops.end());

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1358,6 +1358,7 @@ void AnalysisPredictor::PrepareArgument() {
    argument_->SetTensorRtAllowBuildAtRuntime(
        config_.trt_allow_build_at_runtime());
    argument_->SetTensorRtUseInspector(config_.trt_use_inspector_);
+    argument_->SetTensorRtUseSparseWeights(config_.trt_use_sparse_weights_);
    argument_->SetTrtEngineMemorySharing(config_.trt_engine_memory_sharing());
  }


--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -742,6 +742,9 @@ struct PD_INFER_DECL AnalysisConfig {
  void EnableTensorRtInspector();
  bool tensorrt_inspector_enabled() { return trt_use_inspector_; }

+  void EnableTensorRtSparseWeights();
+  bool tensorrt_sparse_weights_enabled() { return trt_use_sparse_weights_; }
+
  void EnableDlnne(
      int min_subgraph_size = 3,
      int max_batch_size = 1,
@@ -1118,6 +1121,7 @@ struct PD_INFER_DECL AnalysisConfig {
  // tune to get dynamic_shape info.
  bool trt_tuned_dynamic_shape_{false};
  bool trt_use_inspector_{false};
+  bool trt_use_sparse_weights_{false};

  // In CollectShapeInfo mode, we will collect the shape information of
  // all intermediate tensors in the compute graph and calculate the

--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -207,12 +207,6 @@ void TensorRTEngine::FreezeNetwork() {
  infer_builder_config_->setMaxWorkspaceSize(max_workspace_);
 #endif

-#if IS_TRT_VERSION_GE(8500)
-  infer_builder_config_->setPreviewFeature(
-      nvinfer1::PreviewFeature::kFASTER_DYNAMIC_SHAPES_0805, true);
-#else
-#endif
-
  bool enable_fp16 = (precision_ == AnalysisConfig::Precision::kHalf);
  if (enable_fp16) {
    bool support_fp16 = infer_builder_->platformHasFastFp16();
@@ -363,6 +357,7 @@ void TensorRTEngine::FreezeNetwork() {
                   "opt_shape, false /*disable_trt_plugin_fp16*/)'";
    }
  }
+
 #if IS_TRT_VERSION_GE(8200)
  if (use_inspector_) {
    infer_builder_config_->setProfilingVerbosity(
@@ -374,7 +369,9 @@ void TensorRTEngine::FreezeNetwork() {
  infer_engine_.reset(infer_builder_->buildEngineWithConfig(
      *network(), *infer_builder_config_));
 #else
+  if (use_sparse_weights_) {
    infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS);
+  }
  ihost_memory_.reset(infer_builder_->buildSerializedNetwork(
      *network(), *infer_builder_config_));
  infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_));

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -739,6 +739,9 @@ class TensorRTEngine {
  void GetEngineInfo();

  void SetUseInspector(bool use_inspector) { use_inspector_ = use_inspector; }
+  void SetUseSparseWeights(bool use_sparse_weights) {
+    use_sparse_weights_ = use_sparse_weights;
+  }
  void SetScope(const framework::Scope& scope) { scope_ = &scope; }

  void SetContextMemorySharing(bool context_memory_sharing) {
@@ -827,6 +830,7 @@ class TensorRTEngine {
 #endif
  std::mutex mutex_;
  bool use_inspector_;
+  bool use_sparse_weights_{false};

 public:
  thread_local static int predictor_id_per_thread;

--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
@@ -96,6 +96,10 @@ static std::tuple<int, int, int> GetTrtCompileVersion() {
      NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, NV_TENSORRT_PATCH};
 }

+static float TrtMajorVersion(int full_version) {
+  return (full_version / 100) / 10.0;
+}
+
 template <typename T>
 struct Destroyer {
  void operator()(T* x) {

--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -879,6 +879,10 @@ void BindAnalysisConfig(py::module *m) {
           &AnalysisConfig::EnableTensorRtInspector)
      .def("tensorrt_inspector_enabled",
           &AnalysisConfig::tensorrt_inspector_enabled)
+      .def("enable_tensorrt_sparse_weights",
+           &AnalysisConfig::EnableTensorRtSparseWeights)
+      .def("tensorrt_sparse_weights_enabled",
+           &AnalysisConfig::tensorrt_sparse_weights_enabled)
      .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
      .def("enable_dlnne",
           &AnalysisConfig::EnableDlnne,

--- a/test/ir/inference/test_trt_inference_predictor.py
+++ b/test/ir/inference/test_trt_inference_predictor.py
@@ -84,6 +84,8 @@ class BackendPaddle:
        # enable memory optim
        if not self.args.enable_tune:
            config.enable_memory_optim()
+        if self.args.enable_trt_sparse_weights:
+            config.enable_tensorrt_sparse_weights()

        config.set_cpu_math_library_num_threads(self.args.cpu_threads)
        config.switch_ir_optim(True)
@@ -258,6 +260,9 @@ def parse_args():
    parser.add_argument('--enable_dynamic_shape', type=str2bool, default=True)
    parser.add_argument('--enable_tune', type=str2bool, default=False)
    parser.add_argument('--enable_profile', type=str2bool, default=False)
+    parser.add_argument(
+        '--enable_trt_sparse_weights', type=str2bool, default=False
+    )
    parser.add_argument('--enable_benchmark', type=str2bool, default=True)
    parser.add_argument('--save_result', type=str2bool, default=False)
    parser.add_argument('--return_result', type=str2bool, default=False)
@@ -308,6 +313,13 @@ def run_infer(model_path):
    backend.load(conf)
    backend.predict()

+    # run inference predictor, enable trt sparse weights
+    conf.enable_tune = False
+    conf.enable_trt_sparse_weights = True
+    backend = BackendPaddle()
+    backend.load(conf)
+    backend.predict()
+

 class ConvBNLayer(paddle.nn.Layer):
    def __init__(