update trt workspace size param (#44469)

* update trt workspace size param * update * update * update * use int64_t * use int64_t * upate * update

update trt workspace size param (#44469)
* update trt workspace size param * update * update * update * use int64_t * use int64_t * upate * update
bdce552b · Zhang Jun · GitHub · 54d98963 · bdce552b · bdce552b
15 changed file
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -216,7 +216,7 @@ struct Argument {
  DECL_ARGUMENT_FIELD(tensorrt_use_dla, TensorRtUseDLA, bool);
  DECL_ARGUMENT_FIELD(tensorrt_dla_core, TensorRtDLACore, int);
  DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
-  DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
+  DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int64_t);
  DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
  DECL_ARGUMENT_FIELD(tensorrt_disabled_ops,
                      TensorRtDisabledOPs,

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -133,7 +133,8 @@ void IRPassManager::CreatePasses(Argument *argument,
                    argument->bfloat16_enabled_op_types()));
 #endif
    } else if (pass_name == "tensorrt_subgraph_pass") {
-      pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
+      pass->Set("workspace_size",
+                new int64_t(argument->tensorrt_workspace_size()));
      pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
      pass->Set("min_subgraph_size",
                new int(argument->tensorrt_min_subgraph_size()));

--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -378,7 +379,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
  op_desc->SetBlockAttr("sub_block", new_block);
  op_desc->SetAttr("subgraph", block_desc.Proto()->SerializeAsString());
  op_desc->SetAttr("max_batch_size", max_batch_size);
-  op_desc->SetAttr("workspace_size", Get<int>("workspace_size"));
+  op_desc->SetAttr("workspace_size", Get<int64_t>("workspace_size"));
  op_desc->SetAttr("gpu_id", Get<int>("gpu_device_id"));
  op_desc->SetAttr("output_name_mapping", output_mapping);
  op_desc->SetAttr("origin_output_dims", renamed_output_dims);
@@ -499,7 +500,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
      inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
          .Create(engine_key + std::to_string(predictor_id),
                  max_batch_size,
-                  Get<int>("workspace_size"),
+                  Get<int64_t>("workspace_size"),
                  precision_mode,
                  calibrator.get(),
                  Get<int>("gpu_device_id"),

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -517,7 +517,7 @@ MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const {
 }

 void AnalysisConfig::EnableTensorRtEngine(
-    int workspace_size,
+    int64_t workspace_size,
    int max_batch_size,
    int min_subgraph_size,
    AnalysisConfig::Precision precision_mode,

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -523,7 +523,7 @@ struct PD_INFER_DECL AnalysisConfig {
  /// quantization).
  ///
  ///
-  void EnableTensorRtEngine(int workspace_size = 1 << 20,
+  void EnableTensorRtEngine(int64_t workspace_size = 1 << 30,
                            int max_batch_size = 1,
                            int min_subgraph_size = 3,
                            Precision precision = Precision::kFloat32,
@@ -967,7 +967,7 @@ struct PD_INFER_DECL AnalysisConfig {
  bool use_tensorrt_{false};
  // For workspace_size, refer it from here:
  // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
-  int tensorrt_workspace_size_{1 << 30};
+  int64_t tensorrt_workspace_size_{1 << 30};
  // While TensorRT allows an engine optimized for a given max batch size
  // to run at any smaller size, the performance for those smaller
  // sizes may not be as well-optimized. Therefore, Max batch is best

--- a/paddle/fluid/inference/capi/paddle_c_api.h
+++ b/paddle/fluid/inference/capi/paddle_c_api.h
@@ -214,7 +214,7 @@ PADDLE_CAPI_EXPORT extern bool PD_SpecifyInputName(

 PADDLE_CAPI_EXPORT extern void PD_EnableTensorRtEngine(
    PD_AnalysisConfig* config,
-    int workspace_size,
+    int64_t workspace_size,
    int max_batch_size,
    int min_subgraph_size,
    Precision precision,

--- a/paddle/fluid/inference/capi/pd_config.cc
+++ b/paddle/fluid/inference/capi/pd_config.cc
@@ -243,7 +243,7 @@ bool PD_SpecifyInputName(const PD_AnalysisConfig* config) {
 }

 void PD_EnableTensorRtEngine(PD_AnalysisConfig* config,
-                             int workspace_size,
+                             int64_t workspace_size,
                             int max_batch_size,
                             int min_subgraph_size,
                             Precision precision,

--- a/paddle/fluid/inference/capi_exp/pd_config.cc
+++ b/paddle/fluid/inference/capi_exp/pd_config.cc
@@ -219,7 +219,7 @@ PD_Bool PD_ConfigIrOptim(__pd_keep PD_Config* pd_config) {
 }

 void PD_ConfigEnableTensorRtEngine(__pd_keep PD_Config* pd_config,
-                                   int32_t workspace_size,
+                                   int64_t workspace_size,
                                   int32_t max_batch_size,
                                   int32_t min_subgraph_size,
                                   PD_PrecisionType precision,

--- a/paddle/fluid/inference/capi_exp/pd_config.h
+++ b/paddle/fluid/inference/capi_exp/pd_config.h
@@ -329,7 +329,7 @@ PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigIrOptim(
 ///
 PADDLE_CAPI_EXPORT extern void PD_ConfigEnableTensorRtEngine(
    __pd_keep PD_Config* pd_config,
-    int32_t workspace_size,
+    int64_t workspace_size,
    int32_t max_batch_size,
    int32_t min_subgraph_size,
    PD_PrecisionType precision,

--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -79,7 +79,7 @@ class TRTConvertValidation {
  TRTConvertValidation(int max_batch_size,
                       const std::unordered_set<std::string>& parameters,
                       framework::Scope& scope,  // NOLINT
-                       int workspace_size = 1 << 10,
+                       int64_t workspace_size = 1 << 30,
                       bool if_add_batch = true)
      : parameters_(parameters),
        scope_(scope),

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -206,7 +206,7 @@ class TensorRTEngine {

  TensorRTEngine(
      int max_batch,
-      int max_workspace,
+      int64_t max_workspace,
      AnalysisConfig::Precision precision = AnalysisConfig::Precision::kFloat32,
      TRTInt8Calibrator* calibrator = nullptr,
      int device_id = 0,
@@ -672,7 +672,7 @@ class TensorRTEngine {
  // the runtime batch size
  static int runtime_batch_;
  // the max memory size the engine uses
-  int max_workspace_;
+  int64_t max_workspace_;

  AnalysisConfig::Precision precision_;
  TRTInt8Calibrator* calibrator_;
@@ -767,7 +767,7 @@ class TRTEngineManager {
  TensorRTEngine* Create(
      std::string name,
      int max_batch,
-      int max_workspace,
+      int64_t max_workspace,
      AnalysisConfig::Precision precision = AnalysisConfig::Precision::kFloat32,
      TRTInt8Calibrator* calibrator = nullptr,
      int device_id = 0,

--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
@@ -34,7 +34,7 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
        "engine_key",
        "The engine_key here is used to distinguish different TRT Engines");
    AddAttr<int>("max_batch_size", "the maximum batch size.");
-    AddAttr<int>("workspace_size", "the workspace size.");
+    AddAttr<int64_t>("workspace_size", "the workspace size.").AsExtra();
    AddAttr<framework::BlockDesc *>("sub_block", "the trt block");
    AddAttr<bool>("enable_int8", "whether swith to int8 mode");
    AddComment("TensorRT engine operator.");

--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -177,7 +177,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
  std::vector<std::string> runtime_input_names_;
  mutable TensorRTEngine *trt_engine_{nullptr};
  int max_batch_size_;
-  int workspace_size_;
+  int64_t workspace_size_;
  std::unique_ptr<TRTInt8Calibrator> calibrator_;
  bool enable_int8_;
  bool enable_fp16_;
@@ -207,7 +207,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
      : framework::OperatorBase(type, inputs, outputs, attrs) {
    input_names_ = Inputs("Xs");
    max_batch_size_ = Attr<int>("max_batch_size");
-    workspace_size_ = Attr<int>("workspace_size");
+    workspace_size_ = Attr<int64_t>("workspace_size");
    device_id_ = Attr<int>("gpu_id");
    enable_int8_ = Attr<bool>("enable_int8");
    enable_fp16_ = Attr<bool>("enable_fp16");

--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -107,7 +107,7 @@ void DynamicShapeTest(bool allow_build_at_runtime) {

  engine_op_desc.SetBlockAttr("sub_block", &block_desc);
  engine_op_desc.SetAttr("max_batch_size", static_cast<int>(2));
-  engine_op_desc.SetAttr("workspace_size", static_cast<int>(1 << 20));
+  engine_op_desc.SetAttr("workspace_size", static_cast<int64_t>(1 << 20));
  engine_op_desc.SetAttr("parameters", std::vector<std::string>({}));
  engine_op_desc.SetAttr("engine_key", std::string("a_engine"));
  engine_op_desc.SetAttr("calibration_engine_key",
@@ -259,7 +259,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {

  engine_op_desc.SetBlockAttr("sub_block", &block_desc);
  engine_op_desc.SetAttr("max_batch_size", static_cast<int>(batch_size));
-  engine_op_desc.SetAttr("workspace_size", static_cast<int>(1 << 20));
+  engine_op_desc.SetAttr("workspace_size", static_cast<int64_t>(1 << 20));
  engine_op_desc.SetAttr("parameters",
                         std::vector<std::string>({"y0", "y1", "y2", "y3"}));
  engine_op_desc.SetAttr("engine_key", std::string("b_engine"));

--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -687,7 +687,7 @@ void BindAnalysisConfig(py::module *m) {
      .def("specify_input_name", &AnalysisConfig::specify_input_name)
      .def("enable_tensorrt_engine",
           &AnalysisConfig::EnableTensorRtEngine,
-           py::arg("workspace_size") = 1 << 20,
+           py::arg("workspace_size") = 1 << 30,
           py::arg("max_batch_size") = 1,
           py::arg("min_subgraph_size") = 3,
           py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,