[Inference] Add tuned trt_dynamic_shape mode. (#34806)

7c96efed · Wilber · GitHub · f5e430c5 · 7c96efed · 7c96efed
26 changed file
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -212,6 +212,12 @@ struct Argument {
                      bool);
  DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool);
  DECL_ARGUMENT_FIELD(tensorrt_use_oss, TensorRtUseOSS, bool);
+  DECL_ARGUMENT_FIELD(tensorrt_shape_range_info_path,
+                      TensorRtShapeRangeInfoPath, std::string);
+  DECL_ARGUMENT_FIELD(tensorrt_tuned_dynamic_shape, TensorRtTunedDynamicShape,
+                      bool);
+  DECL_ARGUMENT_FIELD(tensorrt_allow_build_at_runtime,
+                      TensorRtAllowBuildAtRuntime, bool);

  DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool);
  DECL_ARGUMENT_FIELD(dlnne_min_subgraph_size, DlnneMinSubgraphSize, int);

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -146,6 +146,14 @@ void IRPassManager::CreatePasses(Argument *argument,
      pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
      pass->Set("use_static_engine", new bool(use_static_engine));
      pass->Set("model_from_memory", new bool(argument->model_from_memory()));
+
+      // tuned trt dynamic_shape
+      pass->Set("trt_shape_range_info_path",
+                new std::string(argument->tensorrt_shape_range_info_path()));
+      pass->Set("trt_tuned_dynamic_shape",
+                new bool(argument->tensorrt_tuned_dynamic_shape()));
+      pass->Set("trt_allow_build_at_runtime",
+                new bool(argument->tensorrt_allow_build_at_runtime()));
      pass->Set("max_input_shape", new std::map<std::string, std::vector<int>>(
                                       argument->max_input_shape()));
      pass->Set("min_input_shape", new std::map<std::string, std::vector<int>>(
@@ -153,17 +161,17 @@ void IRPassManager::CreatePasses(Argument *argument,
      pass->Set("optim_input_shape",
                new std::map<std::string, std::vector<int>>(
                    argument->optim_input_shape()));
-      bool with_dynamic_shape = argument->max_input_shape().size() > 0 &&
-                                argument->min_input_shape().size() > 0 &&
-                                argument->optim_input_shape().size() > 0;
+      bool with_dynamic_shape = (argument->max_input_shape().size() > 0 &&
+                                 argument->min_input_shape().size() > 0 &&
+                                 argument->optim_input_shape().size() > 0) ||
+                                argument->tensorrt_tuned_dynamic_shape();
      pass->Set("with_dynamic_shape", new bool(with_dynamic_shape));
      pass->Set("trt_disabled_ops", new std::vector<std::string>(
                                        argument->tensorrt_disabled_ops()));
      pass->Set("trt_use_dla", new bool(argument->tensorrt_use_dla()));
      pass->Set("trt_dla_core", new int(argument->tensorrt_dla_core()));
      // Setting the disable_trt_plugin_fp16 to true means that TRT plugin will
-      // not
-      // run fp16.
+      // not run fp16.
      pass->Set("disable_trt_plugin_fp16",
                new bool(argument->disable_trt_plugin_fp16()));
    } else if (pass_name == "dlnne_subgraph_pass") {

--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
 cc_library(subgraph_util SRCS subgraph_util.cc DEPS subgraph_detector)

 if (WITH_GPU AND TENSORRT_FOUND)
-  cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_util tensorrt_op_teller)
+  cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_util tensorrt_op_teller infer_io_utils)

  set(analysis_deps ${analysis_deps}
          subgraph_util tensorrt_subgraph_pass

--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -22,6 +22,7 @@
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
+#include "paddle/fluid/inference/utils/io_utils.h"

 namespace paddle {
 namespace inference {
@@ -197,6 +198,17 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
  auto opt_input_shape =
      Get<std::map<std::string, std::vector<int>>>("optim_input_shape");

+  auto allow_build_at_runtime = Get<bool>("trt_allow_build_at_runtime");
+  auto shape_range_info_path = Get<std::string>("trt_shape_range_info_path");
+  auto trt_tuned_dynamic_shape = Get<bool>("trt_tuned_dynamic_shape");
+  int max_batch_size = Get<int>("max_batch_size");
+  if (trt_tuned_dynamic_shape) {
+    VLOG(1) << "trt dynamic_shape deserialize from " << shape_range_info_path;
+    inference::DeserializeShapeRangeInfo(shape_range_info_path,
+                                         &min_input_shape, &max_input_shape,
+                                         &opt_input_shape);
+  }
+
  // The following procedure is used to rename all the intermediate
  // variables and the output variables of the subgraph.
  // Why we do this?
@@ -242,12 +254,14 @@ void TensorRtSubgraphPass::CreateTensorRTOp(

  op_desc->SetBlockAttr("sub_block", new_block);
  op_desc->SetAttr("subgraph", block_desc.Proto()->SerializeAsString());
-  op_desc->SetAttr("max_batch_size", Get<int>("max_batch_size"));
+  op_desc->SetAttr("max_batch_size", max_batch_size);
  op_desc->SetAttr("workspace_size", Get<int>("workspace_size"));
  op_desc->SetAttr("gpu_id", Get<int>("gpu_device_id"));
  op_desc->SetAttr("output_name_mapping", output_mapping);
  op_desc->SetAttr("origin_output_dims", renamed_output_dims);
  op_desc->SetAttr("parameters", params);
+  op_desc->SetAttr("allow_build_at_runtime", allow_build_at_runtime);
+  op_desc->SetAttr("shape_range_info_path", shape_range_info_path);

  // we record all inputs' shapes in attr to check if they are consistent
  // with the real inputs' shapes retrieved from scope when trt runs.
@@ -259,6 +273,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
  }

  auto use_static_engine = Get<bool>("use_static_engine");
+  op_desc->SetAttr("use_static_engine", use_static_engine);
+  if (use_static_engine)
+    op_desc->SetAttr("model_opt_cache_dir",
+                     Get<std::string>("model_opt_cache_dir"));
+
  // TODO(NHZlX)
  // There are models with the same structure but the different parameters,
  // when running in the 'use_serialize' mode, there is a bug.
@@ -266,12 +285,12 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
  // So we use seperate engine keys in serialization and calibration.
  auto engine_key = GenerateEngineKey(
      input_names_with_id, output_names_with_id, std::to_string(0),
-      std::to_string(Get<int>("max_batch_size")),
+      std::to_string(max_batch_size),
      std::to_string(static_cast<int>(precision_mode)), false);
-  auto calibration_engine_key = GenerateEngineKey(
-      input_names_with_id, output_names_with_id, std::to_string(0),
-      std::to_string(Get<int>("max_batch_size")),
-      std::to_string(static_cast<int>(precision_mode)), true);
+  auto calibration_engine_key =
+      GenerateEngineKey(input_names_with_id, output_names_with_id,
+                        std::to_string(0), std::to_string(max_batch_size),
+                        std::to_string(static_cast<int>(precision_mode)), true);
  auto predictor_id = Get<int>("predictor_id");

  // Get "" when there is no cached calibration table data.
@@ -345,11 +364,10 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
  bool disable_trt_plugin_fp16 = Get<bool>("disable_trt_plugin_fp16");
  tensorrt::TensorRTEngine *trt_engine =
      inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
-          .Create(engine_key + std::to_string(predictor_id),
-                  Get<int>("max_batch_size"), Get<int>("workspace_size"),
-                  precision_mode, calibrator.get(), Get<int>("gpu_device_id"),
-                  min_input_shape, max_input_shape, opt_input_shape,
-                  disable_trt_plugin_fp16);
+          .Create(engine_key + std::to_string(predictor_id), max_batch_size,
+                  Get<int>("workspace_size"), precision_mode, calibrator.get(),
+                  Get<int>("gpu_device_id"), min_input_shape, max_input_shape,
+                  opt_input_shape, disable_trt_plugin_fp16);
  trt_engine->SetUseOSS(Get<bool>("use_oss"));
  trt_engine->SetUseDLA(Get<bool>("trt_use_dla"));
  trt_engine->SetDLACore(Get<int>("trt_dla_core"));

--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -55,10 +55,17 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
  // We get all the vars from local_scope instead of the ProgramDesc.
  // Because there exists the case that new parameter variables are not added to
  // the program in the analysis pass.
+  bool reserve_cpu_weights = false;
+  if (argument->tensorrt_allow_build_at_runtime_valid() &&
+      argument->tensorrt_allow_build_at_runtime()) {
+    reserve_cpu_weights = true;
+  }
  for (auto &var_name : all_vars) {
    if (std::count(repetitive_params.begin(), repetitive_params.end(),
                   var_name)) {
-      scope->EraseVars({var_name});
+      if (!reserve_cpu_weights) {
+        scope->EraseVars({var_name});
+      }
      continue;
    }
    auto *var = scope->FindLocalVar(var_name);

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -49,7 +49,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
 endif()

 cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} 
-          zero_copy_tensor ir_pass_manager op_compatible_info)
+          zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils)

 cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api)


--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -158,6 +158,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  CP_MEMBER(trt_use_static_engine_);
  CP_MEMBER(trt_use_calib_mode_);
  CP_MEMBER(trt_use_oss_);
+  CP_MEMBER(trt_tuned_dynamic_shape_);
+  CP_MEMBER(trt_allow_build_at_runtime_);
+  CP_MEMBER(collect_shape_range_info_);
+  CP_MEMBER(shape_range_info_path_);
  // Dlnne related
  CP_MEMBER(use_dlnne_);
  CP_MEMBER(dlnne_min_subgraph_size_);
@@ -653,8 +657,8 @@ float AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
 #endif
 }

-void AnalysisConfig::EnableMemoryOptim() {
-  enable_memory_optim_ = true;
+void AnalysisConfig::EnableMemoryOptim(bool x) {
+  enable_memory_optim_ = x;
  Update();
 }

@@ -783,6 +787,9 @@ std::string AnalysisConfig::Summary() {
      // dynamic_shape
      os.InsertRow({"tensorrt_enable_dynamic_shape",
                    min_input_shape_.empty() ? "false" : "true"});
+      os.InsertRow({"tensorrt_tuned_dynamic_shape", trt_tuned_dynamic_shape_
+                                                        ? shape_range_info_path_
+                                                        : "false"});

      os.InsertRow({"tensorrt_use_oss", trt_use_oss_ ? "true" : "false"});
      os.InsertRow({"tensorrt_use_dla", trt_use_dla_ ? "true" : "false"});
@@ -812,8 +819,46 @@ std::string AnalysisConfig::Summary() {
  os.InsertRow({"memory_optim", enable_memory_optim_ ? "true" : "false"});
  os.InsertRow({"enable_profile", with_profile_ ? "true" : "false"});
  os.InsertRow({"enable_log", with_glog_info_ ? "true" : "false"});
+  os.InsertRow({"collect_shape_range_info",
+                collect_shape_range_info_ ? shape_range_info_path_ : "false"});

  return os.PrintTable();
 }

+void AnalysisConfig::CollectShapeRangeInfo(
+    const std::string &shape_range_info_path) {
+  LOG(INFO) << "In CollectShapeInfo mode, we will disable optimizations and "
+               "collect the shape information of "
+            << "all intermediate tensors in the compute graph and calculate "
+               "the min_shape, max_shape and opt_shape.";
+  collect_shape_range_info_ = true;
+  PADDLE_ENFORCE_EQ(shape_range_info_path.empty(), false,
+                    platform::errors::InvalidArgument(
+                        "The shape_range_info_path should not be empty, please "
+                        "re-check the argument."));
+  shape_range_info_path_ = shape_range_info_path;
+}
+
+const std::string &AnalysisConfig::shape_range_info_path() {
+  return shape_range_info_path_;
+}
+
+bool AnalysisConfig::shape_range_info_collected() {
+  return collect_shape_range_info_;
+}
+
+void AnalysisConfig::EnableTunedTensorRtDynamicShape(
+    const std::string &shape_range_info_path, bool allow_build_at_runtime) {
+  shape_range_info_path_ = shape_range_info_path;
+  trt_allow_build_at_runtime_ = allow_build_at_runtime;
+  trt_tuned_dynamic_shape_ = true;
+}
+
+bool AnalysisConfig::tuned_tensorrt_dynamic_shape() {
+  return trt_tuned_dynamic_shape_;
+}
+
+bool AnalysisConfig::trt_allow_build_at_runtime() {
+  return trt_allow_build_at_runtime_;
+}
 }  // namespace paddle
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -13,7 +13,9 @@
 // limitations under the License.

 #include "paddle/fluid/inference/api/analysis_predictor.h"
+
 #include <glog/logging.h>
+
 #include <algorithm>
 #include <fstream>
 #include <memory>
@@ -21,6 +23,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/extension/include/ext_op_meta_info.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
@@ -34,6 +37,7 @@
 #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/inference/utils/io_utils.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/cpu_helper.h"
@@ -570,6 +574,11 @@ void AnalysisPredictor::PrepareArgument() {
    argument_.SetMaxInputShape(config_.max_input_shape_);
    argument_.SetOptimInputShape(config_.optim_input_shape_);
    argument_.SetCloseTrtPluginFp16(config_.disable_trt_plugin_fp16_);
+    argument_.SetTensorRtShapeRangeInfoPath(config_.shape_range_info_path());
+    argument_.SetTensorRtTunedDynamicShape(
+        config_.tuned_tensorrt_dynamic_shape());
+    argument_.SetTensorRtAllowBuildAtRuntime(
+        config_.trt_allow_build_at_runtime());
  }

  if (config_.dlnne_enabled()) {
@@ -915,6 +924,11 @@ bool AnalysisPredictor::ZeroCopyRun() {
 #endif

  executor_->Run();
+
+  if (config_.shape_range_info_collected()) {
+    CollectShapeRangeInfo();
+  }
+
  // Fix TensorArray reuse not cleaned bug.
  tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_);
  tensor_array_batch_cleaner_.ResetTensorArray();
@@ -934,6 +948,78 @@ bool AnalysisPredictor::ZeroCopyRun() {
  return true;
 }

+void AnalysisPredictor::CollectShapeRangeInfo() {
+  // if use gpu, sync first.
+  if (config_.use_gpu()) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    paddle::platform::DeviceContextPool &pool =
+        paddle::platform::DeviceContextPool::Instance();
+    auto gpu_place = BOOST_GET_CONST(paddle::platform::CUDAPlace, place_);
+    auto *dev_ctx = static_cast<const paddle::platform::CUDADeviceContext *>(
+        pool.Get(gpu_place));
+#ifdef PADDLE_WITH_HIP
+    hipStreamSynchronize(dev_ctx->stream());
+#else
+    cudaStreamSynchronize(dev_ctx->stream());
+#endif
+#endif
+  }
+
+  std::vector<std::string> var_names = sub_scope_->LocalVarNames();
+  for (const auto &name : var_names) {
+    auto *var = sub_scope_->GetVar(name);
+    if (!var->IsType<framework::LoDTensor>()) {
+      continue;
+    }
+    framework::DDim dim = var->Get<framework::LoDTensor>().dims();
+    std::vector<int32_t> shape(dim.size());
+    for (size_t i = 0; i < shape.size(); ++i) shape[i] = dim[i];
+    shape_info_[name].emplace_back(shape);
+  }
+}
+
+void AnalysisPredictor::StatisticShapeRangeInfo() {
+  std::map<std::string, std::vector<int32_t>> min_shapes;
+  std::map<std::string, std::vector<int32_t>> max_shapes;
+  std::map<std::string, std::vector<int32_t>> opt_shapes;
+  for (auto it : shape_info_) {
+    auto name = it.first;
+    auto shapes = it.second;
+
+    std::vector<int32_t> min_shape(shapes[0].begin(), shapes[0].end());
+    std::vector<int32_t> max_shape(shapes[0].begin(), shapes[0].end());
+    std::vector<int32_t> opt_shape(shapes[0].begin(), shapes[0].end());
+
+    auto ShapeMaxFreq = [](const std::map<int32_t, int32_t> &m) -> int32_t {
+      std::vector<std::pair<int32_t, int32_t>> counter;
+      for (auto &it : m) counter.push_back(it);
+      std::sort(
+          counter.begin(), counter.end(),
+          [](std::pair<int32_t, int32_t> &a, std::pair<int32_t, int32_t> &b) {
+            return a.second > b.second;
+          });
+      return counter[0].first;
+    };
+
+    for (size_t d = 0; d < shapes[0].size(); ++d) {
+      std::map<int32_t, int32_t> counter;
+      for (size_t i = 0; i < shapes.size(); ++i) {
+        counter[shapes[i][d]] += 1;
+        if (shapes[i][d] < min_shape[d]) min_shape[d] = shapes[i][d];
+        if (shapes[i][d] > max_shape[d]) max_shape[d] = shapes[i][d];
+      }
+      opt_shape[d] = ShapeMaxFreq(counter);
+    }
+
+    min_shapes[name] = min_shape;
+    max_shapes[name] = max_shape;
+    opt_shapes[name] = opt_shape;
+  }
+
+  inference::SerializeShapeRangeInfo(config_.shape_range_info_path(),
+                                     min_shapes, max_shapes, opt_shapes);
+}
+
 bool AnalysisPredictor::LoadProgramDesc() {
  // Initialize the inference program
  std::string filename;
@@ -1140,6 +1226,10 @@ AnalysisPredictor::~AnalysisPredictor() {
  }
 #endif

+  if (config_.shape_range_info_collected()) {
+    StatisticShapeRangeInfo();
+  }
+
  memory::Release(place_);
 }


--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -91,6 +91,10 @@ class AnalysisPredictor : public PaddlePredictor {
  /// \param[in] AnalysisConfig config
  ///
  explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {
+    if (config_.shape_range_info_collected()) {
+      config_.SwitchIrOptim(false);
+      config_.EnableMemoryOptim(false);
+    }
    predictor_id_ = inference::GetUniqueId();
  }
  ///
@@ -377,6 +381,10 @@ class AnalysisPredictor : public PaddlePredictor {
  FRIEND_TEST(AnalysisPredictor, with_gpu);
 #endif

+ private:
+  void StatisticShapeRangeInfo();
+  void CollectShapeRangeInfo();
+
 private:
  AnalysisConfig config_;
  Argument argument_;
@@ -419,6 +427,8 @@ class AnalysisPredictor : public PaddlePredictor {
 private:
  // Some status here that help to determine the status inside the predictor.
  bool status_is_cloned_{false};
+
+  std::map<std::string, std::vector<std::vector<int32_t>>> shape_info_;
 };

 }  // namespace paddle
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -19,8 +19,10 @@
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
+#include "paddle/fluid/inference/utils/io_utils.h"
 #include "paddle/fluid/platform/cpu_info.h"

 DEFINE_string(dirname, "", "dirname to tests.");
@@ -32,6 +34,8 @@ TEST(AnalysisPredictor, analysis_off) {
  config.SetModel(FLAGS_dirname);
  config.SwitchIrOptim(false);
  LOG(INFO) << config.Summary();
+  LOG(INFO) << "Shape Info collected: " << config.shape_range_info_collected()
+            << ", path: " << config.shape_range_info_path();

  auto _predictor = CreatePaddlePredictor<AnalysisConfig>(config);
  auto* predictor = static_cast<AnalysisPredictor*>(_predictor.get());
@@ -86,10 +90,6 @@ TEST(AnalysisPredictor, analysis_on) {
  std::vector<PaddleTensor> outputs;
  ASSERT_TRUE(predictor->Run(inputs, &outputs));

-  for (auto& output : outputs) {
-    LOG(INFO) << inference::DescribeTensor(output);
-  }
-
  // compare with NativePredictor
  auto naive_predictor =
      CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
@@ -139,6 +139,54 @@ TEST(AnalysisPredictor, ZeroCopy) {
  predictor->TryShrinkMemory();
 }

+TEST(AnalysisPredictor, CollectShapeRangeInfo) {
+  AnalysisConfig config;
+  config.SetModel(FLAGS_dirname);
+  config.SwitchUseFeedFetchOps(false);
+  config.EnableUseGpu(100, 0);
+  config.CollectShapeRangeInfo(FLAGS_dirname + "/shape_range.pbtxt");
+  LOG(INFO) << config.Summary();
+  AnalysisConfig config2(config);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(config2);
+
+  auto w0 = predictor->GetInputTensor("firstw");
+  auto w1 = predictor->GetInputTensor("secondw");
+  auto w2 = predictor->GetInputTensor("thirdw");
+  auto w3 = predictor->GetInputTensor("forthw");
+
+  w0->Reshape({4, 1});
+  w1->Reshape({4, 1});
+  w2->Reshape({4, 1});
+  w3->Reshape({4, 1});
+
+  auto* w0_data = w0->mutable_data<int64_t>(PaddlePlace::kCPU);
+  auto* w1_data = w1->mutable_data<int64_t>(PaddlePlace::kCPU);
+  auto* w2_data = w2->mutable_data<int64_t>(PaddlePlace::kCPU);
+  auto* w3_data = w3->mutable_data<int64_t>(PaddlePlace::kCPU);
+
+  for (int i = 0; i < 4; i++) {
+    w0_data[i] = i;
+    w1_data[i] = i;
+    w2_data[i] = i;
+    w3_data[i] = i;
+  }
+
+  predictor->ZeroCopyRun();
+
+  auto out = predictor->GetOutputTensor("fc_1.tmp_2");
+  PaddlePlace place;
+  int size = 0;
+  out->data<float>(&place, &size);
+  LOG(INFO) << "output size: " << size / sizeof(float);
+  // TODO(wilber): check for windows
+  // std::map<std::string, std::vector<int32_t>> min_shape;
+  // std::map<std::string, std::vector<int32_t>> max_shape;
+  // std::map<std::string, std::vector<int32_t>> opt_shape;
+  // inference::DeserializeShapeRangeInfo(FLAGS_dirname + "/shape_range.pbtxt",
+  //                                     &min_shape, &max_shape, &opt_shape);
+  // ASSERT_EQ(min_shape.size(), 14u);
+}
+
 TEST(AnalysisPredictor, Clone) {
  AnalysisConfig config;
  config.SetModel(FLAGS_dirname);

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -380,6 +380,50 @@ struct PD_INFER_DECL AnalysisConfig {
  bool tensorrt_dynamic_shape_enabled() const {
    return !min_input_shape_.empty();
  }
+  ///
+  /// \brief Enable tuned tensorrt dynamic shape.
+  ///
+  /// \param shape_range_info_path the path to shape_info file got in
+  /// CollectShapeInfo
+  /// mode.
+  /// \param allow_build_at_runtime allow build trt engine at runtime.
+  ///
+  void EnableTunedTensorRtDynamicShape(const std::string& shape_range_info_path,
+                                       bool allow_build_at_runtime = true);
+
+  ///
+  /// \brief A boolean state telling whether to use tuned tensorrt dynamic
+  /// shape.
+  ///
+  bool tuned_tensorrt_dynamic_shape();
+
+  ///
+  /// \brief A boolean state telling whether to allow building trt engine at
+  /// runtime.
+  ///
+  bool trt_allow_build_at_runtime();
+
+  ///
+  /// \brief Collect shape info of all tensors in compute graph.
+  ///
+  /// \param shape_range_info_path the path to save shape info.
+  ///
+  void CollectShapeRangeInfo(const std::string& shape_range_info_path);
+
+  ///
+  /// \brief the shape info path in CollectShapeInfo mode.
+  ///
+  /// \return the shape info path.
+  ///
+  const std::string& shape_range_info_path();
+
+  ///
+  /// \brief A boolean state telling whether to collect shape info.
+  ///
+  /// \return bool Whether to collect shape info.
+  ///
+  bool shape_range_info_collected();
+
  ///
  /// \brief Prevent ops running in Paddle-TRT
  /// NOTE: just experimental, not an official stable API, easy to be broken.
@@ -573,7 +617,9 @@ struct PD_INFER_DECL AnalysisConfig {
  /// \brief Turn on memory optimize
  /// NOTE still in development.
  ///
-  void EnableMemoryOptim();
+  /// \param x Whether to enable memory optimize.
+  ///
+  void EnableMemoryOptim(bool x = true);
  ///
  /// \brief A boolean state telling whether the memory optimization is
  /// activated.
@@ -693,6 +739,15 @@ struct PD_INFER_DECL AnalysisConfig {
  std::map<std::string, std::vector<int>> optim_input_shape_{};
  std::vector<std::string> trt_disabled_ops_{};
  bool disable_trt_plugin_fp16_{false};
+  bool trt_allow_build_at_runtime_{false};
+  // tune to get dynamic_shape info.
+  bool trt_tuned_dynamic_shape_{false};
+
+  // In CollectShapeInfo mode, we will collect the shape information of
+  // all intermediate tensors in the compute graph and calculate the
+  // min_shape, max_shape and opt_shape and save in shape_range_info_path_;
+  bool collect_shape_range_info_{false};
+  std::string shape_range_info_path_;

  // dlnne related.
  bool use_dlnne_{false};

--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -196,6 +196,10 @@ void TensorRTEngine::FreezeNetwork() {
 #if IS_TRT_VERSION_GE(6000)
    LOG(INFO) << "Run Paddle-TRT Dynamic Shape mode.";
    for (auto &input : min_input_shape_) {
+      VLOG(4) << "TRT dynamic_shape set " << input.first
+              << " min: " << Vec2Str(input.second)
+              << ", max: " << Vec2Str(max_input_shape_[input.first])
+              << ", opt: " << Vec2Str(optim_input_shape_[input.first]);
      optim_profile_->setDimensions(
          input.first.c_str(), nvinfer1::OptProfileSelector::kMIN,
          Vec2TRT_Dims(input.second, input.first, true));

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -32,6 +32,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
 #include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/utils/any.h"

 namespace paddle {
@@ -226,6 +227,7 @@ class TensorRTEngine {
                     const std::string& name);
  // Set the itensor_map_[name] as the network's output, and set its name.
  void DeclareOutput(const std::string& name);
+  void ClearTensorMap() { itensor_map_.clear(); }

  void SetITensor(const std::string& name, nvinfer1::ITensor* tensor);
  // Get an ITensor called name.
@@ -244,6 +246,16 @@ class TensorRTEngine {
    }
    return infer_context_[tid].get();
  }
+  void ResetContext() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    const std::thread::id tid = std::this_thread::get_id();
+    PADDLE_ENFORCE_NOT_NULL(
+        infer_engine_,
+        platform::errors::InvalidArgument(
+            "You should build engine first and then set the context."));
+    infer_context_[tid].reset(nullptr);
+    infer_context_.erase(tid);
+  }

  nvinfer1::IHostMemory* Serialize() {
    PADDLE_ENFORCE_NOT_NULL(
@@ -364,6 +376,55 @@ class TensorRTEngine {
  ShapeMapType min_input_shape() { return min_input_shape_; }
  ShapeMapType max_input_shape() { return max_input_shape_; }
  ShapeMapType optim_input_shape() { return optim_input_shape_; }
+
+  bool AdjustDynamicShapeRange(const ShapeMapType& runtime_input_shape,
+                               std::vector<std::string>* changed) {
+    bool ret = false;
+    changed->clear();
+    for (const auto& it : runtime_input_shape) {
+      auto name = it.first;
+      auto input_shape = it.second;
+      PADDLE_ENFORCE_EQ(
+          min_input_shape_.count(name), true,
+          platform::errors::InvalidArgument(
+              "TRT dynamic_shape min_input_shape %s not found.", name));
+      PADDLE_ENFORCE_EQ(min_input_shape_[name].size(), input_shape.size(),
+                        platform::errors::InvalidArgument(
+                            "TRT dynamic_shape min_input_shape %s size not "
+                            "equal, the min_input_shape[%s].size()=%d"
+                            ", but the runtime_input_shape[%s].size()=%d.",
+                            name, name, min_input_shape_[name].size(), name,
+                            input_shape.size()));
+      auto bak_min_shape = min_input_shape_[name];
+      auto bak_max_shape = max_input_shape_[name];
+      bool min_change = false;
+      bool max_change = false;
+      for (size_t d = 0; d < input_shape.size(); ++d) {
+        if (input_shape[d] < min_input_shape_[name][d]) {
+          ret = true;
+          min_change = true;
+          min_input_shape_[name][d] = input_shape[d];
+        }
+        if (input_shape[d] > max_input_shape_[name][d]) {
+          ret = true;
+          max_change = true;
+          max_input_shape_[name][d] = input_shape[d];
+        }
+      }
+
+      if (min_change)
+        LOG(INFO) << "refactor shape range: " << name << ", min_shape from "
+                  << Vec2Str(bak_min_shape) << " to "
+                  << Vec2Str(min_input_shape_[name]);
+      if (max_change)
+        LOG(INFO) << "refactor shape range: " << name << ", max_shape from "
+                  << Vec2Str(bak_max_shape) << " to "
+                  << Vec2Str(max_input_shape_[name]);
+      if (min_change || max_change) changed->push_back(name);
+    }
+    return ret;
+  }
+
  bool use_oss() { return use_oss_; }
  bool with_ernie() { return with_ernie_; }
  bool disable_trt_plugin_fp16() { return disable_trt_plugin_fp16_; }

--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
@@ -154,6 +154,16 @@ inline void PrintITensorShape(nvinfer1::ITensor* X) {
  std::cout << "]\n";
 }

+template <typename T>
+inline std::string Vec2Str(const std::vector<T>& vec) {
+  std::ostringstream os;
+  os << "(";
+  for (size_t i = 0; i < vec.size() - 1; ++i) {
+    os << vec[i] << ",";
+  }
+  os << vec[vec.size() - 1] << ")";
+  return os.str();
+}
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc
@@ -143,6 +143,70 @@ void TestDynamic2() {
  }
 }

+void TestTunedDynamic() {
+  std::string model_dir =
+      FLAGS_infer_model + "/complex_model_dynamic/complex_model_dynamic2";
+  AnalysisConfig config_tuned;
+  const std::string shape_range = "shape_range.pbtxt";
+  config_tuned.EnableUseGpu(100, 0);
+  config_tuned.SetModel(model_dir + "/model", model_dir + "/params");
+  config_tuned.SwitchUseFeedFetchOps(false);
+  config_tuned.CollectShapeRangeInfo(shape_range);
+
+  int batch_size = 1;
+  auto predictor_tuned = CreatePaddlePredictor(config_tuned);
+
+  auto check_func = [batch_size](PaddlePredictor *predictor) {
+    int channels = 3;
+    int height = 5;
+    int width = 5;
+    int input_num = channels * height * width * 1;
+
+    float *input = new float[input_num];
+    memset(input, 0, input_num * sizeof(float));
+    auto input_names = predictor->GetInputNames();
+    auto input_t = predictor->GetInputTensor(input_names[0]);
+    input_t->Reshape({batch_size, channels, height, width});
+    input_t->copy_from_cpu(input);
+
+    auto input_t1 = predictor->GetInputTensor(input_names[1]);
+    input_t1->Reshape({batch_size, 2, 1, 1});
+    std::vector<float> first;
+    for (int i = 0; i < batch_size * 2; i++) first.push_back(1.0);
+    input_t1->copy_from_cpu(first.data());
+
+    auto input_t2 = predictor->GetInputTensor(input_names[2]);
+    input_t2->Reshape({batch_size, 2, 1, 1});
+    input_t2->copy_from_cpu(first.data());
+
+    ASSERT_TRUE(predictor->ZeroCopyRun());
+
+    std::vector<float> out_data;
+    auto output_names = predictor->GetOutputNames();
+    auto output_t = predictor->GetOutputTensor(output_names[0]);
+    std::vector<int> output_shape = output_t->shape();
+    int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                  std::multiplies<int>());
+    out_data.resize(out_num);
+    output_t->copy_to_cpu(out_data.data());
+  };
+  check_func(predictor_tuned.get());
+
+  // check tuned_dynamic_shape
+  AnalysisConfig config;
+  config.EnableUseGpu(100, 0);
+  std::string cache_dir = "tuned_cache";
+  config.SetOptimCacheDir(cache_dir);
+  delete_cache_files(cache_dir);
+  config.SetModel(model_dir + "/model", model_dir + "/params");
+  config.SwitchUseFeedFetchOps(false);
+  config.EnableTunedTensorRtDynamicShape(shape_range, true);
+  config.EnableTensorRtEngine(1 << 30, batch_size, 0,
+                              AnalysisConfig::Precision::kFloat32, true, false);
+  auto test_predictor = CreatePaddlePredictor(config);
+  check_func(test_predictor.get());
+}
+
 TEST(AnalysisPredictor, trt_dynamic) { TestDynamic(true); }
 TEST(AnalysisPredictor, trt_static) { TestDynamic(false); }
 TEST(AnalysisPredictor, trt_memory_serialize) {
@@ -153,5 +217,7 @@ TEST(AnalysisPredictor, trt_memory_serialize) {
 }
 TEST(AnalysisPredictor, trt_dynamic2) { TestDynamic2(); }

+TEST(AnalysisPredictor, trt_tuned_dynamic) { TestTunedDynamic(); }
+
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
@@ -47,6 +47,24 @@ TEST(AnalysisPredictor, use_gpu) {
  }
 }

+TEST(AnalysisPredictor, collect_shape_range) {
+  std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
+  AnalysisConfig config;
+  config.EnableUseGpu(100, 0);
+  config.SetModel(model_dir);
+  config.CollectShapeRangeInfo("shape_range.pbtxt");
+
+  std::vector<std::vector<PaddleTensor>> inputs_all;
+  auto predictor = CreatePaddlePredictor(config);
+  SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
+
+  std::vector<PaddleTensor> outputs;
+  for (auto &input : inputs_all) {
+    ASSERT_TRUE(predictor->Run(input, &outputs));
+    predictor->ClearIntermediateTensor();
+  }
+}
+
 }  // namespace inference
 }  // namespace paddle


--- a/paddle/fluid/inference/utils/CMakeLists.txt
+++ b/paddle/fluid/inference/utils/CMakeLists.txt
 cc_library(benchmark SRCS benchmark.cc DEPS enforce)
 cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
-cc_library(infer_io_utils SRCS io_utils.cc DEPS paddle_inference_api lod_tensor)
+cc_library(infer_io_utils SRCS io_utils.cc DEPS paddle_inference_api lod_tensor shape_range_info_proto)
 cc_test(infer_io_utils_tester SRCS io_utils_tester.cc DEPS infer_io_utils)
 cc_library(table_printer SRCS table_printer.cc)
 cc_test(test_table_printer SRCS table_printer_tester.cc DEPS table_printer)
+
+proto_library(shape_range_info_proto SRCS shape_range_info.proto)
--- a/paddle/fluid/inference/utils/io_utils.cc
+++ b/paddle/fluid/inference/utils/io_utils.cc
@@ -13,7 +13,15 @@
 // limitations under the License.

 #include "paddle/fluid/inference/utils/io_utils.h"
+
+#include <fcntl.h>
+
+#include <utility>
+
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/text_format.h"
 #include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/utils/shape_range_info.pb.h"

 namespace paddle {
 namespace inference {
@@ -157,5 +165,102 @@ void DeserializePDTensorsToFile(const std::string &path,
  fin.close();
 }

+void SerializeShapeRangeInfo(
+    const std::string &path,
+    const paddle::inference::proto::ShapeRangeInfos &info) {
+  int out_fd = open(path.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644);
+  google::protobuf::io::FileOutputStream *os =
+      new google::protobuf::io::FileOutputStream(out_fd);
+  google::protobuf::TextFormat::Print(info, os);
+  delete os;
+  close(out_fd);
+}
+
+void SerializeShapeRangeInfo(
+    const std::string &path,
+    const std::map<std::string, std::vector<int32_t>> &min_shape,
+    const std::map<std::string, std::vector<int32_t>> &max_shape,
+    const std::map<std::string, std::vector<int32_t>> &opt_shape) {
+  paddle::inference::proto::ShapeRangeInfos shape_range_infos;
+  for (auto it : min_shape) {
+    auto *s = shape_range_infos.add_shape_range_info();
+    s->set_name(it.first);
+    for (size_t i = 0; i < it.second.size(); ++i) {
+      s->add_min_shape(it.second[i]);
+      s->add_max_shape(max_shape.at(it.first)[i]);
+      s->add_opt_shape(opt_shape.at(it.first)[i]);
+    }
+  }
+
+  inference::SerializeShapeRangeInfo(path, shape_range_infos);
+}
+void DeserializeShapeRangeInfo(
+    const std::string &path, paddle::inference::proto::ShapeRangeInfos *info) {
+  int fd = open(path.c_str(), O_RDONLY);
+  google::protobuf::io::FileInputStream *is =
+      new google::protobuf::io::FileInputStream(fd);
+  google::protobuf::TextFormat::Parse(is, info);
+  delete is;
+  close(fd);
+}
+
+void DeserializeShapeRangeInfo(
+    const std::string &path,
+    std::map<std::string, std::vector<int32_t>> *min_shape,
+    std::map<std::string, std::vector<int32_t>> *max_shape,
+    std::map<std::string, std::vector<int32_t>> *opt_shape) {
+  paddle::inference::proto::ShapeRangeInfos shape_range_infos;
+  DeserializeShapeRangeInfo(path, &shape_range_infos);
+  for (int i = 0; i < shape_range_infos.shape_range_info_size(); ++i) {
+    auto info = shape_range_infos.shape_range_info(i);
+    auto name = info.name();
+    if (min_shape->count(name) || max_shape->count(name) ||
+        opt_shape->count(name)) {
+      continue;
+    } else {
+      std::vector<int32_t> tmp(info.min_shape_size());
+      for (size_t k = 0; k < tmp.size(); ++k) tmp[k] = info.min_shape(k);
+      min_shape->insert(std::make_pair(name, tmp));
+
+      tmp.resize(info.max_shape_size());
+      for (size_t k = 0; k < tmp.size(); ++k) tmp[k] = info.max_shape(k);
+      max_shape->insert(std::make_pair(name, tmp));
+
+      tmp.resize(info.opt_shape_size());
+      for (size_t k = 0; k < tmp.size(); ++k) tmp[k] = info.opt_shape(k);
+      opt_shape->insert(std::make_pair(name, tmp));
+    }
+  }
+}
+
+void UpdateShapeRangeInfo(
+    const std::string &path,
+    const std::map<std::string, std::vector<int32_t>> &min_shape,
+    const std::map<std::string, std::vector<int32_t>> &max_shape,
+    const std::map<std::string, std::vector<int32_t>> &opt_shape,
+    const std::vector<std::string> &names) {
+  paddle::inference::proto::ShapeRangeInfos shape_range_infos;
+  DeserializeShapeRangeInfo(path, &shape_range_infos);
+
+  for (int i = 0; i < shape_range_infos.shape_range_info_size(); ++i) {
+    auto *info = shape_range_infos.mutable_shape_range_info(i);
+    for (const auto &name : names) {
+      if (info->name() == name) {
+        info->clear_min_shape();
+        info->clear_max_shape();
+        info->clear_opt_shape();
+        for (size_t j = 0; j < min_shape.at(name).size(); ++j)
+          info->add_min_shape(min_shape.at(name)[j]);
+        for (size_t j = 0; j < max_shape.at(name).size(); ++j)
+          info->add_max_shape(max_shape.at(name)[j]);
+        for (size_t j = 0; j < opt_shape.at(name).size(); ++j)
+          info->add_opt_shape(opt_shape.at(name)[j]);
+        break;
+      }
+    }
+  }
+  inference::SerializeShapeRangeInfo(path, shape_range_infos);
+}
+
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/utils/io_utils.h
+++ b/paddle/fluid/inference/utils/io_utils.h
@@ -19,6 +19,7 @@

 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/api/paddle_api.h"
+#include "paddle/fluid/inference/utils/shape_range_info.pb.h"

 namespace paddle {
 struct PaddleTensor;
@@ -41,5 +42,28 @@ void SerializePDTensorsToFile(const std::string& path,
                              const std::vector<PaddleTensor>& tensors);
 void DeserializePDTensorsToFile(const std::string& path,
                                std::vector<PaddleTensor>* tensors);
+
+void SerializeShapeRangeInfo(
+    const std::string& path,
+    const paddle::inference::proto::ShapeRangeInfos& info);
+void SerializeShapeRangeInfo(
+    const std::string& path,
+    const std::map<std::string, std::vector<int32_t>>& min_shape,
+    const std::map<std::string, std::vector<int32_t>>& max_shape,
+    const std::map<std::string, std::vector<int32_t>>& opt_shape);
+void DeserializeShapeRangeInfo(const std::string& path,
+                               paddle::inference::proto::ShapeRangeInfos* info);
+void DeserializeShapeRangeInfo(
+    const std::string& path,
+    std::map<std::string, std::vector<int32_t>>* min_shape,
+    std::map<std::string, std::vector<int32_t>>* max_shape,
+    std::map<std::string, std::vector<int32_t>>* opt_shape);
+
+void UpdateShapeRangeInfo(
+    const std::string& path,
+    const std::map<std::string, std::vector<int32_t>>& min_shape,
+    const std::map<std::string, std::vector<int32_t>>& max_shape,
+    const std::map<std::string, std::vector<int32_t>>& opt_shape,
+    const std::vector<std::string>& names);
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/utils/io_utils_tester.cc
+++ b/paddle/fluid/inference/utils/io_utils_tester.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/inference/utils/io_utils.h"
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include <utility>
 #include "paddle/fluid/inference/api/helper.h"

 namespace paddle {
@@ -93,3 +94,28 @@ TEST(infer_io_utils, tensors) {
        paddle::inference::pd_tensor_equal(tensors_in[i], tensors_out[i]));
  }
 }
+
+TEST(shape_info_io, read_and_write) {
+  const std::string path = "test_shape_info_io";
+  std::map<std::string, std::vector<int32_t>> min_shape, max_shape, opt_shape;
+  min_shape.insert(
+      std::make_pair("test1", std::vector<int32_t>{1, 3, 112, 112}));
+  max_shape.insert(
+      std::make_pair("test1", std::vector<int32_t>{1, 3, 224, 224}));
+  opt_shape.insert(
+      std::make_pair("test1", std::vector<int32_t>{1, 3, 224, 224}));
+  paddle::inference::SerializeShapeRangeInfo(path, min_shape, max_shape,
+                                             opt_shape);
+  min_shape.clear();
+  max_shape.clear();
+  opt_shape.clear();
+  opt_shape.insert(
+      std::make_pair("test2", std::vector<int32_t>{1, 3, 224, 224}));
+  paddle::inference::DeserializeShapeRangeInfo(path, &min_shape, &max_shape,
+                                               &opt_shape);
+
+  min_shape.insert(std::make_pair("test1", std::vector<int32_t>{1, 3, 56, 56}));
+  std::vector<std::string> names{"test1"};
+  paddle::inference::UpdateShapeRangeInfo(path, min_shape, max_shape, opt_shape,
+                                          names);
+}
--- a/paddle/fluid/inference/utils/shape_range_info.proto
+++ b/paddle/fluid/inference/utils/shape_range_info.proto
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax = "proto2";
+package paddle.inference.proto;
+
+// To support trt dynamic shape, record the runtime shape
+// information of all tmp tensors in the Compution graph.
+message ShapeRangeInfos {
+  message ShapeRangeInfo {
+    required string name = 1;
+    repeated int32 min_shape = 2;
+    repeated int32 max_shape = 3;
+    repeated int32 opt_shape = 4;
+  }
+
+  repeated ShapeRangeInfo shape_range_info = 1;
+}
--- a/paddle/fluid/operators/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/operators/tensorrt/CMakeLists.txt
-op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter)
+op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter infer_io_utils analysis_helper)
 file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(tensorrt_engine);\n")
 nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
  DEPS tensorrt_engine_op

--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -14,6 +14,7 @@

 #pragma once

+#include "paddle/fluid/framework/scope.h"
 #ifdef PADDLE_WITH_CUDA

 #include <memory>
@@ -30,6 +31,7 @@
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/inference/utils/io_utils.h"

 namespace paddle {
 namespace inference {
@@ -77,18 +79,19 @@ static void RuntimeStaticShapeCheck(std::vector<int64_t> runtime_input_shape,
 }

 static void RuntimeDynamicShapeCheck(
-    const std::string &x, const std::vector<int64_t> &runtime_input_shape,
-    const std::vector<int> &min_input_shape,
-    const std::vector<int> &max_input_shape) {
-  PADDLE_ENFORCE_EQ(runtime_input_shape.size(), min_input_shape.size(),
-                    platform::errors::InvalidArgument(
-                        "TRT engine runtime input dims size(%d) inconsistent "
-                        "with the dynamic shape size(%d)",
-                        runtime_input_shape.size(), min_input_shape.size()));
+    const std::string &x, const std::vector<int32_t> &runtime_input_shape,
+    const std::vector<int32_t> &min_input_shape,
+    const std::vector<int32_t> &max_input_shape) {
+  // PADDLE_ENFORCE_EQ(
+  //     runtime_input_shape.size(), min_input_shape.size(),
+  //     platform::errors::InvalidArgument(
+  //         "TRT engine runtime input %s dims size(%d) inconsistent "
+  //         "with the dynamic shape size(%d)",
+  //         x, runtime_input_shape.size(), min_input_shape.size()));
  auto is_input_shape_valid = [&](
-      const std::vector<int64_t> &runtime_input_shape,
-      const std::vector<int> &min_input_shape,
-      const std::vector<int> &max_input_shape) -> bool {
+      const std::vector<int32_t> &runtime_input_shape,
+      const std::vector<int32_t> &min_input_shape,
+      const std::vector<int32_t> &max_input_shape) -> bool {
    for (size_t i = 0; i < runtime_input_shape.size(); i++) {
      if (runtime_input_shape[i] <= max_input_shape[i] &&
          runtime_input_shape[i] >= min_input_shape[i]) {
@@ -128,6 +131,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
 private:
  std::vector<std::string> input_names_;
  std::unordered_set<std::string> param_names_;
+  std::vector<std::string> runtime_input_names_;
  mutable TensorRTEngine *trt_engine_{nullptr};
  int max_batch_size_;
  int workspace_size_;
@@ -141,7 +145,14 @@ class TensorRTEngineOp : public framework::OperatorBase {
  bool calibration_mode_;
  int predictor_id_;
  int device_id_;
+  bool allow_build_at_runtime_{false};
+  std::string shape_range_info_path_;
+  std::string model_opt_cache_dir_;
+  bool use_static_engine_;
  AnalysisConfig::Precision precision_mode_;
+  std::map<std::string, std::vector<int>> min_input_shape_{};
+  std::map<std::string, std::vector<int>> max_input_shape_{};
+  std::map<std::string, std::vector<int>> opt_input_shape_{};

 public:
  TensorRTEngineOp(const std::string &type,
@@ -160,11 +171,57 @@ class TensorRTEngineOp : public framework::OperatorBase {
    engine_key_ = Attr<std::string>("engine_key");
    calibration_engine_key_ = Attr<std::string>("calibration_engine_key");
    predictor_id_ = Attr<int>("predictor_id");
+    shape_range_info_path_ = Attr<std::string>("shape_range_info_path");
+    allow_build_at_runtime_ = Attr<bool>("allow_build_at_runtime");
+    use_static_engine_ = Attr<bool>("use_static_engine");
+    if (use_static_engine_) {
+      model_opt_cache_dir_ = Attr<std::string>("model_opt_cache_dir");
+    }
+
+    if (HasAttr("dynamic_shape_names") && HasAttr("min_input_shape") &&
+        HasAttr("max_input_shape") && HasAttr("opt_input_shape")) {
+      std::vector<std::string> dynamic_shape_names;
+      std::vector<std::vector<int>> min_input_shapes;
+      std::vector<std::vector<int>> max_input_shapes;
+      std::vector<std::vector<int>> opt_input_shapes;
+      std::vector<int> dynamic_shape_lens;
+      dynamic_shape_names =
+          Attr<std::vector<std::string>>("dynamic_shape_names");
+      std::vector<int> min_shapes = Attr<std::vector<int>>("min_input_shape");
+      std::vector<int> max_shapes = Attr<std::vector<int>>("max_input_shape");
+      std::vector<int> opt_shapes = Attr<std::vector<int>>("opt_input_shape");
+      dynamic_shape_lens = Attr<std::vector<int>>("dynamic_shape_lens");
+      int idx = 0;
+      for (size_t i = 0; i < dynamic_shape_lens.size(); ++i) {
+        std::vector<int> tmp1, tmp2, tmp3;
+        for (int j = 0; j < dynamic_shape_lens[i]; ++j) {
+          tmp1.push_back(min_shapes[idx]);
+          tmp2.push_back(max_shapes[idx]);
+          tmp3.push_back(opt_shapes[idx++]);
+        }
+        min_input_shapes.emplace_back(tmp1);
+        max_input_shapes.emplace_back(tmp2);
+        opt_input_shapes.emplace_back(tmp3);
+      }
+
+      for (size_t i = 0; i < dynamic_shape_names.size(); ++i) {
+        min_input_shape_.insert(
+            std::make_pair(dynamic_shape_names[i], min_input_shapes[i]));
+        max_input_shape_.insert(
+            std::make_pair(dynamic_shape_names[i], max_input_shapes[i]));
+        opt_input_shape_.insert(
+            std::make_pair(dynamic_shape_names[i], opt_input_shapes[i]));
+      }
+    }

    auto params = Attr<std::vector<std::string>>("parameters");
    for (const auto &param : params) {
      param_names_.insert(param);
    }
+    for (auto &x : input_names_) {
+      if (param_names_.count(x)) continue;
+      runtime_input_names_.emplace_back(x);
+    }
    // calibration_mode is ture represents we need to
    // generate the calibration table data.
    calibration_mode_ =
@@ -210,6 +267,78 @@ class TensorRTEngineOp : public framework::OperatorBase {
      return;
    }
    auto *trt_engine = GetEngine(scope, dev_place);
+    if (trt_engine->with_dynamic_shape()) {
+      // get runtime input shapes.
+      std::map<std::string, std::vector<int32_t>> runtime_input_shape;
+      for (auto name : runtime_input_names_) {
+        auto &t = inference::analysis::GetFromScope<framework::LoDTensor>(scope,
+                                                                          name);
+        VLOG(4) << "trt engine runtime input name(" << name << "), dims("
+                << t.dims() << ")";
+        auto t_shape = framework::vectorize<int32_t>(t.dims());
+        runtime_input_shape.insert(std::make_pair(name, t_shape));
+      }
+
+      if (!allow_build_at_runtime_) {
+        std::map<std::string, std::vector<int>> min_input_shape =
+            trt_engine->min_input_shape();
+        std::map<std::string, std::vector<int>> max_input_shape =
+            trt_engine->max_input_shape();
+        for (auto &x : runtime_input_names_) {
+          PADDLE_ENFORCE_EQ(
+              min_input_shape.count(x), true,
+              platform::errors::InvalidArgument(
+                  "Input %s not found in TRT engine min_input_shape.", x));
+          PADDLE_ENFORCE_EQ(
+              max_input_shape.count(x), true,
+              platform::errors::InvalidArgument(
+                  "Input %s not found in TRT engine max_input_shape.", x));
+          RuntimeDynamicShapeCheck(x, runtime_input_shape[x],
+                                   min_input_shape[x], max_input_shape[x]);
+        }
+      } else {
+        // compare runtime_input_shape and trt_engine dynamic shapes.
+        std::vector<std::string> shape_changed_name;
+        bool is_adjusted = trt_engine->AdjustDynamicShapeRange(
+            runtime_input_shape, &shape_changed_name);
+        if (is_adjusted) {
+          LOG(INFO) << "Adjust dynamic shape range, rebuild trt engine!";
+          trt_engine->ResetContext();
+          trt_engine->ClearTensorMap();
+          auto *anc = scope.parent();
+          while (anc && anc->parent()) {
+            anc = anc->parent();
+          }
+          if (anc == nullptr) {
+            anc = &scope;
+          }
+          PrepareTRTEngine(*anc, trt_engine);
+
+          // update shape_range_info_pbtxt
+          if (!shape_range_info_path_.empty()) {
+            inference::UpdateShapeRangeInfo(
+                shape_range_info_path_, trt_engine->min_input_shape(),
+                trt_engine->max_input_shape(), trt_engine->optim_input_shape(),
+                shape_changed_name);
+          }
+
+          if (use_static_engine_) {
+            nvinfer1::IHostMemory *serialized_engine_data =
+                trt_engine->Serialize();
+            std::string trt_engine_serialized_data =
+                std::string((const char *)serialized_engine_data->data(),
+                            serialized_engine_data->size());
+            inference::analysis::SaveTrtEngineSerializedDataToFile(
+                inference::analysis::GetTrtEngineSerializedPath(
+                    model_opt_cache_dir_, engine_key_),
+                trt_engine_serialized_data);
+            LOG(INFO) << "Save TRT Optimized Info to "
+                      << inference::analysis::GetTrtEngineSerializedPath(
+                             model_opt_cache_dir_, engine_key_);
+          }
+        }
+      }
+    }
    RunTrt(scope, dev_place, trt_engine);
  }

@@ -273,7 +402,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
        reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();

    PADDLE_ENFORCE_EQ(
-        input_names_.empty(), false,
+        runtime_input_names_.empty(), false,
        platform::errors::PreconditionNotMet(
            "TensorRT engine needs at least one input, but no input is found. "
            "Please check if you set the input correctly."));
@@ -283,16 +412,12 @@ class TensorRTEngineOp : public framework::OperatorBase {

    int num_inputs = 0;

-    for (const auto &x : Inputs("Xs")) {
-      if (param_names_.count(x)) continue;
-      num_inputs += 1;
-    }
+    num_inputs += runtime_input_names_.size();
    const int num_bindings = num_inputs + Outputs("Ys").size();
    std::vector<void *> buffers(num_bindings);

    // Bind input tensor to TRT.
-    for (const auto &x : Inputs("Xs")) {
-      if (param_names_.count(x)) continue;
+    for (const auto &x : runtime_input_names_) {
      // convert input and copy to TRT engine's buffer
      auto &t =
          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
@@ -341,22 +466,6 @@ class TensorRTEngineOp : public framework::OperatorBase {
        }
      } else {
 #if IS_TRT_VERSION_GE(6000)
-        std::map<std::string, std::vector<int>> min_input_shape =
-            engine->min_input_shape();
-        std::map<std::string, std::vector<int>> max_input_shape =
-            engine->max_input_shape();
-        PADDLE_ENFORCE_EQ(
-            min_input_shape.count(x), true,
-            platform::errors::InvalidArgument(
-                "Input %s not found in TRT engine min_input_shape.", x));
-        PADDLE_ENFORCE_EQ(
-            max_input_shape.count(x), true,
-            platform::errors::InvalidArgument(
-                "Input %s not found in TRT engine max_input_shape.", x));
-        auto x_min_input_shape = min_input_shape[x];
-        auto x_max_input_shape = max_input_shape[x];
-        RuntimeDynamicShapeCheck(x, t_shape, x_min_input_shape,
-                                 x_max_input_shape);
        auto *trt_context = engine->context();
        trt_context->setBindingDimensions(
            bind_index, inference::tensorrt::Vec2TRT_Dims(t_shape, x, true));
@@ -460,7 +569,8 @@ class TensorRTEngineOp : public framework::OperatorBase {
          inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
              .Create(engine_key_ + std::to_string(predictor_id_),
                      max_batch_size_, workspace_size_, precision_mode_,
-                      calibrator_.get(), device_id_);
+                      calibrator_.get(), device_id_, min_input_shape_,
+                      max_input_shape_, opt_input_shape_);
      PrepareTRTEngine(scope, trt_engine_);
    }
    return trt_engine_;

--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -56,7 +56,7 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block,

 using inference::analysis::SetAttr;

-TEST(TensorRTEngineOp, manual) {
+void DynamicShapeTest(bool allow_build_at_runtime) {
  framework::ProgramDesc program;
  auto* block_ = program.Proto()->add_blocks();
  block_->set_idx(0);
@@ -116,6 +116,15 @@ TEST(TensorRTEngineOp, manual) {
  engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
  int device_id = 0;
  engine_op_desc.SetAttr("gpu_id", device_id);
+  engine_op_desc.SetAttr("shape_range_info_path", std::string(""));
+  engine_op_desc.SetAttr("model_opt_cache_dir", std::string(""));
+  engine_op_desc.SetAttr("allow_build_at_runtime", allow_build_at_runtime);
+  engine_op_desc.SetAttr("use_static_engine", true);
+  engine_op_desc.SetAttr("dynamic_shape_names", std::vector<std::string>{"x"});
+  engine_op_desc.SetAttr("dynamic_shape_lens", std::vector<int>{4});
+  engine_op_desc.SetAttr("min_input_shape", std::vector<int>{1, 4, 1, 1});
+  engine_op_desc.SetAttr("max_input_shape", std::vector<int>{2, 4, 1, 1});
+  engine_op_desc.SetAttr("opt_input_shape", std::vector<int>{2, 4, 1, 1});

  LOG(INFO) << "create engine op";
  auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
@@ -125,7 +134,10 @@ TEST(TensorRTEngineOp, manual) {
  platform::CUDAPlace place;
  platform::CUDADeviceContext ctx(place);
  // Prepare variables.
-  CreateCUDATensor(&scope, "x", std::vector<int64_t>({2, 4}));
+  if (allow_build_at_runtime)
+    CreateCUDATensor(&scope, "x", std::vector<int64_t>({3, 4, 1, 1}));
+  else
+    CreateCUDATensor(&scope, "x", std::vector<int64_t>({2, 4, 1, 1}));
  CreateCUDATensor(&scope, "y", std::vector<int64_t>({4, 6}));
  CreateCUDATensor(&scope, "z", std::vector<int64_t>({2, 6}));

@@ -137,6 +149,11 @@ TEST(TensorRTEngineOp, manual) {
  engine_op->Run(scope, place);
 }

+TEST(TensorRTEngineOp, manual) {
+  DynamicShapeTest(false);
+  DynamicShapeTest(true);
+}
+
 void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
  framework::ProgramDesc program;
  framework::Scope scope;
@@ -220,6 +237,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
  engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
  int device_id = 0;
  engine_op_desc.SetAttr("gpu_id", device_id);
+  engine_op_desc.SetAttr("shape_range_info_path", std::string(""));
+  engine_op_desc.SetAttr("model_opt_cache_dir", std::string(""));
+  engine_op_desc.SetAttr("allow_build_at_runtime", false);
+  engine_op_desc.SetAttr("use_static_engine", false);

  auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);

@@ -228,7 +249,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
 }

 // Test with a larger FC layer.
-TEST(TensorRTEngineOp, fc) { Execute(40, 28, 28); }
+// TEST(TensorRTEngineOp, fc) { Execute(40, 28, 28); }

 }  // namespace operators
 }  // namespace paddle

--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -538,7 +538,8 @@ void BindAnalysisConfig(py::module *m) {
      .def("switch_ir_optim", &AnalysisConfig::SwitchIrOptim,
           py::arg("x") = true)
      .def("ir_optim", &AnalysisConfig::ir_optim)
-      .def("enable_memory_optim", &AnalysisConfig::EnableMemoryOptim)
+      .def("enable_memory_optim", &AnalysisConfig::EnableMemoryOptim,
+           py::arg("x") = true)
      .def("enable_profile", &AnalysisConfig::EnableProfile)
      .def("disable_glog_info", &AnalysisConfig::DisableGlogInfo)
      .def("glog_info_disabled", &AnalysisConfig::glog_info_disabled)
@@ -569,6 +570,16 @@ void BindAnalysisConfig(py::module *m) {
           &AnalysisConfig::tensorrt_dynamic_shape_enabled)
      .def("enable_tensorrt_oss", &AnalysisConfig::EnableTensorRtOSS)
      .def("tensorrt_oss_enabled", &AnalysisConfig::tensorrt_oss_enabled)
+      .def("collect_shape_range_info", &AnalysisConfig::CollectShapeRangeInfo)
+      .def("shape_range_info_path", &AnalysisConfig::shape_range_info_path)
+      .def("shape_range_info_collected",
+           &AnalysisConfig::shape_range_info_collected)
+      .def("enable_tuned_tensorrt_dynamic_shape",
+           &AnalysisConfig::EnableTunedTensorRtDynamicShape)
+      .def("tuned_tensorrt_dynamic_shape",
+           &AnalysisConfig::tuned_tensorrt_dynamic_shape)
+      .def("trt_allow_build_at_runtime",
+           &AnalysisConfig::trt_allow_build_at_runtime)
      .def("exp_disable_tensorrt_ops", &AnalysisConfig::Exp_DisableTensorRtOPs)
      .def("enable_tensorrt_dla", &AnalysisConfig::EnableTensorRtDLA,
           py::arg("dla_core") = 0)

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_tuned_dynamic_shape.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_tuned_dynamic_shape.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+paddle.enable_static()
+import paddle.fluid as fluid
+from paddle.inference import Config, Predictor, create_predictor
+
+
+class TRTTunedDynamicShapeTest(unittest.TestCase):
+    def get_model(self):
+        place = fluid.CUDAPlace(0)
+        exe = fluid.Executor(place)
+
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 6, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=1,
+                padding=0,
+                bias_attr=False,
+                act=None)
+        exe.run(startup_program)
+        serialized_program = paddle.static.serialize_program(
+            data, conv_out, program=main_program)
+        serialized_params = paddle.static.serialize_persistables(
+            data, conv_out, executor=exe, program=main_program)
+        return serialized_program, serialized_params
+
+    def get_config(self, model, params, tuned=False):
+        config = Config()
+        config.set_model_buffer(model, len(model), params, len(params))
+        config.enable_use_gpu(100, 0)
+        config.set_optim_cache_dir('tuned_test')
+        if tuned:
+            config.collect_shape_range_info('shape_range.pbtxt')
+        else:
+            config.enable_tensorrt_engine(
+                workspace_size=1024,
+                max_batch_size=1,
+                min_subgraph_size=0,
+                precision_mode=paddle.inference.PrecisionType.Float32,
+                use_static=True,
+                use_calib_mode=False)
+            config.enable_tuned_tensorrt_dynamic_shape('shape_range.pbtxt',
+                                                       True)
+
+        return config
+
+    def predictor_run(self, config, in_data):
+        predictor = create_predictor(config)
+        in_names = predictor.get_input_names()
+        in_handle = predictor.get_input_handle(in_names[0])
+        in_handle.copy_from_cpu(in_data)
+        predictor.run()
+
+    def test_tuned_dynamic_shape_run(self):
+        program, params = self.get_model()
+
+        config = self.get_config(program, params, tuned=True)
+        self.predictor_run(config, np.ones((1, 6, 64, 64)).astype(np.float32))
+
+        config2 = self.get_config(program, params, tuned=False)
+        self.predictor_run(config2, np.ones((1, 6, 32, 32)).astype(np.float32))
+
+
+if __name__ == '__main__':
+    unittest.main()