From 7c96efedee58e14221f29fe210e44916db865aa9 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Tue, 14 Sep 2021 10:23:30 +0800
Subject: [PATCH] [Inference] Add tuned trt_dynamic_shape mode. (#34806)

---
 paddle/fluid/inference/analysis/argument.h    |   6 +
 .../inference/analysis/ir_pass_manager.cc     |  18 +-
 .../analysis/ir_passes/CMakeLists.txt         |   2 +-
 .../ir_passes/tensorrt_subgraph_pass.cc       |  40 ++--
 .../ir_params_sync_among_devices_pass.cc      |   9 +-
 paddle/fluid/inference/api/CMakeLists.txt     |   2 +-
 paddle/fluid/inference/api/analysis_config.cc |  49 ++++-
 .../fluid/inference/api/analysis_predictor.cc |  90 +++++++++
 .../fluid/inference/api/analysis_predictor.h  |  10 +
 .../api/analysis_predictor_tester.cc          |  56 +++++-
 .../inference/api/paddle_analysis_config.h    |  57 +++++-
 paddle/fluid/inference/tensorrt/engine.cc     |   4 +
 paddle/fluid/inference/tensorrt/engine.h      |  61 ++++++
 paddle/fluid/inference/tensorrt/helper.h      |  10 +
 .../tests/api/trt_dynamic_shape_test.cc       |  66 +++++++
 .../inference/tests/api/trt_mobilenet_test.cc |  18 ++
 paddle/fluid/inference/utils/CMakeLists.txt   |   4 +-
 paddle/fluid/inference/utils/io_utils.cc      | 105 ++++++++++
 paddle/fluid/inference/utils/io_utils.h       |  24 +++
 .../fluid/inference/utils/io_utils_tester.cc  |  26 +++
 .../inference/utils/shape_range_info.proto    |  29 +++
 .../fluid/operators/tensorrt/CMakeLists.txt   |   2 +-
 .../operators/tensorrt/tensorrt_engine_op.h   | 180 ++++++++++++++----
 .../tensorrt/tensorrt_engine_op_test.cc       |  27 ++-
 paddle/fluid/pybind/inference_api.cc          |  13 +-
 .../inference/test_trt_tuned_dynamic_shape.py |  88 +++++++++
 26 files changed, 929 insertions(+), 67 deletions(-)
 create mode 100644 paddle/fluid/inference/utils/shape_range_info.proto
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_tuned_dynamic_shape.py
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 255c6ca75df..b24005cb6d9 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -212,6 +212,12 @@ struct Argument {
                       bool);
   DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool);
   DECL_ARGUMENT_FIELD(tensorrt_use_oss, TensorRtUseOSS, bool);
+  DECL_ARGUMENT_FIELD(tensorrt_shape_range_info_path,
+                      TensorRtShapeRangeInfoPath, std::string);
+  DECL_ARGUMENT_FIELD(tensorrt_tuned_dynamic_shape, TensorRtTunedDynamicShape,
+                      bool);
+  DECL_ARGUMENT_FIELD(tensorrt_allow_build_at_runtime,
+                      TensorRtAllowBuildAtRuntime, bool);
 
   DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool);
   DECL_ARGUMENT_FIELD(dlnne_min_subgraph_size, DlnneMinSubgraphSize, int);
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 4bb08dc96b1..8eb7e8d1388 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -146,6 +146,14 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
       pass->Set("use_static_engine", new bool(use_static_engine));
       pass->Set("model_from_memory", new bool(argument->model_from_memory()));
+
+      // tuned trt dynamic_shape
+      pass->Set("trt_shape_range_info_path",
+                new std::string(argument->tensorrt_shape_range_info_path()));
+      pass->Set("trt_tuned_dynamic_shape",
+                new bool(argument->tensorrt_tuned_dynamic_shape()));
+      pass->Set("trt_allow_build_at_runtime",
+                new bool(argument->tensorrt_allow_build_at_runtime()));
       pass->Set("max_input_shape", new std::map<std::string, std::vector<int>>(
                                        argument->max_input_shape()));
       pass->Set("min_input_shape", new std::map<std::string, std::vector<int>>(
@@ -153,17 +161,17 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("optim_input_shape",
                 new std::map<std::string, std::vector<int>>(
                     argument->optim_input_shape()));
-      bool with_dynamic_shape = argument->max_input_shape().size() > 0 &&
-                                argument->min_input_shape().size() > 0 &&
-                                argument->optim_input_shape().size() > 0;
+      bool with_dynamic_shape = (argument->max_input_shape().size() > 0 &&
+                                 argument->min_input_shape().size() > 0 &&
+                                 argument->optim_input_shape().size() > 0) ||
+                                argument->tensorrt_tuned_dynamic_shape();
       pass->Set("with_dynamic_shape", new bool(with_dynamic_shape));
       pass->Set("trt_disabled_ops", new std::vector<std::string>(
                                         argument->tensorrt_disabled_ops()));
       pass->Set("trt_use_dla", new bool(argument->tensorrt_use_dla()));
       pass->Set("trt_dla_core", new int(argument->tensorrt_dla_core()));
       // Setting the disable_trt_plugin_fp16 to true means that TRT plugin will
-      // not
-      // run fp16.
+      // not run fp16.
       pass->Set("disable_trt_plugin_fp16",
                 new bool(argument->disable_trt_plugin_fp16()));
     } else if (pass_name == "dlnne_subgraph_pass") {
diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
index 330f7a99847..7faef7d391f 100644
--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
@@ -1,7 +1,7 @@
 cc_library(subgraph_util SRCS subgraph_util.cc DEPS subgraph_detector)
 
 if (WITH_GPU AND TENSORRT_FOUND)
-  cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_util tensorrt_op_teller)
+  cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_util tensorrt_op_teller infer_io_utils)
 
   set(analysis_deps ${analysis_deps}
           subgraph_util tensorrt_subgraph_pass
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index f57f07883dc..a21118e23aa 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -22,6 +22,7 @@
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
+#include "paddle/fluid/inference/utils/io_utils.h"
 
 namespace paddle {
 namespace inference {
@@ -197,6 +198,17 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   auto opt_input_shape =
       Get<std::map<std::string, std::vector<int>>>("optim_input_shape");
 
+  auto allow_build_at_runtime = Get<bool>("trt_allow_build_at_runtime");
+  auto shape_range_info_path = Get<std::string>("trt_shape_range_info_path");
+  auto trt_tuned_dynamic_shape = Get<bool>("trt_tuned_dynamic_shape");
+  int max_batch_size = Get<int>("max_batch_size");
+  if (trt_tuned_dynamic_shape) {
+    VLOG(1) << "trt dynamic_shape deserialize from " << shape_range_info_path;
+    inference::DeserializeShapeRangeInfo(shape_range_info_path,
+                                         &min_input_shape, &max_input_shape,
+                                         &opt_input_shape);
+  }
+
   // The following procedure is used to rename all the intermediate
   // variables and the output variables of the subgraph.
   // Why we do this?
@@ -242,12 +254,14 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
 
   op_desc->SetBlockAttr("sub_block", new_block);
   op_desc->SetAttr("subgraph", block_desc.Proto()->SerializeAsString());
-  op_desc->SetAttr("max_batch_size", Get<int>("max_batch_size"));
+  op_desc->SetAttr("max_batch_size", max_batch_size);
   op_desc->SetAttr("workspace_size", Get<int>("workspace_size"));
   op_desc->SetAttr("gpu_id", Get<int>("gpu_device_id"));
   op_desc->SetAttr("output_name_mapping", output_mapping);
   op_desc->SetAttr("origin_output_dims", renamed_output_dims);
   op_desc->SetAttr("parameters", params);
+  op_desc->SetAttr("allow_build_at_runtime", allow_build_at_runtime);
+  op_desc->SetAttr("shape_range_info_path", shape_range_info_path);
 
   // we record all inputs' shapes in attr to check if they are consistent
   // with the real inputs' shapes retrieved from scope when trt runs.
@@ -259,6 +273,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   }
 
   auto use_static_engine = Get<bool>("use_static_engine");
+  op_desc->SetAttr("use_static_engine", use_static_engine);
+  if (use_static_engine)
+    op_desc->SetAttr("model_opt_cache_dir",
+                     Get<std::string>("model_opt_cache_dir"));
+
   // TODO(NHZlX)
   // There are models with the same structure but the different parameters,
   // when running in the 'use_serialize' mode, there is a bug.
@@ -266,12 +285,12 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   // So we use seperate engine keys in serialization and calibration.
   auto engine_key = GenerateEngineKey(
       input_names_with_id, output_names_with_id, std::to_string(0),
-      std::to_string(Get<int>("max_batch_size")),
+      std::to_string(max_batch_size),
       std::to_string(static_cast<int>(precision_mode)), false);
-  auto calibration_engine_key = GenerateEngineKey(
-      input_names_with_id, output_names_with_id, std::to_string(0),
-      std::to_string(Get<int>("max_batch_size")),
-      std::to_string(static_cast<int>(precision_mode)), true);
+  auto calibration_engine_key =
+      GenerateEngineKey(input_names_with_id, output_names_with_id,
+                        std::to_string(0), std::to_string(max_batch_size),
+                        std::to_string(static_cast<int>(precision_mode)), true);
   auto predictor_id = Get<int>("predictor_id");
 
   // Get "" when there is no cached calibration table data.
@@ -345,11 +364,10 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   bool disable_trt_plugin_fp16 = Get<bool>("disable_trt_plugin_fp16");
   tensorrt::TensorRTEngine *trt_engine =
       inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
-          .Create(engine_key + std::to_string(predictor_id),
-                  Get<int>("max_batch_size"), Get<int>("workspace_size"),
-                  precision_mode, calibrator.get(), Get<int>("gpu_device_id"),
-                  min_input_shape, max_input_shape, opt_input_shape,
-                  disable_trt_plugin_fp16);
+          .Create(engine_key + std::to_string(predictor_id), max_batch_size,
+                  Get<int>("workspace_size"), precision_mode, calibrator.get(),
+                  Get<int>("gpu_device_id"), min_input_shape, max_input_shape,
+                  opt_input_shape, disable_trt_plugin_fp16);
   trt_engine->SetUseOSS(Get<bool>("use_oss"));
   trt_engine->SetUseDLA(Get<bool>("trt_use_dla"));
   trt_engine->SetDLACore(Get<int>("trt_dla_core"));
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index f127478b5f2..9993bb37d51 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -55,10 +55,17 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
   // We get all the vars from local_scope instead of the ProgramDesc.
   // Because there exists the case that new parameter variables are not added to
   // the program in the analysis pass.
+  bool reserve_cpu_weights = false;
+  if (argument->tensorrt_allow_build_at_runtime_valid() &&
+      argument->tensorrt_allow_build_at_runtime()) {
+    reserve_cpu_weights = true;
+  }
   for (auto &var_name : all_vars) {
     if (std::count(repetitive_params.begin(), repetitive_params.end(),
                    var_name)) {
-      scope->EraseVars({var_name});
+      if (!reserve_cpu_weights) {
+        scope->EraseVars({var_name});
+      }
       continue;
     }
     auto *var = scope->FindLocalVar(var_name);
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 888caa3947a..bbec3eab1ca 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -49,7 +49,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
 endif()
 
 cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} 
-          zero_copy_tensor ir_pass_manager op_compatible_info)
+          zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils)
 
 cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api)
 
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index bf719949134..f9c7be9cd4c 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -158,6 +158,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(trt_use_static_engine_);
   CP_MEMBER(trt_use_calib_mode_);
   CP_MEMBER(trt_use_oss_);
+  CP_MEMBER(trt_tuned_dynamic_shape_);
+  CP_MEMBER(trt_allow_build_at_runtime_);
+  CP_MEMBER(collect_shape_range_info_);
+  CP_MEMBER(shape_range_info_path_);
   // Dlnne related
   CP_MEMBER(use_dlnne_);
   CP_MEMBER(dlnne_min_subgraph_size_);
@@ -653,8 +657,8 @@ float AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
 #endif
 }
 
-void AnalysisConfig::EnableMemoryOptim() {
-  enable_memory_optim_ = true;
+void AnalysisConfig::EnableMemoryOptim(bool x) {
+  enable_memory_optim_ = x;
   Update();
 }
 
@@ -783,6 +787,9 @@ std::string AnalysisConfig::Summary() {
       // dynamic_shape
       os.InsertRow({"tensorrt_enable_dynamic_shape",
                     min_input_shape_.empty() ? "false" : "true"});
+      os.InsertRow({"tensorrt_tuned_dynamic_shape", trt_tuned_dynamic_shape_
+                                                        ? shape_range_info_path_
+                                                        : "false"});
 
       os.InsertRow({"tensorrt_use_oss", trt_use_oss_ ? "true" : "false"});
       os.InsertRow({"tensorrt_use_dla", trt_use_dla_ ? "true" : "false"});
@@ -812,8 +819,46 @@ std::string AnalysisConfig::Summary() {
   os.InsertRow({"memory_optim", enable_memory_optim_ ? "true" : "false"});
   os.InsertRow({"enable_profile", with_profile_ ? "true" : "false"});
   os.InsertRow({"enable_log", with_glog_info_ ? "true" : "false"});
+  os.InsertRow({"collect_shape_range_info",
+                collect_shape_range_info_ ? shape_range_info_path_ : "false"});
 
   return os.PrintTable();
 }
 
+void AnalysisConfig::CollectShapeRangeInfo(
+    const std::string &shape_range_info_path) {
+  LOG(INFO) << "In CollectShapeInfo mode, we will disable optimizations and "
+               "collect the shape information of "
+            << "all intermediate tensors in the compute graph and calculate "
+               "the min_shape, max_shape and opt_shape.";
+  collect_shape_range_info_ = true;
+  PADDLE_ENFORCE_EQ(shape_range_info_path.empty(), false,
+                    platform::errors::InvalidArgument(
+                        "The shape_range_info_path should not be empty, please "
+                        "re-check the argument."));
+  shape_range_info_path_ = shape_range_info_path;
+}
+
+const std::string &AnalysisConfig::shape_range_info_path() {
+  return shape_range_info_path_;
+}
+
+bool AnalysisConfig::shape_range_info_collected() {
+  return collect_shape_range_info_;
+}
+
+void AnalysisConfig::EnableTunedTensorRtDynamicShape(
+    const std::string &shape_range_info_path, bool allow_build_at_runtime) {
+  shape_range_info_path_ = shape_range_info_path;
+  trt_allow_build_at_runtime_ = allow_build_at_runtime;
+  trt_tuned_dynamic_shape_ = true;
+}
+
+bool AnalysisConfig::tuned_tensorrt_dynamic_shape() {
+  return trt_tuned_dynamic_shape_;
+}
+
+bool AnalysisConfig::trt_allow_build_at_runtime() {
+  return trt_allow_build_at_runtime_;
+}
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 25f4cfea0ba..f8491e2abf7 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/api/analysis_predictor.h"
+
 #include <glog/logging.h>
+
 #include <algorithm>
 #include <fstream>
 #include <memory>
@@ -21,6 +23,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/extension/include/ext_op_meta_info.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
@@ -34,6 +37,7 @@
 #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/inference/utils/io_utils.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/cpu_helper.h"
@@ -570,6 +574,11 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetMaxInputShape(config_.max_input_shape_);
     argument_.SetOptimInputShape(config_.optim_input_shape_);
     argument_.SetCloseTrtPluginFp16(config_.disable_trt_plugin_fp16_);
+    argument_.SetTensorRtShapeRangeInfoPath(config_.shape_range_info_path());
+    argument_.SetTensorRtTunedDynamicShape(
+        config_.tuned_tensorrt_dynamic_shape());
+    argument_.SetTensorRtAllowBuildAtRuntime(
+        config_.trt_allow_build_at_runtime());
   }
 
   if (config_.dlnne_enabled()) {
@@ -915,6 +924,11 @@ bool AnalysisPredictor::ZeroCopyRun() {
 #endif
 
   executor_->Run();
+
+  if (config_.shape_range_info_collected()) {
+    CollectShapeRangeInfo();
+  }
+
   // Fix TensorArray reuse not cleaned bug.
   tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_);
   tensor_array_batch_cleaner_.ResetTensorArray();
@@ -934,6 +948,78 @@ bool AnalysisPredictor::ZeroCopyRun() {
   return true;
 }
 
+void AnalysisPredictor::CollectShapeRangeInfo() {
+  // if use gpu, sync first.
+  if (config_.use_gpu()) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    paddle::platform::DeviceContextPool &pool =
+        paddle::platform::DeviceContextPool::Instance();
+    auto gpu_place = BOOST_GET_CONST(paddle::platform::CUDAPlace, place_);
+    auto *dev_ctx = static_cast<const paddle::platform::CUDADeviceContext *>(
+        pool.Get(gpu_place));
+#ifdef PADDLE_WITH_HIP
+    hipStreamSynchronize(dev_ctx->stream());
+#else
+    cudaStreamSynchronize(dev_ctx->stream());
+#endif
+#endif
+  }
+
+  std::vector<std::string> var_names = sub_scope_->LocalVarNames();
+  for (const auto &name : var_names) {
+    auto *var = sub_scope_->GetVar(name);
+    if (!var->IsType<framework::LoDTensor>()) {
+      continue;
+    }
+    framework::DDim dim = var->Get<framework::LoDTensor>().dims();
+    std::vector<int32_t> shape(dim.size());
+    for (size_t i = 0; i < shape.size(); ++i) shape[i] = dim[i];
+    shape_info_[name].emplace_back(shape);
+  }
+}
+
+void AnalysisPredictor::StatisticShapeRangeInfo() {
+  std::map<std::string, std::vector<int32_t>> min_shapes;
+  std::map<std::string, std::vector<int32_t>> max_shapes;
+  std::map<std::string, std::vector<int32_t>> opt_shapes;
+  for (auto it : shape_info_) {
+    auto name = it.first;
+    auto shapes = it.second;
+
+    std::vector<int32_t> min_shape(shapes[0].begin(), shapes[0].end());
+    std::vector<int32_t> max_shape(shapes[0].begin(), shapes[0].end());
+    std::vector<int32_t> opt_shape(shapes[0].begin(), shapes[0].end());
+
+    auto ShapeMaxFreq = [](const std::map<int32_t, int32_t> &m) -> int32_t {
+      std::vector<std::pair<int32_t, int32_t>> counter;
+      for (auto &it : m) counter.push_back(it);
+      std::sort(
+          counter.begin(), counter.end(),
+          [](std::pair<int32_t, int32_t> &a, std::pair<int32_t, int32_t> &b) {
+            return a.second > b.second;
+          });
+      return counter[0].first;
+    };
+
+    for (size_t d = 0; d < shapes[0].size(); ++d) {
+      std::map<int32_t, int32_t> counter;
+      for (size_t i = 0; i < shapes.size(); ++i) {
+        counter[shapes[i][d]] += 1;
+        if (shapes[i][d] < min_shape[d]) min_shape[d] = shapes[i][d];
+        if (shapes[i][d] > max_shape[d]) max_shape[d] = shapes[i][d];
+      }
+      opt_shape[d] = ShapeMaxFreq(counter);
+    }
+
+    min_shapes[name] = min_shape;
+    max_shapes[name] = max_shape;
+    opt_shapes[name] = opt_shape;
+  }
+
+  inference::SerializeShapeRangeInfo(config_.shape_range_info_path(),
+                                     min_shapes, max_shapes, opt_shapes);
+}
+
 bool AnalysisPredictor::LoadProgramDesc() {
   // Initialize the inference program
   std::string filename;
@@ -1140,6 +1226,10 @@ AnalysisPredictor::~AnalysisPredictor() {
   }
 #endif
 
+  if (config_.shape_range_info_collected()) {
+    StatisticShapeRangeInfo();
+  }
+
   memory::Release(place_);
 }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 656db31d473..9c360517575 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -91,6 +91,10 @@ class AnalysisPredictor : public PaddlePredictor {
   /// \param[in] AnalysisConfig config
   ///
   explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {
+    if (config_.shape_range_info_collected()) {
+      config_.SwitchIrOptim(false);
+      config_.EnableMemoryOptim(false);
+    }
     predictor_id_ = inference::GetUniqueId();
   }
   ///
@@ -377,6 +381,10 @@ class AnalysisPredictor : public PaddlePredictor {
   FRIEND_TEST(AnalysisPredictor, with_gpu);
 #endif
 
+ private:
+  void StatisticShapeRangeInfo();
+  void CollectShapeRangeInfo();
+
  private:
   AnalysisConfig config_;
   Argument argument_;
@@ -419,6 +427,8 @@ class AnalysisPredictor : public PaddlePredictor {
  private:
   // Some status here that help to determine the status inside the predictor.
   bool status_is_cloned_{false};
+
+  std::map<std::string, std::vector<std::vector<int32_t>>> shape_info_;
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index c14614d6882..87af94a88d4 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -19,8 +19,10 @@
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
+#include "paddle/fluid/inference/utils/io_utils.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
 DEFINE_string(dirname, "", "dirname to tests.");
@@ -32,6 +34,8 @@ TEST(AnalysisPredictor, analysis_off) {
   config.SetModel(FLAGS_dirname);
   config.SwitchIrOptim(false);
   LOG(INFO) << config.Summary();
+  LOG(INFO) << "Shape Info collected: " << config.shape_range_info_collected()
+            << ", path: " << config.shape_range_info_path();
 
   auto _predictor = CreatePaddlePredictor<AnalysisConfig>(config);
   auto* predictor = static_cast<AnalysisPredictor*>(_predictor.get());
@@ -86,10 +90,6 @@ TEST(AnalysisPredictor, analysis_on) {
   std::vector<PaddleTensor> outputs;
   ASSERT_TRUE(predictor->Run(inputs, &outputs));
 
-  for (auto& output : outputs) {
-    LOG(INFO) << inference::DescribeTensor(output);
-  }
-
   // compare with NativePredictor
   auto naive_predictor =
       CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
@@ -139,6 +139,54 @@ TEST(AnalysisPredictor, ZeroCopy) {
   predictor->TryShrinkMemory();
 }
 
+TEST(AnalysisPredictor, CollectShapeRangeInfo) {
+  AnalysisConfig config;
+  config.SetModel(FLAGS_dirname);
+  config.SwitchUseFeedFetchOps(false);
+  config.EnableUseGpu(100, 0);
+  config.CollectShapeRangeInfo(FLAGS_dirname + "/shape_range.pbtxt");
+  LOG(INFO) << config.Summary();
+  AnalysisConfig config2(config);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(config2);
+
+  auto w0 = predictor->GetInputTensor("firstw");
+  auto w1 = predictor->GetInputTensor("secondw");
+  auto w2 = predictor->GetInputTensor("thirdw");
+  auto w3 = predictor->GetInputTensor("forthw");
+
+  w0->Reshape({4, 1});
+  w1->Reshape({4, 1});
+  w2->Reshape({4, 1});
+  w3->Reshape({4, 1});
+
+  auto* w0_data = w0->mutable_data<int64_t>(PaddlePlace::kCPU);
+  auto* w1_data = w1->mutable_data<int64_t>(PaddlePlace::kCPU);
+  auto* w2_data = w2->mutable_data<int64_t>(PaddlePlace::kCPU);
+  auto* w3_data = w3->mutable_data<int64_t>(PaddlePlace::kCPU);
+
+  for (int i = 0; i < 4; i++) {
+    w0_data[i] = i;
+    w1_data[i] = i;
+    w2_data[i] = i;
+    w3_data[i] = i;
+  }
+
+  predictor->ZeroCopyRun();
+
+  auto out = predictor->GetOutputTensor("fc_1.tmp_2");
+  PaddlePlace place;
+  int size = 0;
+  out->data<float>(&place, &size);
+  LOG(INFO) << "output size: " << size / sizeof(float);
+  // TODO(wilber): check for windows
+  // std::map<std::string, std::vector<int32_t>> min_shape;
+  // std::map<std::string, std::vector<int32_t>> max_shape;
+  // std::map<std::string, std::vector<int32_t>> opt_shape;
+  // inference::DeserializeShapeRangeInfo(FLAGS_dirname + "/shape_range.pbtxt",
+  //                                     &min_shape, &max_shape, &opt_shape);
+  // ASSERT_EQ(min_shape.size(), 14u);
+}
+
 TEST(AnalysisPredictor, Clone) {
   AnalysisConfig config;
   config.SetModel(FLAGS_dirname);
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 7a619c447c0..dbdd0983b53 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -380,6 +380,50 @@ struct PD_INFER_DECL AnalysisConfig {
   bool tensorrt_dynamic_shape_enabled() const {
     return !min_input_shape_.empty();
   }
+  ///
+  /// \brief Enable tuned tensorrt dynamic shape.
+  ///
+  /// \param shape_range_info_path the path to shape_info file got in
+  /// CollectShapeInfo
+  /// mode.
+  /// \param allow_build_at_runtime allow build trt engine at runtime.
+  ///
+  void EnableTunedTensorRtDynamicShape(const std::string& shape_range_info_path,
+                                       bool allow_build_at_runtime = true);
+
+  ///
+  /// \brief A boolean state telling whether to use tuned tensorrt dynamic
+  /// shape.
+  ///
+  bool tuned_tensorrt_dynamic_shape();
+
+  ///
+  /// \brief A boolean state telling whether to allow building trt engine at
+  /// runtime.
+  ///
+  bool trt_allow_build_at_runtime();
+
+  ///
+  /// \brief Collect shape info of all tensors in compute graph.
+  ///
+  /// \param shape_range_info_path the path to save shape info.
+  ///
+  void CollectShapeRangeInfo(const std::string& shape_range_info_path);
+
+  ///
+  /// \brief the shape info path in CollectShapeInfo mode.
+  ///
+  /// \return the shape info path.
+  ///
+  const std::string& shape_range_info_path();
+
+  ///
+  /// \brief A boolean state telling whether to collect shape info.
+  ///
+  /// \return bool Whether to collect shape info.
+  ///
+  bool shape_range_info_collected();
+
   ///
   /// \brief Prevent ops running in Paddle-TRT
   /// NOTE: just experimental, not an official stable API, easy to be broken.
@@ -573,7 +617,9 @@ struct PD_INFER_DECL AnalysisConfig {
   /// \brief Turn on memory optimize
   /// NOTE still in development.
   ///
-  void EnableMemoryOptim();
+  /// \param x Whether to enable memory optimize.
+  ///
+  void EnableMemoryOptim(bool x = true);
   ///
   /// \brief A boolean state telling whether the memory optimization is
   /// activated.
@@ -693,6 +739,15 @@ struct PD_INFER_DECL AnalysisConfig {
   std::map<std::string, std::vector<int>> optim_input_shape_{};
   std::vector<std::string> trt_disabled_ops_{};
   bool disable_trt_plugin_fp16_{false};
+  bool trt_allow_build_at_runtime_{false};
+  // tune to get dynamic_shape info.
+  bool trt_tuned_dynamic_shape_{false};
+
+  // In CollectShapeInfo mode, we will collect the shape information of
+  // all intermediate tensors in the compute graph and calculate the
+  // min_shape, max_shape and opt_shape and save in shape_range_info_path_;
+  bool collect_shape_range_info_{false};
+  std::string shape_range_info_path_;
 
   // dlnne related.
   bool use_dlnne_{false};
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index dbaaf2bdc7c..517af24f4d8 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -196,6 +196,10 @@ void TensorRTEngine::FreezeNetwork() {
 #if IS_TRT_VERSION_GE(6000)
     LOG(INFO) << "Run Paddle-TRT Dynamic Shape mode.";
     for (auto &input : min_input_shape_) {
+      VLOG(4) << "TRT dynamic_shape set " << input.first
+              << " min: " << Vec2Str(input.second)
+              << ", max: " << Vec2Str(max_input_shape_[input.first])
+              << ", opt: " << Vec2Str(optim_input_shape_[input.first]);
       optim_profile_->setDimensions(
           input.first.c_str(), nvinfer1::OptProfileSelector::kMIN,
           Vec2TRT_Dims(input.second, input.first, true));
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 721af98ce9b..29324f29006 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -32,6 +32,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
 #include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/utils/any.h"
 
 namespace paddle {
@@ -226,6 +227,7 @@ class TensorRTEngine {
                      const std::string& name);
   // Set the itensor_map_[name] as the network's output, and set its name.
   void DeclareOutput(const std::string& name);
+  void ClearTensorMap() { itensor_map_.clear(); }
 
   void SetITensor(const std::string& name, nvinfer1::ITensor* tensor);
   // Get an ITensor called name.
@@ -244,6 +246,16 @@ class TensorRTEngine {
     }
     return infer_context_[tid].get();
   }
+  void ResetContext() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    const std::thread::id tid = std::this_thread::get_id();
+    PADDLE_ENFORCE_NOT_NULL(
+        infer_engine_,
+        platform::errors::InvalidArgument(
+            "You should build engine first and then set the context."));
+    infer_context_[tid].reset(nullptr);
+    infer_context_.erase(tid);
+  }
 
   nvinfer1::IHostMemory* Serialize() {
     PADDLE_ENFORCE_NOT_NULL(
@@ -364,6 +376,55 @@ class TensorRTEngine {
   ShapeMapType min_input_shape() { return min_input_shape_; }
   ShapeMapType max_input_shape() { return max_input_shape_; }
   ShapeMapType optim_input_shape() { return optim_input_shape_; }
+
+  bool AdjustDynamicShapeRange(const ShapeMapType& runtime_input_shape,
+                               std::vector<std::string>* changed) {
+    bool ret = false;
+    changed->clear();
+    for (const auto& it : runtime_input_shape) {
+      auto name = it.first;
+      auto input_shape = it.second;
+      PADDLE_ENFORCE_EQ(
+          min_input_shape_.count(name), true,
+          platform::errors::InvalidArgument(
+              "TRT dynamic_shape min_input_shape %s not found.", name));
+      PADDLE_ENFORCE_EQ(min_input_shape_[name].size(), input_shape.size(),
+                        platform::errors::InvalidArgument(
+                            "TRT dynamic_shape min_input_shape %s size not "
+                            "equal, the min_input_shape[%s].size()=%d"
+                            ", but the runtime_input_shape[%s].size()=%d.",
+                            name, name, min_input_shape_[name].size(), name,
+                            input_shape.size()));
+      auto bak_min_shape = min_input_shape_[name];
+      auto bak_max_shape = max_input_shape_[name];
+      bool min_change = false;
+      bool max_change = false;
+      for (size_t d = 0; d < input_shape.size(); ++d) {
+        if (input_shape[d] < min_input_shape_[name][d]) {
+          ret = true;
+          min_change = true;
+          min_input_shape_[name][d] = input_shape[d];
+        }
+        if (input_shape[d] > max_input_shape_[name][d]) {
+          ret = true;
+          max_change = true;
+          max_input_shape_[name][d] = input_shape[d];
+        }
+      }
+
+      if (min_change)
+        LOG(INFO) << "refactor shape range: " << name << ", min_shape from "
+                  << Vec2Str(bak_min_shape) << " to "
+                  << Vec2Str(min_input_shape_[name]);
+      if (max_change)
+        LOG(INFO) << "refactor shape range: " << name << ", max_shape from "
+                  << Vec2Str(bak_max_shape) << " to "
+                  << Vec2Str(max_input_shape_[name]);
+      if (min_change || max_change) changed->push_back(name);
+    }
+    return ret;
+  }
+
   bool use_oss() { return use_oss_; }
   bool with_ernie() { return with_ernie_; }
   bool disable_trt_plugin_fp16() { return disable_trt_plugin_fp16_; }
diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h
index f0d585e1b40..16595b8a032 100644
--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
@@ -154,6 +154,16 @@ inline void PrintITensorShape(nvinfer1::ITensor* X) {
   std::cout << "]\n";
 }
 
+template <typename T>
+inline std::string Vec2Str(const std::vector<T>& vec) {
+  std::ostringstream os;
+  os << "(";
+  for (size_t i = 0; i < vec.size() - 1; ++i) {
+    os << vec[i] << ",";
+  }
+  os << vec[vec.size() - 1] << ")";
+  return os.str();
+}
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc
index 55ee2082e69..4f6742b88b2 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc
@@ -143,6 +143,70 @@ void TestDynamic2() {
   }
 }
 
+void TestTunedDynamic() {
+  std::string model_dir =
+      FLAGS_infer_model + "/complex_model_dynamic/complex_model_dynamic2";
+  AnalysisConfig config_tuned;
+  const std::string shape_range = "shape_range.pbtxt";
+  config_tuned.EnableUseGpu(100, 0);
+  config_tuned.SetModel(model_dir + "/model", model_dir + "/params");
+  config_tuned.SwitchUseFeedFetchOps(false);
+  config_tuned.CollectShapeRangeInfo(shape_range);
+
+  int batch_size = 1;
+  auto predictor_tuned = CreatePaddlePredictor(config_tuned);
+
+  auto check_func = [batch_size](PaddlePredictor *predictor) {
+    int channels = 3;
+    int height = 5;
+    int width = 5;
+    int input_num = channels * height * width * 1;
+
+    float *input = new float[input_num];
+    memset(input, 0, input_num * sizeof(float));
+    auto input_names = predictor->GetInputNames();
+    auto input_t = predictor->GetInputTensor(input_names[0]);
+    input_t->Reshape({batch_size, channels, height, width});
+    input_t->copy_from_cpu(input);
+
+    auto input_t1 = predictor->GetInputTensor(input_names[1]);
+    input_t1->Reshape({batch_size, 2, 1, 1});
+    std::vector<float> first;
+    for (int i = 0; i < batch_size * 2; i++) first.push_back(1.0);
+    input_t1->copy_from_cpu(first.data());
+
+    auto input_t2 = predictor->GetInputTensor(input_names[2]);
+    input_t2->Reshape({batch_size, 2, 1, 1});
+    input_t2->copy_from_cpu(first.data());
+
+    ASSERT_TRUE(predictor->ZeroCopyRun());
+
+    std::vector<float> out_data;
+    auto output_names = predictor->GetOutputNames();
+    auto output_t = predictor->GetOutputTensor(output_names[0]);
+    std::vector<int> output_shape = output_t->shape();
+    int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                  std::multiplies<int>());
+    out_data.resize(out_num);
+    output_t->copy_to_cpu(out_data.data());
+  };
+  check_func(predictor_tuned.get());
+
+  // check tuned_dynamic_shape
+  AnalysisConfig config;
+  config.EnableUseGpu(100, 0);
+  std::string cache_dir = "tuned_cache";
+  config.SetOptimCacheDir(cache_dir);
+  delete_cache_files(cache_dir);
+  config.SetModel(model_dir + "/model", model_dir + "/params");
+  config.SwitchUseFeedFetchOps(false);
+  config.EnableTunedTensorRtDynamicShape(shape_range, true);
+  config.EnableTensorRtEngine(1 << 30, batch_size, 0,
+                              AnalysisConfig::Precision::kFloat32, true, false);
+  auto test_predictor = CreatePaddlePredictor(config);
+  check_func(test_predictor.get());
+}
+
 TEST(AnalysisPredictor, trt_dynamic) { TestDynamic(true); }
 TEST(AnalysisPredictor, trt_static) { TestDynamic(false); }
 TEST(AnalysisPredictor, trt_memory_serialize) {
@@ -153,5 +217,7 @@ TEST(AnalysisPredictor, trt_memory_serialize) {
 }
 TEST(AnalysisPredictor, trt_dynamic2) { TestDynamic2(); }
 
+TEST(AnalysisPredictor, trt_tuned_dynamic) { TestTunedDynamic(); }
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
index d5d60cc08ab..a87bf7b085b 100644
--- a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
@@ -47,6 +47,24 @@ TEST(AnalysisPredictor, use_gpu) {
   }
 }
 
+TEST(AnalysisPredictor, collect_shape_range) {
+  std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
+  AnalysisConfig config;
+  config.EnableUseGpu(100, 0);
+  config.SetModel(model_dir);
+  config.CollectShapeRangeInfo("shape_range.pbtxt");
+
+  std::vector<std::vector<PaddleTensor>> inputs_all;
+  auto predictor = CreatePaddlePredictor(config);
+  SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
+
+  std::vector<PaddleTensor> outputs;
+  for (auto &input : inputs_all) {
+    ASSERT_TRUE(predictor->Run(input, &outputs));
+    predictor->ClearIntermediateTensor();
+  }
+}
+
 }  // namespace inference
 }  // namespace paddle
 
diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt
index 0a034c0de47..9a495194a8a 100644
--- a/paddle/fluid/inference/utils/CMakeLists.txt
+++ b/paddle/fluid/inference/utils/CMakeLists.txt
@@ -1,6 +1,8 @@
 cc_library(benchmark SRCS benchmark.cc DEPS enforce)
 cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
-cc_library(infer_io_utils SRCS io_utils.cc DEPS paddle_inference_api lod_tensor)
+cc_library(infer_io_utils SRCS io_utils.cc DEPS paddle_inference_api lod_tensor shape_range_info_proto)
 cc_test(infer_io_utils_tester SRCS io_utils_tester.cc DEPS infer_io_utils)
 cc_library(table_printer SRCS table_printer.cc)
 cc_test(test_table_printer SRCS table_printer_tester.cc DEPS table_printer)
+
+proto_library(shape_range_info_proto SRCS shape_range_info.proto)
diff --git a/paddle/fluid/inference/utils/io_utils.cc b/paddle/fluid/inference/utils/io_utils.cc
index d01d40181c4..3691285ba3a 100644
--- a/paddle/fluid/inference/utils/io_utils.cc
+++ b/paddle/fluid/inference/utils/io_utils.cc
@@ -13,7 +13,15 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/utils/io_utils.h"
+
+#include <fcntl.h>
+
+#include <utility>
+
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/text_format.h"
 #include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/utils/shape_range_info.pb.h"
 
 namespace paddle {
 namespace inference {
@@ -157,5 +165,102 @@ void DeserializePDTensorsToFile(const std::string &path,
   fin.close();
 }
 
+void SerializeShapeRangeInfo(
+    const std::string &path,
+    const paddle::inference::proto::ShapeRangeInfos &info) {
+  int out_fd = open(path.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644);
+  google::protobuf::io::FileOutputStream *os =
+      new google::protobuf::io::FileOutputStream(out_fd);
+  google::protobuf::TextFormat::Print(info, os);
+  delete os;
+  close(out_fd);
+}
+
+void SerializeShapeRangeInfo(
+    const std::string &path,
+    const std::map<std::string, std::vector<int32_t>> &min_shape,
+    const std::map<std::string, std::vector<int32_t>> &max_shape,
+    const std::map<std::string, std::vector<int32_t>> &opt_shape) {
+  paddle::inference::proto::ShapeRangeInfos shape_range_infos;
+  for (auto it : min_shape) {
+    auto *s = shape_range_infos.add_shape_range_info();
+    s->set_name(it.first);
+    for (size_t i = 0; i < it.second.size(); ++i) {
+      s->add_min_shape(it.second[i]);
+      s->add_max_shape(max_shape.at(it.first)[i]);
+      s->add_opt_shape(opt_shape.at(it.first)[i]);
+    }
+  }
+
+  inference::SerializeShapeRangeInfo(path, shape_range_infos);
+}
+void DeserializeShapeRangeInfo(
+    const std::string &path, paddle::inference::proto::ShapeRangeInfos *info) {
+  int fd = open(path.c_str(), O_RDONLY);
+  google::protobuf::io::FileInputStream *is =
+      new google::protobuf::io::FileInputStream(fd);
+  google::protobuf::TextFormat::Parse(is, info);
+  delete is;
+  close(fd);
+}
+
+void DeserializeShapeRangeInfo(
+    const std::string &path,
+    std::map<std::string, std::vector<int32_t>> *min_shape,
+    std::map<std::string, std::vector<int32_t>> *max_shape,
+    std::map<std::string, std::vector<int32_t>> *opt_shape) {
+  paddle::inference::proto::ShapeRangeInfos shape_range_infos;
+  DeserializeShapeRangeInfo(path, &shape_range_infos);
+  for (int i = 0; i < shape_range_infos.shape_range_info_size(); ++i) {
+    auto info = shape_range_infos.shape_range_info(i);
+    auto name = info.name();
+    if (min_shape->count(name) || max_shape->count(name) ||
+        opt_shape->count(name)) {
+      continue;
+    } else {
+      std::vector<int32_t> tmp(info.min_shape_size());
+      for (size_t k = 0; k < tmp.size(); ++k) tmp[k] = info.min_shape(k);
+      min_shape->insert(std::make_pair(name, tmp));
+
+      tmp.resize(info.max_shape_size());
+      for (size_t k = 0; k < tmp.size(); ++k) tmp[k] = info.max_shape(k);
+      max_shape->insert(std::make_pair(name, tmp));
+
+      tmp.resize(info.opt_shape_size());
+      for (size_t k = 0; k < tmp.size(); ++k) tmp[k] = info.opt_shape(k);
+      opt_shape->insert(std::make_pair(name, tmp));
+    }
+  }
+}
+
+void UpdateShapeRangeInfo(
+    const std::string &path,
+    const std::map<std::string, std::vector<int32_t>> &min_shape,
+    const std::map<std::string, std::vector<int32_t>> &max_shape,
+    const std::map<std::string, std::vector<int32_t>> &opt_shape,
+    const std::vector<std::string> &names) {
+  paddle::inference::proto::ShapeRangeInfos shape_range_infos;
+  DeserializeShapeRangeInfo(path, &shape_range_infos);
+
+  for (int i = 0; i < shape_range_infos.shape_range_info_size(); ++i) {
+    auto *info = shape_range_infos.mutable_shape_range_info(i);
+    for (const auto &name : names) {
+      if (info->name() == name) {
+        info->clear_min_shape();
+        info->clear_max_shape();
+        info->clear_opt_shape();
+        for (size_t j = 0; j < min_shape.at(name).size(); ++j)
+          info->add_min_shape(min_shape.at(name)[j]);
+        for (size_t j = 0; j < max_shape.at(name).size(); ++j)
+          info->add_max_shape(max_shape.at(name)[j]);
+        for (size_t j = 0; j < opt_shape.at(name).size(); ++j)
+          info->add_opt_shape(opt_shape.at(name)[j]);
+        break;
+      }
+    }
+  }
+  inference::SerializeShapeRangeInfo(path, shape_range_infos);
+}
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/utils/io_utils.h b/paddle/fluid/inference/utils/io_utils.h
index de2c7b26d33..682bbdef05e 100644
--- a/paddle/fluid/inference/utils/io_utils.h
+++ b/paddle/fluid/inference/utils/io_utils.h
@@ -19,6 +19,7 @@
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/api/paddle_api.h"
+#include "paddle/fluid/inference/utils/shape_range_info.pb.h"
 
 namespace paddle {
 struct PaddleTensor;
@@ -41,5 +42,28 @@ void SerializePDTensorsToFile(const std::string& path,
                               const std::vector<PaddleTensor>& tensors);
 void DeserializePDTensorsToFile(const std::string& path,
                                 std::vector<PaddleTensor>* tensors);
+
+void SerializeShapeRangeInfo(
+    const std::string& path,
+    const paddle::inference::proto::ShapeRangeInfos& info);
+void SerializeShapeRangeInfo(
+    const std::string& path,
+    const std::map<std::string, std::vector<int32_t>>& min_shape,
+    const std::map<std::string, std::vector<int32_t>>& max_shape,
+    const std::map<std::string, std::vector<int32_t>>& opt_shape);
+void DeserializeShapeRangeInfo(const std::string& path,
+                               paddle::inference::proto::ShapeRangeInfos* info);
+void DeserializeShapeRangeInfo(
+    const std::string& path,
+    std::map<std::string, std::vector<int32_t>>* min_shape,
+    std::map<std::string, std::vector<int32_t>>* max_shape,
+    std::map<std::string, std::vector<int32_t>>* opt_shape);
+
+void UpdateShapeRangeInfo(
+    const std::string& path,
+    const std::map<std::string, std::vector<int32_t>>& min_shape,
+    const std::map<std::string, std::vector<int32_t>>& max_shape,
+    const std::map<std::string, std::vector<int32_t>>& opt_shape,
+    const std::vector<std::string>& names);
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/utils/io_utils_tester.cc b/paddle/fluid/inference/utils/io_utils_tester.cc
index 3ed6de38ad3..766afed4e50 100644
--- a/paddle/fluid/inference/utils/io_utils_tester.cc
+++ b/paddle/fluid/inference/utils/io_utils_tester.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/inference/utils/io_utils.h"
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include <utility>
 #include "paddle/fluid/inference/api/helper.h"
 
 namespace paddle {
@@ -93,3 +94,28 @@ TEST(infer_io_utils, tensors) {
         paddle::inference::pd_tensor_equal(tensors_in[i], tensors_out[i]));
   }
 }
+
+TEST(shape_info_io, read_and_write) {
+  const std::string path = "test_shape_info_io";
+  std::map<std::string, std::vector<int32_t>> min_shape, max_shape, opt_shape;
+  min_shape.insert(
+      std::make_pair("test1", std::vector<int32_t>{1, 3, 112, 112}));
+  max_shape.insert(
+      std::make_pair("test1", std::vector<int32_t>{1, 3, 224, 224}));
+  opt_shape.insert(
+      std::make_pair("test1", std::vector<int32_t>{1, 3, 224, 224}));
+  paddle::inference::SerializeShapeRangeInfo(path, min_shape, max_shape,
+                                             opt_shape);
+  min_shape.clear();
+  max_shape.clear();
+  opt_shape.clear();
+  opt_shape.insert(
+      std::make_pair("test2", std::vector<int32_t>{1, 3, 224, 224}));
+  paddle::inference::DeserializeShapeRangeInfo(path, &min_shape, &max_shape,
+                                               &opt_shape);
+
+  min_shape.insert(std::make_pair("test1", std::vector<int32_t>{1, 3, 56, 56}));
+  std::vector<std::string> names{"test1"};
+  paddle::inference::UpdateShapeRangeInfo(path, min_shape, max_shape, opt_shape,
+                                          names);
+}
diff --git a/paddle/fluid/inference/utils/shape_range_info.proto b/paddle/fluid/inference/utils/shape_range_info.proto
new file mode 100644
index 00000000000..fcb2d635b52
--- /dev/null
+++ b/paddle/fluid/inference/utils/shape_range_info.proto
@@ -0,0 +1,29 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax = "proto2";
+package paddle.inference.proto;
+
+// To support trt dynamic shape, record the runtime shape
+// information of all tmp tensors in the Compution graph.
+message ShapeRangeInfos {
+  message ShapeRangeInfo {
+    required string name = 1;
+    repeated int32 min_shape = 2;
+    repeated int32 max_shape = 3;
+    repeated int32 opt_shape = 4;
+  }
+
+  repeated ShapeRangeInfo shape_range_info = 1;
+}
diff --git a/paddle/fluid/operators/tensorrt/CMakeLists.txt b/paddle/fluid/operators/tensorrt/CMakeLists.txt
index 6b551d13f1d..0ab66f2fdce 100644
--- a/paddle/fluid/operators/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/operators/tensorrt/CMakeLists.txt
@@ -1,4 +1,4 @@
-op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter)
+op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter infer_io_utils analysis_helper)
 file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(tensorrt_engine);\n")
 nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
   DEPS tensorrt_engine_op
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 3b27f525b55..46da8e61516 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include "paddle/fluid/framework/scope.h"
 #ifdef PADDLE_WITH_CUDA
 
 #include <memory>
@@ -30,6 +31,7 @@
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/inference/utils/io_utils.h"
 
 namespace paddle {
 namespace inference {
@@ -77,18 +79,19 @@ static void RuntimeStaticShapeCheck(std::vector<int64_t> runtime_input_shape,
 }
 
 static void RuntimeDynamicShapeCheck(
-    const std::string &x, const std::vector<int64_t> &runtime_input_shape,
-    const std::vector<int> &min_input_shape,
-    const std::vector<int> &max_input_shape) {
-  PADDLE_ENFORCE_EQ(runtime_input_shape.size(), min_input_shape.size(),
-                    platform::errors::InvalidArgument(
-                        "TRT engine runtime input dims size(%d) inconsistent "
-                        "with the dynamic shape size(%d)",
-                        runtime_input_shape.size(), min_input_shape.size()));
+    const std::string &x, const std::vector<int32_t> &runtime_input_shape,
+    const std::vector<int32_t> &min_input_shape,
+    const std::vector<int32_t> &max_input_shape) {
+  // PADDLE_ENFORCE_EQ(
+  //     runtime_input_shape.size(), min_input_shape.size(),
+  //     platform::errors::InvalidArgument(
+  //         "TRT engine runtime input %s dims size(%d) inconsistent "
+  //         "with the dynamic shape size(%d)",
+  //         x, runtime_input_shape.size(), min_input_shape.size()));
   auto is_input_shape_valid = [&](
-      const std::vector<int64_t> &runtime_input_shape,
-      const std::vector<int> &min_input_shape,
-      const std::vector<int> &max_input_shape) -> bool {
+      const std::vector<int32_t> &runtime_input_shape,
+      const std::vector<int32_t> &min_input_shape,
+      const std::vector<int32_t> &max_input_shape) -> bool {
     for (size_t i = 0; i < runtime_input_shape.size(); i++) {
       if (runtime_input_shape[i] <= max_input_shape[i] &&
           runtime_input_shape[i] >= min_input_shape[i]) {
@@ -128,6 +131,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
  private:
   std::vector<std::string> input_names_;
   std::unordered_set<std::string> param_names_;
+  std::vector<std::string> runtime_input_names_;
   mutable TensorRTEngine *trt_engine_{nullptr};
   int max_batch_size_;
   int workspace_size_;
@@ -141,7 +145,14 @@ class TensorRTEngineOp : public framework::OperatorBase {
   bool calibration_mode_;
   int predictor_id_;
   int device_id_;
+  bool allow_build_at_runtime_{false};
+  std::string shape_range_info_path_;
+  std::string model_opt_cache_dir_;
+  bool use_static_engine_;
   AnalysisConfig::Precision precision_mode_;
+  std::map<std::string, std::vector<int>> min_input_shape_{};
+  std::map<std::string, std::vector<int>> max_input_shape_{};
+  std::map<std::string, std::vector<int>> opt_input_shape_{};
 
  public:
   TensorRTEngineOp(const std::string &type,
@@ -160,11 +171,57 @@ class TensorRTEngineOp : public framework::OperatorBase {
     engine_key_ = Attr<std::string>("engine_key");
     calibration_engine_key_ = Attr<std::string>("calibration_engine_key");
     predictor_id_ = Attr<int>("predictor_id");
+    shape_range_info_path_ = Attr<std::string>("shape_range_info_path");
+    allow_build_at_runtime_ = Attr<bool>("allow_build_at_runtime");
+    use_static_engine_ = Attr<bool>("use_static_engine");
+    if (use_static_engine_) {
+      model_opt_cache_dir_ = Attr<std::string>("model_opt_cache_dir");
+    }
+
+    if (HasAttr("dynamic_shape_names") && HasAttr("min_input_shape") &&
+        HasAttr("max_input_shape") && HasAttr("opt_input_shape")) {
+      std::vector<std::string> dynamic_shape_names;
+      std::vector<std::vector<int>> min_input_shapes;
+      std::vector<std::vector<int>> max_input_shapes;
+      std::vector<std::vector<int>> opt_input_shapes;
+      std::vector<int> dynamic_shape_lens;
+      dynamic_shape_names =
+          Attr<std::vector<std::string>>("dynamic_shape_names");
+      std::vector<int> min_shapes = Attr<std::vector<int>>("min_input_shape");
+      std::vector<int> max_shapes = Attr<std::vector<int>>("max_input_shape");
+      std::vector<int> opt_shapes = Attr<std::vector<int>>("opt_input_shape");
+      dynamic_shape_lens = Attr<std::vector<int>>("dynamic_shape_lens");
+      int idx = 0;
+      for (size_t i = 0; i < dynamic_shape_lens.size(); ++i) {
+        std::vector<int> tmp1, tmp2, tmp3;
+        for (int j = 0; j < dynamic_shape_lens[i]; ++j) {
+          tmp1.push_back(min_shapes[idx]);
+          tmp2.push_back(max_shapes[idx]);
+          tmp3.push_back(opt_shapes[idx++]);
+        }
+        min_input_shapes.emplace_back(tmp1);
+        max_input_shapes.emplace_back(tmp2);
+        opt_input_shapes.emplace_back(tmp3);
+      }
+
+      for (size_t i = 0; i < dynamic_shape_names.size(); ++i) {
+        min_input_shape_.insert(
+            std::make_pair(dynamic_shape_names[i], min_input_shapes[i]));
+        max_input_shape_.insert(
+            std::make_pair(dynamic_shape_names[i], max_input_shapes[i]));
+        opt_input_shape_.insert(
+            std::make_pair(dynamic_shape_names[i], opt_input_shapes[i]));
+      }
+    }
 
     auto params = Attr<std::vector<std::string>>("parameters");
     for (const auto &param : params) {
       param_names_.insert(param);
     }
+    for (auto &x : input_names_) {
+      if (param_names_.count(x)) continue;
+      runtime_input_names_.emplace_back(x);
+    }
     // calibration_mode is ture represents we need to
     // generate the calibration table data.
     calibration_mode_ =
@@ -210,6 +267,78 @@ class TensorRTEngineOp : public framework::OperatorBase {
       return;
     }
     auto *trt_engine = GetEngine(scope, dev_place);
+    if (trt_engine->with_dynamic_shape()) {
+      // get runtime input shapes.
+      std::map<std::string, std::vector<int32_t>> runtime_input_shape;
+      for (auto name : runtime_input_names_) {
+        auto &t = inference::analysis::GetFromScope<framework::LoDTensor>(scope,
+                                                                          name);
+        VLOG(4) << "trt engine runtime input name(" << name << "), dims("
+                << t.dims() << ")";
+        auto t_shape = framework::vectorize<int32_t>(t.dims());
+        runtime_input_shape.insert(std::make_pair(name, t_shape));
+      }
+
+      if (!allow_build_at_runtime_) {
+        std::map<std::string, std::vector<int>> min_input_shape =
+            trt_engine->min_input_shape();
+        std::map<std::string, std::vector<int>> max_input_shape =
+            trt_engine->max_input_shape();
+        for (auto &x : runtime_input_names_) {
+          PADDLE_ENFORCE_EQ(
+              min_input_shape.count(x), true,
+              platform::errors::InvalidArgument(
+                  "Input %s not found in TRT engine min_input_shape.", x));
+          PADDLE_ENFORCE_EQ(
+              max_input_shape.count(x), true,
+              platform::errors::InvalidArgument(
+                  "Input %s not found in TRT engine max_input_shape.", x));
+          RuntimeDynamicShapeCheck(x, runtime_input_shape[x],
+                                   min_input_shape[x], max_input_shape[x]);
+        }
+      } else {
+        // compare runtime_input_shape and trt_engine dynamic shapes.
+        std::vector<std::string> shape_changed_name;
+        bool is_adjusted = trt_engine->AdjustDynamicShapeRange(
+            runtime_input_shape, &shape_changed_name);
+        if (is_adjusted) {
+          LOG(INFO) << "Adjust dynamic shape range, rebuild trt engine!";
+          trt_engine->ResetContext();
+          trt_engine->ClearTensorMap();
+          auto *anc = scope.parent();
+          while (anc && anc->parent()) {
+            anc = anc->parent();
+          }
+          if (anc == nullptr) {
+            anc = &scope;
+          }
+          PrepareTRTEngine(*anc, trt_engine);
+
+          // update shape_range_info_pbtxt
+          if (!shape_range_info_path_.empty()) {
+            inference::UpdateShapeRangeInfo(
+                shape_range_info_path_, trt_engine->min_input_shape(),
+                trt_engine->max_input_shape(), trt_engine->optim_input_shape(),
+                shape_changed_name);
+          }
+
+          if (use_static_engine_) {
+            nvinfer1::IHostMemory *serialized_engine_data =
+                trt_engine->Serialize();
+            std::string trt_engine_serialized_data =
+                std::string((const char *)serialized_engine_data->data(),
+                            serialized_engine_data->size());
+            inference::analysis::SaveTrtEngineSerializedDataToFile(
+                inference::analysis::GetTrtEngineSerializedPath(
+                    model_opt_cache_dir_, engine_key_),
+                trt_engine_serialized_data);
+            LOG(INFO) << "Save TRT Optimized Info to "
+                      << inference::analysis::GetTrtEngineSerializedPath(
+                             model_opt_cache_dir_, engine_key_);
+          }
+        }
+      }
+    }
     RunTrt(scope, dev_place, trt_engine);
   }
 
@@ -273,7 +402,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
         reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
 
     PADDLE_ENFORCE_EQ(
-        input_names_.empty(), false,
+        runtime_input_names_.empty(), false,
         platform::errors::PreconditionNotMet(
             "TensorRT engine needs at least one input, but no input is found. "
             "Please check if you set the input correctly."));
@@ -283,16 +412,12 @@ class TensorRTEngineOp : public framework::OperatorBase {
 
     int num_inputs = 0;
 
-    for (const auto &x : Inputs("Xs")) {
-      if (param_names_.count(x)) continue;
-      num_inputs += 1;
-    }
+    num_inputs += runtime_input_names_.size();
     const int num_bindings = num_inputs + Outputs("Ys").size();
     std::vector<void *> buffers(num_bindings);
 
     // Bind input tensor to TRT.
-    for (const auto &x : Inputs("Xs")) {
-      if (param_names_.count(x)) continue;
+    for (const auto &x : runtime_input_names_) {
       // convert input and copy to TRT engine's buffer
       auto &t =
           inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
@@ -341,22 +466,6 @@ class TensorRTEngineOp : public framework::OperatorBase {
         }
       } else {
 #if IS_TRT_VERSION_GE(6000)
-        std::map<std::string, std::vector<int>> min_input_shape =
-            engine->min_input_shape();
-        std::map<std::string, std::vector<int>> max_input_shape =
-            engine->max_input_shape();
-        PADDLE_ENFORCE_EQ(
-            min_input_shape.count(x), true,
-            platform::errors::InvalidArgument(
-                "Input %s not found in TRT engine min_input_shape.", x));
-        PADDLE_ENFORCE_EQ(
-            max_input_shape.count(x), true,
-            platform::errors::InvalidArgument(
-                "Input %s not found in TRT engine max_input_shape.", x));
-        auto x_min_input_shape = min_input_shape[x];
-        auto x_max_input_shape = max_input_shape[x];
-        RuntimeDynamicShapeCheck(x, t_shape, x_min_input_shape,
-                                 x_max_input_shape);
         auto *trt_context = engine->context();
         trt_context->setBindingDimensions(
             bind_index, inference::tensorrt::Vec2TRT_Dims(t_shape, x, true));
@@ -460,7 +569,8 @@ class TensorRTEngineOp : public framework::OperatorBase {
           inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
               .Create(engine_key_ + std::to_string(predictor_id_),
                       max_batch_size_, workspace_size_, precision_mode_,
-                      calibrator_.get(), device_id_);
+                      calibrator_.get(), device_id_, min_input_shape_,
+                      max_input_shape_, opt_input_shape_);
       PrepareTRTEngine(scope, trt_engine_);
     }
     return trt_engine_;
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
index 4e88d79dfe4..d2d04a4fa50 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -56,7 +56,7 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
 
 using inference::analysis::SetAttr;
 
-TEST(TensorRTEngineOp, manual) {
+void DynamicShapeTest(bool allow_build_at_runtime) {
   framework::ProgramDesc program;
   auto* block_ = program.Proto()->add_blocks();
   block_->set_idx(0);
@@ -116,6 +116,15 @@ TEST(TensorRTEngineOp, manual) {
   engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
   int device_id = 0;
   engine_op_desc.SetAttr("gpu_id", device_id);
+  engine_op_desc.SetAttr("shape_range_info_path", std::string(""));
+  engine_op_desc.SetAttr("model_opt_cache_dir", std::string(""));
+  engine_op_desc.SetAttr("allow_build_at_runtime", allow_build_at_runtime);
+  engine_op_desc.SetAttr("use_static_engine", true);
+  engine_op_desc.SetAttr("dynamic_shape_names", std::vector<std::string>{"x"});
+  engine_op_desc.SetAttr("dynamic_shape_lens", std::vector<int>{4});
+  engine_op_desc.SetAttr("min_input_shape", std::vector<int>{1, 4, 1, 1});
+  engine_op_desc.SetAttr("max_input_shape", std::vector<int>{2, 4, 1, 1});
+  engine_op_desc.SetAttr("opt_input_shape", std::vector<int>{2, 4, 1, 1});
 
   LOG(INFO) << "create engine op";
   auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
@@ -125,7 +134,10 @@ TEST(TensorRTEngineOp, manual) {
   platform::CUDAPlace place;
   platform::CUDADeviceContext ctx(place);
   // Prepare variables.
-  CreateCUDATensor(&scope, "x", std::vector<int64_t>({2, 4}));
+  if (allow_build_at_runtime)
+    CreateCUDATensor(&scope, "x", std::vector<int64_t>({3, 4, 1, 1}));
+  else
+    CreateCUDATensor(&scope, "x", std::vector<int64_t>({2, 4, 1, 1}));
   CreateCUDATensor(&scope, "y", std::vector<int64_t>({4, 6}));
   CreateCUDATensor(&scope, "z", std::vector<int64_t>({2, 6}));
 
@@ -137,6 +149,11 @@ TEST(TensorRTEngineOp, manual) {
   engine_op->Run(scope, place);
 }
 
+TEST(TensorRTEngineOp, manual) {
+  DynamicShapeTest(false);
+  DynamicShapeTest(true);
+}
+
 void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
   framework::ProgramDesc program;
   framework::Scope scope;
@@ -220,6 +237,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
   engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
   int device_id = 0;
   engine_op_desc.SetAttr("gpu_id", device_id);
+  engine_op_desc.SetAttr("shape_range_info_path", std::string(""));
+  engine_op_desc.SetAttr("model_opt_cache_dir", std::string(""));
+  engine_op_desc.SetAttr("allow_build_at_runtime", false);
+  engine_op_desc.SetAttr("use_static_engine", false);
 
   auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
 
@@ -228,7 +249,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
 }
 
 // Test with a larger FC layer.
-TEST(TensorRTEngineOp, fc) { Execute(40, 28, 28); }
+// TEST(TensorRTEngineOp, fc) { Execute(40, 28, 28); }
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index e1678a65c0b..87986aebe04 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -538,7 +538,8 @@ void BindAnalysisConfig(py::module *m) {
       .def("switch_ir_optim", &AnalysisConfig::SwitchIrOptim,
            py::arg("x") = true)
       .def("ir_optim", &AnalysisConfig::ir_optim)
-      .def("enable_memory_optim", &AnalysisConfig::EnableMemoryOptim)
+      .def("enable_memory_optim", &AnalysisConfig::EnableMemoryOptim,
+           py::arg("x") = true)
       .def("enable_profile", &AnalysisConfig::EnableProfile)
       .def("disable_glog_info", &AnalysisConfig::DisableGlogInfo)
       .def("glog_info_disabled", &AnalysisConfig::glog_info_disabled)
@@ -569,6 +570,16 @@ void BindAnalysisConfig(py::module *m) {
            &AnalysisConfig::tensorrt_dynamic_shape_enabled)
       .def("enable_tensorrt_oss", &AnalysisConfig::EnableTensorRtOSS)
       .def("tensorrt_oss_enabled", &AnalysisConfig::tensorrt_oss_enabled)
+      .def("collect_shape_range_info", &AnalysisConfig::CollectShapeRangeInfo)
+      .def("shape_range_info_path", &AnalysisConfig::shape_range_info_path)
+      .def("shape_range_info_collected",
+           &AnalysisConfig::shape_range_info_collected)
+      .def("enable_tuned_tensorrt_dynamic_shape",
+           &AnalysisConfig::EnableTunedTensorRtDynamicShape)
+      .def("tuned_tensorrt_dynamic_shape",
+           &AnalysisConfig::tuned_tensorrt_dynamic_shape)
+      .def("trt_allow_build_at_runtime",
+           &AnalysisConfig::trt_allow_build_at_runtime)
       .def("exp_disable_tensorrt_ops", &AnalysisConfig::Exp_DisableTensorRtOPs)
       .def("enable_tensorrt_dla", &AnalysisConfig::EnableTensorRtDLA,
            py::arg("dla_core") = 0)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_tuned_dynamic_shape.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_tuned_dynamic_shape.py
new file mode 100644
index 00000000000..4a5090fa498
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_tuned_dynamic_shape.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+paddle.enable_static()
+import paddle.fluid as fluid
+from paddle.inference import Config, Predictor, create_predictor
+
+
+class TRTTunedDynamicShapeTest(unittest.TestCase):
+    def get_model(self):
+        place = fluid.CUDAPlace(0)
+        exe = fluid.Executor(place)
+
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 6, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=1,
+                padding=0,
+                bias_attr=False,
+                act=None)
+        exe.run(startup_program)
+        serialized_program = paddle.static.serialize_program(
+            data, conv_out, program=main_program)
+        serialized_params = paddle.static.serialize_persistables(
+            data, conv_out, executor=exe, program=main_program)
+        return serialized_program, serialized_params
+
+    def get_config(self, model, params, tuned=False):
+        config = Config()
+        config.set_model_buffer(model, len(model), params, len(params))
+        config.enable_use_gpu(100, 0)
+        config.set_optim_cache_dir('tuned_test')
+        if tuned:
+            config.collect_shape_range_info('shape_range.pbtxt')
+        else:
+            config.enable_tensorrt_engine(
+                workspace_size=1024,
+                max_batch_size=1,
+                min_subgraph_size=0,
+                precision_mode=paddle.inference.PrecisionType.Float32,
+                use_static=True,
+                use_calib_mode=False)
+            config.enable_tuned_tensorrt_dynamic_shape('shape_range.pbtxt',
+                                                       True)
+
+        return config
+
+    def predictor_run(self, config, in_data):
+        predictor = create_predictor(config)
+        in_names = predictor.get_input_names()
+        in_handle = predictor.get_input_handle(in_names[0])
+        in_handle.copy_from_cpu(in_data)
+        predictor.run()
+
+    def test_tuned_dynamic_shape_run(self):
+        program, params = self.get_model()
+
+        config = self.get_config(program, params, tuned=True)
+        self.predictor_run(config, np.ones((1, 6, 64, 64)).astype(np.float32))
+
+        config2 = self.get_config(program, params, tuned=False)
+        self.predictor_run(config2, np.ones((1, 6, 32, 32)).astype(np.float32))
+
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab