[Paddle-TRT]support shape tensor is the input of trt-subgraph (#46482)

f2a778c9 · zhoutianzi666 · GitHub · 5303b66b · f2a778c9 · f2a778c9
12 changed file
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -77,6 +77,15 @@ void IRPassManager::CreatePasses(Argument *argument,
    pass->Set("optim_input_shape",
              new std::map<std::string, std::vector<int>>(
                  argument->optim_input_shape()));
+    // Now, shape tensor value is not explicit set by user,
+    // it is collected through API CollectShapeRangeInfo.
+    pass->Set("max_shape_tensor",
+              new std::map<std::string, std::vector<int>>());
+    pass->Set("min_shape_tensor",
+              new std::map<std::string, std::vector<int>>());
+    pass->Set("optim_shape_tensor",
+              new std::map<std::string, std::vector<int>>());
+
    // tuned trt dynamic_shape
    pass->Set("trt_tuned_dynamic_shape",
              new bool(argument->tensorrt_tuned_dynamic_shape()));

--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -317,6 +317,13 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
  auto opt_input_shape =
      Get<std::map<std::string, std::vector<int>>>("optim_input_shape");

+  auto min_shape_tensor =
+      Get<std::map<std::string, std::vector<int>>>("min_shape_tensor");
+  auto max_shape_tensor =
+      Get<std::map<std::string, std::vector<int>>>("max_shape_tensor");
+  auto opt_shape_tensor =
+      Get<std::map<std::string, std::vector<int>>>("optim_shape_tensor");
+
  auto allow_build_at_runtime = Get<bool>("trt_allow_build_at_runtime");
  auto shape_range_info_path = Get<std::string>("trt_shape_range_info_path");
  auto trt_tuned_dynamic_shape = Get<bool>("trt_tuned_dynamic_shape");
@@ -326,7 +333,10 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
    inference::DeserializeShapeRangeInfo(shape_range_info_path,
                                         &min_input_shape,
                                         &max_input_shape,
-                                         &opt_input_shape);
+                                         &opt_input_shape,
+                                         &min_shape_tensor,
+                                         &max_shape_tensor,
+                                         &opt_shape_tensor);
  }

  // The following procedure is used to rename all the intermediate
@@ -511,6 +521,9 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
                  min_input_shape,
                  max_input_shape,
                  opt_input_shape,
+                  min_shape_tensor,
+                  max_shape_tensor,
+                  opt_shape_tensor,
                  disable_trt_plugin_fp16,
                  static_cast<phi::DataType>(Get<int>("model_precision")));
  trt_engine->SetUseOSS(Get<bool>("use_varseqlen"));

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1747,10 +1747,39 @@ void AnalysisPredictor::CollectShapeRangeInfo() {
    if (!var->IsType<phi::DenseTensor>()) {
      continue;
    }
-    framework::DDim dim = var->Get<phi::DenseTensor>().dims();
+    auto tensor = var->Get<phi::DenseTensor>();
+    framework::DDim dim = tensor.dims();
    std::vector<int32_t> shape(dim.size());
    for (size_t i = 0; i < shape.size(); ++i) shape[i] = dim[i];
    shape_info_[name].emplace_back(shape);
+
+    // We need collect value range for shape tensor for Paddle-TRT's use.
+    // To be noticed, this method to identify all shape tensors is based on
+    // assumption that all shape tensors in the model have numbers <= 7.
+    // This is a simple method to identify all shape tensors with some
+    // mistakes, but it doesn't matter.
+    auto is_shape_tensor = tensor.numel() <= 7 && tensor.numel() >= 1;
+    if (tensor.dtype() == paddle::experimental::DataType::INT32 &&
+        is_shape_tensor) {
+      std::vector<int> int32_host(tensor.numel());
+      if (tensor.place() == platform::CPUPlace()) {
+        paddle::memory::Copy(platform::CPUPlace(),
+                             int32_host.data(),
+                             platform::CPUPlace(),
+                             tensor.data<int>(),
+                             tensor.numel() * sizeof(int));
+      } else if (tensor.place() == platform::CUDAPlace()) {
+#if defined(PADDLE_WITH_CUDA)
+        paddle::memory::Copy(platform::CPUPlace(),
+                             int32_host.data(),
+                             platform::CUDAPlace(),
+                             tensor.data<int>(),
+                             tensor.numel() * sizeof(int),
+                             nullptr);
+#endif
+      }
+      shape_tensor_value_[name].emplace_back(int32_host);
+    }
  }
 }

@@ -1758,7 +1787,16 @@ void AnalysisPredictor::StatisticShapeRangeInfo() {
  std::map<std::string, std::vector<int32_t>> min_shapes;
  std::map<std::string, std::vector<int32_t>> max_shapes;
  std::map<std::string, std::vector<int32_t>> opt_shapes;
-  for (auto it : shape_info_) {
+  std::map<std::string, std::vector<int32_t>> min_values;
+  std::map<std::string, std::vector<int32_t>> max_values;
+  std::map<std::string, std::vector<int32_t>> opt_values;
+
+  auto extract_min_max_opt =
+      [](std::map<std::string, std::vector<int32_t>> &min_data,
+         decltype(min_data) max_data,
+         decltype(min_data) opt_data,
+         decltype(shape_info_) shape_data) {
+        for (auto it : shape_data) {
          auto name = it.first;
          auto shapes = it.second;

@@ -1766,13 +1804,14 @@ void AnalysisPredictor::StatisticShapeRangeInfo() {
          std::vector<int32_t> max_shape(shapes[0].begin(), shapes[0].end());
          std::vector<int32_t> opt_shape(shapes[0].begin(), shapes[0].end());

-    auto ShapeMaxFreq = [](const std::map<int32_t, int32_t> &m) -> int32_t {
+          auto ShapeMaxFreq =
+              [](const std::map<int32_t, int32_t> &m) -> int32_t {
            std::vector<std::pair<int32_t, int32_t>> counter;
            for (auto &it : m) counter.push_back(it);
-      std::sort(
-          counter.begin(),
+            std::sort(counter.begin(),
                      counter.end(),
-          [](std::pair<int32_t, int32_t> &a, std::pair<int32_t, int32_t> &b) {
+                      [](std::pair<int32_t, int32_t> &a,
+                         std::pair<int32_t, int32_t> &b) {
                        return a.second > b.second;
                      });
            return counter[0].first;
@@ -1788,13 +1827,21 @@ void AnalysisPredictor::StatisticShapeRangeInfo() {
            opt_shape[d] = ShapeMaxFreq(counter);
          }

-    min_shapes[name] = min_shape;
-    max_shapes[name] = max_shape;
-    opt_shapes[name] = opt_shape;
+          min_data[name] = min_shape;
+          max_data[name] = max_shape;
+          opt_data[name] = opt_shape;
        }
-
-  inference::SerializeShapeRangeInfo(
-      config_.shape_range_info_path(), min_shapes, max_shapes, opt_shapes);
+      };
+  extract_min_max_opt(min_shapes, max_shapes, opt_shapes, shape_info_);
+  extract_min_max_opt(min_values, max_values, opt_values, shape_tensor_value_);
+
+  inference::SerializeShapeRangeInfo(config_.shape_range_info_path(),
+                                     min_shapes,
+                                     max_shapes,
+                                     opt_shapes,
+                                     min_values,
+                                     max_values,
+                                     opt_values);
 }

 bool AnalysisPredictor::LoadProgramDesc() {

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -514,6 +514,7 @@ class AnalysisPredictor : public PaddlePredictor {
  bool status_is_cloned_{false};

  std::map<std::string, std::vector<std::vector<int32_t>>> shape_info_;
+  std::map<std::string, std::vector<std::vector<int32_t>>> shape_tensor_value_;
  static int clone_num_;

  bool private_context_{false};

--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -275,6 +275,35 @@ void TensorRTEngine::FreezeNetwork() {
            nvinfer1::OptProfileSelector::kOPT,
            Vec2TRT_Dims(optim_input_shape_[input.first], input.first, true));
      }
+
+      for (int input_id = 0; input_id < network()->getNbInputs(); input_id++) {
+        auto input_name = network()->getInput(input_id)->getName();
+        if (!itensor_map_.count(input_name)) continue;
+        if (!GetITensor(input_name)->isShapeTensor()) continue;
+        PADDLE_ENFORCE_EQ(min_shape_tensor_.count(input_name) &&
+                              max_shape_tensor_.count(input_name) &&
+                              optim_shape_tensor_.count(input_name),
+                          true,
+                          platform::errors::InvalidArgument(
+                              "Fail to find min/max/optim shape value for TRT "
+                              "network's shape tensor input named %s.",
+                              input_name));
+        auto min_vec = min_shape_tensor_.at(input_name);
+        optim_profiles_[i]->setShapeValues(input_name,
+                                           nvinfer1::OptProfileSelector::kMIN,
+                                           min_vec.data(),
+                                           min_vec.size());
+        optim_profiles_[i]->setShapeValues(input_name,
+                                           nvinfer1::OptProfileSelector::kMAX,
+                                           max_shape_tensor_[input_name].data(),
+                                           min_vec.size());
+        optim_profiles_[i]->setShapeValues(
+            input_name,
+            nvinfer1::OptProfileSelector::kOPT,
+            optim_shape_tensor_[input_name].data(),
+            min_vec.size());
+      }
+
      infer_builder_config_->addOptimizationProfile(optim_profiles_[i]);
    }
    if (WithFp16() && disable_trt_plugin_fp16()) {

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -217,6 +217,9 @@ class TensorRTEngine {
      const ShapeMapType min_input_shape = {},
      const ShapeMapType max_input_shape = {},
      const ShapeMapType optim_input_shape = {},
+      const ShapeMapType min_shape_tensor = {},
+      const ShapeMapType max_shape_tensor = {},
+      const ShapeMapType optim_shape_tensor = {},
      bool disable_trt_plugin_fp16 = false,
      phi::DataType model_precision = phi::DataType::FLOAT32,
      nvinfer1::ILogger& logger = NaiveLogger::Global())
@@ -228,6 +231,9 @@ class TensorRTEngine {
        min_input_shape_(min_input_shape),
        max_input_shape_(max_input_shape),
        optim_input_shape_(optim_input_shape),
+        min_shape_tensor_(min_shape_tensor),
+        max_shape_tensor_(max_shape_tensor),
+        optim_shape_tensor_(optim_shape_tensor),
        disable_trt_plugin_fp16_(disable_trt_plugin_fp16),
        model_precision_(model_precision),
        logger_(logger) {
@@ -443,6 +449,9 @@ class TensorRTEngine {
  ShapeMapType min_input_shape() { return min_input_shape_; }
  ShapeMapType max_input_shape() { return max_input_shape_; }
  ShapeMapType optim_input_shape() { return optim_input_shape_; }
+  ShapeMapType min_shape_tensor() { return min_shape_tensor_; }
+  ShapeMapType max_shape_tensor() { return max_shape_tensor_; }
+  ShapeMapType optim_shape_tensor() { return optim_shape_tensor_; }

  bool AdjustDynamicShapeRange(const ShapeMapType& runtime_input_shape,
                               std::vector<std::string>* changed) {
@@ -641,6 +650,9 @@ class TensorRTEngine {
  ShapeMapType min_input_shape_;
  ShapeMapType max_input_shape_;
  ShapeMapType optim_input_shape_;
+  ShapeMapType min_shape_tensor_;
+  ShapeMapType max_shape_tensor_;
+  ShapeMapType optim_shape_tensor_;
  bool disable_trt_plugin_fp16_{false};
  phi::DataType model_precision_{phi::DataType::FLOAT32};
  bool use_varseqlen_{false};
@@ -741,6 +753,9 @@ class TRTEngineManager {
      const std::map<std::string, std::vector<int>> min_input_shape = {},
      const std::map<std::string, std::vector<int>> max_input_shape = {},
      const std::map<std::string, std::vector<int>> optim_input_shape = {},
+      const std::map<std::string, std::vector<int>> min_shape_tensor = {},
+      const std::map<std::string, std::vector<int>> max_shape_tensor = {},
+      const std::map<std::string, std::vector<int>> optim_shape_tensor = {},
      bool disable_trt_plugin_fp16 = false,
      phi::DataType model_precision = phi::DataType::FLOAT32,
      nvinfer1::ILogger& logger = NaiveLogger::Global()) {
@@ -752,6 +767,9 @@ class TRTEngineManager {
                                 min_input_shape,
                                 max_input_shape,
                                 optim_input_shape,
+                                 min_shape_tensor,
+                                 max_shape_tensor,
+                                 optim_shape_tensor,
                                 disable_trt_plugin_fp16,
                                 model_precision,
                                 logger);

--- a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
@@ -31,6 +31,137 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {

+class TensorRTDynamicShapeValueEngineTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    ctx_ = new phi::GPUContext(platform::CUDAPlace(0));
+    ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
+                           .get());
+    ctx_->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    ctx_->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(platform::CUDAPlace(0))
+            .get());
+    ctx_->SetPinnedAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CUDAPinnedPlace())
+            .get());
+    ctx_->PartialInitWithAllocator();
+
+    std::map<std::string, std::vector<int>> min_input_shape = {
+        {"input", {1, 32}}};
+    std::map<std::string, std::vector<int>> max_input_shape = {
+        {"input", {18, 32}}};
+    std::map<std::string, std::vector<int>> optim_input_shape = {
+        {"input", {18, 32}}};
+    std::map<std::string, std::vector<int>> min_input_value = {
+        {"shape", {1, 8, 4}}};
+    std::map<std::string, std::vector<int>> max_input_value = {
+        {"shape", {18, 8, 4}}};
+    std::map<std::string, std::vector<int>> optim_input_value = {
+        {"shape", {18, 8, 4}}};
+    engine_ = new TensorRTEngine(16,
+                                 1 << 10,
+                                 AnalysisConfig::Precision::kFloat32,
+                                 nullptr,
+                                 0,
+                                 min_input_shape,
+                                 max_input_shape,
+                                 optim_input_shape,
+                                 min_input_value,
+                                 max_input_value,
+                                 optim_input_value,
+                                 false,
+                                 phi::DataType::FLOAT32,
+                                 NaiveLogger::Global());
+    engine_->InitNetwork();
+  }
+
+  void TearDown() override {
+    if (engine_) {
+      delete engine_;
+      engine_ = nullptr;
+    }
+  }
+
+  void PrepareInputOutput(const std::vector<float> &input,
+                          std::vector<int> output_shape) {
+    paddle::framework::TensorFromVector(input, *ctx_, &input_);
+    output_.Resize(phi::make_ddim(output_shape));
+  }
+  void PrepareShapeInput(const std::vector<int> &input) {
+    paddle::framework::TensorFromVector(input, *ctx_, &shape_);
+  }
+  void GetOutput(std::vector<float> *output) {
+    paddle::framework::TensorToVector(output_, *ctx_, output);
+  }
+
+ protected:
+  framework::LoDTensor input_;
+  framework::LoDTensor shape_;
+  framework::LoDTensor output_;
+  TensorRTEngine *engine_;
+  phi::GPUContext *ctx_;
+};
+
+TEST_F(TensorRTDynamicShapeValueEngineTest, test_trt_dynamic_shape_value) {
+  std::vector<void *> buffers(3);
+  std::cout << "with_dynamic_shape: " << engine_->with_dynamic_shape()
+            << std::endl;
+  auto *x = engine_->DeclareInput(
+      "input", nvinfer1::DataType::kFLOAT, nvinfer1::Dims2{-1, 32});
+  nvinfer1::Dims shape_dim;
+  shape_dim.nbDims = 1;
+  shape_dim.d[0] = 3;
+  auto *shape =
+      engine_->DeclareInput("shape", nvinfer1::DataType::kINT32, shape_dim);
+  auto layer = engine_->network()->addShuffle(*x);
+  layer->setInput(1, *shape);
+  PADDLE_ENFORCE_NOT_NULL(
+      layer,
+      platform::errors::InvalidArgument("TRT shuffle layer building failed."));
+  engine_->DeclareOutput(layer, 0, "y");
+  engine_->FreezeNetwork();
+  ASSERT_EQ(engine_->engine()->getNbBindings(), 3);
+
+  std::vector<float> x_v(8 * 32);
+  for (int i = 0; i < 8 * 32; i++) {
+    x_v[i] = i % (8 * 32);
+  }
+
+  std::vector<int> shape_v = {8, 8, 4};
+  PrepareInputOutput(x_v, {8, 8, 4});
+  PrepareShapeInput(shape_v);
+  engine_->context()->setBindingDimensions(0, nvinfer1::Dims2{8, 32});
+  engine_->context()->setBindingDimensions(1, shape_dim);
+  engine_->context()->setInputShapeBinding(1, shape_v.data());
+
+  auto *x_gpu_data = input_.mutable_data<float>(ctx_->GetPlace());
+  auto *shape_gpu_data = shape_.mutable_data<int>(ctx_->GetPlace());
+  auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
+
+  buffers[0] = reinterpret_cast<void *>(x_gpu_data);
+  buffers[1] = reinterpret_cast<void *>(shape_gpu_data);
+  buffers[2] = reinterpret_cast<void *>(y_gpu_data);
+
+  engine_->Execute(-1, &buffers, ctx_->stream());
+  cudaStreamSynchronize(ctx_->stream());
+  std::vector<float> y_cpu;
+  GetOutput(&y_cpu);
+  ASSERT_EQ(y_cpu[0], 0);
+  ASSERT_EQ(y_cpu[1], 1);
+  auto dims = engine_->context()->getBindingDimensions(2);
+  ASSERT_EQ(dims.nbDims, 3);
+  ASSERT_EQ(dims.d[0], 8);
+  ASSERT_EQ(dims.d[1], 8);
+  ASSERT_EQ(dims.d[2], 4);
+  return;
+}
+
 class TensorRTDynamicEngineTest : public ::testing::Test {
 protected:
  void SetUp() override {
@@ -67,6 +198,9 @@ class TensorRTDynamicEngineTest : public ::testing::Test {
                                 min_input_shape,
                                 max_input_shape,
                                 optim_input_shape,
+                                 std::map<std::string, std::vector<int>>(),
+                                 std::map<std::string, std::vector<int>>(),
+                                 std::map<std::string, std::vector<int>>(),
                                 false,
                                 phi::DataType::FLOAT32,
                                 NaiveLogger::Global());
@@ -241,6 +375,9 @@ class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test {
                                 min_input_shape,
                                 max_input_shape,
                                 optim_input_shape,
+                                 std::map<std::string, std::vector<int>>(),
+                                 std::map<std::string, std::vector<int>>(),
+                                 std::map<std::string, std::vector<int>>(),
                                 false,
                                 phi::DataType::FLOAT32,
                                 NaiveLogger::Global());

--- a/paddle/fluid/inference/utils/io_utils.cc
+++ b/paddle/fluid/inference/utils/io_utils.cc
@@ -182,7 +182,10 @@ void SerializeShapeRangeInfo(
    const std::string &path,
    const std::map<std::string, std::vector<int32_t>> &min_shape,
    const std::map<std::string, std::vector<int32_t>> &max_shape,
-    const std::map<std::string, std::vector<int32_t>> &opt_shape) {
+    const std::map<std::string, std::vector<int32_t>> &opt_shape,
+    const std::map<std::string, std::vector<int32_t>> &min_value,
+    const std::map<std::string, std::vector<int32_t>> &max_value,
+    const std::map<std::string, std::vector<int32_t>> &opt_value) {
  paddle::inference::proto::ShapeRangeInfos shape_range_infos;
  for (auto it : min_shape) {
    auto *s = shape_range_infos.add_shape_range_info();
@@ -192,10 +195,18 @@ void SerializeShapeRangeInfo(
      s->add_max_shape(max_shape.at(it.first)[i]);
      s->add_opt_shape(opt_shape.at(it.first)[i]);
    }
+    // If it.first is a shape tensor, we should collect values from it.
+    if (min_value.count(it.first)) {
+      for (size_t i = 0; i < min_value.at(it.first).size(); ++i) {
+        s->add_min_value(min_value.at(it.first)[i]);
+        s->add_max_value(max_value.at(it.first)[i]);
+        s->add_opt_value(opt_value.at(it.first)[i]);
+      }
+    }
  }
-
  inference::SerializeShapeRangeInfo(path, shape_range_infos);
 }
+
 void DeserializeShapeRangeInfo(
    const std::string &path, paddle::inference::proto::ShapeRangeInfos *info) {
  int fd = open(path.c_str(), O_RDONLY);
@@ -213,7 +224,10 @@ void DeserializeShapeRangeInfo(
    const std::string &path,
    std::map<std::string, std::vector<int32_t>> *min_shape,
    std::map<std::string, std::vector<int32_t>> *max_shape,
-    std::map<std::string, std::vector<int32_t>> *opt_shape) {
+    std::map<std::string, std::vector<int32_t>> *opt_shape,
+    std::map<std::string, std::vector<int32_t>> *min_value,
+    std::map<std::string, std::vector<int32_t>> *max_value,
+    std::map<std::string, std::vector<int32_t>> *opt_value) {
  paddle::inference::proto::ShapeRangeInfos shape_range_infos;
  DeserializeShapeRangeInfo(path, &shape_range_infos);
  for (int i = 0; i < shape_range_infos.shape_range_info_size(); ++i) {
@@ -236,6 +250,26 @@ void DeserializeShapeRangeInfo(
      opt_shape->insert(std::make_pair(name, tmp));
    }
  }
+  for (int i = 0; i < shape_range_infos.shape_range_info_size(); ++i) {
+    auto info = shape_range_infos.shape_range_info(i);
+    auto name = info.name();
+    if (min_value->count(name) || max_value->count(name) ||
+        opt_value->count(name)) {
+      continue;
+    } else {
+      std::vector<int32_t> tmp(info.min_value_size());
+      for (size_t k = 0; k < tmp.size(); ++k) tmp[k] = info.min_value(k);
+      min_value->insert(std::make_pair(name, tmp));
+
+      tmp.resize(info.max_value_size());
+      for (size_t k = 0; k < tmp.size(); ++k) tmp[k] = info.max_value(k);
+      max_value->insert(std::make_pair(name, tmp));
+
+      tmp.resize(info.opt_value_size());
+      for (size_t k = 0; k < tmp.size(); ++k) tmp[k] = info.opt_value(k);
+      opt_value->insert(std::make_pair(name, tmp));
+    }
+  }
 }

 void UpdateShapeRangeInfo(
@@ -264,6 +298,7 @@ void UpdateShapeRangeInfo(
      }
    }
  }
+
  inference::SerializeShapeRangeInfo(path, shape_range_infos);
 }


--- a/paddle/fluid/inference/utils/io_utils.h
+++ b/paddle/fluid/inference/utils/io_utils.h
@@ -42,23 +42,22 @@ void SerializePDTensorsToFile(const std::string& path,
                              const std::vector<PaddleTensor>& tensors);
 void DeserializePDTensorsToFile(const std::string& path,
                                std::vector<PaddleTensor>* tensors);
-
-void SerializeShapeRangeInfo(
-    const std::string& path,
-    const paddle::inference::proto::ShapeRangeInfos& info);
 void SerializeShapeRangeInfo(
    const std::string& path,
    const std::map<std::string, std::vector<int32_t>>& min_shape,
    const std::map<std::string, std::vector<int32_t>>& max_shape,
-    const std::map<std::string, std::vector<int32_t>>& opt_shape);
-void DeserializeShapeRangeInfo(const std::string& path,
-                               paddle::inference::proto::ShapeRangeInfos* info);
+    const std::map<std::string, std::vector<int32_t>>& opt_shape,
+    const std::map<std::string, std::vector<int32_t>>& min_value,
+    const std::map<std::string, std::vector<int32_t>>& max_value,
+    const std::map<std::string, std::vector<int32_t>>& opt_value);
 void DeserializeShapeRangeInfo(
    const std::string& path,
    std::map<std::string, std::vector<int32_t>>* min_shape,
    std::map<std::string, std::vector<int32_t>>* max_shape,
-    std::map<std::string, std::vector<int32_t>>* opt_shape);
-
+    std::map<std::string, std::vector<int32_t>>* opt_shape,
+    std::map<std::string, std::vector<int32_t>>* min_value,
+    std::map<std::string, std::vector<int32_t>>* max_value,
+    std::map<std::string, std::vector<int32_t>>* opt_value);
 void UpdateShapeRangeInfo(
    const std::string& path,
    const std::map<std::string, std::vector<int32_t>>& min_shape,

--- a/paddle/fluid/inference/utils/io_utils_tester.cc
+++ b/paddle/fluid/inference/utils/io_utils_tester.cc
@@ -100,28 +100,48 @@ TEST(infer_io_utils, tensors) {
 TEST(shape_info_io, read_and_write) {
  const std::string path = "test_shape_info_io";
  std::map<std::string, std::vector<int32_t>> min_shape, max_shape, opt_shape;
+  std::map<std::string, std::vector<int32_t>> min_value, max_value, opt_value;
  min_shape.insert(
      std::make_pair("test1", std::vector<int32_t>{1, 3, 112, 112}));
  max_shape.insert(
      std::make_pair("test1", std::vector<int32_t>{1, 3, 224, 224}));
  opt_shape.insert(
      std::make_pair("test1", std::vector<int32_t>{1, 3, 224, 224}));
+  min_value.insert(
+      std::make_pair("test1", std::vector<int32_t>{1, 3, 112, 112}));
+  max_value.insert(
+      std::make_pair("test1", std::vector<int32_t>{1, 3, 224, 224}));
+  opt_value.insert(
+      std::make_pair("test1", std::vector<int32_t>{1, 3, 224, 224}));
  paddle::inference::SerializeShapeRangeInfo(
-      path, min_shape, max_shape, opt_shape);
+      path, min_shape, max_shape, opt_shape, min_value, max_value, opt_value);
  min_shape.clear();
  max_shape.clear();
  opt_shape.clear();
+  min_value.clear();
+  max_value.clear();
+  opt_value.clear();
  opt_shape.insert(
      std::make_pair("test2", std::vector<int32_t>{1, 3, 224, 224}));
-  paddle::inference::DeserializeShapeRangeInfo(
-      path, &min_shape, &max_shape, &opt_shape);
+  paddle::inference::DeserializeShapeRangeInfo(path,
+                                               &min_shape,
+                                               &max_shape,
+                                               &opt_shape,
+                                               &min_value,
+                                               &max_value,
+                                               &opt_value);

  min_shape.insert(std::make_pair("test1", std::vector<int32_t>{1, 3, 56, 56}));
  std::vector<std::string> names{"test1"};
  paddle::inference::UpdateShapeRangeInfo(
      path, min_shape, max_shape, opt_shape, names);

-  ASSERT_THROW(paddle::inference::DeserializeShapeRangeInfo(
-                   "no_exists_file", &min_shape, &max_shape, &opt_shape);
+  ASSERT_THROW(paddle::inference::DeserializeShapeRangeInfo("no_exists_file",
+                                                            &min_shape,
+                                                            &max_shape,
+                                                            &opt_shape,
+                                                            &min_value,
+                                                            &max_value,
+                                                            &opt_value);
               , paddle::platform::EnforceNotMet);
 }
--- a/paddle/fluid/inference/utils/shape_range_info.proto
+++ b/paddle/fluid/inference/utils/shape_range_info.proto
@@ -23,6 +23,9 @@ message ShapeRangeInfos {
    repeated int32 min_shape = 2;
    repeated int32 max_shape = 3;
    repeated int32 opt_shape = 4;
+    repeated int32 min_value = 5;
+    repeated int32 max_value = 6;
+    repeated int32 opt_value = 7;
  }

  repeated ShapeRangeInfo shape_range_info = 1;

--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -554,6 +554,18 @@ class TensorRTEngineOp : public framework::OperatorBase {
 #if IS_TRT_VERSION_GE(6000)
        trt_context->setBindingDimensions(
            bind_index, inference::tensorrt::Vec2TRT_Dims(t_shape, x, true));
+        // If this x is a shape tensor, we need call setInputShapeBinding
+        if (engine->engine()->isShapeBinding(bind_index) &&
+            engine->engine()->bindingIsInput(bind_index)) {
+          std::vector<int> shape_v(t.numel());
+          paddle::memory::Copy(platform::CPUPlace(),
+                               shape_v.data(),
+                               platform::CUDAPlace(),
+                               t.data<int32_t>(),
+                               t.numel() * sizeof(int),
+                               nullptr);
+          trt_context->setInputShapeBinding(bind_index, shape_v.data());
+        }
 #endif
      }
      runtime_batch = t_shape[0];