[Inference] Add TryShrinkMemory interface. (#28409)

1bf48365 · Wilber · GitHub · 26d292b1 · 1bf48365 · 1bf48365
7 changed file
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -175,7 +175,10 @@ bool AnalysisPredictor::PrepareScope(
    status_is_cloned_ = true;
  } else {
    paddle::framework::InitDevices(false);
-    scope_.reset(new paddle::framework::Scope());
+    scope_.reset(new paddle::framework::Scope(), [&](framework::Scope *scope) {
+      delete scope;
+      memory::Release(place_);
+    });
    status_is_cloned_ = false;
  }
  sub_scope_ = &scope_->NewScope();
@@ -591,7 +594,6 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
        gflags.push_back("--allocator_strategy=thread_local");
        process_level_allocator_enabled = false;
      } else {
-        gflags.push_back("--allocator_strategy=naive_best_fit");
        process_level_allocator_enabled = true;
      }

@@ -890,6 +892,11 @@ bool AnalysisPredictor::LoadParameters() {
  return true;
 }

+uint64_t AnalysisPredictor::TryShrinkMemory() {
+  ClearIntermediateTensor();
+  return paddle::memory::Release(place_);
+}
+
 void AnalysisPredictor::ClearIntermediateTensor() {
  PADDLE_ENFORCE_NOT_NULL(inference_program_.get(),
                          platform::errors::PreconditionNotMet(
@@ -985,6 +992,8 @@ AnalysisPredictor::~AnalysisPredictor() {
    mkldnn_quantizer_ = nullptr;
  }
 #endif
+
+  memory::Release(place_);
 }

 std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
@@ -1142,6 +1151,8 @@ void Predictor::ClearIntermediateTensor() {
  predictor_->ClearIntermediateTensor();
 }

+uint64_t Predictor::TryShrinkMemory() { return predictor_->TryShrinkMemory(); }
+
 int GetNumBytesOfDataType(DataType dtype) {
  switch (dtype) {
    case DataType::FLOAT32:

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -193,6 +193,17 @@ class AnalysisPredictor : public PaddlePredictor {
  ///
  void ClearIntermediateTensor();

+  ///
+  /// \brief Release all tmp tensor to compress the size of the memory pool.
+  /// The memory pool is considered to be composed of a list of chunks, if
+  /// the chunk is not occupied, it can be released.
+  ///
+  /// \return Number of bytes released. It may be smaller than the actual
+  /// released memory, because part of the memory is not managed by the
+  /// MemoryPool.
+  ///
+  uint64_t TryShrinkMemory() override;
+
  ///
  /// \brief Get the argument used by predictor
  ///

--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -135,6 +135,7 @@ TEST(AnalysisPredictor, ZeroCopy) {
  auto* out_data = out->data<float>(&place, &size);
  LOG(INFO) << "output size: " << size / sizeof(float);
  LOG(INFO) << "output_data: " << out_data;
+  predictor->TryShrinkMemory();
 }

 TEST(AnalysisPredictor, Clone) {
@@ -253,8 +254,7 @@ class MkldnnQuantizerTest : public testing::Test {
 public:
  MkldnnQuantizerTest() {
    AnalysisConfig config(FLAGS_dirname);
-
-    predictor.reset(new AnalysisPredictor(config));
+    predictor = std::move(CreatePaddlePredictor(config));
    auto* predictor_p = static_cast<AnalysisPredictor*>(predictor.get());

    auto qconfig = new MkldnnQuantizerConfig();
@@ -507,3 +507,45 @@ TEST(AnalysisPredictor, bf16_pass_strategy) {
 }

 }  // namespace paddle
+
+namespace paddle_infer {
+
+TEST(Predictor, Run) {
+  Config config;
+  config.SetModel(FLAGS_dirname);
+
+  auto predictor = CreatePredictor(config);
+
+  auto w0 = predictor->GetInputHandle("firstw");
+  auto w1 = predictor->GetInputHandle("secondw");
+  auto w2 = predictor->GetInputHandle("thirdw");
+  auto w3 = predictor->GetInputHandle("forthw");
+
+  w0->Reshape({4, 1});
+  w1->Reshape({4, 1});
+  w2->Reshape({4, 1});
+  w3->Reshape({4, 1});
+
+  auto* w0_data = w0->mutable_data<int64_t>(PlaceType::kCPU);
+  auto* w1_data = w1->mutable_data<int64_t>(PlaceType::kCPU);
+  auto* w2_data = w2->mutable_data<int64_t>(PlaceType::kCPU);
+  auto* w3_data = w3->mutable_data<int64_t>(PlaceType::kCPU);
+
+  for (int i = 0; i < 4; i++) {
+    w0_data[i] = i;
+    w1_data[i] = i;
+    w2_data[i] = i;
+    w3_data[i] = i;
+  }
+
+  predictor->Run();
+
+  auto out = predictor->GetOutputHandle("fc_1.tmp_2");
+  PlaceType place;
+  int size = 0;
+  out->data<float>(&place, &size);
+  LOG(INFO) << "output size: " << size / sizeof(float);
+  predictor->TryShrinkMemory();
+}
+
+}  // namespace paddle_infer
--- a/paddle/fluid/inference/api/api_tester.cc
+++ b/paddle/fluid/inference/api/api_tester.cc
@@ -60,6 +60,7 @@ TEST(paddle_inference_api, demo) {
  auto predictor = CreatePaddlePredictor(config);
  std::vector<PaddleTensor> outputs;
  predictor->Run({}, &outputs);
+  predictor->TryShrinkMemory();
 }

 TEST(paddle_inference_api, get_version) {

--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -319,6 +319,17 @@ class PD_INFER_DECL PaddlePredictor {
  ///
  virtual void ClearIntermediateTensor() {}

+  ///
+  /// \brief Release all tmp tensor to compress the size of the memory pool.
+  /// The memory pool is considered to be composed of a list of chunks, if
+  /// the chunk is not occupied, it can be released.
+  ///
+  /// \return Number of bytes released. It may be smaller than the actual
+  /// released memory, because part of the memory is not managed by the
+  /// MemoryPool.
+  ///
+  virtual uint64_t TryShrinkMemory() { return 0; }
+
  /// \brief Clone an existing predictor
  /// When using clone, the same network will be created,
  /// and the parameters between them are shared.

--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -224,6 +224,17 @@ class PD_INFER_DECL Predictor {
  /// \brief Clear the intermediate tensors of the predictor
  void ClearIntermediateTensor();

+  ///
+  /// \brief Release all tmp tensor to compress the size of the memory pool.
+  /// The memory pool is considered to be composed of a list of chunks, if
+  /// the chunk is not occupied, it can be released.
+  ///
+  /// \return Number of bytes released. It may be smaller than the actual
+  /// released memory, because part of the memory is not managed by the
+  /// MemoryPool.
+  ///
+  uint64_t TryShrinkMemory();
+
 private:
  std::unique_ptr<paddle::PaddlePredictor> predictor_;
 };

--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -566,6 +566,7 @@ void BindAnalysisPredictor(py::module *m) {
      .def("zero_copy_run", &AnalysisPredictor::ZeroCopyRun)
      .def("clear_intermediate_tensor",
           &AnalysisPredictor::ClearIntermediateTensor)
+      .def("try_shrink_memory", &AnalysisPredictor::TryShrinkMemory)
      .def("create_feed_fetch_var", &AnalysisPredictor::CreateFeedFetchVar)
      .def("prepare_feed_fetch", &AnalysisPredictor::PrepareFeedFetch)
      .def("prepare_argument", &AnalysisPredictor::PrepareArgument)
@@ -593,6 +594,7 @@ void BindPaddleInferPredictor(py::module *m) {
      .def("get_output_handle", &paddle_infer::Predictor::GetOutputHandle)
      .def("run", &paddle_infer::Predictor::Run)
      .def("clone", &paddle_infer::Predictor::Clone)
+      .def("try_shrink_memory", &paddle_infer::Predictor::TryShrinkMemory)
      .def("clear_intermediate_tensor",
           &paddle_infer::Predictor::ClearIntermediateTensor);
 }