diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index ccfb6dfa17ab435ce030f028968ed59705f133fb..20bea8e568e467d5839888a302b32cf6132a4fac 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -175,7 +175,10 @@ bool AnalysisPredictor::PrepareScope( status_is_cloned_ = true; } else { paddle::framework::InitDevices(false); - scope_.reset(new paddle::framework::Scope()); + scope_.reset(new paddle::framework::Scope(), [&](framework::Scope *scope) { + delete scope; + memory::Release(place_); + }); status_is_cloned_ = false; } sub_scope_ = &scope_->NewScope(); @@ -591,7 +594,6 @@ std::unique_ptr CreatePaddlePredictor< gflags.push_back("--allocator_strategy=thread_local"); process_level_allocator_enabled = false; } else { - gflags.push_back("--allocator_strategy=naive_best_fit"); process_level_allocator_enabled = true; } @@ -890,6 +892,11 @@ bool AnalysisPredictor::LoadParameters() { return true; } +uint64_t AnalysisPredictor::TryShrinkMemory() { + ClearIntermediateTensor(); + return paddle::memory::Release(place_); +} + void AnalysisPredictor::ClearIntermediateTensor() { PADDLE_ENFORCE_NOT_NULL(inference_program_.get(), platform::errors::PreconditionNotMet( @@ -985,6 +992,8 @@ AnalysisPredictor::~AnalysisPredictor() { mkldnn_quantizer_ = nullptr; } #endif + + memory::Release(place_); } std::unique_ptr AnalysisPredictor::Clone() { @@ -1142,6 +1151,8 @@ void Predictor::ClearIntermediateTensor() { predictor_->ClearIntermediateTensor(); } +uint64_t Predictor::TryShrinkMemory() { return predictor_->TryShrinkMemory(); } + int GetNumBytesOfDataType(DataType dtype) { switch (dtype) { case DataType::FLOAT32: diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 269f2fd80bb47d1d2a57a4d469a0574e7aae856a..35b52fa56d63aa8f4f9cad5bd07d0722e6abb57f 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -193,6 +193,17 @@ class AnalysisPredictor : public PaddlePredictor { /// void ClearIntermediateTensor(); + /// + /// \brief Release all tmp tensor to compress the size of the memory pool. + /// The memory pool is considered to be composed of a list of chunks, if + /// the chunk is not occupied, it can be released. + /// + /// \return Number of bytes released. It may be smaller than the actual + /// released memory, because part of the memory is not managed by the + /// MemoryPool. + /// + uint64_t TryShrinkMemory() override; + /// /// \brief Get the argument used by predictor /// diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 5766919f08e68832886b88b867bc48afa288a955..67c9b441e261936b6ede9fa76e825bc853b8df5a 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -135,6 +135,7 @@ TEST(AnalysisPredictor, ZeroCopy) { auto* out_data = out->data(&place, &size); LOG(INFO) << "output size: " << size / sizeof(float); LOG(INFO) << "output_data: " << out_data; + predictor->TryShrinkMemory(); } TEST(AnalysisPredictor, Clone) { @@ -253,8 +254,7 @@ class MkldnnQuantizerTest : public testing::Test { public: MkldnnQuantizerTest() { AnalysisConfig config(FLAGS_dirname); - - predictor.reset(new AnalysisPredictor(config)); + predictor = std::move(CreatePaddlePredictor(config)); auto* predictor_p = static_cast(predictor.get()); auto qconfig = new MkldnnQuantizerConfig(); @@ -507,3 +507,45 @@ TEST(AnalysisPredictor, bf16_pass_strategy) { } } // namespace paddle + +namespace paddle_infer { + +TEST(Predictor, Run) { + Config config; + config.SetModel(FLAGS_dirname); + + auto predictor = CreatePredictor(config); + + auto w0 = predictor->GetInputHandle("firstw"); + auto w1 = predictor->GetInputHandle("secondw"); + auto w2 = predictor->GetInputHandle("thirdw"); + auto w3 = predictor->GetInputHandle("forthw"); + + w0->Reshape({4, 1}); + w1->Reshape({4, 1}); + w2->Reshape({4, 1}); + w3->Reshape({4, 1}); + + auto* w0_data = w0->mutable_data(PlaceType::kCPU); + auto* w1_data = w1->mutable_data(PlaceType::kCPU); + auto* w2_data = w2->mutable_data(PlaceType::kCPU); + auto* w3_data = w3->mutable_data(PlaceType::kCPU); + + for (int i = 0; i < 4; i++) { + w0_data[i] = i; + w1_data[i] = i; + w2_data[i] = i; + w3_data[i] = i; + } + + predictor->Run(); + + auto out = predictor->GetOutputHandle("fc_1.tmp_2"); + PlaceType place; + int size = 0; + out->data(&place, &size); + LOG(INFO) << "output size: " << size / sizeof(float); + predictor->TryShrinkMemory(); +} + +} // namespace paddle_infer diff --git a/paddle/fluid/inference/api/api_tester.cc b/paddle/fluid/inference/api/api_tester.cc index 988ffc47292b58fba6f9021d5326e218314f454a..0c717f0fae03cedcf19ca5d2d93f07f7e96a8085 100644 --- a/paddle/fluid/inference/api/api_tester.cc +++ b/paddle/fluid/inference/api/api_tester.cc @@ -60,6 +60,7 @@ TEST(paddle_inference_api, demo) { auto predictor = CreatePaddlePredictor(config); std::vector outputs; predictor->Run({}, &outputs); + predictor->TryShrinkMemory(); } TEST(paddle_inference_api, get_version) { diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index 064f63542683a0d95985382385b182d794da0068..9fd198fb5a4736b968f16f0f0bda6b3bc808b090 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -319,6 +319,17 @@ class PD_INFER_DECL PaddlePredictor { /// virtual void ClearIntermediateTensor() {} + /// + /// \brief Release all tmp tensor to compress the size of the memory pool. + /// The memory pool is considered to be composed of a list of chunks, if + /// the chunk is not occupied, it can be released. + /// + /// \return Number of bytes released. It may be smaller than the actual + /// released memory, because part of the memory is not managed by the + /// MemoryPool. + /// + virtual uint64_t TryShrinkMemory() { return 0; } + /// \brief Clone an existing predictor /// When using clone, the same network will be created, /// and the parameters between them are shared. diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 5dc4430fde4715fe11c19ce8adc7397f77391fc3..2e1e3b822d164d995be95d3dad6a7752371b7636 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -224,6 +224,17 @@ class PD_INFER_DECL Predictor { /// \brief Clear the intermediate tensors of the predictor void ClearIntermediateTensor(); + /// + /// \brief Release all tmp tensor to compress the size of the memory pool. + /// The memory pool is considered to be composed of a list of chunks, if + /// the chunk is not occupied, it can be released. + /// + /// \return Number of bytes released. It may be smaller than the actual + /// released memory, because part of the memory is not managed by the + /// MemoryPool. + /// + uint64_t TryShrinkMemory(); + private: std::unique_ptr predictor_; }; diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index a0cb096193fcd55372003c4e94b2e946d5eb6e97..7f3fe410464ede4c58396d6212c7ee4446047ebc 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -566,6 +566,7 @@ void BindAnalysisPredictor(py::module *m) { .def("zero_copy_run", &AnalysisPredictor::ZeroCopyRun) .def("clear_intermediate_tensor", &AnalysisPredictor::ClearIntermediateTensor) + .def("try_shrink_memory", &AnalysisPredictor::TryShrinkMemory) .def("create_feed_fetch_var", &AnalysisPredictor::CreateFeedFetchVar) .def("prepare_feed_fetch", &AnalysisPredictor::PrepareFeedFetch) .def("prepare_argument", &AnalysisPredictor::PrepareArgument) @@ -593,6 +594,7 @@ void BindPaddleInferPredictor(py::module *m) { .def("get_output_handle", &paddle_infer::Predictor::GetOutputHandle) .def("run", &paddle_infer::Predictor::Run) .def("clone", &paddle_infer::Predictor::Clone) + .def("try_shrink_memory", &paddle_infer::Predictor::TryShrinkMemory) .def("clear_intermediate_tensor", &paddle_infer::Predictor::ClearIntermediateTensor); }