From 8226d44bee2ea0ac9c67af7f0e6dac90a87c47c3 Mon Sep 17 00:00:00 2001 From: Yan Chunwei Date: Fri, 30 Nov 2018 13:03:44 +0800 Subject: [PATCH] fix container not clear bug for inference (#14674) --- .../fluid/inference/analysis/analysis_pass.h | 2 -- .../fluid/inference/api/analysis_predictor.cc | 12 ++++++---- paddle/fluid/inference/api/api_impl.cc | 6 ++--- .../api/details/reset_tensor_array.cc | 23 +++++++++++++++++++ .../api/details/reset_tensor_array.h | 17 ++++++++++++++ 5 files changed, 51 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/inference/analysis/analysis_pass.h b/paddle/fluid/inference/analysis/analysis_pass.h index 299f235a74a..d5a972fab3b 100644 --- a/paddle/fluid/inference/analysis/analysis_pass.h +++ b/paddle/fluid/inference/analysis/analysis_pass.h @@ -46,8 +46,6 @@ class AnalysisPass { protected: // User should implement these. virtual void RunImpl(Argument* argument) = 0; - - Argument* argument_{nullptr}; }; } // namespace analysis diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 1862f61f0f4..391330a7c0f 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -190,9 +190,13 @@ bool AnalysisPredictor::Run(const std::vector &inputs, } VLOG(3) << "predict cost: " << timer.toc() << "ms"; - // Fix TensorArray reuse not cleaned bug. - tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get()); - tensor_array_batch_cleaner_.ResetTensorArray(); + // All the containers in the scope will be hold in inference, but the + // operators assume that the container will be reset after each batch. + // Here is a bugfix, collect all the container variables, and reset then to a + // bool; the next time, the operator will call MutableData and construct a new + // container again, so that the container will be empty for each batch. + tensor_array_batch_cleaner_.CollectNoTensorVars(sub_scope_); + tensor_array_batch_cleaner_.ResetNoTensorVars(); return true; } @@ -417,7 +421,7 @@ std::unique_ptr AnalysisPredictor::GetOutputTensor( bool AnalysisPredictor::ZeroCopyRun() { executor_->Run(); // Fix TensorArray reuse not cleaned bug. - tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get()); + tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_); tensor_array_batch_cleaner_.ResetTensorArray(); return true; } diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 74369e88669..4c5b412a2c1 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -154,9 +154,9 @@ bool NativePaddlePredictor::Run(const std::vector &inputs, } VLOG(3) << "predict cost: " << timer.toc() << "ms"; - // Fix TensorArray reuse not cleaned bug. - tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get()); - tensor_array_batch_cleaner_.ResetTensorArray(); + // For some other vector like containers not cleaned after each batch. + tensor_array_batch_cleaner_.CollectNoTensorVars(scope_.get()); + tensor_array_batch_cleaner_.ResetNoTensorVars(); return true; } diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.cc b/paddle/fluid/inference/api/details/reset_tensor_array.cc index 4ae6c6dc9f4..569a487328e 100644 --- a/paddle/fluid/inference/api/details/reset_tensor_array.cc +++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc @@ -46,5 +46,28 @@ void TensorArrayBatchCleaner::ResetTensorArray() { } } +void TensorArrayBatchCleaner::CollectNoTensorVars(framework::Scope *scope) { + if (no_tensor_flag_) { + for (auto &var_name : scope->LocalVarNames()) { + auto *var = scope->FindVar(var_name); + if (!var->IsInitialized()) continue; + if (!valid_types_.count(var->Type())) { + no_tensor_vars_.insert(var); + } + } + + for (auto *kid : scope->kids()) { + CollectTensorArrays(kid); + } + no_tensor_flag_ = false; // Only collect one time. + } +} + +void TensorArrayBatchCleaner::ResetNoTensorVars() { + for (auto *var : no_tensor_vars_) { + var->Clear(); + } +} + } // namespace details } // namespace paddle diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h index a39449ff0e6..6a5ea64de66 100644 --- a/paddle/fluid/inference/api/details/reset_tensor_array.h +++ b/paddle/fluid/inference/api/details/reset_tensor_array.h @@ -14,9 +14,11 @@ #pragma once +#include #include #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/variable.h" namespace paddle { namespace details { @@ -24,13 +26,28 @@ namespace details { // Clean the TensorArray each batch to make the behavior the same with the // training phase. struct TensorArrayBatchCleaner { + TensorArrayBatchCleaner() { + valid_types_.insert(typeid(framework::Tensor)); + valid_types_.insert(typeid(framework::LoDTensor)); + } + // Collect the variables that are not Tensor or LoDTensor, and reset them to a + // bool(trick), because some of them are containers, and some operators just + // keep inserting new items without clearing the containers first; So the + // memory grow larger and larger in inference service deployed online. + void CollectNoTensorVars(framework::Scope *scope); + void ResetNoTensorVars(); + // Fix the tensor array not clear in the inference scenarios. void CollectTensorArrays(framework::Scope *scope); void ResetTensorArray(); private: bool flag_{true}; + bool no_tensor_flag_{true}; std::vector arrays_; + + std::unordered_set valid_types_; + std::unordered_set no_tensor_vars_; }; } // namespace details -- GitLab