From 5c68e79d78372b73ad9b74fe1b32259da577355c Mon Sep 17 00:00:00 2001 From: lidanqing Date: Wed, 16 Jun 2021 10:31:23 +0800 Subject: [PATCH] [cherry pick] Fix issue #33021 setCacheCapacity could not limit memory consumption (#33571) * [oneDNN] First fix to #33021 (#33174) * - First fix to #33021 * [oneDNN] Second fix to #33021 (#33471) * use older download_data function Co-authored-by: Jacek Czaja --- .../fluid/inference/api/analysis_predictor.cc | 12 +- .../fluid/inference/tests/api/CMakeLists.txt | 9 +- ...nalyzer_detect_functional_mkldnn_tester.cc | 166 ++++++++++++++++++ paddle/fluid/platform/device_context.cc | 31 +++- paddle/fluid/platform/device_context.h | 15 +- 5 files changed, 212 insertions(+), 21 deletions(-) create mode 100644 paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 42793595e1..215174c12c 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -343,8 +343,6 @@ void AnalysisPredictor::MkldnnPreSet( platform::MKLDNNDeviceContext::tls().set_cur_mkldnn_session_id( platform::MKLDNNDeviceContextThreadLocals:: kMKLDNNSessionID_CacheClearing); - platform::MKLDNNDeviceContext::tls().set_cur_input_shape_cache_capacity( - config_.mkldnn_cache_capacity_); // Set current_input_shape for caching dynamic shape. std::stringstream ss; for (size_t i = 0; i < inputs_shape.size(); ++i) { @@ -355,6 +353,9 @@ void AnalysisPredictor::MkldnnPreSet( VLOG(2) << "Set input shape=" << ss.str(); platform::MKLDNNDeviceContext::tls().set_cur_input_shape_str(ss.str()); } + platform::MKLDNNDeviceContext::tls().set_cur_input_shape_cache_capacity( + config_.mkldnn_cache_capacity_); + #endif } @@ -370,10 +371,9 @@ void AnalysisPredictor::MkldnnPostReset() { CHECK_LE(shape_blob_size, static_cast(config_.mkldnn_cache_capacity_)); } - paddle::platform::MKLDNNDeviceContext::tls().set_cur_mkldnn_session_id( - platform::MKLDNNDeviceContextThreadLocals::kMKLDNNSessionID_Default); - platform::MKLDNNDeviceContext::tls().set_cur_input_shape_cache_capacity(0); - platform::MKLDNNDeviceContext::tls().set_cur_input_shape_str(""); + // We cannot reset to the default cache settings + // as there maybe CopyToCPU method used and oneDNN + // primitives are used there so cache would grow } #endif } diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index f74cd671d6..0df442d332 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -285,11 +285,10 @@ inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_te # densebox set(DENSEBOX_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/densebox") download_data(${DENSEBOX_INSTALL_DIR} "densebox.tar.gz") -#inference_analysis_test(test_analyzer_detect SRCS analyzer_detect_tester.cc -# EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} -# ARGS --infer_model=${DENSEBOX_INSTALL_DIR}/model --infer_data=${DENSEBOX_INSTALL_DIR}/detect_input_50.txt -# --infer_shape=${DENSEBOX_INSTALL_DIR}/shape_50.txt) -#set_property(TEST test_analyzer_detect PROPERTY ENVIRONMENT GLOG_vmodule=analysis_predictor=2) +inference_analysis_test(test_analyzer_detect_functional_mkldnn SRCS analyzer_detect_functional_mkldnn_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${DENSEBOX_INSTALL_DIR}/model --infer_data=${DENSEBOX_INSTALL_DIR}/detect_input_50.txt + --infer_shape=${DENSEBOX_INSTALL_DIR}/shape_50.txt) # mobilenet with transpose op set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet") diff --git a/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc new file mode 100644 index 0000000000..384bef8a4b --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc @@ -0,0 +1,166 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include "paddle/fluid/inference/tests/api/tester_helper.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" + +DEFINE_string(infer_shape, "", "data shape file"); +DEFINE_int32(sample, 20, "number of sample"); + +namespace paddle { +namespace inference { +namespace analysis { + +struct Record { + std::vector data; + std::vector shape; +}; + +Record ProcessALine(const std::string &line, const std::string &shape_line) { + VLOG(3) << "process a line"; + + Record record; + std::vector data_strs; + split(line, ' ', &data_strs); + for (auto &d : data_strs) { + record.data.push_back(std::stof(d)); + } + + std::vector shape_strs; + split(shape_line, ' ', &shape_strs); + for (auto &s : shape_strs) { + record.shape.push_back(std::stoi(s)); + } + return record; +} + +void SetConfig(AnalysisConfig *cfg) { + cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params"); + cfg->DisableGpu(); + // cfg->SwitchIrDebug(); // Enable to have graphs dumped + cfg->SwitchSpecifyInputNames(false); + cfg->SetCpuMathLibraryNumThreads(FLAGS_cpu_num_threads); +} + +void SetInput(std::vector> *inputs, + const std::string &line, const std::string &shape_line) { + auto record = ProcessALine(line, shape_line); + + PaddleTensor input; + input.shape = record.shape; + input.dtype = PaddleDType::FLOAT32; + size_t input_size = record.data.size() * sizeof(float); + input.data.Resize(input_size); + memcpy(input.data.data(), record.data.data(), input_size); + std::vector input_slots; + input_slots.assign({input}); + (*inputs).emplace_back(input_slots); +} + +#ifdef PADDLE_WITH_MKLDNN +int GetNumCachedObjects(void) { + auto &pool = platform::DeviceContextPool::Instance(); + platform::CPUPlace place; + auto onednn_dev_ctx = + dynamic_cast(pool.Get(place)); + return onednn_dev_ctx->GetCachedObjectsNumber(); +} + +void validate_cache_onednn(int cache_capacity = 1) { + AnalysisConfig cfg; + SetConfig(&cfg); + cfg.EnableMKLDNN(); + cfg.SetMkldnnCacheCapacity(cache_capacity); + + auto predictor = CreatePaddlePredictor(cfg); + std::vector> ref_outputs; + std::vector> input_slots_all; + + std::ifstream file(FLAGS_infer_data); + std::ifstream infer_file(FLAGS_infer_shape); + std::vector lines; + std::vector shape_lines; + + // Let's work with 4 samples + auto num_samples = 4; + ref_outputs.resize(num_samples); + lines.resize(num_samples); + shape_lines.resize(num_samples); + + // Let's remember number of cached objects before + // execution and after every single execution + std::vector cache_filling; + cache_filling.push_back(GetNumCachedObjects()); + + // compute sequentially prediction + for (int i = 0; i < num_samples; ++i) { + std::getline(file, lines[i]); + std::getline(infer_file, shape_lines[i]); + SetInput(&input_slots_all, lines[i], shape_lines[i]); + predictor->Run(input_slots_all[i], &ref_outputs[i], FLAGS_batch_size); + // record number of cached objects + cache_filling.push_back(GetNumCachedObjects()); + } + + file.close(); + infer_file.close(); + + // Pick first output tensor from model + // as internally reorders may be called + // so it will impact cache size + auto output_names = predictor->GetOutputNames(); + auto output_t = predictor->GetOutputTensor(output_names[0]); + std::vector output_shape = output_t->shape(); + size_t out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, + std::multiplies()); + std::vector out_data; + out_data.resize(out_num); + output_t->CopyToCpu(out_data.data()); + + // Release predictor (relevant cache should be emptied) + predictor.reset(nullptr); + cache_filling.push_back(GetNumCachedObjects()); + + // Compare results + // First and last value should be equal e.g. before using cache (empty) and + // after releasing executor + PADDLE_ENFORCE_EQ( + cache_filling[0], cache_filling[cache_filling.size() - 1], + platform::errors::Fatal("Cache size before execution and after " + "releasing Executor do not match")); + + // Iterate to check if cache is not increasing + // over exceeding cache capacity + if (cache_capacity != 0) { + for (int i = cache_capacity + 1; i < num_samples + 1; ++i) { + PADDLE_ENFORCE_EQ( + cache_filling[cache_capacity], cache_filling[i], + platform::errors::Fatal("Cache capacity should not increase " + "after full capacity is used")); + } + } +} + +TEST(Analyzer_detect, validate_cache_onednn) { + validate_cache_onednn(2 /*cache_capacity */); +} +#endif + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 9a47ac4546..fcb60b27b1 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -537,7 +537,7 @@ Place CUDAPinnedDeviceContext::GetPlace() const { return place_; } MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place) : CPUDeviceContext(place), p_blobmap_() { p_blobmap_.reset(new BlobMap()); - p_exec_items_.reset(new ExecMap()); + p_exec_items_.reset(new ExecShape()); p_mutex_.reset(new std::mutex()); } @@ -618,10 +618,15 @@ void MKLDNNDeviceContext::ResetBlobMap(void* ptr) { if (ptr == nullptr) { p_blobmap_->clear(); } else { - for (auto& v : (*p_exec_items_)[ptr]) { - (v.first)->erase(v.second); + // Iterate through all shapes and release + // for each shape and active executor all entries + // of this executor + for (auto& s : *p_exec_items_) { + for (auto& v : (*s.second)[ptr]) { + (v.first)->erase(v.second); + } + s.second->erase(ptr); } - p_exec_items_->erase(ptr); } } else { VLOG(3) << "Prevented Clearing DNNL cache."; @@ -629,11 +634,24 @@ void MKLDNNDeviceContext::ResetBlobMap(void* ptr) { } } +void MKLDNNDeviceContext::RemoveShapeEntriesWithExecutor(void) const { + p_exec_items_->erase(p_exec_items_->begin()); +} + void MKLDNNDeviceContext::LinkEntryWithExecutor(BlobPtr_t pblob, KeyBlob::iterator it) const { + // Take current input shape from TLS // Take current executor addess from TLS // and for this executor's items add the one defined with arguments - (*p_exec_items_)[tls().get_curr_exec()].push_back(std::make_pair(pblob, it)); + auto key_it = p_exec_items_ + ->insert(std::make_pair(tls().cur_input_shape_str, + std::make_shared())) + .first; + (*key_it->second)[tls().get_curr_exec()].push_back(std::make_pair(pblob, it)); + + VLOG(3) << "LinkEntryWithExecutor, shapes: " << p_exec_items_->size() + << " curr exec size: " + << (*key_it->second)[tls().get_curr_exec()].size() << "\n"; } void MKLDNNDeviceContext::BlockNextCacheClearing() { @@ -690,6 +708,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name, VLOG(2) << "sid=" << sid << ", remove all blobs of shape: " << sBlob->begin()->first; sBlob->erase(sBlob->begin()->first); + RemoveShapeEntriesWithExecutor(); } pBlob = std::make_shared(); (*sBlob)[tls().cur_input_shape_str] = pBlob; @@ -713,7 +732,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name, return; } -unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) { +unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) const { unsigned int num_entries = 0; for (auto const& l3 : *p_blobmap_) { for (auto const& l2 : *(l3.second)) { diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index a0baf5e811..43c56eecad 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -728,8 +728,14 @@ class MKLDNNDeviceContext : public CPUDeviceContext { using ShapeBlob = umap_key_string_t; using BlobMap = umap_value_smart_t; - using ExecMap = std::unordered_map< - void*, std::vector, KeyBlob::iterator>>>; + // Auxillary two-level structure (shape, executor) to easier control + // clearing cache objects related to specific executor + + using ExecKey = void*; + using ExecMapCacheIterPair = std::pair, KeyBlob::iterator>; + using ExecMap = + std::unordered_map>; + using ExecShape = std::unordered_map>; explicit MKLDNNDeviceContext(CPUPlace place); @@ -738,6 +744,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext { // Register object to currently used executor's map void LinkEntryWithExecutor(BlobPtr_t, KeyBlob::iterator) const; + void RemoveShapeEntriesWithExecutor(void) const; // Remove all entries from the blob map void ResetBlobMap(void* ptr); @@ -752,7 +759,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext { void SetBlob(const std::string& name, std::shared_ptr data) const; // Calculate number of oneDNN objects cached - unsigned int GetCachedObjectsNumber(void); + unsigned int GetCachedObjectsNumber(void) const; // Find a saved blob. Return nullptr if not found std::shared_ptr GetBlob(const std::string& name) const; @@ -765,7 +772,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext { std::shared_ptr p_blobmap_; // Map key is pointer of executor and value is a data(iterator in map) needed // to erase - std::shared_ptr p_exec_items_; + std::shared_ptr p_exec_items_; std::shared_ptr p_mutex_; bool block_next_cache_clearing_ = false; }; -- GitLab