[cherry pick] Fix issue #33021 setCacheCapacity could not limit memory consumption (#33571)

* [oneDNN] First fix to #33021 (#33174) * - First fix to #33021 * [oneDNN] Second fix to #33021 (#33471) * use older download_data function Co-authored-by: N Jacek Czaja <jacek.czaja@intel.com>

[cherry pick] Fix issue #33021 setCacheCapacity could not limit memory consumption (#33571)
* [oneDNN] First fix to #33021 (#33174) * - First fix to #33021 * [oneDNN] Second fix to #33021 (#33471) * use older download_data function Co-authored-by: N Jacek Czaja <jacek.czaja@intel.com>
5c68e79d · lidanqing · GitHub · e5bd7eb8 · 5c68e79d · 5c68e79d
5 changed file
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -343,8 +343,6 @@ void AnalysisPredictor::MkldnnPreSet(
    platform::MKLDNNDeviceContext::tls().set_cur_mkldnn_session_id(
        platform::MKLDNNDeviceContextThreadLocals::
            kMKLDNNSessionID_CacheClearing);
-    platform::MKLDNNDeviceContext::tls().set_cur_input_shape_cache_capacity(
-        config_.mkldnn_cache_capacity_);
    // Set current_input_shape for caching dynamic shape.
    std::stringstream ss;
    for (size_t i = 0; i < inputs_shape.size(); ++i) {
@@ -355,6 +353,9 @@ void AnalysisPredictor::MkldnnPreSet(
    VLOG(2) << "Set input shape=" << ss.str();
    platform::MKLDNNDeviceContext::tls().set_cur_input_shape_str(ss.str());
  }
+  platform::MKLDNNDeviceContext::tls().set_cur_input_shape_cache_capacity(
+      config_.mkldnn_cache_capacity_);
+
 #endif
 }

@@ -370,10 +371,9 @@ void AnalysisPredictor::MkldnnPostReset() {
      CHECK_LE(shape_blob_size,
               static_cast<size_t>(config_.mkldnn_cache_capacity_));
    }
-    paddle::platform::MKLDNNDeviceContext::tls().set_cur_mkldnn_session_id(
-        platform::MKLDNNDeviceContextThreadLocals::kMKLDNNSessionID_Default);
-    platform::MKLDNNDeviceContext::tls().set_cur_input_shape_cache_capacity(0);
-    platform::MKLDNNDeviceContext::tls().set_cur_input_shape_str("");
+    // We cannot reset to the default cache settings
+    // as there maybe CopyToCPU method used and oneDNN
+    // primitives are used there so cache would grow
  }
 #endif
 }

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -285,11 +285,10 @@ inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_te
 # densebox
 set(DENSEBOX_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/densebox")
 download_data(${DENSEBOX_INSTALL_DIR} "densebox.tar.gz")
-#inference_analysis_test(test_analyzer_detect SRCS analyzer_detect_tester.cc 
-#  EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-#  ARGS --infer_model=${DENSEBOX_INSTALL_DIR}/model --infer_data=${DENSEBOX_INSTALL_DIR}/detect_input_50.txt 
-#       --infer_shape=${DENSEBOX_INSTALL_DIR}/shape_50.txt)
-#set_property(TEST test_analyzer_detect PROPERTY ENVIRONMENT GLOG_vmodule=analysis_predictor=2)
+inference_analysis_test(test_analyzer_detect_functional_mkldnn SRCS analyzer_detect_functional_mkldnn_tester.cc 
+  EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+  ARGS --infer_model=${DENSEBOX_INSTALL_DIR}/model --infer_data=${DENSEBOX_INSTALL_DIR}/detect_input_50.txt 
+       --infer_shape=${DENSEBOX_INSTALL_DIR}/shape_50.txt)

 # mobilenet with transpose op
 set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet")

--- a/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+
+DEFINE_string(infer_shape, "", "data shape file");
+DEFINE_int32(sample, 20, "number of sample");
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+struct Record {
+  std::vector<float> data;
+  std::vector<int32_t> shape;
+};
+
+Record ProcessALine(const std::string &line, const std::string &shape_line) {
+  VLOG(3) << "process a line";
+
+  Record record;
+  std::vector<std::string> data_strs;
+  split(line, ' ', &data_strs);
+  for (auto &d : data_strs) {
+    record.data.push_back(std::stof(d));
+  }
+
+  std::vector<std::string> shape_strs;
+  split(shape_line, ' ', &shape_strs);
+  for (auto &s : shape_strs) {
+    record.shape.push_back(std::stoi(s));
+  }
+  return record;
+}
+
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
+  cfg->DisableGpu();
+  // cfg->SwitchIrDebug(); // Enable to have graphs dumped
+  cfg->SwitchSpecifyInputNames(false);
+  cfg->SetCpuMathLibraryNumThreads(FLAGS_cpu_num_threads);
+}
+
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
+              const std::string &line, const std::string &shape_line) {
+  auto record = ProcessALine(line, shape_line);
+
+  PaddleTensor input;
+  input.shape = record.shape;
+  input.dtype = PaddleDType::FLOAT32;
+  size_t input_size = record.data.size() * sizeof(float);
+  input.data.Resize(input_size);
+  memcpy(input.data.data(), record.data.data(), input_size);
+  std::vector<PaddleTensor> input_slots;
+  input_slots.assign({input});
+  (*inputs).emplace_back(input_slots);
+}
+
+#ifdef PADDLE_WITH_MKLDNN
+int GetNumCachedObjects(void) {
+  auto &pool = platform::DeviceContextPool::Instance();
+  platform::CPUPlace place;
+  auto onednn_dev_ctx =
+      dynamic_cast<platform::MKLDNNDeviceContext *>(pool.Get(place));
+  return onednn_dev_ctx->GetCachedObjectsNumber();
+}
+
+void validate_cache_onednn(int cache_capacity = 1) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  cfg.EnableMKLDNN();
+  cfg.SetMkldnnCacheCapacity(cache_capacity);
+
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  std::vector<std::vector<PaddleTensor>> ref_outputs;
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+
+  std::ifstream file(FLAGS_infer_data);
+  std::ifstream infer_file(FLAGS_infer_shape);
+  std::vector<std::string> lines;
+  std::vector<std::string> shape_lines;
+
+  // Let's work with 4 samples
+  auto num_samples = 4;
+  ref_outputs.resize(num_samples);
+  lines.resize(num_samples);
+  shape_lines.resize(num_samples);
+
+  // Let's remember number of cached objects before
+  // execution and after every single execution
+  std::vector<int> cache_filling;
+  cache_filling.push_back(GetNumCachedObjects());
+
+  // compute sequentially prediction
+  for (int i = 0; i < num_samples; ++i) {
+    std::getline(file, lines[i]);
+    std::getline(infer_file, shape_lines[i]);
+    SetInput(&input_slots_all, lines[i], shape_lines[i]);
+    predictor->Run(input_slots_all[i], &ref_outputs[i], FLAGS_batch_size);
+    // record number of cached objects
+    cache_filling.push_back(GetNumCachedObjects());
+  }
+
+  file.close();
+  infer_file.close();
+
+  // Pick first output tensor from model
+  // as internally reorders may be called
+  // so it will impact cache size
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputTensor(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  size_t out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                   std::multiplies<int>());
+  std::vector<float> out_data;
+  out_data.resize(out_num);
+  output_t->CopyToCpu(out_data.data());
+
+  // Release predictor (relevant cache should be emptied)
+  predictor.reset(nullptr);
+  cache_filling.push_back(GetNumCachedObjects());
+
+  // Compare results
+  // First and last value should be equal e.g. before using cache (empty) and
+  // after releasing executor
+  PADDLE_ENFORCE_EQ(
+      cache_filling[0], cache_filling[cache_filling.size() - 1],
+      platform::errors::Fatal("Cache size before execution and after "
+                              "releasing Executor do not match"));
+
+  // Iterate to check if cache is not increasing
+  // over exceeding cache capacity
+  if (cache_capacity != 0) {
+    for (int i = cache_capacity + 1; i < num_samples + 1; ++i) {
+      PADDLE_ENFORCE_EQ(
+          cache_filling[cache_capacity], cache_filling[i],
+          platform::errors::Fatal("Cache capacity should not increase "
+                                  "after full capacity is used"));
+    }
+  }
+}
+
+TEST(Analyzer_detect, validate_cache_onednn) {
+  validate_cache_onednn(2 /*cache_capacity */);
+}
+#endif
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -537,7 +537,7 @@ Place CUDAPinnedDeviceContext::GetPlace() const { return place_; }
 MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
    : CPUDeviceContext(place), p_blobmap_() {
  p_blobmap_.reset(new BlobMap());
-  p_exec_items_.reset(new ExecMap());
+  p_exec_items_.reset(new ExecShape());
  p_mutex_.reset(new std::mutex());
 }

@@ -618,10 +618,15 @@ void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
    if (ptr == nullptr) {
      p_blobmap_->clear();
    } else {
-      for (auto& v : (*p_exec_items_)[ptr]) {
-        (v.first)->erase(v.second);
+      // Iterate through all shapes and release
+      // for each shape and active executor all entries
+      // of this executor
+      for (auto& s : *p_exec_items_) {
+        for (auto& v : (*s.second)[ptr]) {
+          (v.first)->erase(v.second);
+        }
+        s.second->erase(ptr);
      }
-      p_exec_items_->erase(ptr);
    }
  } else {
    VLOG(3) << "Prevented Clearing DNNL cache.";
@@ -629,11 +634,24 @@ void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
  }
 }

+void MKLDNNDeviceContext::RemoveShapeEntriesWithExecutor(void) const {
+  p_exec_items_->erase(p_exec_items_->begin());
+}
+
 void MKLDNNDeviceContext::LinkEntryWithExecutor(BlobPtr_t<KeyBlob> pblob,
                                                KeyBlob::iterator it) const {
+  // Take current input shape from TLS
  // Take current executor addess from TLS
  // and for this executor's items add the one defined with arguments
-  (*p_exec_items_)[tls().get_curr_exec()].push_back(std::make_pair(pblob, it));
+  auto key_it = p_exec_items_
+                    ->insert(std::make_pair(tls().cur_input_shape_str,
+                                            std::make_shared<ExecMap>()))
+                    .first;
+  (*key_it->second)[tls().get_curr_exec()].push_back(std::make_pair(pblob, it));
+
+  VLOG(3) << "LinkEntryWithExecutor, shapes: " << p_exec_items_->size()
+          << " curr exec size: "
+          << (*key_it->second)[tls().get_curr_exec()].size() << "\n";
 }

 void MKLDNNDeviceContext::BlockNextCacheClearing() {
@@ -690,6 +708,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
      VLOG(2) << "sid=" << sid
              << ", remove all blobs of shape: " << sBlob->begin()->first;
      sBlob->erase(sBlob->begin()->first);
+      RemoveShapeEntriesWithExecutor();
    }
    pBlob = std::make_shared<KeyBlob>();
    (*sBlob)[tls().cur_input_shape_str] = pBlob;
@@ -713,7 +732,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
  return;
 }

-unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) {
+unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) const {
  unsigned int num_entries = 0;
  for (auto const& l3 : *p_blobmap_) {
    for (auto const& l2 : *(l3.second)) {

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -728,8 +728,14 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
  using ShapeBlob = umap_key_string_t<KeyBlob>;
  using BlobMap = umap_value_smart_t<int, ShapeBlob>;

-  using ExecMap = std::unordered_map<
-      void*, std::vector<std::pair<BlobPtr_t<KeyBlob>, KeyBlob::iterator>>>;
+  // Auxillary two-level structure (shape, executor) to easier control
+  // clearing cache objects related to specific executor
+
+  using ExecKey = void*;
+  using ExecMapCacheIterPair = std::pair<BlobPtr_t<KeyBlob>, KeyBlob::iterator>;
+  using ExecMap =
+      std::unordered_map<ExecKey, std::vector<ExecMapCacheIterPair>>;
+  using ExecShape = std::unordered_map<std::string, std::shared_ptr<ExecMap>>;

  explicit MKLDNNDeviceContext(CPUPlace place);

@@ -738,6 +744,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext {

  // Register object to currently used executor's map
  void LinkEntryWithExecutor(BlobPtr_t<KeyBlob>, KeyBlob::iterator) const;
+  void RemoveShapeEntriesWithExecutor(void) const;

  // Remove all entries from the blob map
  void ResetBlobMap(void* ptr);
@@ -752,7 +759,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
  void SetBlob(const std::string& name, std::shared_ptr<void> data) const;

  // Calculate number of oneDNN objects cached
-  unsigned int GetCachedObjectsNumber(void);
+  unsigned int GetCachedObjectsNumber(void) const;

  // Find a saved blob. Return nullptr if not found
  std::shared_ptr<void> GetBlob(const std::string& name) const;
@@ -765,7 +772,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
  std::shared_ptr<BlobMap> p_blobmap_;
  // Map key is pointer of executor and value is a data(iterator in map) needed
  // to erase
-  std::shared_ptr<ExecMap> p_exec_items_;
+  std::shared_ptr<ExecShape> p_exec_items_;
  std::shared_ptr<std::mutex> p_mutex_;
  bool block_next_cache_clearing_ = false;
 };