[oneDNN] First fix to #33021 (#33174)

* - First fix to #33021

[oneDNN] First fix to #33021 (#33174)
* - First fix to #33021
1382cd22 · Jacek Czaja · GitHub · 32ef95d7 · 1382cd22 · 1382cd22
4 changed file
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -325,11 +325,10 @@ inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_te
 # densebox
 set(DENSEBOX_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/densebox")
 download_data_without_verify(${DENSEBOX_INSTALL_DIR} "densebox.tar.gz")
-#inference_analysis_test(test_analyzer_detect SRCS analyzer_detect_tester.cc 
-#  EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-#  ARGS --infer_model=${DENSEBOX_INSTALL_DIR}/model --infer_data=${DENSEBOX_INSTALL_DIR}/detect_input_50.txt 
-#       --infer_shape=${DENSEBOX_INSTALL_DIR}/shape_50.txt)
-#set_property(TEST test_analyzer_detect PROPERTY ENVIRONMENT GLOG_vmodule=analysis_predictor=2)
+inference_analysis_test(test_analyzer_detect_functional_mkldnn SRCS analyzer_detect_functional_mkldnn_tester.cc 
+  EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+  ARGS --infer_model=${DENSEBOX_INSTALL_DIR}/model --infer_data=${DENSEBOX_INSTALL_DIR}/detect_input_50.txt 
+       --infer_shape=${DENSEBOX_INSTALL_DIR}/shape_50.txt)

 # mobilenet with transpose op
 set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet")

--- a/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+
+DEFINE_string(infer_shape, "", "data shape file");
+DEFINE_int32(sample, 20, "number of sample");
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+struct Record {
+  std::vector<float> data;
+  std::vector<int32_t> shape;
+};
+
+Record ProcessALine(const std::string &line, const std::string &shape_line) {
+  VLOG(3) << "process a line";
+
+  Record record;
+  std::vector<std::string> data_strs;
+  split(line, ' ', &data_strs);
+  for (auto &d : data_strs) {
+    record.data.push_back(std::stof(d));
+  }
+
+  std::vector<std::string> shape_strs;
+  split(shape_line, ' ', &shape_strs);
+  for (auto &s : shape_strs) {
+    record.shape.push_back(std::stoi(s));
+  }
+  return record;
+}
+
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
+  cfg->DisableGpu();
+  // cfg->SwitchIrDebug(); // Enable to have graphs dumped
+  cfg->SwitchSpecifyInputNames(false);
+  cfg->SetCpuMathLibraryNumThreads(FLAGS_cpu_num_threads);
+}
+
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
+              const std::string &line, const std::string &shape_line) {
+  auto record = ProcessALine(line, shape_line);
+
+  PaddleTensor input;
+  input.shape = record.shape;
+  input.dtype = PaddleDType::FLOAT32;
+  size_t input_size = record.data.size() * sizeof(float);
+  input.data.Resize(input_size);
+  memcpy(input.data.data(), record.data.data(), input_size);
+  std::vector<PaddleTensor> input_slots;
+  input_slots.assign({input});
+  (*inputs).emplace_back(input_slots);
+}
+
+#ifdef PADDLE_WITH_MKLDNN
+int GetNumCachedObjects(void) {
+  auto &pool = platform::DeviceContextPool::Instance();
+  platform::CPUPlace place;
+  auto onednn_dev_ctx =
+      dynamic_cast<platform::MKLDNNDeviceContext *>(pool.Get(place));
+  return onednn_dev_ctx->GetCachedObjectsNumber();
+}
+
+void validate_cache_onednn(int cache_capacity = 1) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  cfg.EnableMKLDNN();
+  cfg.SetMkldnnCacheCapacity(cache_capacity);
+
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  std::vector<std::vector<PaddleTensor>> ref_outputs;
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+
+  std::ifstream file(FLAGS_infer_data);
+  std::ifstream infer_file(FLAGS_infer_shape);
+  std::vector<std::string> lines;
+  std::vector<std::string> shape_lines;
+
+  // Let's work with 4 samples
+  auto num_samples = 4;
+  ref_outputs.resize(num_samples);
+  lines.resize(num_samples);
+  shape_lines.resize(num_samples);
+
+  // Let's remember number of cached objects before
+  // execution and after every single execution
+  std::vector<int> cache_filling;
+  cache_filling.push_back(GetNumCachedObjects());
+
+  // compute sequentially prediction
+  for (int i = 0; i < num_samples; ++i) {
+    std::getline(file, lines[i]);
+    std::getline(infer_file, shape_lines[i]);
+    SetInput(&input_slots_all, lines[i], shape_lines[i]);
+    predictor->Run(input_slots_all[i], &ref_outputs[i], FLAGS_batch_size);
+    // record number of cached objects
+    cache_filling.push_back(GetNumCachedObjects());
+  }
+
+  file.close();
+  infer_file.close();
+
+  predictor.reset(nullptr);
+  cache_filling.push_back(GetNumCachedObjects());
+
+  // Compare results
+  // First and last value should be equal e.g. before using cache (empty) and
+  // after releasing executor
+  PADDLE_ENFORCE_EQ(
+      cache_filling[0], cache_filling[cache_filling.size() - 1],
+      platform::errors::Fatal("Cache size before execution and after "
+                              "releasing Executor do not match"));
+
+  // Iterate to check if cache is not increasing
+  // over exceeding cache capacity
+  if (cache_capacity != 0) {
+    for (int i = cache_capacity + 1; i < num_samples + 1; ++i) {
+      PADDLE_ENFORCE_EQ(
+          cache_filling[cache_capacity], cache_filling[i],
+          platform::errors::Fatal("Cache capacity should not increase "
+                                  "after full capacity is used"));
+    }
+  }
+}
+
+TEST(Analyzer_detect, validate_cache_onednn) {
+  validate_cache_onednn(2 /*cache_capacity */);
+}
+#endif
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -563,7 +563,7 @@ Place CUDAPinnedDeviceContext::GetPlace() const { return place_; }
 MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
    : CPUDeviceContext(place), p_blobmap_() {
  p_blobmap_.reset(new BlobMap());
-  p_exec_items_.reset(new ExecMap());
+  p_exec_items_.reset(new ExecShape());
  p_mutex_.reset(new std::mutex());
 }

@@ -644,10 +644,15 @@ void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
    if (ptr == nullptr) {
      p_blobmap_->clear();
    } else {
-      for (auto& v : (*p_exec_items_)[ptr]) {
+      // Iterate through all shapes and release
+      // for each shape and active executor all entries
+      // of this executor
+      for (auto& s : *p_exec_items_) {
+        for (auto& v : (*s.second)[ptr]) {
          (v.first)->erase(v.second);
        }
-      p_exec_items_->erase(ptr);
+        s.second->erase(ptr);
+      }
    }
  } else {
    VLOG(3) << "Prevented Clearing DNNL cache.";
@@ -655,11 +660,24 @@ void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
  }
 }

+void MKLDNNDeviceContext::RemoveShapeEntriesWithExecutor(void) const {
+  p_exec_items_->erase(p_exec_items_->begin());
+}
+
 void MKLDNNDeviceContext::LinkEntryWithExecutor(BlobPtr_t<KeyBlob> pblob,
                                                KeyBlob::iterator it) const {
+  // Take current input shape from TLS
  // Take current executor addess from TLS
  // and for this executor's items add the one defined with arguments
-  (*p_exec_items_)[tls().get_curr_exec()].push_back(std::make_pair(pblob, it));
+  auto key_it = p_exec_items_
+                    ->insert(std::make_pair(tls().cur_input_shape_str,
+                                            std::make_shared<ExecMap>()))
+                    .first;
+  (*key_it->second)[tls().get_curr_exec()].push_back(std::make_pair(pblob, it));
+
+  VLOG(3) << "LinkEntryWithExecutor, shapes: " << p_exec_items_->size()
+          << " curr exec size: "
+          << (*key_it->second)[tls().get_curr_exec()].size() << "\n";
 }

 void MKLDNNDeviceContext::BlockNextCacheClearing() {
@@ -716,6 +734,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
      VLOG(2) << "sid=" << sid
              << ", remove all blobs of shape: " << sBlob->begin()->first;
      sBlob->erase(sBlob->begin()->first);
+      RemoveShapeEntriesWithExecutor();
    }
    pBlob = std::make_shared<KeyBlob>();
    (*sBlob)[tls().cur_input_shape_str] = pBlob;
@@ -739,7 +758,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
  return;
 }

-unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) {
+unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) const {
  unsigned int num_entries = 0;
  for (auto const& l3 : *p_blobmap_) {
    for (auto const& l2 : *(l3.second)) {

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -749,8 +749,14 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
  using ShapeBlob = umap_key_string_t<KeyBlob>;
  using BlobMap = umap_value_smart_t<int, ShapeBlob>;

-  using ExecMap = std::unordered_map<
-      void*, std::vector<std::pair<BlobPtr_t<KeyBlob>, KeyBlob::iterator>>>;
+  // Auxillary two-level structure (shape, executor) to easier control
+  // clearing cache objects related to specific executor
+
+  using ExecKey = void*;
+  using ExecMapCacheIterPair = std::pair<BlobPtr_t<KeyBlob>, KeyBlob::iterator>;
+  using ExecMap =
+      std::unordered_map<ExecKey, std::vector<ExecMapCacheIterPair>>;
+  using ExecShape = std::unordered_map<std::string, std::shared_ptr<ExecMap>>;

  explicit MKLDNNDeviceContext(CPUPlace place);

@@ -759,6 +765,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext {

  // Register object to currently used executor's map
  void LinkEntryWithExecutor(BlobPtr_t<KeyBlob>, KeyBlob::iterator) const;
+  void RemoveShapeEntriesWithExecutor(void) const;

  // Remove all entries from the blob map
  void ResetBlobMap(void* ptr);
@@ -773,7 +780,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
  void SetBlob(const std::string& name, std::shared_ptr<void> data) const;

  // Calculate number of oneDNN objects cached
-  unsigned int GetCachedObjectsNumber(void);
+  unsigned int GetCachedObjectsNumber(void) const;

  // Find a saved blob. Return nullptr if not found
  std::shared_ptr<void> GetBlob(const std::string& name) const;
@@ -786,7 +793,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
  std::shared_ptr<BlobMap> p_blobmap_;
  // Map key is pointer of executor and value is a data(iterator in map) needed
  // to erase
-  std::shared_ptr<ExecMap> p_exec_items_;
+  std::shared_ptr<ExecShape> p_exec_items_;
  std::shared_ptr<std::mutex> p_mutex_;
  bool block_next_cache_clearing_ = false;
 };