add config.SetMkldnnCacheCapacity api for mkldnn cache clear strategy (#18580)

* add config.SetMkldnnCacheCapacity api for mkldnn cache clear strategy test=develop * enhance MkldnnPostReset test=develop * add comments for mkldnn_cache_capacity field test=develop

add config.SetMkldnnCacheCapacity api for mkldnn cache clear strategy (#18580)
* add config.SetMkldnnCacheCapacity api for mkldnn cache clear strategy test=develop * enhance MkldnnPostReset test=develop * add comments for mkldnn_cache_capacity field test=develop
076f8331 · Tao Luo · GitHub · a20b2b43 · 076f8331 · 076f8331
7 changed file
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -148,6 +148,8 @@ struct Argument {
  // Pass a set of op types to enable its mkldnn kernel
  DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes,
                      std::unordered_set<std::string>);
+  // The cache capacity of different input shapes for mkldnn.
+  DECL_ARGUMENT_FIELD(mkldnn_cache_capacity, MkldnnCacheCapacity, int);
 #ifdef PADDLE_WITH_MKLDNN
  // A set of op types to enable their quantized kernels

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -115,6 +115,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  // MKLDNN related.
  CP_MEMBER(use_mkldnn_);
  CP_MEMBER(mkldnn_enabled_op_types_);
+  CP_MEMBER(mkldnn_cache_capacity_);
  // Quantization related.
  CP_MEMBER(use_mkldnn_quantizer_);
  CP_MEMBER(mkldnn_quantizer_config_);
@@ -162,6 +163,15 @@ void AnalysisConfig::EnableMKLDNN() {
  Update();
 }
+void AnalysisConfig::SetMkldnnCacheCapacity(int capacity) {
+#ifdef PADDLE_WITH_MKLDNN
+  mkldnn_cache_capacity_ = capacity;
+#else
+  LOG(ERROR) << "Please compile with MKLDNN first to set MKLDNN Thread Id";
+  mkldnn_cache_capacity_ = 0;
+#endif
+}
 void AnalysisConfig::EnableMkldnnQuantizer() {
 #ifdef PADDLE_WITH_MKLDNN
  if (!mkldnn_quantizer_config_)
@@ -343,6 +353,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
  ss << use_ngraph_;
  ss << use_mkldnn_;
+  ss << mkldnn_cache_capacity_;
  for (auto &item : mkldnn_enabled_op_types_) ss << item;
  ss << ";";

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -185,10 +185,49 @@ bool AnalysisPredictor::PrepareExecutor() {
  return true;
 }
+void AnalysisPredictor::MkldnnPreSet(const std::vector<PaddleTensor> &inputs) {
+#ifdef PADDLE_WITH_MKLDNN
+  VLOG(2) << "AnalysisPredictor::Run get_cur_mkldnn_session_id="
+          << platform::get_cur_mkldnn_session_id();
+  // In cache clearing mode.
+  if (config_.mkldnn_cache_capacity_ > 0) {
+    VLOG(2) << "In mkldnn cache clear mode.";
+    platform::set_cur_mkldnn_session_id(
+        platform::kMKLDNNSessionID_CacheClearing);
+    platform::set_cur_input_shape_cache_capacity(
+        config_.mkldnn_cache_capacity_);
+    // Set current_input_shape for caching dynamic shape.
+    std::stringstream ss;
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      for (size_t j = 0; j < inputs[i].shape.size(); ++j) {
+        ss << inputs[i].shape[j] << "-";
+      }
+    }
+    VLOG(2) << "Set input shape=" << ss.str();
+    platform::set_cur_input_shape_str(ss.str());
+  }
+#endif
+}
+void AnalysisPredictor::MkldnnPostReset() {
+#ifdef PADDLE_WITH_MKLDNN
+  // In cache clearing mode.
+  if (config_.mkldnn_cache_capacity_ > 0) {
+    paddle::platform::set_cur_mkldnn_session_id(
+        platform::kMKLDNNSessionID_Default);
+    platform::set_cur_input_shape_cache_capacity(0);
+    platform::set_cur_input_shape_str("");
+  }
+#endif
+}
 bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
                            std::vector<PaddleTensor> *output_data,
                            int batch_size) {
  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
+#ifdef PADDLE_WITH_MKLDNN
+  if (config_.use_mkldnn_) MkldnnPreSet(inputs);
+#endif
  VLOG(3) << "Predictor::predict";
  inference::Timer timer;
  timer.tic();
@@ -230,7 +269,9 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
  // recover the cpu_math_library_num_threads to 1, in order to avoid thread
  // conflict when integrating it into deployment service.
  paddle::platform::SetNumThreads(1);
+#ifdef PADDLE_WITH_MKLDNN
+  if (config_.use_mkldnn_) MkldnnPostReset();
+#endif
  return true;
 }
@@ -595,7 +636,6 @@ bool AnalysisPredictor::ZeroCopyRun() {
  // recover the cpu_math_library_num_threads to 1, in order to avoid thread
  // conflict when integrating it into deployment service.
  paddle::platform::SetNumThreads(1);
  return true;
 }

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -109,6 +109,11 @@ class AnalysisPredictor : public PaddlePredictor {
  template <typename T>
  void GetFetchOne(const framework::LoDTensor &fetchs,
                   PaddleTensor *output_data);
+  // PreSet and PostReset for Mkldnn multi-thread and dynamic shape input.
+  // Used in AnalysisPredictor::Run(), do not support
+  // AnalysisPredictor::ZeroRun() now.
+  void MkldnnPreSet(const std::vector<PaddleTensor> &inputs);
+  void MkldnnPostReset();
 #if PADDLE_WITH_TENSORRT
  // When we use Paddle-TRT INT8 engine, we need to generate calibration table

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -184,6 +184,10 @@ struct AnalysisConfig {
  /** Turn on MKLDNN.
   */
  void EnableMKLDNN();
+  /** set the cache capacity of different input shapes for MKLDNN.
+   *  Default 0 means don't cache any shape.
+   */
+  void SetMkldnnCacheCapacity(int capacity);
  /** A boolean state telling whether to use the MKLDNN.
   */
  bool mkldnn_enabled() const { return use_mkldnn_; }
@@ -316,8 +320,11 @@ struct AnalysisConfig {
  std::vector<std::string> anakin_passes_filter_;
  std::vector<std::string> anakin_ops_filter_;
+  // mkldnn related.
+  int mkldnn_cache_capacity_{0};
  bool use_mkldnn_quantizer_{false};
  std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config_;
  // If the config is already used on a predictor, it becomes invalid.
  // Any config can only be used with one predictor.
  // Variables held by config can take up a lot of memory in some cases.

--- a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
@@ -173,12 +173,47 @@ TEST(Analyzer_MM_DNN, compare_determine) {
 }
 #ifdef PADDLE_WITH_MKLDNN
-void TestMkldnnCacheClear(int mkldnn_input_shape_cache_capacity) {
+void TestMkldnnCacheClear(int mkldnn_input_shape_cache_capacity,
+                          std::vector<std::vector<PaddleTensor>> *outputs) {
  AnalysisConfig config;
  SetConfig(&config);
  config.EnableMKLDNN();
-  // TODO(luotao): explicit following settings will be deprecated after enhance
+  config.SetMkldnnCacheCapacity(mkldnn_input_shape_cache_capacity);
-  // config.EnableMKLDNN() interface.
+  std::vector<PaddleTensor> input;
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+  int sample_num = 10;
+  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+  outputs->resize(sample_num);
+  for (int i = 0; i < sample_num; i++) {
+    PrepareInputs(&input, &data, FLAGS_batch_size);
+    predictor->Run(input, &(*outputs)[i], 1);
+  }
+}
+TEST(Analyzer_MM_DNN, mkldnn_cache_clear) {
+  std::vector<std::vector<PaddleTensor>> outputs, cache_outputs;
+  // 0 means do not use cache clear strategy.
+  TestMkldnnCacheClear(0, &outputs);
+  // 4 means use cache clear strategy, and the
+  // mkldnn_input_shape_cache_capacity is 4.
+  TestMkldnnCacheClear(4, &cache_outputs);
+  // compare the result.
+  for (size_t i = 0; i < outputs.size(); i++) {
+    CompareResult(outputs[i], cache_outputs[i]);
+  }
+}
+void TestMkldnnShapeBlobSize(int mkldnn_input_shape_cache_capacity) {
+  AnalysisConfig config;
+  SetConfig(&config);
+  config.EnableMKLDNN();
+  config.SwitchUseFeedFetchOps(false);
+  // Since AnalysisPredictor::Run() will reset cur_mkldnn_session_id to default
+  // before its finished, we use AnalysisPredictor::ZeroCopyRun() here to check
+  // the mkldnn_shape_blob_size.
  if (mkldnn_input_shape_cache_capacity > 0) {
    platform::set_cur_mkldnn_session_id(
        platform::kMKLDNNSessionID_CacheClearing);
@@ -186,7 +221,7 @@ void TestMkldnnCacheClear(int mkldnn_input_shape_cache_capacity) {
        mkldnn_input_shape_cache_capacity);
  }
-  std::vector<PaddleTensor> input, output;
+  std::vector<PaddleTensor> input;
  auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
  int sample_num = 10;
@@ -195,8 +230,12 @@ void TestMkldnnCacheClear(int mkldnn_input_shape_cache_capacity) {
  auto &pool = platform::DeviceContextPool::Instance();
  auto *dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext *>(
      pool.Get(platform::CPUPlace()));
+  // clear before test
+  dev_ctx->ResetBlobMap();
  for (int i = 0; i < sample_num; i++) {
    PrepareInputs(&input, &data, FLAGS_batch_size);
+    ConvertPaddleTensorToZeroCopyTensor(predictor.get(), input);
    if (mkldnn_input_shape_cache_capacity > 0) {
      std::stringstream ss;
      for (size_t i = 0; i < input.size(); i++) {
@@ -204,11 +243,9 @@ void TestMkldnnCacheClear(int mkldnn_input_shape_cache_capacity) {
          ss << input[i].shape[j] << "-";
        }
      }
-      // TODO(luotao): explicit following settings will be deprecated after
-      // enhance config.EnableMKLDNN() interface.
      platform::set_cur_input_shape_str(ss.str());
    }
-    predictor->Run(input, &output, 1);
+    predictor->ZeroCopyRun();
  }
  if (mkldnn_input_shape_cache_capacity > 0) {
    PADDLE_ENFORCE_EQ(dev_ctx->GetShapeBlobSize(),
@@ -216,15 +253,14 @@ void TestMkldnnCacheClear(int mkldnn_input_shape_cache_capacity) {
  } else {
    PADDLE_ENFORCE_EQ(dev_ctx->GetShapeBlobSize(), 1UL);
  }
-  dev_ctx->ResetBlobMap();
 }
-TEST(Analyzer_MM_DNN, mkldnn_cache_clear) {
+TEST(Analyzer_MM_DNN, mkldnn_shape_blob_size) {
  // 0 means do not use cache clear strategy.
-  TestMkldnnCacheClear(0);
+  TestMkldnnShapeBlobSize(0);
  // 4 means use cache clear strategy, and the
  // mkldnn_input_shape_cache_capacity is 4.
-  TestMkldnnCacheClear(4);
+  TestMkldnnShapeBlobSize(4);
 }
 #endif

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -462,7 +462,8 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
  if (key_it == sBlob->end()) {
    // In cache clearing mode, cur_input_shape_cache_capacity defines
    // max pblob capacity
-    if ((sid == kMKLDNNSessionID_CacheClearing) &&
+    if ((static_cast<size_t>(sid) == kMKLDNNSessionID_CacheClearing) &&
+        sBlob->size() &&
        (sBlob->size() >=
         static_cast<size_t>(cur_input_shape_cache_capacity))) {
      VLOG(2) << "sid=" << sid