From 17ac6e258006a9959e4a2a98a96a181de712d917 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Wed, 29 Apr 2020 14:24:50 +0800
Subject: [PATCH] update the analysis predictor for multi-stream support,
 test=develop (#24046)

* update the analysis predictor, test=develop

* update the unit test, test=develop

* no priority set before the inferface determined, test=develop

* interface name generalization, test=develop
---
 paddle/fluid/inference/api/analysis_config.cc |   6 +
 .../fluid/inference/api/analysis_predictor.cc | 104 +++++++++++++-----
 paddle/fluid/inference/api/demo_ci/utils.h    |   1 +
 .../inference/api/paddle_analysis_config.h    |  17 +++
 .../tests/api/lite_mul_model_test.cc          |  52 +++++++--
 .../fluid/inference/tests/api/tester_helper.h |  18 +++
 paddle/fluid/platform/init.cc                 |   5 +-
 paddle/fluid/platform/init.h                  |   2 +-
 8 files changed, 166 insertions(+), 39 deletions(-)
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 47b45f8e4f4..23d964c798e 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -149,6 +149,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 
   CP_MEMBER(serialized_info_cache_);
 
+  CP_MEMBER(thread_local_stream_);
+
   if (use_gpu_) {
     pass_builder_.reset(new GpuPassStrategy(
         *static_cast<GpuPassStrategy *>(other.pass_builder())));
@@ -389,6 +391,8 @@ std::string AnalysisConfig::SerializeInfoCache() {
 
   ss << use_lite_;
 
+  ss << thread_local_stream_;
+
   return ss.str();
 }
 
@@ -480,4 +484,6 @@ void AnalysisConfig::PartiallyRelease() {
   params_file_.shrink_to_fit();
 }
 
+void AnalysisConfig::EnableGpuMultiStream() { thread_local_stream_ = true; }
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index d21f0292d9b..1bcbe6acf02 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -37,13 +37,15 @@
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/cpu_helper.h"
-#ifdef PADDLE_WITH_MKLML
-#include "paddle/fluid/platform/dynload/mklml.h"
-#endif
+#include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 
+#ifdef PADDLE_WITH_MKLML
+#include "paddle/fluid/platform/dynload/mklml.h"
+#endif
+
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/inference/api/mkldnn_quantizer.h"
 #endif
@@ -212,9 +214,18 @@ bool AnalysisPredictor::PrepareProgram(
   return true;
 }
 bool AnalysisPredictor::CreateExecutor() {
-  if (config_.use_gpu_) {
+  if (config_.use_gpu()) {
     status_use_gpu_ = true;
-    place_ = paddle::platform::CUDAPlace(config_.device_id_);
+    place_ = paddle::platform::CUDAPlace(config_.gpu_device_id());
+#ifdef PADDLE_WITH_CUDA
+    if (config_.thread_local_stream_enabled()) {
+      auto *ctx = static_cast<platform::CUDADeviceContext *>(
+          platform::DeviceContextPool::Instance().Get(place_));
+      VLOG(3) << "The prediction process will be completed using a separate "
+                 "normal-priority stream on each thread.";
+      ctx->ResetThreadContext(platform::stream::Priority::kNormal);
+    }
+#endif
   } else {
     place_ = paddle::platform::CPUPlace();
   }
@@ -503,30 +514,69 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
   VLOG(3) << "create AnalysisConfig";
   PADDLE_ENFORCE(config.is_valid(),
                  "Note: Each config can only be used for one predictor.");
+
   if (config.use_gpu()) {
-    // 1. GPU memory
-    PADDLE_ENFORCE_GE(config.memory_pool_init_size_mb(), 0.f);
-    PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d",
-                      config.gpu_device_id());
-    std::vector<std::string> flags;
-
-    float fraction_of_gpu_memory = config.fraction_of_gpu_memory_for_pool();
-    if (fraction_of_gpu_memory > 0.95f) {
-      LOG(ERROR)
-          << "Allocate too much memory for the GPU memory pool, assigned "
-          << config.memory_pool_init_size_mb() << " MB";
-      LOG(ERROR)
-          << "Try to shink the value by setting AnalysisConfig::EnableGpu(...)";
-    }
+    static std::once_flag gflags_initialized;
+    static bool process_level_allocator_enabled;
+
+    std::call_once(gflags_initialized, [&]() {
+      std::vector<std::string> gflags;
+      PADDLE_ENFORCE_GE(
+          config.memory_pool_init_size_mb(), 0.f,
+          platform::errors::InvalidArgument(
+              "The size of memory pool should be greater than 0."));
+      PADDLE_ENFORCE_GE(
+          config.gpu_device_id(), 0,
+          platform::errors::InvalidArgument(
+              "Invalid device id (%d). The device id should be greater than 0.",
+              config.gpu_device_id()));
+      gflags.push_back("dummy");
+
+      float fraction_of_gpu_memory = config.fraction_of_gpu_memory_for_pool();
+      if (fraction_of_gpu_memory > 0.95f) {
+        LOG(ERROR)
+            << "Allocate too much memory for the GPU memory pool, assigned "
+            << config.memory_pool_init_size_mb() << " MB";
+        LOG(ERROR) << "Try to shink the value by setting "
+                      "AnalysisConfig::EnableGpu(...)";
+      }
+
+      if (fraction_of_gpu_memory >= 0.0f || fraction_of_gpu_memory <= 0.95f) {
+        std::string flag = "--fraction_of_gpu_memory_to_use=" +
+                           std::to_string(fraction_of_gpu_memory);
+        VLOG(3) << "set flag: " << flag;
+        gflags.push_back(flag);
+        gflags.push_back("--cudnn_deterministic=True");
+      }
 
-    if (fraction_of_gpu_memory >= 0.0f || fraction_of_gpu_memory <= 0.95f) {
-      flags.push_back("dummy");
-      std::string flag = "--fraction_of_gpu_memory_to_use=" +
-                         std::to_string(fraction_of_gpu_memory);
-      flags.push_back(flag);
-      flags.push_back("--cudnn_deterministic=True");
-      VLOG(3) << "set flag: " << flag;
-      framework::InitGflags(flags);
+      if (config.thread_local_stream_enabled()) {
+        gflags.push_back("--allocator_strategy=thread_local");
+        process_level_allocator_enabled = false;
+      } else {
+        gflags.push_back("--allocator_strategy=naive_best_fit");
+        process_level_allocator_enabled = true;
+      }
+
+      if (framework::InitGflags(gflags)) {
+        VLOG(3) << "The following gpu analysis configurations only take effect "
+                   "for the first predictor: ";
+        for (size_t i = 1; i < gflags.size(); ++i) {
+          VLOG(3) << gflags[i];
+        }
+      } else {
+        LOG(WARNING) << "The one-time configuration of analysis predictor "
+                        "failed, which may be due to native predictor called "
+                        "first and its configurations taken effect.";
+      }
+    });
+
+    if (config.thread_local_stream_enabled() &&
+        process_level_allocator_enabled) {
+      LOG(FATAL) << " When binding threads and streams, the use of "
+                    "process-level allocators will result in undefined result "
+                    "errors due to memory asynchronous operations."
+                    "The thread and stream binding configuration of all "
+                    "predictors should be the same in a single process.";
     }
   }
 
diff --git a/paddle/fluid/inference/api/demo_ci/utils.h b/paddle/fluid/inference/api/demo_ci/utils.h
index 1505a898c5b..5ac00fd294f 100644
--- a/paddle/fluid/inference/api/demo_ci/utils.h
+++ b/paddle/fluid/inference/api/demo_ci/utils.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include <math.h>
 #include <algorithm>
 #include <fstream>
 #include <iostream>
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 2002d1f76ab..a66f71e2a89 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -396,6 +396,14 @@ struct AnalysisConfig {
   ///
   void EnableMkldnnQuantizer();
 
+  ///
+  /// \brief A boolean state telling whether the thread local CUDA stream is
+  /// enabled.
+  ///
+  /// \return bool Whether the thread local CUDA stream is enabled.
+  ///
+  bool thread_local_stream_enabled() const { return thread_local_stream_; }
+
   ///
   /// \brief A boolean state telling whether the MKLDNN quantization is enabled.
   ///
@@ -486,6 +494,13 @@ struct AnalysisConfig {
   ///
   ///
   PassStrategy* pass_builder() const;
+
+  ///
+  /// \brief Enable the GPU multi-computing stream feature.
+  /// NOTE: The current behavior of this interface is to bind the computation
+  /// stream to the thread, and this behavior may be changed in the future.
+  ///
+  void EnableGpuMultiStream();
   void PartiallyRelease();
 
  protected:
@@ -563,6 +578,8 @@ struct AnalysisConfig {
   std::vector<std::string> lite_ops_filter_;
   Precision lite_precision_mode_;
 
+  bool thread_local_stream_{false};
+
   // mkldnn related.
   int mkldnn_cache_capacity_{0};
   bool use_mkldnn_quantizer_{false};
diff --git a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
index a50fbfd43ea..35d20ce34ee 100644
--- a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
@@ -16,21 +16,26 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <cmath>
+#include <mutex>   // NOLINT
+#include <thread>  // NOLINT
 
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
 namespace inference {
 
-TEST(AnalysisPredictor, use_gpu) {
-  std::string model_dir = FLAGS_infer_model + "/" + "mul_model";
-  AnalysisConfig config;
-  config.EnableUseGpu(100, 0);
-  config.SetModel(model_dir);
-  config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32);
+int test_main(const AnalysisConfig& config, Barrier* barrier = nullptr) {
+  static std::mutex mutex;
+  std::unique_ptr<PaddlePredictor> predictor;
+  {
+    std::unique_lock<std::mutex> lock(mutex);
+    predictor = std::move(CreatePaddlePredictor(config));
+  }
+  if (barrier) {
+    barrier->Wait();
+  }
 
   std::vector<PaddleTensor> inputs;
-  auto predictor = CreatePaddlePredictor(config);
   std::vector<float> input({1});
 
   PaddleTensor in;
@@ -40,19 +45,46 @@ TEST(AnalysisPredictor, use_gpu) {
   inputs.emplace_back(in);
 
   std::vector<PaddleTensor> outputs;
-  ASSERT_TRUE(predictor->Run(inputs, &outputs));
-
+  predictor->Run(inputs, &outputs);
   const std::vector<float> truth_values = {
       -0.00621776, -0.00620937, 0.00990623,  -0.0039817, -0.00074315,
       0.61229795,  -0.00491806, -0.00068755, 0.18409646, 0.30090684};
-
   const size_t expected_size = 1;
   EXPECT_EQ(outputs.size(), expected_size);
   float* data_o = static_cast<float*>(outputs[0].data.data());
   for (size_t j = 0; j < outputs[0].data.length() / sizeof(float); ++j) {
     EXPECT_LT(std::abs(data_o[j] - truth_values[j]), 10e-6);
   }
+  return 0;
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(AnalysisPredictor, thread_local_stream) {
+  const size_t thread_num = 5;
+  std::vector<std::thread> threads(thread_num);
+  Barrier barrier(thread_num);
+  for (size_t i = 0; i < threads.size(); ++i) {
+    threads[i] = std::thread([&barrier, i]() {
+      AnalysisConfig config;
+      config.EnableUseGpu(100, 0);
+      config.SetModel(FLAGS_infer_model + "/" + "mul_model");
+      config.EnableGpuMultiStream();
+      test_main(config, &barrier);
+    });
+  }
+  for (auto& th : threads) {
+    th.join();
+  }
+}
+
+TEST(AnalysisPredictor, lite_engine) {
+  AnalysisConfig config;
+  config.EnableUseGpu(100, 0);
+  config.SetModel(FLAGS_infer_model + "/" + "mul_model");
+  config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32);
+  test_main(config);
 }
+#endif
 
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index c8e5c826a13..4e6ffafeff8 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -105,6 +105,24 @@ void CheckError(float data_ref, float data) {
   }
 }
 
+class Barrier {
+ public:
+  explicit Barrier(std::size_t count) : _count(count) {}
+  void Wait() {
+    std::unique_lock<std::mutex> lock(_mutex);
+    if (--_count) {
+      _cv.wait(lock, [this] { return _count == 0; });
+    } else {
+      _cv.notify_all();
+    }
+  }
+
+ private:
+  std::mutex _mutex;
+  std::condition_variable _cv;
+  std::size_t _count;
+};
+
 // Compare result between two PaddleTensor
 void CompareResult(const std::vector<PaddleTensor> &outputs,
                    const std::vector<PaddleTensor> &ref_outputs) {
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index c6c84f8b9f0..4113379f5b4 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -50,7 +50,8 @@ std::once_flag glog_init_flag;
 std::once_flag p2p_init_flag;
 std::once_flag glog_warning_once_flag;
 
-void InitGflags(std::vector<std::string> argv) {
+bool InitGflags(std::vector<std::string> argv) {
+  bool successed = false;
   std::call_once(gflags_init_flag, [&]() {
     FLAGS_logtostderr = true;
     // NOTE(zhiqiu): dummy is needed, since the function
@@ -71,7 +72,9 @@ void InitGflags(std::vector<std::string> argv) {
             << ", Init commandline: " << line;
     google::ParseCommandLineFlags(&argc, &arr, true);
     VLOG(1) << "After Parse: argc is " << argc;
+    successed = true;
   });
+  return successed;
 }
 
 void InitP2P(std::vector<int> devices) {
diff --git a/paddle/fluid/platform/init.h b/paddle/fluid/platform/init.h
index d189f0022bf..09aef2743e8 100644
--- a/paddle/fluid/platform/init.h
+++ b/paddle/fluid/platform/init.h
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-void InitGflags(std::vector<std::string> argv);
+bool InitGflags(std::vector<std::string> argv);
 
 void InitGLOG(const std::string &prog_name);
 
-- 
GitLab