update the analysis predictor for multi-stream support, test=develop (#24046)

* update the analysis predictor, test=develop * update the unit test, test=develop * no priority set before the inferface determined, test=develop * interface name generalization, test=develop

update the analysis predictor for multi-stream support, test=develop (#24046)
* update the analysis predictor, test=develop * update the unit test, test=develop * no priority set before the inferface determined, test=develop * interface name generalization, test=develop
17ac6e25 · 石晓伟 · GitHub · 69f2f285 · 17ac6e25 · 17ac6e25
8 changed file
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -149,6 +149,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  CP_MEMBER(serialized_info_cache_);
+  CP_MEMBER(thread_local_stream_);
  if (use_gpu_) {
    pass_builder_.reset(new GpuPassStrategy(
        *static_cast<GpuPassStrategy *>(other.pass_builder())));
@@ -389,6 +391,8 @@ std::string AnalysisConfig::SerializeInfoCache() {
  ss << use_lite_;
+  ss << thread_local_stream_;
  return ss.str();
 }
@@ -480,4 +484,6 @@ void AnalysisConfig::PartiallyRelease() {
  params_file_.shrink_to_fit();
 }
+void AnalysisConfig::EnableGpuMultiStream() { thread_local_stream_ = true; }
 }  // namespace paddle
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -37,13 +37,15 @@
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/cpu_helper.h"
-#ifdef PADDLE_WITH_MKLML
+#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/dynload/mklml.h"
-#endif
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
+#ifdef PADDLE_WITH_MKLML
+#include "paddle/fluid/platform/dynload/mklml.h"
+#endif
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/inference/api/mkldnn_quantizer.h"
 #endif
@@ -212,9 +214,18 @@ bool AnalysisPredictor::PrepareProgram(
  return true;
 }
 bool AnalysisPredictor::CreateExecutor() {
-  if (config_.use_gpu_) {
+  if (config_.use_gpu()) {
    status_use_gpu_ = true;
-    place_ = paddle::platform::CUDAPlace(config_.device_id_);
+    place_ = paddle::platform::CUDAPlace(config_.gpu_device_id());
+#ifdef PADDLE_WITH_CUDA
+    if (config_.thread_local_stream_enabled()) {
+      auto *ctx = static_cast<platform::CUDADeviceContext *>(
+          platform::DeviceContextPool::Instance().Get(place_));
+      VLOG(3) << "The prediction process will be completed using a separate "
+                 "normal-priority stream on each thread.";
+      ctx->ResetThreadContext(platform::stream::Priority::kNormal);
+    }
+#endif
  } else {
    place_ = paddle::platform::CPUPlace();
  }
@@ -503,30 +514,69 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
  VLOG(3) << "create AnalysisConfig";
  PADDLE_ENFORCE(config.is_valid(),
                 "Note: Each config can only be used for one predictor.");
  if (config.use_gpu()) {
-    // 1. GPU memory
+    static std::once_flag gflags_initialized;
-    PADDLE_ENFORCE_GE(config.memory_pool_init_size_mb(), 0.f);
+    static bool process_level_allocator_enabled;
-    PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d",
-                      config.gpu_device_id());
+    std::call_once(gflags_initialized, [&]() {
-    std::vector<std::string> flags;
+      std::vector<std::string> gflags;
+      PADDLE_ENFORCE_GE(
-    float fraction_of_gpu_memory = config.fraction_of_gpu_memory_for_pool();
+          config.memory_pool_init_size_mb(), 0.f,
-    if (fraction_of_gpu_memory > 0.95f) {
+          platform::errors::InvalidArgument(
-      LOG(ERROR)
+              "The size of memory pool should be greater than 0."));
-          << "Allocate too much memory for the GPU memory pool, assigned "
+      PADDLE_ENFORCE_GE(
-          << config.memory_pool_init_size_mb() << " MB";
+          config.gpu_device_id(), 0,
-      LOG(ERROR)
+          platform::errors::InvalidArgument(
-          << "Try to shink the value by setting AnalysisConfig::EnableGpu(...)";
+              "Invalid device id (%d). The device id should be greater than 0.",
-    }
+              config.gpu_device_id()));
+      gflags.push_back("dummy");
+      float fraction_of_gpu_memory = config.fraction_of_gpu_memory_for_pool();
+      if (fraction_of_gpu_memory > 0.95f) {
+        LOG(ERROR)
+            << "Allocate too much memory for the GPU memory pool, assigned "
+            << config.memory_pool_init_size_mb() << " MB";
+        LOG(ERROR) << "Try to shink the value by setting "
+                      "AnalysisConfig::EnableGpu(...)";
+      }
+      if (fraction_of_gpu_memory >= 0.0f || fraction_of_gpu_memory <= 0.95f) {
+        std::string flag = "--fraction_of_gpu_memory_to_use=" +
+                           std::to_string(fraction_of_gpu_memory);
+        VLOG(3) << "set flag: " << flag;
+        gflags.push_back(flag);
+        gflags.push_back("--cudnn_deterministic=True");
+      }
-    if (fraction_of_gpu_memory >= 0.0f || fraction_of_gpu_memory <= 0.95f) {
+      if (config.thread_local_stream_enabled()) {
-      flags.push_back("dummy");
+        gflags.push_back("--allocator_strategy=thread_local");
-      std::string flag = "--fraction_of_gpu_memory_to_use=" +
+        process_level_allocator_enabled = false;
-                         std::to_string(fraction_of_gpu_memory);
+      } else {
-      flags.push_back(flag);
+        gflags.push_back("--allocator_strategy=naive_best_fit");
-      flags.push_back("--cudnn_deterministic=True");
+        process_level_allocator_enabled = true;
-      VLOG(3) << "set flag: " << flag;
+      }
-      framework::InitGflags(flags);
+      if (framework::InitGflags(gflags)) {
+        VLOG(3) << "The following gpu analysis configurations only take effect "
+                   "for the first predictor: ";
+        for (size_t i = 1; i < gflags.size(); ++i) {
+          VLOG(3) << gflags[i];
+        }
+      } else {
+        LOG(WARNING) << "The one-time configuration of analysis predictor "
+                        "failed, which may be due to native predictor called "
+                        "first and its configurations taken effect.";
+      }
+    });
+    if (config.thread_local_stream_enabled() &&
+        process_level_allocator_enabled) {
+      LOG(FATAL) << " When binding threads and streams, the use of "
+                    "process-level allocators will result in undefined result "
+                    "errors due to memory asynchronous operations."
+                    "The thread and stream binding configuration of all "
+                    "predictors should be the same in a single process.";
    }
  }

--- a/paddle/fluid/inference/api/demo_ci/utils.h
+++ b/paddle/fluid/inference/api/demo_ci/utils.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
+#include <math.h>
 #include <algorithm>
 #include <fstream>
 #include <iostream>

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -396,6 +396,14 @@ struct AnalysisConfig {
  ///
  void EnableMkldnnQuantizer();
+  ///
+  /// \brief A boolean state telling whether the thread local CUDA stream is
+  /// enabled.
+  ///
+  /// \return bool Whether the thread local CUDA stream is enabled.
+  ///
+  bool thread_local_stream_enabled() const { return thread_local_stream_; }
  ///
  /// \brief A boolean state telling whether the MKLDNN quantization is enabled.
  ///
@@ -486,6 +494,13 @@ struct AnalysisConfig {
  ///
  ///
  PassStrategy* pass_builder() const;
+  ///
+  /// \brief Enable the GPU multi-computing stream feature.
+  /// NOTE: The current behavior of this interface is to bind the computation
+  /// stream to the thread, and this behavior may be changed in the future.
+  ///
+  void EnableGpuMultiStream();
  void PartiallyRelease();
 protected:
@@ -563,6 +578,8 @@ struct AnalysisConfig {
  std::vector<std::string> lite_ops_filter_;
  Precision lite_precision_mode_;
+  bool thread_local_stream_{false};
  // mkldnn related.
  int mkldnn_cache_capacity_{0};
  bool use_mkldnn_quantizer_{false};

--- a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
@@ -16,21 +16,26 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <cmath>
+#include <mutex>   // NOLINT
+#include <thread>  // NOLINT
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 namespace paddle {
 namespace inference {
-TEST(AnalysisPredictor, use_gpu) {
+int test_main(const AnalysisConfig& config, Barrier* barrier = nullptr) {
-  std::string model_dir = FLAGS_infer_model + "/" + "mul_model";
+  static std::mutex mutex;
-  AnalysisConfig config;
+  std::unique_ptr<PaddlePredictor> predictor;
-  config.EnableUseGpu(100, 0);
+  {
-  config.SetModel(model_dir);
+    std::unique_lock<std::mutex> lock(mutex);
-  config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32);
+    predictor = std::move(CreatePaddlePredictor(config));
+  }
+  if (barrier) {
+    barrier->Wait();
+  }
  std::vector<PaddleTensor> inputs;
-  auto predictor = CreatePaddlePredictor(config);
  std::vector<float> input({1});
  PaddleTensor in;
@@ -40,19 +45,46 @@ TEST(AnalysisPredictor, use_gpu) {
  inputs.emplace_back(in);
  std::vector<PaddleTensor> outputs;
-  ASSERT_TRUE(predictor->Run(inputs, &outputs));
+  predictor->Run(inputs, &outputs);
  const std::vector<float> truth_values = {
      -0.00621776, -0.00620937, 0.00990623,  -0.0039817, -0.00074315,
      0.61229795,  -0.00491806, -0.00068755, 0.18409646, 0.30090684};
  const size_t expected_size = 1;
  EXPECT_EQ(outputs.size(), expected_size);
  float* data_o = static_cast<float*>(outputs[0].data.data());
  for (size_t j = 0; j < outputs[0].data.length() / sizeof(float); ++j) {
    EXPECT_LT(std::abs(data_o[j] - truth_values[j]), 10e-6);
  }
+  return 0;
+}
+#ifdef PADDLE_WITH_CUDA
+TEST(AnalysisPredictor, thread_local_stream) {
+  const size_t thread_num = 5;
+  std::vector<std::thread> threads(thread_num);
+  Barrier barrier(thread_num);
+  for (size_t i = 0; i < threads.size(); ++i) {
+    threads[i] = std::thread([&barrier, i]() {
+      AnalysisConfig config;
+      config.EnableUseGpu(100, 0);
+      config.SetModel(FLAGS_infer_model + "/" + "mul_model");
+      config.EnableGpuMultiStream();
+      test_main(config, &barrier);
+    });
+  }
+  for (auto& th : threads) {
+    th.join();
+  }
+}
+TEST(AnalysisPredictor, lite_engine) {
+  AnalysisConfig config;
+  config.EnableUseGpu(100, 0);
+  config.SetModel(FLAGS_infer_model + "/" + "mul_model");
+  config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32);
+  test_main(config);
 }
+#endif
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -105,6 +105,24 @@ void CheckError(float data_ref, float data) {
  }
 }
+class Barrier {
+ public:
+  explicit Barrier(std::size_t count) : _count(count) {}
+  void Wait() {
+    std::unique_lock<std::mutex> lock(_mutex);
+    if (--_count) {
+      _cv.wait(lock, [this] { return _count == 0; });
+    } else {
+      _cv.notify_all();
+    }
+  }
+ private:
+  std::mutex _mutex;
+  std::condition_variable _cv;
+  std::size_t _count;
+};
 // Compare result between two PaddleTensor
 void CompareResult(const std::vector<PaddleTensor> &outputs,
                   const std::vector<PaddleTensor> &ref_outputs) {

--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -50,7 +50,8 @@ std::once_flag glog_init_flag;
 std::once_flag p2p_init_flag;
 std::once_flag glog_warning_once_flag;
-void InitGflags(std::vector<std::string> argv) {
+bool InitGflags(std::vector<std::string> argv) {
+  bool successed = false;
  std::call_once(gflags_init_flag, [&]() {
    FLAGS_logtostderr = true;
    // NOTE(zhiqiu): dummy is needed, since the function
@@ -71,7 +72,9 @@ void InitGflags(std::vector<std::string> argv) {
            << ", Init commandline: " << line;
    google::ParseCommandLineFlags(&argc, &arr, true);
    VLOG(1) << "After Parse: argc is " << argc;
+    successed = true;
  });
+  return successed;
 }
 void InitP2P(std::vector<int> devices) {

--- a/paddle/fluid/platform/init.h
+++ b/paddle/fluid/platform/init.h
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-void InitGflags(std::vector<std::string> argv);
+bool InitGflags(std::vector<std::string> argv);
 void InitGLOG(const std::string &prog_name);