From 17ac6e258006a9959e4a2a98a96a181de712d917 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?= <39303645+Shixiaowei02@users.noreply.github.com> Date: Wed, 29 Apr 2020 14:24:50 +0800 Subject: [PATCH] update the analysis predictor for multi-stream support, test=develop (#24046) * update the analysis predictor, test=develop * update the unit test, test=develop * no priority set before the inferface determined, test=develop * interface name generalization, test=develop --- paddle/fluid/inference/api/analysis_config.cc | 6 + .../fluid/inference/api/analysis_predictor.cc | 104 +++++++++++++----- paddle/fluid/inference/api/demo_ci/utils.h | 1 + .../inference/api/paddle_analysis_config.h | 17 +++ .../tests/api/lite_mul_model_test.cc | 52 +++++++-- .../fluid/inference/tests/api/tester_helper.h | 18 +++ paddle/fluid/platform/init.cc | 5 +- paddle/fluid/platform/init.h | 2 +- 8 files changed, 166 insertions(+), 39 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 47b45f8e4f4..23d964c798e 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -149,6 +149,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(serialized_info_cache_); + CP_MEMBER(thread_local_stream_); + if (use_gpu_) { pass_builder_.reset(new GpuPassStrategy( *static_cast(other.pass_builder()))); @@ -389,6 +391,8 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << use_lite_; + ss << thread_local_stream_; + return ss.str(); } @@ -480,4 +484,6 @@ void AnalysisConfig::PartiallyRelease() { params_file_.shrink_to_fit(); } +void AnalysisConfig::EnableGpuMultiStream() { thread_local_stream_ = true; } + } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index d21f0292d9b..1bcbe6acf02 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -37,13 +37,15 @@ #include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/cpu_helper.h" -#ifdef PADDLE_WITH_MKLML -#include "paddle/fluid/platform/dynload/mklml.h" -#endif +#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" +#ifdef PADDLE_WITH_MKLML +#include "paddle/fluid/platform/dynload/mklml.h" +#endif + #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/inference/api/mkldnn_quantizer.h" #endif @@ -212,9 +214,18 @@ bool AnalysisPredictor::PrepareProgram( return true; } bool AnalysisPredictor::CreateExecutor() { - if (config_.use_gpu_) { + if (config_.use_gpu()) { status_use_gpu_ = true; - place_ = paddle::platform::CUDAPlace(config_.device_id_); + place_ = paddle::platform::CUDAPlace(config_.gpu_device_id()); +#ifdef PADDLE_WITH_CUDA + if (config_.thread_local_stream_enabled()) { + auto *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place_)); + VLOG(3) << "The prediction process will be completed using a separate " + "normal-priority stream on each thread."; + ctx->ResetThreadContext(platform::stream::Priority::kNormal); + } +#endif } else { place_ = paddle::platform::CPUPlace(); } @@ -503,30 +514,69 @@ std::unique_ptr CreatePaddlePredictor< VLOG(3) << "create AnalysisConfig"; PADDLE_ENFORCE(config.is_valid(), "Note: Each config can only be used for one predictor."); + if (config.use_gpu()) { - // 1. GPU memory - PADDLE_ENFORCE_GE(config.memory_pool_init_size_mb(), 0.f); - PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d", - config.gpu_device_id()); - std::vector flags; - - float fraction_of_gpu_memory = config.fraction_of_gpu_memory_for_pool(); - if (fraction_of_gpu_memory > 0.95f) { - LOG(ERROR) - << "Allocate too much memory for the GPU memory pool, assigned " - << config.memory_pool_init_size_mb() << " MB"; - LOG(ERROR) - << "Try to shink the value by setting AnalysisConfig::EnableGpu(...)"; - } + static std::once_flag gflags_initialized; + static bool process_level_allocator_enabled; + + std::call_once(gflags_initialized, [&]() { + std::vector gflags; + PADDLE_ENFORCE_GE( + config.memory_pool_init_size_mb(), 0.f, + platform::errors::InvalidArgument( + "The size of memory pool should be greater than 0.")); + PADDLE_ENFORCE_GE( + config.gpu_device_id(), 0, + platform::errors::InvalidArgument( + "Invalid device id (%d). The device id should be greater than 0.", + config.gpu_device_id())); + gflags.push_back("dummy"); + + float fraction_of_gpu_memory = config.fraction_of_gpu_memory_for_pool(); + if (fraction_of_gpu_memory > 0.95f) { + LOG(ERROR) + << "Allocate too much memory for the GPU memory pool, assigned " + << config.memory_pool_init_size_mb() << " MB"; + LOG(ERROR) << "Try to shink the value by setting " + "AnalysisConfig::EnableGpu(...)"; + } + + if (fraction_of_gpu_memory >= 0.0f || fraction_of_gpu_memory <= 0.95f) { + std::string flag = "--fraction_of_gpu_memory_to_use=" + + std::to_string(fraction_of_gpu_memory); + VLOG(3) << "set flag: " << flag; + gflags.push_back(flag); + gflags.push_back("--cudnn_deterministic=True"); + } - if (fraction_of_gpu_memory >= 0.0f || fraction_of_gpu_memory <= 0.95f) { - flags.push_back("dummy"); - std::string flag = "--fraction_of_gpu_memory_to_use=" + - std::to_string(fraction_of_gpu_memory); - flags.push_back(flag); - flags.push_back("--cudnn_deterministic=True"); - VLOG(3) << "set flag: " << flag; - framework::InitGflags(flags); + if (config.thread_local_stream_enabled()) { + gflags.push_back("--allocator_strategy=thread_local"); + process_level_allocator_enabled = false; + } else { + gflags.push_back("--allocator_strategy=naive_best_fit"); + process_level_allocator_enabled = true; + } + + if (framework::InitGflags(gflags)) { + VLOG(3) << "The following gpu analysis configurations only take effect " + "for the first predictor: "; + for (size_t i = 1; i < gflags.size(); ++i) { + VLOG(3) << gflags[i]; + } + } else { + LOG(WARNING) << "The one-time configuration of analysis predictor " + "failed, which may be due to native predictor called " + "first and its configurations taken effect."; + } + }); + + if (config.thread_local_stream_enabled() && + process_level_allocator_enabled) { + LOG(FATAL) << " When binding threads and streams, the use of " + "process-level allocators will result in undefined result " + "errors due to memory asynchronous operations." + "The thread and stream binding configuration of all " + "predictors should be the same in a single process."; } } diff --git a/paddle/fluid/inference/api/demo_ci/utils.h b/paddle/fluid/inference/api/demo_ci/utils.h index 1505a898c5b..5ac00fd294f 100644 --- a/paddle/fluid/inference/api/demo_ci/utils.h +++ b/paddle/fluid/inference/api/demo_ci/utils.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include #include #include #include diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 2002d1f76ab..a66f71e2a89 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -396,6 +396,14 @@ struct AnalysisConfig { /// void EnableMkldnnQuantizer(); + /// + /// \brief A boolean state telling whether the thread local CUDA stream is + /// enabled. + /// + /// \return bool Whether the thread local CUDA stream is enabled. + /// + bool thread_local_stream_enabled() const { return thread_local_stream_; } + /// /// \brief A boolean state telling whether the MKLDNN quantization is enabled. /// @@ -486,6 +494,13 @@ struct AnalysisConfig { /// /// PassStrategy* pass_builder() const; + + /// + /// \brief Enable the GPU multi-computing stream feature. + /// NOTE: The current behavior of this interface is to bind the computation + /// stream to the thread, and this behavior may be changed in the future. + /// + void EnableGpuMultiStream(); void PartiallyRelease(); protected: @@ -563,6 +578,8 @@ struct AnalysisConfig { std::vector lite_ops_filter_; Precision lite_precision_mode_; + bool thread_local_stream_{false}; + // mkldnn related. int mkldnn_cache_capacity_{0}; bool use_mkldnn_quantizer_{false}; diff --git a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc index a50fbfd43ea..35d20ce34ee 100644 --- a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc +++ b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc @@ -16,21 +16,26 @@ limitations under the License. */ #include #include #include +#include // NOLINT +#include // NOLINT #include "paddle/fluid/inference/tests/api/tester_helper.h" namespace paddle { namespace inference { -TEST(AnalysisPredictor, use_gpu) { - std::string model_dir = FLAGS_infer_model + "/" + "mul_model"; - AnalysisConfig config; - config.EnableUseGpu(100, 0); - config.SetModel(model_dir); - config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32); +int test_main(const AnalysisConfig& config, Barrier* barrier = nullptr) { + static std::mutex mutex; + std::unique_ptr predictor; + { + std::unique_lock lock(mutex); + predictor = std::move(CreatePaddlePredictor(config)); + } + if (barrier) { + barrier->Wait(); + } std::vector inputs; - auto predictor = CreatePaddlePredictor(config); std::vector input({1}); PaddleTensor in; @@ -40,19 +45,46 @@ TEST(AnalysisPredictor, use_gpu) { inputs.emplace_back(in); std::vector outputs; - ASSERT_TRUE(predictor->Run(inputs, &outputs)); - + predictor->Run(inputs, &outputs); const std::vector truth_values = { -0.00621776, -0.00620937, 0.00990623, -0.0039817, -0.00074315, 0.61229795, -0.00491806, -0.00068755, 0.18409646, 0.30090684}; - const size_t expected_size = 1; EXPECT_EQ(outputs.size(), expected_size); float* data_o = static_cast(outputs[0].data.data()); for (size_t j = 0; j < outputs[0].data.length() / sizeof(float); ++j) { EXPECT_LT(std::abs(data_o[j] - truth_values[j]), 10e-6); } + return 0; +} + +#ifdef PADDLE_WITH_CUDA +TEST(AnalysisPredictor, thread_local_stream) { + const size_t thread_num = 5; + std::vector threads(thread_num); + Barrier barrier(thread_num); + for (size_t i = 0; i < threads.size(); ++i) { + threads[i] = std::thread([&barrier, i]() { + AnalysisConfig config; + config.EnableUseGpu(100, 0); + config.SetModel(FLAGS_infer_model + "/" + "mul_model"); + config.EnableGpuMultiStream(); + test_main(config, &barrier); + }); + } + for (auto& th : threads) { + th.join(); + } +} + +TEST(AnalysisPredictor, lite_engine) { + AnalysisConfig config; + config.EnableUseGpu(100, 0); + config.SetModel(FLAGS_infer_model + "/" + "mul_model"); + config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32); + test_main(config); } +#endif } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index c8e5c826a13..4e6ffafeff8 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -105,6 +105,24 @@ void CheckError(float data_ref, float data) { } } +class Barrier { + public: + explicit Barrier(std::size_t count) : _count(count) {} + void Wait() { + std::unique_lock lock(_mutex); + if (--_count) { + _cv.wait(lock, [this] { return _count == 0; }); + } else { + _cv.notify_all(); + } + } + + private: + std::mutex _mutex; + std::condition_variable _cv; + std::size_t _count; +}; + // Compare result between two PaddleTensor void CompareResult(const std::vector &outputs, const std::vector &ref_outputs) { diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index c6c84f8b9f0..4113379f5b4 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -50,7 +50,8 @@ std::once_flag glog_init_flag; std::once_flag p2p_init_flag; std::once_flag glog_warning_once_flag; -void InitGflags(std::vector argv) { +bool InitGflags(std::vector argv) { + bool successed = false; std::call_once(gflags_init_flag, [&]() { FLAGS_logtostderr = true; // NOTE(zhiqiu): dummy is needed, since the function @@ -71,7 +72,9 @@ void InitGflags(std::vector argv) { << ", Init commandline: " << line; google::ParseCommandLineFlags(&argc, &arr, true); VLOG(1) << "After Parse: argc is " << argc; + successed = true; }); + return successed; } void InitP2P(std::vector devices) { diff --git a/paddle/fluid/platform/init.h b/paddle/fluid/platform/init.h index d189f0022bf..09aef2743e8 100644 --- a/paddle/fluid/platform/init.h +++ b/paddle/fluid/platform/init.h @@ -22,7 +22,7 @@ limitations under the License. */ namespace paddle { namespace framework { -void InitGflags(std::vector argv); +bool InitGflags(std::vector argv); void InitGLOG(const std::string &prog_name); -- GitLab