未验证 提交 17ac6e25 编写于 作者: 石晓伟 提交者: GitHub

update the analysis predictor for multi-stream support, test=develop (#24046)

* update the analysis predictor, test=develop

* update the unit test, test=develop

* no priority set before the inferface determined, test=develop

* interface name generalization, test=develop
上级 69f2f285
...@@ -149,6 +149,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -149,6 +149,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(serialized_info_cache_); CP_MEMBER(serialized_info_cache_);
CP_MEMBER(thread_local_stream_);
if (use_gpu_) { if (use_gpu_) {
pass_builder_.reset(new GpuPassStrategy( pass_builder_.reset(new GpuPassStrategy(
*static_cast<GpuPassStrategy *>(other.pass_builder()))); *static_cast<GpuPassStrategy *>(other.pass_builder())));
...@@ -389,6 +391,8 @@ std::string AnalysisConfig::SerializeInfoCache() { ...@@ -389,6 +391,8 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << use_lite_; ss << use_lite_;
ss << thread_local_stream_;
return ss.str(); return ss.str();
} }
...@@ -480,4 +484,6 @@ void AnalysisConfig::PartiallyRelease() { ...@@ -480,4 +484,6 @@ void AnalysisConfig::PartiallyRelease() {
params_file_.shrink_to_fit(); params_file_.shrink_to_fit();
} }
void AnalysisConfig::EnableGpuMultiStream() { thread_local_stream_ = true; }
} // namespace paddle } // namespace paddle
...@@ -37,13 +37,15 @@ ...@@ -37,13 +37,15 @@
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_helper.h"
#ifdef PADDLE_WITH_MKLML #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/inference/api/mkldnn_quantizer.h" #include "paddle/fluid/inference/api/mkldnn_quantizer.h"
#endif #endif
...@@ -212,9 +214,18 @@ bool AnalysisPredictor::PrepareProgram( ...@@ -212,9 +214,18 @@ bool AnalysisPredictor::PrepareProgram(
return true; return true;
} }
bool AnalysisPredictor::CreateExecutor() { bool AnalysisPredictor::CreateExecutor() {
if (config_.use_gpu_) { if (config_.use_gpu()) {
status_use_gpu_ = true; status_use_gpu_ = true;
place_ = paddle::platform::CUDAPlace(config_.device_id_); place_ = paddle::platform::CUDAPlace(config_.gpu_device_id());
#ifdef PADDLE_WITH_CUDA
if (config_.thread_local_stream_enabled()) {
auto *ctx = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(place_));
VLOG(3) << "The prediction process will be completed using a separate "
"normal-priority stream on each thread.";
ctx->ResetThreadContext(platform::stream::Priority::kNormal);
}
#endif
} else { } else {
place_ = paddle::platform::CPUPlace(); place_ = paddle::platform::CPUPlace();
} }
...@@ -503,30 +514,69 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< ...@@ -503,30 +514,69 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
VLOG(3) << "create AnalysisConfig"; VLOG(3) << "create AnalysisConfig";
PADDLE_ENFORCE(config.is_valid(), PADDLE_ENFORCE(config.is_valid(),
"Note: Each config can only be used for one predictor."); "Note: Each config can only be used for one predictor.");
if (config.use_gpu()) { if (config.use_gpu()) {
// 1. GPU memory static std::once_flag gflags_initialized;
PADDLE_ENFORCE_GE(config.memory_pool_init_size_mb(), 0.f); static bool process_level_allocator_enabled;
PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d",
config.gpu_device_id()); std::call_once(gflags_initialized, [&]() {
std::vector<std::string> flags; std::vector<std::string> gflags;
PADDLE_ENFORCE_GE(
float fraction_of_gpu_memory = config.fraction_of_gpu_memory_for_pool(); config.memory_pool_init_size_mb(), 0.f,
if (fraction_of_gpu_memory > 0.95f) { platform::errors::InvalidArgument(
LOG(ERROR) "The size of memory pool should be greater than 0."));
<< "Allocate too much memory for the GPU memory pool, assigned " PADDLE_ENFORCE_GE(
<< config.memory_pool_init_size_mb() << " MB"; config.gpu_device_id(), 0,
LOG(ERROR) platform::errors::InvalidArgument(
<< "Try to shink the value by setting AnalysisConfig::EnableGpu(...)"; "Invalid device id (%d). The device id should be greater than 0.",
} config.gpu_device_id()));
gflags.push_back("dummy");
float fraction_of_gpu_memory = config.fraction_of_gpu_memory_for_pool();
if (fraction_of_gpu_memory > 0.95f) {
LOG(ERROR)
<< "Allocate too much memory for the GPU memory pool, assigned "
<< config.memory_pool_init_size_mb() << " MB";
LOG(ERROR) << "Try to shink the value by setting "
"AnalysisConfig::EnableGpu(...)";
}
if (fraction_of_gpu_memory >= 0.0f || fraction_of_gpu_memory <= 0.95f) {
std::string flag = "--fraction_of_gpu_memory_to_use=" +
std::to_string(fraction_of_gpu_memory);
VLOG(3) << "set flag: " << flag;
gflags.push_back(flag);
gflags.push_back("--cudnn_deterministic=True");
}
if (fraction_of_gpu_memory >= 0.0f || fraction_of_gpu_memory <= 0.95f) { if (config.thread_local_stream_enabled()) {
flags.push_back("dummy"); gflags.push_back("--allocator_strategy=thread_local");
std::string flag = "--fraction_of_gpu_memory_to_use=" + process_level_allocator_enabled = false;
std::to_string(fraction_of_gpu_memory); } else {
flags.push_back(flag); gflags.push_back("--allocator_strategy=naive_best_fit");
flags.push_back("--cudnn_deterministic=True"); process_level_allocator_enabled = true;
VLOG(3) << "set flag: " << flag; }
framework::InitGflags(flags);
if (framework::InitGflags(gflags)) {
VLOG(3) << "The following gpu analysis configurations only take effect "
"for the first predictor: ";
for (size_t i = 1; i < gflags.size(); ++i) {
VLOG(3) << gflags[i];
}
} else {
LOG(WARNING) << "The one-time configuration of analysis predictor "
"failed, which may be due to native predictor called "
"first and its configurations taken effect.";
}
});
if (config.thread_local_stream_enabled() &&
process_level_allocator_enabled) {
LOG(FATAL) << " When binding threads and streams, the use of "
"process-level allocators will result in undefined result "
"errors due to memory asynchronous operations."
"The thread and stream binding configuration of all "
"predictors should be the same in a single process.";
} }
} }
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <math.h>
#include <algorithm> #include <algorithm>
#include <fstream> #include <fstream>
#include <iostream> #include <iostream>
......
...@@ -396,6 +396,14 @@ struct AnalysisConfig { ...@@ -396,6 +396,14 @@ struct AnalysisConfig {
/// ///
void EnableMkldnnQuantizer(); void EnableMkldnnQuantizer();
///
/// \brief A boolean state telling whether the thread local CUDA stream is
/// enabled.
///
/// \return bool Whether the thread local CUDA stream is enabled.
///
bool thread_local_stream_enabled() const { return thread_local_stream_; }
/// ///
/// \brief A boolean state telling whether the MKLDNN quantization is enabled. /// \brief A boolean state telling whether the MKLDNN quantization is enabled.
/// ///
...@@ -486,6 +494,13 @@ struct AnalysisConfig { ...@@ -486,6 +494,13 @@ struct AnalysisConfig {
/// ///
/// ///
PassStrategy* pass_builder() const; PassStrategy* pass_builder() const;
///
/// \brief Enable the GPU multi-computing stream feature.
/// NOTE: The current behavior of this interface is to bind the computation
/// stream to the thread, and this behavior may be changed in the future.
///
void EnableGpuMultiStream();
void PartiallyRelease(); void PartiallyRelease();
protected: protected:
...@@ -563,6 +578,8 @@ struct AnalysisConfig { ...@@ -563,6 +578,8 @@ struct AnalysisConfig {
std::vector<std::string> lite_ops_filter_; std::vector<std::string> lite_ops_filter_;
Precision lite_precision_mode_; Precision lite_precision_mode_;
bool thread_local_stream_{false};
// mkldnn related. // mkldnn related.
int mkldnn_cache_capacity_{0}; int mkldnn_cache_capacity_{0};
bool use_mkldnn_quantizer_{false}; bool use_mkldnn_quantizer_{false};
......
...@@ -16,21 +16,26 @@ limitations under the License. */ ...@@ -16,21 +16,26 @@ limitations under the License. */
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <cmath> #include <cmath>
#include <mutex> // NOLINT
#include <thread> // NOLINT
#include "paddle/fluid/inference/tests/api/tester_helper.h" #include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
TEST(AnalysisPredictor, use_gpu) { int test_main(const AnalysisConfig& config, Barrier* barrier = nullptr) {
std::string model_dir = FLAGS_infer_model + "/" + "mul_model"; static std::mutex mutex;
AnalysisConfig config; std::unique_ptr<PaddlePredictor> predictor;
config.EnableUseGpu(100, 0); {
config.SetModel(model_dir); std::unique_lock<std::mutex> lock(mutex);
config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32); predictor = std::move(CreatePaddlePredictor(config));
}
if (barrier) {
barrier->Wait();
}
std::vector<PaddleTensor> inputs; std::vector<PaddleTensor> inputs;
auto predictor = CreatePaddlePredictor(config);
std::vector<float> input({1}); std::vector<float> input({1});
PaddleTensor in; PaddleTensor in;
...@@ -40,19 +45,46 @@ TEST(AnalysisPredictor, use_gpu) { ...@@ -40,19 +45,46 @@ TEST(AnalysisPredictor, use_gpu) {
inputs.emplace_back(in); inputs.emplace_back(in);
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
ASSERT_TRUE(predictor->Run(inputs, &outputs)); predictor->Run(inputs, &outputs);
const std::vector<float> truth_values = { const std::vector<float> truth_values = {
-0.00621776, -0.00620937, 0.00990623, -0.0039817, -0.00074315, -0.00621776, -0.00620937, 0.00990623, -0.0039817, -0.00074315,
0.61229795, -0.00491806, -0.00068755, 0.18409646, 0.30090684}; 0.61229795, -0.00491806, -0.00068755, 0.18409646, 0.30090684};
const size_t expected_size = 1; const size_t expected_size = 1;
EXPECT_EQ(outputs.size(), expected_size); EXPECT_EQ(outputs.size(), expected_size);
float* data_o = static_cast<float*>(outputs[0].data.data()); float* data_o = static_cast<float*>(outputs[0].data.data());
for (size_t j = 0; j < outputs[0].data.length() / sizeof(float); ++j) { for (size_t j = 0; j < outputs[0].data.length() / sizeof(float); ++j) {
EXPECT_LT(std::abs(data_o[j] - truth_values[j]), 10e-6); EXPECT_LT(std::abs(data_o[j] - truth_values[j]), 10e-6);
} }
return 0;
}
#ifdef PADDLE_WITH_CUDA
TEST(AnalysisPredictor, thread_local_stream) {
const size_t thread_num = 5;
std::vector<std::thread> threads(thread_num);
Barrier barrier(thread_num);
for (size_t i = 0; i < threads.size(); ++i) {
threads[i] = std::thread([&barrier, i]() {
AnalysisConfig config;
config.EnableUseGpu(100, 0);
config.SetModel(FLAGS_infer_model + "/" + "mul_model");
config.EnableGpuMultiStream();
test_main(config, &barrier);
});
}
for (auto& th : threads) {
th.join();
}
}
TEST(AnalysisPredictor, lite_engine) {
AnalysisConfig config;
config.EnableUseGpu(100, 0);
config.SetModel(FLAGS_infer_model + "/" + "mul_model");
config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32);
test_main(config);
} }
#endif
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -105,6 +105,24 @@ void CheckError(float data_ref, float data) { ...@@ -105,6 +105,24 @@ void CheckError(float data_ref, float data) {
} }
} }
class Barrier {
public:
explicit Barrier(std::size_t count) : _count(count) {}
void Wait() {
std::unique_lock<std::mutex> lock(_mutex);
if (--_count) {
_cv.wait(lock, [this] { return _count == 0; });
} else {
_cv.notify_all();
}
}
private:
std::mutex _mutex;
std::condition_variable _cv;
std::size_t _count;
};
// Compare result between two PaddleTensor // Compare result between two PaddleTensor
void CompareResult(const std::vector<PaddleTensor> &outputs, void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<PaddleTensor> &ref_outputs) { const std::vector<PaddleTensor> &ref_outputs) {
......
...@@ -50,7 +50,8 @@ std::once_flag glog_init_flag; ...@@ -50,7 +50,8 @@ std::once_flag glog_init_flag;
std::once_flag p2p_init_flag; std::once_flag p2p_init_flag;
std::once_flag glog_warning_once_flag; std::once_flag glog_warning_once_flag;
void InitGflags(std::vector<std::string> argv) { bool InitGflags(std::vector<std::string> argv) {
bool successed = false;
std::call_once(gflags_init_flag, [&]() { std::call_once(gflags_init_flag, [&]() {
FLAGS_logtostderr = true; FLAGS_logtostderr = true;
// NOTE(zhiqiu): dummy is needed, since the function // NOTE(zhiqiu): dummy is needed, since the function
...@@ -71,7 +72,9 @@ void InitGflags(std::vector<std::string> argv) { ...@@ -71,7 +72,9 @@ void InitGflags(std::vector<std::string> argv) {
<< ", Init commandline: " << line; << ", Init commandline: " << line;
google::ParseCommandLineFlags(&argc, &arr, true); google::ParseCommandLineFlags(&argc, &arr, true);
VLOG(1) << "After Parse: argc is " << argc; VLOG(1) << "After Parse: argc is " << argc;
successed = true;
}); });
return successed;
} }
void InitP2P(std::vector<int> devices) { void InitP2P(std::vector<int> devices) {
......
...@@ -22,7 +22,7 @@ limitations under the License. */ ...@@ -22,7 +22,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
void InitGflags(std::vector<std::string> argv); bool InitGflags(std::vector<std::string> argv);
void InitGLOG(const std::string &prog_name); void InitGLOG(const std::string &prog_name);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册