未验证 提交 17ac6e25 编写于 作者: 石晓伟 提交者: GitHub

update the analysis predictor for multi-stream support, test=develop (#24046)

* update the analysis predictor, test=develop

* update the unit test, test=develop

* no priority set before the inferface determined, test=develop

* interface name generalization, test=develop
上级 69f2f285
......@@ -149,6 +149,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(serialized_info_cache_);
CP_MEMBER(thread_local_stream_);
if (use_gpu_) {
pass_builder_.reset(new GpuPassStrategy(
*static_cast<GpuPassStrategy *>(other.pass_builder())));
......@@ -389,6 +391,8 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << use_lite_;
ss << thread_local_stream_;
return ss.str();
}
......@@ -480,4 +484,6 @@ void AnalysisConfig::PartiallyRelease() {
params_file_.shrink_to_fit();
}
void AnalysisConfig::EnableGpuMultiStream() { thread_local_stream_ = true; }
} // namespace paddle
......@@ -37,13 +37,15 @@
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/cpu_helper.h"
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/inference/api/mkldnn_quantizer.h"
#endif
......@@ -212,9 +214,18 @@ bool AnalysisPredictor::PrepareProgram(
return true;
}
bool AnalysisPredictor::CreateExecutor() {
if (config_.use_gpu_) {
if (config_.use_gpu()) {
status_use_gpu_ = true;
place_ = paddle::platform::CUDAPlace(config_.device_id_);
place_ = paddle::platform::CUDAPlace(config_.gpu_device_id());
#ifdef PADDLE_WITH_CUDA
if (config_.thread_local_stream_enabled()) {
auto *ctx = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(place_));
VLOG(3) << "The prediction process will be completed using a separate "
"normal-priority stream on each thread.";
ctx->ResetThreadContext(platform::stream::Priority::kNormal);
}
#endif
} else {
place_ = paddle::platform::CPUPlace();
}
......@@ -503,30 +514,69 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
VLOG(3) << "create AnalysisConfig";
PADDLE_ENFORCE(config.is_valid(),
"Note: Each config can only be used for one predictor.");
if (config.use_gpu()) {
// 1. GPU memory
PADDLE_ENFORCE_GE(config.memory_pool_init_size_mb(), 0.f);
PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d",
config.gpu_device_id());
std::vector<std::string> flags;
static std::once_flag gflags_initialized;
static bool process_level_allocator_enabled;
std::call_once(gflags_initialized, [&]() {
std::vector<std::string> gflags;
PADDLE_ENFORCE_GE(
config.memory_pool_init_size_mb(), 0.f,
platform::errors::InvalidArgument(
"The size of memory pool should be greater than 0."));
PADDLE_ENFORCE_GE(
config.gpu_device_id(), 0,
platform::errors::InvalidArgument(
"Invalid device id (%d). The device id should be greater than 0.",
config.gpu_device_id()));
gflags.push_back("dummy");
float fraction_of_gpu_memory = config.fraction_of_gpu_memory_for_pool();
if (fraction_of_gpu_memory > 0.95f) {
LOG(ERROR)
<< "Allocate too much memory for the GPU memory pool, assigned "
<< config.memory_pool_init_size_mb() << " MB";
LOG(ERROR)
<< "Try to shink the value by setting AnalysisConfig::EnableGpu(...)";
LOG(ERROR) << "Try to shink the value by setting "
"AnalysisConfig::EnableGpu(...)";
}
if (fraction_of_gpu_memory >= 0.0f || fraction_of_gpu_memory <= 0.95f) {
flags.push_back("dummy");
std::string flag = "--fraction_of_gpu_memory_to_use=" +
std::to_string(fraction_of_gpu_memory);
flags.push_back(flag);
flags.push_back("--cudnn_deterministic=True");
VLOG(3) << "set flag: " << flag;
framework::InitGflags(flags);
gflags.push_back(flag);
gflags.push_back("--cudnn_deterministic=True");
}
if (config.thread_local_stream_enabled()) {
gflags.push_back("--allocator_strategy=thread_local");
process_level_allocator_enabled = false;
} else {
gflags.push_back("--allocator_strategy=naive_best_fit");
process_level_allocator_enabled = true;
}
if (framework::InitGflags(gflags)) {
VLOG(3) << "The following gpu analysis configurations only take effect "
"for the first predictor: ";
for (size_t i = 1; i < gflags.size(); ++i) {
VLOG(3) << gflags[i];
}
} else {
LOG(WARNING) << "The one-time configuration of analysis predictor "
"failed, which may be due to native predictor called "
"first and its configurations taken effect.";
}
});
if (config.thread_local_stream_enabled() &&
process_level_allocator_enabled) {
LOG(FATAL) << " When binding threads and streams, the use of "
"process-level allocators will result in undefined result "
"errors due to memory asynchronous operations."
"The thread and stream binding configuration of all "
"predictors should be the same in a single process.";
}
}
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#pragma once
#include <math.h>
#include <algorithm>
#include <fstream>
#include <iostream>
......
......@@ -396,6 +396,14 @@ struct AnalysisConfig {
///
void EnableMkldnnQuantizer();
///
/// \brief A boolean state telling whether the thread local CUDA stream is
/// enabled.
///
/// \return bool Whether the thread local CUDA stream is enabled.
///
bool thread_local_stream_enabled() const { return thread_local_stream_; }
///
/// \brief A boolean state telling whether the MKLDNN quantization is enabled.
///
......@@ -486,6 +494,13 @@ struct AnalysisConfig {
///
///
PassStrategy* pass_builder() const;
///
/// \brief Enable the GPU multi-computing stream feature.
/// NOTE: The current behavior of this interface is to bind the computation
/// stream to the thread, and this behavior may be changed in the future.
///
void EnableGpuMultiStream();
void PartiallyRelease();
protected:
......@@ -563,6 +578,8 @@ struct AnalysisConfig {
std::vector<std::string> lite_ops_filter_;
Precision lite_precision_mode_;
bool thread_local_stream_{false};
// mkldnn related.
int mkldnn_cache_capacity_{0};
bool use_mkldnn_quantizer_{false};
......
......@@ -16,21 +16,26 @@ limitations under the License. */
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <cmath>
#include <mutex> // NOLINT
#include <thread> // NOLINT
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
TEST(AnalysisPredictor, use_gpu) {
std::string model_dir = FLAGS_infer_model + "/" + "mul_model";
AnalysisConfig config;
config.EnableUseGpu(100, 0);
config.SetModel(model_dir);
config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32);
int test_main(const AnalysisConfig& config, Barrier* barrier = nullptr) {
static std::mutex mutex;
std::unique_ptr<PaddlePredictor> predictor;
{
std::unique_lock<std::mutex> lock(mutex);
predictor = std::move(CreatePaddlePredictor(config));
}
if (barrier) {
barrier->Wait();
}
std::vector<PaddleTensor> inputs;
auto predictor = CreatePaddlePredictor(config);
std::vector<float> input({1});
PaddleTensor in;
......@@ -40,19 +45,46 @@ TEST(AnalysisPredictor, use_gpu) {
inputs.emplace_back(in);
std::vector<PaddleTensor> outputs;
ASSERT_TRUE(predictor->Run(inputs, &outputs));
predictor->Run(inputs, &outputs);
const std::vector<float> truth_values = {
-0.00621776, -0.00620937, 0.00990623, -0.0039817, -0.00074315,
0.61229795, -0.00491806, -0.00068755, 0.18409646, 0.30090684};
const size_t expected_size = 1;
EXPECT_EQ(outputs.size(), expected_size);
float* data_o = static_cast<float*>(outputs[0].data.data());
for (size_t j = 0; j < outputs[0].data.length() / sizeof(float); ++j) {
EXPECT_LT(std::abs(data_o[j] - truth_values[j]), 10e-6);
}
return 0;
}
#ifdef PADDLE_WITH_CUDA
TEST(AnalysisPredictor, thread_local_stream) {
const size_t thread_num = 5;
std::vector<std::thread> threads(thread_num);
Barrier barrier(thread_num);
for (size_t i = 0; i < threads.size(); ++i) {
threads[i] = std::thread([&barrier, i]() {
AnalysisConfig config;
config.EnableUseGpu(100, 0);
config.SetModel(FLAGS_infer_model + "/" + "mul_model");
config.EnableGpuMultiStream();
test_main(config, &barrier);
});
}
for (auto& th : threads) {
th.join();
}
}
TEST(AnalysisPredictor, lite_engine) {
AnalysisConfig config;
config.EnableUseGpu(100, 0);
config.SetModel(FLAGS_infer_model + "/" + "mul_model");
config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32);
test_main(config);
}
#endif
} // namespace inference
} // namespace paddle
......@@ -105,6 +105,24 @@ void CheckError(float data_ref, float data) {
}
}
class Barrier {
public:
explicit Barrier(std::size_t count) : _count(count) {}
void Wait() {
std::unique_lock<std::mutex> lock(_mutex);
if (--_count) {
_cv.wait(lock, [this] { return _count == 0; });
} else {
_cv.notify_all();
}
}
private:
std::mutex _mutex;
std::condition_variable _cv;
std::size_t _count;
};
// Compare result between two PaddleTensor
void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<PaddleTensor> &ref_outputs) {
......
......@@ -50,7 +50,8 @@ std::once_flag glog_init_flag;
std::once_flag p2p_init_flag;
std::once_flag glog_warning_once_flag;
void InitGflags(std::vector<std::string> argv) {
bool InitGflags(std::vector<std::string> argv) {
bool successed = false;
std::call_once(gflags_init_flag, [&]() {
FLAGS_logtostderr = true;
// NOTE(zhiqiu): dummy is needed, since the function
......@@ -71,7 +72,9 @@ void InitGflags(std::vector<std::string> argv) {
<< ", Init commandline: " << line;
google::ParseCommandLineFlags(&argc, &arr, true);
VLOG(1) << "After Parse: argc is " << argc;
successed = true;
});
return successed;
}
void InitP2P(std::vector<int> devices) {
......
......@@ -22,7 +22,7 @@ limitations under the License. */
namespace paddle {
namespace framework {
void InitGflags(std::vector<std::string> argv);
bool InitGflags(std::vector<std::string> argv);
void InitGLOG(const std::string &prog_name);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册