未验证 提交 14b43376 编写于 作者: T Tao Luo 提交者: GitHub

Merge pull request #16062 from luotao1/num_threads

refine SetCpuMathLibraryNumThreads
...@@ -183,6 +183,9 @@ void AnalysisPredictor::SetMkldnnThreadID(int tid) { ...@@ -183,6 +183,9 @@ void AnalysisPredictor::SetMkldnnThreadID(int tid) {
bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs, bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data, std::vector<PaddleTensor> *output_data,
int batch_size) { int batch_size) {
if (UNLIKELY(config_.cpu_math_library_num_threads() > 1)) {
paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
}
VLOG(3) << "Predictor::predict"; VLOG(3) << "Predictor::predict";
inference::Timer timer; inference::Timer timer;
timer.tic(); timer.tic();
......
...@@ -131,6 +131,9 @@ NativePaddlePredictor::~NativePaddlePredictor() { ...@@ -131,6 +131,9 @@ NativePaddlePredictor::~NativePaddlePredictor() {
bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs, bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data, std::vector<PaddleTensor> *output_data,
int batch_size) { int batch_size) {
if (UNLIKELY(config_.cpu_math_library_num_threads() > 1)) {
paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
}
VLOG(3) << "Predictor::predict"; VLOG(3) << "Predictor::predict";
Timer timer; Timer timer;
timer.tic(); timer.tic();
......
...@@ -366,15 +366,17 @@ TEST(Analyzer_rnn1, ZeroCopyMultiThread) { ...@@ -366,15 +366,17 @@ TEST(Analyzer_rnn1, ZeroCopyMultiThread) {
#define NEW_TENSOR(name__) \ #define NEW_TENSOR(name__) \
auto name__##_tensor = predictor->GetInputTensor(#name__); auto name__##_tensor = predictor->GetInputTensor(#name__);
auto base_predictor = CreatePaddlePredictor<AnalysisConfig>(config); std::vector<std::unique_ptr<PaddlePredictor>> predictors;
predictors.emplace_back(CreatePaddlePredictor<AnalysisConfig>(config));
for (int tid = 1; tid < FLAGS_num_threads; tid++) {
predictors.emplace_back(predictors.front()->Clone());
}
double total_time_of_threads{0}; double total_time_of_threads{0};
std::vector<std::thread> threads; std::vector<std::thread> threads;
for (int tid = 0; tid < FLAGS_num_threads; tid++) { for (int tid = 0; tid < FLAGS_num_threads; tid++) {
threads.emplace_back([&, tid] { threads.emplace_back([&, tid] {
// To ensure the thread binding correctly, auto &predictor = predictors[tid];
// please clone inside the threadpool.
auto predictor = base_predictor->Clone();
NEW_TENSOR(data_lod_attention); NEW_TENSOR(data_lod_attention);
NEW_TENSOR(cell_init); NEW_TENSOR(cell_init);
NEW_TENSOR(data); NEW_TENSOR(data);
......
...@@ -266,15 +266,17 @@ TEST(Analyzer_seq_pool1, zerocopy_profile_threads) { ...@@ -266,15 +266,17 @@ TEST(Analyzer_seq_pool1, zerocopy_profile_threads) {
SetConfig(&config); SetConfig(&config);
config.SwitchUseFeedFetchOps(false); config.SwitchUseFeedFetchOps(false);
auto base_predictor = CreatePaddlePredictor<AnalysisConfig>(config); std::vector<std::unique_ptr<PaddlePredictor>> predictors;
predictors.emplace_back(CreatePaddlePredictor<AnalysisConfig>(config));
for (int tid = 1; tid < FLAGS_num_threads; tid++) {
predictors.emplace_back(predictors.front()->Clone());
}
double total_time_of_threads{0}; double total_time_of_threads{0};
std::vector<std::thread> threads; std::vector<std::thread> threads;
for (int tid = 0; tid < FLAGS_num_threads; tid++) { for (int tid = 0; tid < FLAGS_num_threads; tid++) {
threads.emplace_back([&, tid] { threads.emplace_back([&, tid] {
// To ensure the thread binding correctly, auto &predictor = predictors[tid];
// please clone inside the threadpool.
auto predictor = base_predictor->Clone();
std::vector<std::unique_ptr<ZeroCopyTensor>> inputs; std::vector<std::unique_ptr<ZeroCopyTensor>> inputs;
PrepareZeroCopyInputs(predictor, &inputs); PrepareZeroCopyInputs(predictor, &inputs);
auto output_tensor = predictor->GetOutputTensor(out_var_name); auto output_tensor = predictor->GetOutputTensor(out_var_name);
......
...@@ -17,8 +17,10 @@ ...@@ -17,8 +17,10 @@
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <algorithm> #include <algorithm>
#include <memory>
#include <string> #include <string>
#include <thread> // NOLINT #include <thread> // NOLINT
#include <unordered_map>
#include <vector> #include <vector>
#ifdef WITH_GPERFTOOLS #ifdef WITH_GPERFTOOLS
#include <gperftools/profiler.h> #include <gperftools/profiler.h>
...@@ -252,7 +254,11 @@ void TestMultiThreadPrediction( ...@@ -252,7 +254,11 @@ void TestMultiThreadPrediction(
int batch_size = FLAGS_batch_size; int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat; int num_times = FLAGS_repeat;
std::vector<std::thread> threads; std::vector<std::thread> threads;
auto main_predictor = CreateTestPredictor(config, use_analysis); std::vector<std::unique_ptr<PaddlePredictor>> predictors;
predictors.emplace_back(CreateTestPredictor(config, use_analysis));
for (int tid = 1; tid < num_threads; tid++) {
predictors.emplace_back(predictors.front()->Clone());
}
size_t total_time{0}; size_t total_time{0};
for (int tid = 0; tid < num_threads; ++tid) { for (int tid = 0; tid < num_threads; ++tid) {
...@@ -260,9 +266,7 @@ void TestMultiThreadPrediction( ...@@ -260,9 +266,7 @@ void TestMultiThreadPrediction(
// Each thread should have local inputs and outputs. // Each thread should have local inputs and outputs.
// The inputs of each thread are all the same. // The inputs of each thread are all the same.
std::vector<PaddleTensor> outputs_tid; std::vector<PaddleTensor> outputs_tid;
// To ensure the thread binding correctly, auto &predictor = predictors[tid];
// please clone inside the threadpool.
auto predictor = main_predictor->Clone();
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
if (use_analysis) { if (use_analysis) {
static_cast<AnalysisPredictor *>(predictor.get()) static_cast<AnalysisPredictor *>(predictor.get())
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册