未验证 提交 e31f6e98 编写于 作者: T Tao Luo 提交者: GitHub

Merge pull request #16146 from luotao1/zero_copy

 unify ZeroCopy in analysis_test
...@@ -126,15 +126,20 @@ void ZeroCopyTensor::copy_to_cpu(T *data) { ...@@ -126,15 +126,20 @@ void ZeroCopyTensor::copy_to_cpu(T *data) {
} }
template void ZeroCopyTensor::copy_from_cpu<float>(const float *data); template void ZeroCopyTensor::copy_from_cpu<float>(const float *data);
template void ZeroCopyTensor::copy_from_cpu<int64_t>(const int64_t *data); template void ZeroCopyTensor::copy_from_cpu<int64_t>(const int64_t *data);
template void ZeroCopyTensor::copy_from_cpu<int32_t>(const int32_t *data);
template void ZeroCopyTensor::copy_to_cpu<float>(float *data); template void ZeroCopyTensor::copy_to_cpu<float>(float *data);
template void ZeroCopyTensor::copy_to_cpu<int64_t>(int64_t *data); template void ZeroCopyTensor::copy_to_cpu<int64_t>(int64_t *data);
template void ZeroCopyTensor::copy_to_cpu<int32_t>(int32_t *data);
template float *ZeroCopyTensor::data<float>(PaddlePlace *place, template float *ZeroCopyTensor::data<float>(PaddlePlace *place,
int *size) const; int *size) const;
template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place, template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place,
int *size) const; int *size) const;
template int32_t *ZeroCopyTensor::data<int32_t>(PaddlePlace *place,
int *size) const;
template float *ZeroCopyTensor::mutable_data<float>(PaddlePlace place); template float *ZeroCopyTensor::mutable_data<float>(PaddlePlace place);
template int64_t *ZeroCopyTensor::mutable_data<int64_t>(PaddlePlace place); template int64_t *ZeroCopyTensor::mutable_data<int64_t>(PaddlePlace place);
template int32_t *ZeroCopyTensor::mutable_data<int32_t>(PaddlePlace place);
void *ZeroCopyTensor::FindTensor() const { void *ZeroCopyTensor::FindTensor() const {
PADDLE_ENFORCE(!name_.empty(), PADDLE_ENFORCE(!name_.empty(),
......
...@@ -139,9 +139,8 @@ static void TensorAssignData(PaddleTensor *tensor, ...@@ -139,9 +139,8 @@ static void TensorAssignData(PaddleTensor *tensor,
} }
template <typename T> template <typename T>
static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, static void ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
const std::vector<std::vector<T>> &data) { const std::vector<std::vector<T>> &data) {
int size{0};
auto *ptr = tensor->mutable_data<T>(PaddlePlace::kCPU); auto *ptr = tensor->mutable_data<T>(PaddlePlace::kCPU);
int c = 0; int c = 0;
for (const auto &f : data) { for (const auto &f : data) {
...@@ -149,7 +148,15 @@ static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, ...@@ -149,7 +148,15 @@ static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
ptr[c++] = v; ptr[c++] = v;
} }
} }
return size; }
template <typename T>
static void ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
const PaddleBuf &data) {
auto *ptr = tensor->mutable_data<T>(PaddlePlace::kCPU);
for (size_t i = 0; i < data.length() / sizeof(T); i++) {
ptr[i] = *(reinterpret_cast<T *>(data.data()) + i);
}
} }
static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) { static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) {
......
...@@ -107,6 +107,9 @@ void SetConfig(AnalysisConfig *cfg) { ...@@ -107,6 +107,9 @@ void SetConfig(AnalysisConfig *cfg) {
cfg->DisableGpu(); cfg->DisableGpu();
cfg->SwitchSpecifyInputNames(); cfg->SwitchSpecifyInputNames();
cfg->SwitchIrOptim(); cfg->SwitchIrOptim();
if (FLAGS_zero_copy) {
cfg->SwitchUseFeedFetchOps(false);
}
} }
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
...@@ -131,7 +134,7 @@ TEST(Analyzer_Pyramid_DNN, profile) { ...@@ -131,7 +134,7 @@ TEST(Analyzer_Pyramid_DNN, profile) {
TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg), TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads); input_slots_all, &outputs, FLAGS_num_threads);
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { if (FLAGS_num_threads == 1 && !FLAGS_test_all_data && !FLAGS_zero_copy) {
PADDLE_ENFORCE_EQ(outputs.size(), 1UL); PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
size_t size = GetSize(outputs[0]); size_t size = GetSize(outputs[0]);
PADDLE_ENFORCE_GT(size, 0); PADDLE_ENFORCE_GT(size, 0);
...@@ -166,6 +169,19 @@ TEST(Analyzer_Pyramid_DNN, compare) { ...@@ -166,6 +169,19 @@ TEST(Analyzer_Pyramid_DNN, compare) {
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all); reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
} }
// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy
TEST(Analyzer_Pyramid_DNN, compare_zero_copy) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
std::vector<std::string> outputs_name;
outputs_name.emplace_back("cos_sim_2.tmp_0");
CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
input_slots_all, outputs_name);
}
// Compare Deterministic result // Compare Deterministic result
TEST(Analyzer_Pyramid_DNN, compare_determine) { TEST(Analyzer_Pyramid_DNN, compare_determine) {
AnalysisConfig cfg; AnalysisConfig cfg;
......
...@@ -207,6 +207,9 @@ void SetConfig(AnalysisConfig *cfg) { ...@@ -207,6 +207,9 @@ void SetConfig(AnalysisConfig *cfg) {
cfg->DisableGpu(); cfg->DisableGpu();
cfg->SwitchSpecifyInputNames(); cfg->SwitchSpecifyInputNames();
cfg->SwitchIrOptim(); cfg->SwitchIrOptim();
if (FLAGS_zero_copy) {
cfg->SwitchUseFeedFetchOps(false);
}
} }
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
...@@ -285,133 +288,17 @@ TEST(Analyzer_rnn1, multi_thread) { ...@@ -285,133 +288,17 @@ TEST(Analyzer_rnn1, multi_thread) {
input_slots_all, &outputs, 2 /* multi_thread */); input_slots_all, &outputs, 2 /* multi_thread */);
} }
// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing // Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy
// on the complex RNN1 model. TEST(Analyzer_rnn1, compare_zero_copy) {
TEST(Analyzer_rnn1, ZeroCopy) { AnalysisConfig cfg;
AnalysisConfig config; SetConfig(&cfg);
SetConfig(&config);
config.SwitchUseFeedFetchOps(false);
PaddlePlace place;
auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
config.SwitchUseFeedFetchOps(true);
auto native_predictor =
CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
config.SwitchUseFeedFetchOps(
true); // the analysis predictor needs feed/fetch.
auto analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
#define NEW_TENSOR(name__) \
auto name__##_tensor = predictor->GetInputTensor(#name__);
NEW_TENSOR(data_lod_attention);
NEW_TENSOR(cell_init);
NEW_TENSOR(data);
NEW_TENSOR(week);
NEW_TENSOR(minute);
NEW_TENSOR(hidden_init);
// Prepare data for AnalysisPredictor
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
PrepareZeroCopyInputs(data_lod_attention_tensor.get(), cell_init_tensor.get(),
data_tensor.get(), hidden_init_tensor.get(),
week_tensor.get(), minute_tensor.get(), &data,
FLAGS_batch_size);
// Prepare data for NativePredictor
std::vector<std::vector<PaddleTensor>> native_inputs;
SetInput(&native_inputs);
std::vector<PaddleTensor> native_outputs;
std::vector<PaddleTensor> analysis_outputs;
auto output_tensor = predictor->GetOutputTensor("final_output.tmp_1");
// Run analysis predictor
int num_ops;
auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_EQ(fuse_statis.at("fc_fuse"), 1);
ASSERT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM
ASSERT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1);
ASSERT_EQ(num_ops,
13); // After graph optimization, only 13 operators exists.
Timer timer;
double total_time{0};
for (int i = 0; i < FLAGS_repeat; i++) {
timer.tic();
predictor->ZeroCopyRun();
total_time += timer.toc();
}
LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(*output_tensor);
ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs));
LOG(INFO) << "native output " << DescribeTensor(native_outputs.front());
int output_size{0}; // this is the number of elements not memory size
auto *zero_copy_data = output_tensor->data<float>(&place, &output_size);
auto *native_data = static_cast<float *>(native_outputs.front().data.data());
for (int i = 0; i < output_size; i++) {
EXPECT_NEAR(zero_copy_data[i], native_data[i], 1e-3);
}
}
TEST(Analyzer_rnn1, ZeroCopyMultiThread) {
AnalysisConfig config;
SetConfig(&config);
config.SwitchUseFeedFetchOps(false);
#define NEW_TENSOR(name__) \
auto name__##_tensor = predictor->GetInputTensor(#name__);
std::vector<std::unique_ptr<PaddlePredictor>> predictors;
predictors.emplace_back(CreatePaddlePredictor<AnalysisConfig>(config));
for (int tid = 1; tid < FLAGS_num_threads; tid++) {
predictors.emplace_back(predictors.front()->Clone());
}
double total_time_of_threads{0};
std::vector<std::thread> threads;
for (int tid = 0; tid < FLAGS_num_threads; tid++) {
threads.emplace_back([&, tid] {
auto &predictor = predictors[tid];
NEW_TENSOR(data_lod_attention);
NEW_TENSOR(cell_init);
NEW_TENSOR(data);
NEW_TENSOR(week);
NEW_TENSOR(minute);
NEW_TENSOR(hidden_init);
// Prepare data for AnalysisPredictor
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
Timer timer;
double total_time{0};
for (int i = 0; i < FLAGS_repeat; i++) {
PrepareZeroCopyInputs(data_lod_attention_tensor.get(),
cell_init_tensor.get(), data_tensor.get(),
hidden_init_tensor.get(), week_tensor.get(),
minute_tensor.get(), &data, FLAGS_batch_size);
timer.tic();
predictor->ZeroCopyRun();
total_time += timer.toc();
}
total_time_of_threads += total_time;
LOG(INFO) << "thread time: " << total_time / FLAGS_repeat;
});
}
for (auto &t : threads) {
t.join();
}
LOG(INFO) << "average time: " std::vector<std::vector<PaddleTensor>> input_slots_all;
<< total_time_of_threads / FLAGS_num_threads / FLAGS_repeat; SetInput(&input_slots_all);
std::vector<std::string> outputs_name;
outputs_name.emplace_back("final_output.tmp_1");
CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
input_slots_all, outputs_name);
} }
} // namespace inference } // namespace inference
......
...@@ -144,6 +144,9 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) { ...@@ -144,6 +144,9 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) {
cfg->SwitchSpecifyInputNames(); cfg->SwitchSpecifyInputNames();
cfg->SwitchIrDebug(); cfg->SwitchIrDebug();
cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
if (FLAGS_zero_copy) {
cfg->SwitchUseFeedFetchOps(false);
}
if (use_mkldnn) { if (use_mkldnn) {
cfg->EnableMKLDNN(); cfg->EnableMKLDNN();
} }
...@@ -184,10 +187,10 @@ TEST(Analyzer_seq_pool1, compare_determine) { ...@@ -184,10 +187,10 @@ TEST(Analyzer_seq_pool1, compare_determine) {
input_slots_all); input_slots_all);
} }
void analysis_fuse_statis(bool use_zerocopy) { // Check the fuse status
TEST(Analyzer_seq_pool1, fuse_statis) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
cfg.SwitchUseFeedFetchOps(!use_zerocopy);
int num_ops; int num_ops;
auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg); auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops); auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
...@@ -203,137 +206,17 @@ void analysis_fuse_statis(bool use_zerocopy) { ...@@ -203,137 +206,17 @@ void analysis_fuse_statis(bool use_zerocopy) {
EXPECT_EQ(num_ops, 171); EXPECT_EQ(num_ops, 171);
} }
// Check the fuse status // Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy
TEST(Analyzer_seq_pool1, fuse_statis) { analysis_fuse_statis(false); } TEST(Analyzer_seq_pool1, compare_zero_copy) {
AnalysisConfig cfg;
void PrepareZeroCopyInputs( SetConfig(&cfg);
const std::unique_ptr<PaddlePredictor> &predictor,
std::vector<std::unique_ptr<ZeroCopyTensor>> *inputs) {
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
// only feed one batch
const auto &one_batch = data.NextBatch();
inputs->clear();
for (size_t i = 0; i < one_batch.size(); ++i) {
auto &slot = one_batch[i];
auto tensor = predictor->GetInputTensor(slot.name + "_embed");
tensor->Reshape(slot.shape);
tensor->SetLoD({slot.lod});
ZeroCopyTensorAssignData<float>(tensor.get(), slot.data);
inputs->emplace_back(std::move(tensor));
}
}
// return the output values
std::vector<float> zerocopy_profile(int repeat_times) {
AnalysisConfig config;
SetConfig(&config);
config.SwitchUseFeedFetchOps(false);
auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
std::vector<std::unique_ptr<ZeroCopyTensor>> inputs;
PrepareZeroCopyInputs(predictor, &inputs);
auto output_tensor = predictor->GetOutputTensor(out_var_name);
Timer timer;
LOG(INFO) << "Warm up run...";
timer.tic();
predictor->ZeroCopyRun();
PrintTime(FLAGS_batch_size, 1, 1, 0, timer.toc(), 1);
if (FLAGS_profile) {
paddle::platform::ResetProfiler();
}
LOG(INFO) << "Run " << repeat_times << " times...";
timer.tic();
for (int i = 0; i < repeat_times; i++) {
predictor->ZeroCopyRun();
}
PrintTime(FLAGS_batch_size, repeat_times, 1, 0, timer.toc() / repeat_times,
1);
LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(*output_tensor);
PaddlePlace place;
int output_size{0};
auto *pdata = output_tensor->data<float>(&place, &output_size);
std::vector<float> res(output_size);
for (int i = 0; i < output_size; ++i) {
res[i] = pdata[i];
}
return res;
}
TEST(Analyzer_seq_pool1, zerocopy_profile) { zerocopy_profile(FLAGS_repeat); }
TEST(Analyzer_seq_pool1, zerocopy_profile_threads) {
AnalysisConfig config;
SetConfig(&config);
config.SwitchUseFeedFetchOps(false);
std::vector<std::unique_ptr<PaddlePredictor>> predictors;
predictors.emplace_back(CreatePaddlePredictor<AnalysisConfig>(config));
for (int tid = 1; tid < FLAGS_num_threads; tid++) {
predictors.emplace_back(predictors.front()->Clone());
}
double total_time_of_threads{0};
std::vector<std::thread> threads;
for (int tid = 0; tid < FLAGS_num_threads; tid++) {
threads.emplace_back([&, tid] {
auto &predictor = predictors[tid];
std::vector<std::unique_ptr<ZeroCopyTensor>> inputs;
PrepareZeroCopyInputs(predictor, &inputs);
auto output_tensor = predictor->GetOutputTensor(out_var_name);
Timer timer;
double total_time{0};
LOG(INFO) << "Warm up run...";
timer.tic();
predictor->ZeroCopyRun();
PrintTime(FLAGS_batch_size, 1, FLAGS_num_threads, tid, timer.toc(), 1);
if (FLAGS_profile) {
paddle::platform::ResetProfiler();
}
int repeat_times = FLAGS_repeat;
LOG(INFO) << "Run " << repeat_times << " times...";
timer.tic();
for (int i = 0; i < repeat_times; i++) {
predictor->ZeroCopyRun();
}
total_time += timer.toc();
total_time_of_threads += total_time;
LOG(INFO) << "thread time: " << total_time / repeat_times;
});
}
for (auto &t : threads) {
t.join();
}
LOG(INFO) << "average time: "
<< total_time_of_threads / FLAGS_num_threads / FLAGS_repeat;
}
TEST(Analyzer_seq_pool1, zerocopy_fuse_statis) { analysis_fuse_statis(true); }
TEST(Analyzer_seq_pool1, zerocopy_compare_native) {
AnalysisConfig config;
SetConfig(&config);
config.SwitchUseFeedFetchOps(true);
auto predictor = CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
std::vector<PaddleTensor> native_outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
ASSERT_TRUE(predictor->Run(input_slots_all[0], &native_outputs)); std::vector<std::string> outputs_name;
EXPECT_EQ(native_outputs.size(), 1UL); outputs_name.emplace_back(out_var_name);
CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
auto zerocopy_output = zerocopy_profile(1); input_slots_all, outputs_name);
EXPECT_EQ(zerocopy_output.size() * sizeof(float),
native_outputs.front().data.length());
auto *native_data = static_cast<float *>(native_outputs.front().data.data());
for (size_t i = 0; i < zerocopy_output.size(); ++i) {
EXPECT_LT(
std::fabs((zerocopy_output[i] - native_data[i]) / zerocopy_output[i]),
1e-3);
}
} }
} // namespace analysis } // namespace analysis
......
...@@ -50,6 +50,7 @@ DEFINE_bool(use_analysis, true, ...@@ -50,6 +50,7 @@ DEFINE_bool(use_analysis, true,
DEFINE_bool(record_benchmark, false, DEFINE_bool(record_benchmark, false,
"Record benchmark after profiling the model"); "Record benchmark after profiling the model");
DEFINE_double(accuracy, 1e-3, "Result Accuracy."); DEFINE_double(accuracy, 1e-3, "Result Accuracy.");
DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch.");
DECLARE_bool(profile); DECLARE_bool(profile);
DECLARE_int32(paddle_num_threads); DECLARE_int32(paddle_num_threads);
...@@ -67,6 +68,7 @@ void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) { ...@@ -67,6 +68,7 @@ void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
LOG(INFO) << analysis_config->ToNativeConfig(); LOG(INFO) << analysis_config->ToNativeConfig();
} }
// Compare result between two PaddleTensor
void CompareResult(const std::vector<PaddleTensor> &outputs, void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<PaddleTensor> &ref_outputs) { const std::vector<PaddleTensor> &ref_outputs) {
EXPECT_GT(outputs.size(), 0UL); EXPECT_GT(outputs.size(), 0UL);
...@@ -108,6 +110,50 @@ void CompareResult(const std::vector<PaddleTensor> &outputs, ...@@ -108,6 +110,50 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
} }
} }
// Compare result between a PaddleTensor and a ZeroCopyTensor
void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<ZeroCopyTensor> &ref_outputs) {
EXPECT_GT(outputs.size(), 0UL);
EXPECT_EQ(outputs.size(), ref_outputs.size());
for (size_t i = 0; i < outputs.size(); i++) {
auto &out = outputs[i];
auto &ref_out = ref_outputs[i];
size_t size = VecReduceToInt(out.shape);
EXPECT_GT(size, 0UL);
int ref_size = 0; // this is the number of elements not memory size
PaddlePlace place;
switch (out.dtype) {
case PaddleDType::INT64: {
int64_t *pdata = static_cast<int64_t *>(out.data.data());
int64_t *pdata_ref = ref_out.data<int64_t>(&place, &ref_size);
EXPECT_EQ(size, ref_size);
for (size_t j = 0; j < size; ++j) {
EXPECT_EQ(pdata_ref[j], pdata[j]);
}
break;
}
case PaddleDType::FLOAT32: {
float *pdata = static_cast<float *>(out.data.data());
float *pdata_ref = ref_out.data<float>(&place, &ref_size);
EXPECT_EQ(size, ref_size);
for (size_t j = 0; j < size; ++j) {
CHECK_LE(std::abs(pdata_ref[j] - pdata[j]), FLAGS_accuracy);
}
break;
}
case PaddleDType::INT32: {
int32_t *pdata = static_cast<int32_t *>(out.data.data());
int32_t *pdata_ref = ref_out.data<int32_t>(&place, &ref_size);
EXPECT_EQ(size, ref_size);
for (size_t j = 0; j < size; ++j) {
EXPECT_EQ(pdata_ref[j], pdata[j]);
}
break;
}
}
}
}
std::unique_ptr<PaddlePredictor> CreateTestPredictor( std::unique_ptr<PaddlePredictor> CreateTestPredictor(
const PaddlePredictor::Config *config, bool use_analysis = true) { const PaddlePredictor::Config *config, bool use_analysis = true) {
const auto *analysis_config = const auto *analysis_config =
...@@ -205,61 +251,106 @@ void GetInputPerBatch(const std::vector<std::vector<int64_t>> &in, ...@@ -205,61 +251,106 @@ void GetInputPerBatch(const std::vector<std::vector<int64_t>> &in,
} }
} }
void TestOneThreadPrediction( void ConvertPaddleTensorToZeroCopyTensor(
const PaddlePredictor::Config *config, PaddlePredictor *predictor, const std::vector<PaddleTensor> &inputs) {
const std::vector<std::vector<PaddleTensor>> &inputs, for (size_t i = 0; i < inputs.size(); i++) {
std::vector<PaddleTensor> *outputs, bool use_analysis = true) { auto input = inputs[i];
int batch_size = FLAGS_batch_size; auto tensor = predictor->GetInputTensor(input.name);
int num_times = FLAGS_repeat; tensor->Reshape(input.shape);
auto predictor = CreateTestPredictor(config, use_analysis); tensor->SetLoD({input.lod});
if (input.dtype == PaddleDType::INT64) {
ZeroCopyTensorAssignData<int64_t>(tensor.get(), input.data);
} else if (input.dtype == PaddleDType::FLOAT32) {
ZeroCopyTensorAssignData<float>(tensor.get(), input.data);
} else if (input.dtype == PaddleDType::INT32) {
ZeroCopyTensorAssignData<int32_t>(tensor.get(), input.data);
} else {
LOG(ERROR) << "unsupported feed type " << input.dtype;
}
}
}
// warmup run void PredictionWarmUp(PaddlePredictor *predictor,
LOG(INFO) << "Warm up run..."; const std::vector<std::vector<PaddleTensor>> &inputs,
{ std::vector<PaddleTensor> *outputs, int num_threads,
Timer warmup_timer; int tid) {
warmup_timer.tic(); int batch_size = FLAGS_batch_size;
LOG(INFO) << "Running thread " << tid << ", warm up run...";
if (FLAGS_zero_copy) {
ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[0]);
}
Timer warmup_timer;
warmup_timer.tic();
if (!FLAGS_zero_copy) {
predictor->Run(inputs[0], outputs, batch_size); predictor->Run(inputs[0], outputs, batch_size);
PrintTime(batch_size, 1, 1, 0, warmup_timer.toc(), 1); } else {
if (FLAGS_profile) { predictor->ZeroCopyRun();
paddle::platform::ResetProfiler(); }
} PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1);
if (FLAGS_profile) {
paddle::platform::ResetProfiler();
} }
}
LOG(INFO) << "Run " << num_times << " times..."; void PredictionRun(PaddlePredictor *predictor,
{ const std::vector<std::vector<PaddleTensor>> &inputs,
Timer run_timer; std::vector<PaddleTensor> *outputs, int num_threads,
run_timer.tic(); int tid) {
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
LOG(INFO) << "Thread " << tid << " run " << num_times << " times...";
Timer run_timer;
double elapsed_time = 0;
#ifdef WITH_GPERFTOOLS #ifdef WITH_GPERFTOOLS
ProfilerStart("paddle_inference.prof"); ProfilerStart("paddle_inference.prof");
#endif #endif
for (int i = 0; i < num_times; i++) { if (!FLAGS_zero_copy) {
for (size_t j = 0; j < inputs.size(); j++) { run_timer.tic();
predictor->Run(inputs[j], outputs, batch_size); for (size_t i = 0; i < inputs.size(); i++) {
for (int j = 0; j < num_times; j++) {
predictor->Run(inputs[i], outputs, batch_size);
}
}
elapsed_time = run_timer.toc();
} else {
for (size_t i = 0; i < inputs.size(); i++) {
ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[i]);
run_timer.tic();
for (int j = 0; j < num_times; j++) {
predictor->ZeroCopyRun();
} }
elapsed_time += run_timer.toc();
} }
}
#ifdef WITH_GPERFTOOLS #ifdef WITH_GPERFTOOLS
ProfilerStop(); ProfilerStop();
#endif #endif
double latency = run_timer.toc() / (num_times > 1 ? num_times : 1); PrintTime(batch_size, num_times, num_threads, tid, elapsed_time / num_times,
PrintTime(batch_size, num_times, 1, 0, latency, inputs.size()); inputs.size());
if (FLAGS_record_benchmark) { if (FLAGS_record_benchmark) {
Benchmark benchmark; Benchmark benchmark;
benchmark.SetName(FLAGS_model_name); benchmark.SetName(FLAGS_model_name);
benchmark.SetBatchSize(batch_size); benchmark.SetBatchSize(batch_size);
benchmark.SetLatency(latency); benchmark.SetLatency(elapsed_time / num_times);
benchmark.PersistToFile("benchmark_record.txt"); benchmark.PersistToFile("benchmark_record.txt");
}
} }
} }
void TestOneThreadPrediction(
const PaddlePredictor::Config *config,
const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<PaddleTensor> *outputs, bool use_analysis = true) {
auto predictor = CreateTestPredictor(config, use_analysis);
PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0);
PredictionRun(predictor.get(), inputs, outputs, 1, 0);
}
void TestMultiThreadPrediction( void TestMultiThreadPrediction(
const PaddlePredictor::Config *config, const PaddlePredictor::Config *config,
const std::vector<std::vector<PaddleTensor>> &inputs, const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<PaddleTensor> *outputs, int num_threads, std::vector<PaddleTensor> *outputs, int num_threads,
bool use_analysis = true) { bool use_analysis = true) {
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
std::vector<std::thread> threads; std::vector<std::thread> threads;
std::vector<std::unique_ptr<PaddlePredictor>> predictors; std::vector<std::unique_ptr<PaddlePredictor>> predictors;
predictors.emplace_back(CreateTestPredictor(config, use_analysis)); predictors.emplace_back(CreateTestPredictor(config, use_analysis));
...@@ -267,7 +358,6 @@ void TestMultiThreadPrediction( ...@@ -267,7 +358,6 @@ void TestMultiThreadPrediction(
predictors.emplace_back(predictors.front()->Clone()); predictors.emplace_back(predictors.front()->Clone());
} }
size_t total_time{0};
for (int tid = 0; tid < num_threads; ++tid) { for (int tid = 0; tid < num_threads; ++tid) {
threads.emplace_back([&, tid]() { threads.emplace_back([&, tid]() {
// Each thread should have local inputs and outputs. // Each thread should have local inputs and outputs.
...@@ -280,34 +370,8 @@ void TestMultiThreadPrediction( ...@@ -280,34 +370,8 @@ void TestMultiThreadPrediction(
->SetMkldnnThreadID(static_cast<int>(tid) + 1); ->SetMkldnnThreadID(static_cast<int>(tid) + 1);
} }
#endif #endif
PredictionWarmUp(predictor.get(), inputs, outputs, num_threads, tid);
// warmup run PredictionRun(predictor.get(), inputs, outputs, num_threads, tid);
LOG(INFO) << "Running thread " << tid << ", warm up run...";
{
Timer warmup_timer;
warmup_timer.tic();
predictor->Run(inputs[0], outputs, batch_size);
PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1);
if (FLAGS_profile) {
paddle::platform::ResetProfiler();
}
}
LOG(INFO) << "Thread " << tid << " run " << num_times << " times...";
{
Timer timer;
timer.tic();
for (int i = 0; i < num_times; i++) {
for (const auto &input : inputs) {
ASSERT_TRUE(predictor->Run(input, &outputs_tid));
}
}
auto time = timer.toc();
total_time += time;
PrintTime(batch_size, num_times, num_threads, tid, time / num_times,
inputs.size());
}
}); });
} }
for (int i = 0; i < num_threads; ++i) { for (int i = 0; i < num_threads; ++i) {
...@@ -367,6 +431,31 @@ void CompareNativeAndAnalysis( ...@@ -367,6 +431,31 @@ void CompareNativeAndAnalysis(
CompareResult(analysis_outputs, native_outputs); CompareResult(analysis_outputs, native_outputs);
} }
void CompareAnalysisAndZeroCopy(
PaddlePredictor::Config *config,
const std::vector<std::vector<PaddleTensor>> &inputs,
const std::vector<std::string> &outputs_name) {
int batch_size = FLAGS_batch_size;
// analysis
std::vector<PaddleTensor> analysis_outputs;
auto predictor = CreateTestPredictor(config, true);
predictor->Run(inputs[0], &analysis_outputs, batch_size);
// analysis + zero_copy
std::vector<ZeroCopyTensor> zerocopy_outputs;
reinterpret_cast<AnalysisConfig *>(config)->SwitchUseFeedFetchOps(false);
predictor = CreateTestPredictor(config, true);
ConvertPaddleTensorToZeroCopyTensor(predictor.get(), inputs[0]);
predictor->ZeroCopyRun();
for (size_t i = 0; i < outputs_name.size(); i++) {
ZeroCopyTensor zerocopy_output =
*predictor->GetOutputTensor(outputs_name[i]).get();
zerocopy_outputs.emplace_back(zerocopy_output);
LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(zerocopy_output);
}
// compare
CompareResult(analysis_outputs, zerocopy_outputs);
}
template <typename T> template <typename T>
std::string LoDTensorSummary(const framework::LoDTensor &tensor) { std::string LoDTensorSummary(const framework::LoDTensor &tensor) {
std::stringstream ss; std::stringstream ss;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册