提交 b7588751 编写于 作者: T Tao Luo 提交者: Yan Chunwei

Refine infer api test (#13472)

* refine analyzer_nlp_tester

* refine analyzer_rnn/vis_tester
上级 d4570f04
...@@ -103,108 +103,74 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -103,108 +103,74 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
input_slots->assign({input_tensor}); input_slots->assign({input_tensor});
} }
const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, void SetConfig(AnalysisConfig *cfg) {
25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43, cfg->model_dir = FLAGS_infer_model;
44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39, cfg->use_gpu = false;
14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23}; cfg->device = 0;
cfg->specify_input_name = true;
void TestLACPrediction(const std::string &model_path, cfg->enable_ir_optim = true;
const std::string &data_file, const int batch_size, }
const int repeat, bool use_analysis = false) {
AnalysisConfig cfg;
cfg.model_dir = model_path;
cfg.use_gpu = false;
cfg.device = 0;
cfg.specify_input_name = true;
cfg.enable_ir_optim = true;
std::vector<PaddleTensor> input_slots, outputs_slots; void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
DataRecord data(data_file, batch_size); DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
GetOneBatch(&input_slots, &data, batch_size); std::vector<PaddleTensor> input_slots;
std::unique_ptr<PaddlePredictor> predictor; int epoch = FLAGS_test_all_data ? data.batched_datas.size() : 1;
if (use_analysis) { LOG(INFO) << "number of samples: " << epoch;
predictor = for (int bid = 0; bid < epoch; ++bid) {
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg); GetOneBatch(&input_slots, &data, FLAGS_batch_size);
} else { (*inputs).emplace_back(input_slots);
predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
}
for (int i = 0; i < FLAGS_burning; i++) {
predictor->Run(input_slots, &outputs_slots);
} }
Timer timer; }
if (FLAGS_test_all_data) {
LOG(INFO) << "test all data";
std::vector<std::vector<PaddleTensor>> input_slots_all;
for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
GetOneBatch(&input_slots, &data, batch_size);
input_slots_all.emplace_back(input_slots);
}
LOG(INFO) << "total number of samples: " << data.datasets.size();
TestPrediction(cfg, input_slots_all, &outputs_slots, FLAGS_num_threads);
return;
}
timer.tic();
for (int i = 0; i < repeat; i++) {
predictor->Run(input_slots, &outputs_slots);
}
PrintTime(batch_size, repeat, 1, 0, timer.toc() / repeat);
// check result // Easy for profiling independently.
EXPECT_EQ(outputs_slots.size(), 1UL); TEST(Analyzer_LAC, profile) {
auto &out = outputs_slots[0]; AnalysisConfig cfg;
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, SetConfig(&cfg);
[](int a, int b) { return a * b; }); std::vector<PaddleTensor> outputs;
size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
PADDLE_ENFORCE_GT(size, 0);
EXPECT_GE(size, batch1_size);
int64_t *pdata = static_cast<int64_t *>(out.data.data());
for (size_t i = 0; i < batch1_size; ++i) {
EXPECT_EQ(pdata[i], lac_ref_data[i]);
}
if (use_analysis) { std::vector<std::vector<PaddleTensor>> input_slots_all;
// run once for comparion as reference SetInput(&input_slots_all);
auto ref_predictor = TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
std::vector<PaddleTensor> ref_outputs_slots;
ref_predictor->Run(input_slots, &ref_outputs_slots);
CompareResult(ref_outputs_slots, outputs_slots);
AnalysisPredictor *analysis_predictor = if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
dynamic_cast<AnalysisPredictor *>(predictor.get()); // the first inference result
auto &fuse_statis = analysis_predictor->analysis_argument() const int64_t lac_ref_data[] = {
.Get<std::unordered_map<std::string, int>>( 24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, 25, 25, 25, 25,
framework::ir::kFuseStatisAttr); 44, 24, 25, 25, 25, 36, 42, 43, 44, 14, 15, 44, 14, 15, 44, 14,
for (auto &item : fuse_statis) { 15, 44, 38, 39, 14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
LOG(INFO) << "fused " << item.first << " " << item.second; PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
} size_t size = GetSize(outputs[0]);
int num_ops = 0; size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
for (auto &node : PADDLE_ENFORCE_GE(size, batch1_size);
analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) { int64_t *pdata = static_cast<int64_t *>(outputs[0].data.data());
if (node->IsFunction()) { for (size_t i = 0; i < batch1_size; ++i) {
++num_ops; EXPECT_EQ(pdata[i], lac_ref_data[i]);
}
} }
LOG(INFO) << "has num ops: " << num_ops;
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 4);
EXPECT_EQ(num_ops, 11);
} }
} }
TEST(Analyzer_LAC, native) { // Check the fuse status
LOG(INFO) << "LAC with native"; TEST(Analyzer_LAC, fuse_statis) {
TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size, AnalysisConfig cfg;
FLAGS_repeat); SetConfig(&cfg);
int num_ops;
auto fuse_statis = GetFuseStatis(cfg, &num_ops);
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 4);
EXPECT_EQ(num_ops, 11);
} }
TEST(Analyzer_LAC, analysis) { // Compare result of NativeConfig and AnalysisConfig
LOG(INFO) << "LAC with analysis"; TEST(Analyzer_LAC, compare) {
TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size, AnalysisConfig cfg;
FLAGS_repeat, true); SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all);
} }
} // namespace analysis } // namespace analysis
......
...@@ -95,97 +95,73 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -95,97 +95,73 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
} }
} }
// the first inference result void SetConfig(AnalysisConfig *cfg) {
const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26, cfg->prog_file = FLAGS_infer_model + "/__model__";
48, 39, 38, 16, 25}; cfg->param_file = FLAGS_infer_model + "/param";
cfg->use_gpu = false;
void TestChineseNERPrediction(bool use_analysis) { cfg->device = 0;
AnalysisConfig cfg; cfg->specify_input_name = true;
cfg.prog_file = FLAGS_infer_model + "/__model__"; cfg->enable_ir_optim = true;
cfg.param_file = FLAGS_infer_model + "/param"; }
cfg.use_gpu = false;
cfg.device = 0;
cfg.specify_input_name = true;
cfg.enable_ir_optim = true;
std::vector<PaddleTensor> input_slots, outputs;
std::unique_ptr<PaddlePredictor> predictor;
Timer timer;
if (use_analysis) {
predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
} else {
predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
}
if (FLAGS_test_all_data) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
LOG(INFO) << "test all data";
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
std::vector<std::vector<PaddleTensor>> input_slots_all;
for (size_t bid = 0; bid < data.num_samples / FLAGS_batch_size; ++bid) {
PrepareInputs(&input_slots, &data, FLAGS_batch_size);
input_slots_all.emplace_back(input_slots);
}
LOG(INFO) << "total number of samples: " << data.num_samples;
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
return;
}
// Prepare inputs.
DataRecord data(FLAGS_infer_data, FLAGS_batch_size); DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
PrepareInputs(&input_slots, &data, FLAGS_batch_size); std::vector<PaddleTensor> input_slots;
int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
timer.tic(); LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
for (int i = 0; i < FLAGS_repeat; i++) { for (int bid = 0; bid < epoch; ++bid) {
predictor->Run(input_slots, &outputs); PrepareInputs(&input_slots, &data, FLAGS_batch_size);
(*inputs).emplace_back(input_slots);
} }
PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0, timer.toc() / FLAGS_repeat); }
PADDLE_ENFORCE(outputs.size(), 1UL); // Easy for profiling independently.
auto &out = outputs[0]; TEST(Analyzer_Chinese_ner, profile) {
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, AnalysisConfig cfg;
[](int a, int b) { return a * b; }); SetConfig(&cfg);
PADDLE_ENFORCE_GT(size, 0); std::vector<PaddleTensor> outputs;
int64_t *result = static_cast<int64_t *>(out.data.data());
for (size_t i = 0; i < std::min(11UL, size); i++) {
PADDLE_ENFORCE(result[i], chinese_ner_result_data[i]);
}
if (use_analysis) { std::vector<std::vector<PaddleTensor>> input_slots_all;
// run once for comparion as reference SetInput(&input_slots_all);
auto ref_predictor = TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
std::vector<PaddleTensor> ref_outputs_slots;
ref_predictor->Run(input_slots, &ref_outputs_slots);
CompareResult(ref_outputs_slots, outputs);
AnalysisPredictor *analysis_predictor = if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
dynamic_cast<AnalysisPredictor *>(predictor.get()); // the first inference result
auto &fuse_statis = analysis_predictor->analysis_argument() const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
.Get<std::unordered_map<std::string, int>>( 48, 39, 38, 16, 25};
framework::ir::kFuseStatisAttr); PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
for (auto &item : fuse_statis) { size_t size = GetSize(outputs[0]);
LOG(INFO) << "fused " << item.first << " " << item.second; PADDLE_ENFORCE_GT(size, 0);
} int64_t *result = static_cast<int64_t *>(outputs[0].data.data());
int num_ops = 0; for (size_t i = 0; i < std::min(11UL, size); i++) {
for (auto &node : EXPECT_EQ(result[i], chinese_ner_result_data[i]);
analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
if (node->IsFunction()) {
++num_ops;
}
} }
LOG(INFO) << "has num ops: " << num_ops;
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 2);
EXPECT_EQ(num_ops, 14);
} }
} }
TEST(Analyzer_Chinese_ner, native) { TestChineseNERPrediction(false); } // Check the fuse status
TEST(Analyzer_Chinese_ner, fuse_statis) {
AnalysisConfig cfg;
SetConfig(&cfg);
TEST(Analyzer_Chinese_ner, analysis) { TestChineseNERPrediction(true); } int num_ops;
auto fuse_statis = GetFuseStatis(cfg, &num_ops);
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 2);
EXPECT_EQ(num_ops, 14);
}
// Compare result of NativeConfig and AnalysisConfig
TEST(Analyzer_Chinese_ner, compare) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all);
}
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -25,6 +25,7 @@ struct DataRecord { ...@@ -25,6 +25,7 @@ struct DataRecord {
std::vector<size_t> lod1, lod2, lod3; std::vector<size_t> lod1, lod2, lod3;
std::vector<std::vector<float>> rnn_link_data, rnn_week_datas, std::vector<std::vector<float>> rnn_link_data, rnn_week_datas,
rnn_minute_datas; rnn_minute_datas;
size_t num_samples; // total number of samples
size_t batch_iter{0}; size_t batch_iter{0};
size_t batch_size{1}; size_t batch_size{1};
DataRecord() = default; DataRecord() = default;
...@@ -97,6 +98,7 @@ struct DataRecord { ...@@ -97,6 +98,7 @@ struct DataRecord {
week_data_all.push_back(std::move(week_data)); week_data_all.push_back(std::move(week_data));
minute_data_all.push_back(std::move(minute_data)); minute_data_all.push_back(std::move(minute_data));
} }
num_samples = num_lines;
} }
}; };
void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
...@@ -147,89 +149,72 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -147,89 +149,72 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
} }
} }
// Test with a really complicate model. void SetConfig(AnalysisConfig *cfg) {
void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) { cfg->prog_file = FLAGS_infer_model + "/__model__";
AnalysisConfig config; cfg->param_file = FLAGS_infer_model + "/param";
config.prog_file = FLAGS_infer_model + "/__model__"; cfg->use_gpu = false;
config.param_file = FLAGS_infer_model + "/param"; cfg->device = 0;
config.use_gpu = false; cfg->specify_input_name = true;
config.device = 0; cfg->enable_ir_optim = true;
config.specify_input_name = true; cfg->ir_passes.clear(); // Do not exclude any pass.
config.enable_ir_optim = activate_ir; }
PADDLE_ENFORCE(config.ir_mode ==
AnalysisConfig::IrPassMode::kExclude); // default
config.ir_passes.clear(); // Do not exclude any pass.
int batch_size = FLAGS_batch_size;
auto base_predictor = void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
auto predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
std::vector<PaddleTensor> input_slots; std::vector<PaddleTensor> input_slots;
DataRecord data(FLAGS_infer_data, batch_size); int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
// Prepare inputs. LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
PrepareInputs(&input_slots, &data, batch_size); for (int bid = 0; bid < epoch; ++bid) {
std::vector<PaddleTensor> outputs, base_outputs; PrepareInputs(&input_slots, &data, FLAGS_batch_size);
(*inputs).emplace_back(input_slots);
}
}
base_predictor->Run(input_slots, &base_outputs); // Easy for profiling independently.
TEST(Analyzer_rnn1, profile) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
input_slots_all.emplace_back(input_slots); SetInput(&input_slots_all);
if (num_threads == 1) { TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
TestOneThreadPrediction(config, input_slots_all, &outputs); }
CompareResult(outputs, base_outputs);
} else {
// only return the output of first thread
TestMultiThreadPrediction(config, input_slots_all, &outputs, num_threads);
}
if (use_analysis && activate_ir) { // Check the fuse status
AnalysisPredictor *analysis_predictor = TEST(Analyzer_rnn1, fuse_statis) {
dynamic_cast<AnalysisPredictor *>(predictor.get()); AnalysisConfig cfg;
auto &fuse_statis = analysis_predictor->analysis_argument() SetConfig(&cfg);
.Get<std::unordered_map<std::string, int>>(
framework::ir::kFuseStatisAttr);
for (auto &item : fuse_statis) {
LOG(INFO) << "fused " << item.first << " " << item.second;
}
int num_ops = 0; int num_ops;
for (auto &node : auto fuse_statis = GetFuseStatis(cfg, &num_ops);
analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) { ASSERT_TRUE(fuse_statis.count("fc_fuse"));
if (node->IsFunction()) { EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
++num_ops; EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM
} EXPECT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1);
} EXPECT_EQ(num_ops,
LOG(INFO) << "has num ops: " << num_ops; 13); // After graph optimization, only 13 operators exists.
}
ASSERT_TRUE(fuse_statis.count("fc_fuse")); // Compare result of NativeConfig and AnalysisConfig
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); TEST(Analyzer_rnn1, compare) {
EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM AnalysisConfig cfg;
EXPECT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1); SetConfig(&cfg);
EXPECT_EQ(num_ops,
13); // After graph optimization, only 13 operators exists. std::vector<std::vector<PaddleTensor>> input_slots_all;
} SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all);
} }
// Inference with analysis and IR, easy for profiling independently. // Test Multi-Thread.
TEST(Analyzer, rnn1) { TestRNN1Prediction(true, true, FLAGS_num_threads); } TEST(Analyzer_rnn1, multi_thread) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<PaddleTensor> outputs;
// Other unit-tests of RNN1, test different options of use_analysis, std::vector<std::vector<PaddleTensor>> input_slots_all;
// activate_ir and multi-threads. SetInput(&input_slots_all);
TEST(Analyzer, RNN_tests) { TestPrediction(cfg, input_slots_all, &outputs, 4 /* num_threads */);
int num_threads[2] = {1, 4};
for (auto i : num_threads) {
// Directly infer with the original model.
TestRNN1Prediction(false, false, i);
// Inference with the original model with the analysis turned on, the
// analysis module will transform the program to a data flow graph.
TestRNN1Prediction(true, false, i);
// Inference with analysis and IR. The IR module will fuse some large
// kernels.
TestRNN1Prediction(true, true, i);
}
} }
} // namespace inference } // namespace inference
......
...@@ -12,24 +12,7 @@ ...@@ -12,24 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/tests/api/tester_helper.h"
#include <google/protobuf/text_format.h>
#include <gtest/gtest.h>
#include <thread> // NOLINT
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
DEFINE_string(infer_model, "", "model path");
DEFINE_string(infer_data, "", "data path");
DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -41,6 +24,7 @@ struct DataRecord { ...@@ -41,6 +24,7 @@ struct DataRecord {
std::vector<size_t> lod; std::vector<size_t> lod;
std::vector<std::vector<float>> rnn_link_data; std::vector<std::vector<float>> rnn_link_data;
std::vector<float> result_data; std::vector<float> result_data;
size_t num_samples; // total number of samples
size_t batch_iter{0}; size_t batch_iter{0};
size_t batch_size{1}; size_t batch_size{1};
DataRecord() = default; DataRecord() = default;
...@@ -100,6 +84,7 @@ struct DataRecord { ...@@ -100,6 +84,7 @@ struct DataRecord {
result_data.insert(result_data.end(), tmp.begin(), tmp.end()); result_data.insert(result_data.end(), tmp.begin(), tmp.end());
} }
} }
num_samples = num_lines / 2;
} }
}; };
void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
...@@ -118,64 +103,58 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -118,64 +103,58 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
input_slots->assign({feed_tensor}); input_slots->assign({feed_tensor});
} }
void CompareResult(const std::vector<PaddleTensor> &outputs, void SetConfig(AnalysisConfig *cfg) {
const std::vector<float> &base_result) { cfg->prog_file = FLAGS_infer_model + "/__model__";
PADDLE_ENFORCE_GT(outputs.size(), 0); cfg->param_file = FLAGS_infer_model + "/param";
for (size_t i = 0; i < outputs.size(); i++) { cfg->use_gpu = false;
auto &out = outputs[i]; cfg->device = 0;
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, cfg->specify_input_name = true;
[](int a, int b) { return a * b; }); cfg->enable_ir_optim = true;
PADDLE_ENFORCE_GT(size, 0); }
float *data = static_cast<float *>(out.data.data());
for (size_t i = 0; i < size; i++) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
EXPECT_NEAR(data[i], base_result[i], 1e-3); DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
} std::vector<PaddleTensor> input_slots;
int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
for (int bid = 0; bid < epoch; ++bid) {
PrepareInputs(&input_slots, &data, FLAGS_batch_size);
(*inputs).emplace_back(input_slots);
} }
} }
// Test with a really complicate model.
void TestRNN2Prediction() {
AnalysisConfig config;
config.prog_file = FLAGS_infer_model + "/__model__";
config.param_file = FLAGS_infer_model + "/param";
config.use_gpu = false;
config.device = 0;
config.specify_input_name = true;
config.enable_ir_optim = true;
PADDLE_ENFORCE(config.ir_mode ==
AnalysisConfig::IrPassMode::kExclude); // default
int batch_size = FLAGS_batch_size; // Easy for profiling independently.
int num_times = FLAGS_repeat; TEST(Analyzer_rnn2, profile) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<PaddleTensor> outputs;
auto base_predictor = std::vector<std::vector<PaddleTensor>> input_slots_all;
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); SetInput(&input_slots_all);
auto predictor = TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
std::vector<PaddleTensor> input_slots;
DataRecord data(FLAGS_infer_data, batch_size);
PrepareInputs(&input_slots, &data, batch_size);
std::vector<PaddleTensor> outputs, base_outputs;
Timer timer1; if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
timer1.tic(); // the first inference result
for (int i = 0; i < num_times; i++) { DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
base_predictor->Run(input_slots, &base_outputs); PADDLE_ENFORCE_GT(outputs.size(), 0);
size_t size = GetSize(outputs[0]);
PADDLE_ENFORCE_GT(size, 0);
float *result = static_cast<float *>(outputs[0].data.data());
for (size_t i = 0; i < size; i++) {
EXPECT_NEAR(result[i], data.result_data[i], 1e-3);
}
} }
PrintTime(batch_size, num_times, 1, 0, timer1.toc() / num_times); }
Timer timer2; // Compare result of NativeConfig and AnalysisConfig
timer2.tic(); TEST(Analyzer_rnn2, compare) {
for (int i = 0; i < num_times; i++) { AnalysisConfig cfg;
predictor->Run(input_slots, &outputs); SetConfig(&cfg);
}
PrintTime(batch_size, num_times, 1, 0, timer2.toc() / num_times);
CompareResult(base_outputs, data.result_data); std::vector<std::vector<PaddleTensor>> input_slots_all;
CompareResult(outputs, data.result_data); SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all);
} }
TEST(Analyzer, rnn2) { TestRNN2Prediction(); }
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -46,54 +46,63 @@ struct DataReader { ...@@ -46,54 +46,63 @@ struct DataReader {
std::unique_ptr<std::ifstream> file; std::unique_ptr<std::ifstream> file;
}; };
void Main(int batch_size) { void SetConfig(AnalysisConfig *cfg) {
// shape -- cfg->model_dir = FLAGS_infer_model;
// Create Predictor -- cfg->use_gpu = false;
AnalysisConfig config; cfg->device = 0;
config.model_dir = FLAGS_infer_model; cfg->specify_input_name = true;
config.use_gpu = false; cfg->enable_ir_optim = true;
config.enable_ir_optim = true; }
std::vector<PaddleTensor> input_slots, output_slots; void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
std::vector<PaddleTensor> input_slots;
DataReader reader(FLAGS_infer_data); DataReader reader(FLAGS_infer_data);
std::vector<std::vector<PaddleTensor>> input_slots_all; int num_batches = 0;
while (reader.NextBatch(&input_slots, FLAGS_batch_size)) {
if (FLAGS_test_all_data) { (*inputs).emplace_back(input_slots);
LOG(INFO) << "test all data"; ++num_batches;
int num_batches = 0; if (!FLAGS_test_all_data) return;
while (reader.NextBatch(&input_slots, FLAGS_batch_size)) {
input_slots_all.emplace_back(input_slots);
++num_batches;
}
LOG(INFO) << "total number of samples: " << num_batches * FLAGS_batch_size;
TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads);
return;
} }
LOG(INFO) << "total number of samples: " << num_batches * FLAGS_batch_size;
}
// one batch starts // Easy for profiling independently.
// data -- TEST(Analyzer_Text_Classification, profile) {
reader.NextBatch(&input_slots, FLAGS_batch_size); AnalysisConfig cfg;
input_slots_all.emplace_back(input_slots); SetConfig(&cfg);
TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads); std::vector<PaddleTensor> outputs;
// Get output std::vector<std::vector<PaddleTensor>> input_slots_all;
LOG(INFO) << "get outputs " << output_slots.size(); SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
for (auto &output : output_slots) { if (FLAGS_num_threads == 1) {
LOG(INFO) << "output.shape: " << to_string(output.shape); // Get output
// no lod ? LOG(INFO) << "get outputs " << outputs.size();
CHECK_EQ(output.lod.size(), 0UL); for (auto &output : outputs) {
LOG(INFO) << "output.dtype: " << output.dtype; LOG(INFO) << "output.shape: " << to_string(output.shape);
std::stringstream ss; // no lod ?
for (int i = 0; i < 5; i++) { CHECK_EQ(output.lod.size(), 0UL);
ss << static_cast<float *>(output.data.data())[i] << " "; LOG(INFO) << "output.dtype: " << output.dtype;
std::stringstream ss;
for (int i = 0; i < 5; i++) {
ss << static_cast<float *>(output.data.data())[i] << " ";
}
LOG(INFO) << "output.data summary: " << ss.str();
// one batch ends
} }
LOG(INFO) << "output.data summary: " << ss.str();
// one batch ends
} }
} }
TEST(text_classification, basic) { Main(FLAGS_batch_size); } // Compare result of NativeConfig and AnalysisConfig
TEST(Analyzer_Text_Classification, compare) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all);
}
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -49,84 +49,83 @@ Record ProcessALine(const std::string &line) { ...@@ -49,84 +49,83 @@ Record ProcessALine(const std::string &line) {
return record; return record;
} }
/* void SetConfig(AnalysisConfig *cfg) {
* Use the native and analysis fluid engine to inference the demo. cfg->param_file = FLAGS_infer_model + "/__params__";
* ocr, mobilenet and se_resnext50 cfg->prog_file = FLAGS_infer_model + "/__model__";
*/ cfg->use_gpu = false;
void TestVisualPrediction(bool use_mkldnn) { cfg->device = 0;
std::unique_ptr<PaddlePredictor> predictor; cfg->enable_ir_optim = true;
AnalysisConfig cfg; cfg->specify_input_name = true;
cfg.param_file = FLAGS_infer_model + "/__params__";
cfg.prog_file = FLAGS_infer_model + "/__model__";
cfg.use_gpu = false;
cfg._use_mkldnn = use_mkldnn;
cfg.device = 0;
cfg.enable_ir_optim = true;
// TODO(TJ): fix fusion gru // TODO(TJ): fix fusion gru
cfg.ir_passes.push_back("fc_gru_fuse_pass"); cfg->ir_passes.push_back("fc_gru_fuse_pass");
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
cfg->_use_mkldnn = true;
// disable mkldnn fuse since it should have some bugs // disable mkldnn fuse since it should have some bugs
cfg.ir_passes.push_back("conv_relu_mkldnn_fuse_pass"); cfg->ir_passes.push_back("conv_relu_mkldnn_fuse_pass");
#endif #endif
predictor = }
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
// Only have single batch of data. void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
std::string line; std::string line;
std::ifstream file(FLAGS_infer_data); std::ifstream file(FLAGS_infer_data);
std::getline(file, line); std::getline(file, line);
auto record = ProcessALine(line); auto record = ProcessALine(line);
file.close();
// Inference.
PaddleTensor input; PaddleTensor input;
input.shape = record.shape; input.shape = record.shape;
input.data =
PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
input.dtype = PaddleDType::FLOAT32; input.dtype = PaddleDType::FLOAT32;
size_t input_size = record.data.size() * sizeof(float);
input.data.Resize(input_size);
memcpy(input.data.data(), record.data.data(), input_size);
std::vector<PaddleTensor> input_slots;
input_slots.assign({input});
(*inputs).emplace_back(input_slots);
}
std::vector<PaddleTensor> outputs_slots; // Easy for profiling independently.
Timer timer; // ocr, mobilenet and se_resnext50
timer.tic(); TEST(Analyzer_vis, profile) {
for (int i = 0; i < FLAGS_repeat; i++) { AnalysisConfig cfg;
predictor->Run({input}, &outputs_slots); SetConfig(&cfg);
} std::vector<PaddleTensor> outputs;
PrintTime(/*batch size*/ 1, FLAGS_repeat, /*num threads*/ 1, /*thread id*/ 0,
timer.toc() / FLAGS_repeat); std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
VLOG(3) << "output.size " << outputs_slots.size(); TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
// run native as reference if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
auto ref_predictor = const float ocr_result_data[] = {
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg); 5.273636460856323538e-08, 3.296741795111302054e-07,
std::vector<PaddleTensor> ref_outputs_slots; 1.873261190610264748e-08, 3.403730275408634043e-08,
ref_predictor->Run({input}, &ref_outputs_slots); 3.383312474625199684e-08};
CompareResult(outputs_slots, ref_outputs_slots); PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
// print what are fused size_t size = GetSize(outputs[0]);
AnalysisPredictor *analysis_predictor = PADDLE_ENFORCE_GT(size, 0);
dynamic_cast<AnalysisPredictor *>(predictor.get()); float *result = static_cast<float *>(outputs[0].data.data());
auto &fuse_statis = analysis_predictor->analysis_argument() for (size_t i = 0; i < std::min(5UL, size); i++) {
.Get<std::unordered_map<std::string, int>>( EXPECT_NEAR(result[i], ocr_result_data[i], 1e-3);
framework::ir::kFuseStatisAttr);
for (auto &item : fuse_statis) {
LOG(INFO) << "fused " << item.first << " " << item.second;
}
int num_ops = 0;
for (auto &node :
analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
if (node->IsFunction()) {
++num_ops;
} }
} }
LOG(INFO) << "has num ops: " << num_ops;
} }
TEST(Analyzer_vis, analysis) { TestVisualPrediction(/*use_mkldnn*/ false); } // Check the fuse status
#ifdef PADDLE_WITH_MKLDNN TEST(Analyzer_vis, fuse_statis) {
TEST(Analyzer_vis, analysis_mkldnn) { AnalysisConfig cfg;
TestVisualPrediction(/*use_mkldnn*/ true); SetConfig(&cfg);
int num_ops;
GetFuseStatis(cfg, &num_ops);
}
// Compare result of NativeConfig and AnalysisConfig
TEST(Analyzer_vis, compare) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all);
} }
#endif
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <string>
#include <thread> // NOLINT #include <thread> // NOLINT
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
...@@ -28,17 +29,18 @@ ...@@ -28,17 +29,18 @@
DEFINE_string(infer_model, "", "model path"); DEFINE_string(infer_model, "", "model path");
DEFINE_string(infer_data, "", "data file"); DEFINE_string(infer_data, "", "data file");
DEFINE_int32(batch_size, 1, "batch size."); DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(burning, 0, "Burning before repeat.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times."); DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_bool(test_all_data, false, "Test the all dataset in data file."); DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
DEFINE_bool(use_analysis, true,
"Running the inference program in analysis mode.");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
void CompareResult(const std::vector<PaddleTensor> &outputs, void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<PaddleTensor> &ref_outputs) { const std::vector<PaddleTensor> &ref_outputs) {
EXPECT_GT(outputs.size(), 0); EXPECT_GT(outputs.size(), 0UL);
EXPECT_EQ(outputs.size(), ref_outputs.size()); EXPECT_EQ(outputs.size(), ref_outputs.size());
for (size_t i = 0; i < outputs.size(); i++) { for (size_t i = 0; i < outputs.size(); i++) {
auto &out = outputs[i]; auto &out = outputs[i];
...@@ -72,14 +74,50 @@ void CompareResult(const std::vector<PaddleTensor> &outputs, ...@@ -72,14 +74,50 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
} }
} }
std::unique_ptr<PaddlePredictor> GetPrediction(AnalysisConfig config,
bool use_analysis = true) {
if (use_analysis) {
return CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
} else {
return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
config);
}
}
size_t GetSize(const PaddleTensor &out) {
return std::accumulate(out.shape.begin(), out.shape.end(), 1,
[](int a, int b) { return a * b; });
}
std::unordered_map<std::string, int> GetFuseStatis(AnalysisConfig config,
int *num_ops) {
auto predictor = GetPrediction(config);
AnalysisPredictor *analysis_predictor =
dynamic_cast<AnalysisPredictor *>(predictor.get());
auto &fuse_statis = analysis_predictor->analysis_argument()
.Get<std::unordered_map<std::string, int>>(
framework::ir::kFuseStatisAttr);
for (auto &item : fuse_statis) {
LOG(INFO) << "fused " << item.first << " " << item.second;
}
int num = 0;
for (auto &node :
analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
if (node->IsFunction()) {
++num;
}
}
*num_ops = num;
return fuse_statis;
}
void TestOneThreadPrediction( void TestOneThreadPrediction(
AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs, AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
std::vector<PaddleTensor> *outputs) { std::vector<PaddleTensor> *outputs, bool use_analysis = true) {
int batch_size = FLAGS_batch_size; int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat; int num_times = FLAGS_repeat;
auto predictor = auto predictor = GetPrediction(config, use_analysis);
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
Timer timer; Timer timer;
timer.tic(); timer.tic();
for (int i = 0; i < num_times; i++) { for (int i = 0; i < num_times; i++) {
...@@ -93,7 +131,8 @@ void TestOneThreadPrediction( ...@@ -93,7 +131,8 @@ void TestOneThreadPrediction(
void TestMultiThreadPrediction( void TestMultiThreadPrediction(
AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs, AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
std::vector<PaddleTensor> *outputs, int num_threads) { std::vector<PaddleTensor> *outputs, int num_threads,
bool use_analysis = true) {
int batch_size = FLAGS_batch_size; int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat; int num_times = FLAGS_repeat;
std::vector<std::thread> threads; std::vector<std::thread> threads;
...@@ -101,9 +140,7 @@ void TestMultiThreadPrediction( ...@@ -101,9 +140,7 @@ void TestMultiThreadPrediction(
// TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
// because AttentionLSTM's hard code nodeid will be damanged. // because AttentionLSTM's hard code nodeid will be damanged.
for (int tid = 0; tid < num_threads; ++tid) { for (int tid = 0; tid < num_threads; ++tid) {
predictors.emplace_back( predictors.emplace_back(GetPrediction(config, use_analysis));
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config));
} }
for (int tid = 0; tid < num_threads; ++tid) { for (int tid = 0; tid < num_threads; ++tid) {
threads.emplace_back([&, tid]() { threads.emplace_back([&, tid]() {
...@@ -129,13 +166,25 @@ void TestMultiThreadPrediction( ...@@ -129,13 +166,25 @@ void TestMultiThreadPrediction(
void TestPrediction(AnalysisConfig config, void TestPrediction(AnalysisConfig config,
const std::vector<std::vector<PaddleTensor>> inputs, const std::vector<std::vector<PaddleTensor>> inputs,
std::vector<PaddleTensor> *outputs, int num_threads) { std::vector<PaddleTensor> *outputs, int num_threads,
bool use_analysis = FLAGS_use_analysis) {
LOG(INFO) << "use_analysis: " << use_analysis;
if (num_threads == 1) { if (num_threads == 1) {
TestOneThreadPrediction(config, inputs, outputs); TestOneThreadPrediction(config, inputs, outputs, use_analysis);
} else { } else {
TestMultiThreadPrediction(config, inputs, outputs, num_threads); TestMultiThreadPrediction(config, inputs, outputs, num_threads,
use_analysis);
} }
} }
void CompareNativeAndAnalysis(
AnalysisConfig config,
const std::vector<std::vector<PaddleTensor>> inputs) {
std::vector<PaddleTensor> native_outputs, analysis_outputs;
TestOneThreadPrediction(config, inputs, &native_outputs, false);
TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
CompareResult(analysis_outputs, native_outputs);
}
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册