diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc index d7580a1cfd689c122ea1e7db5918e2cd8711718f..bb52d7e498e55c02ddc2cd6d07ccccd51ce4edc5 100644 --- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc @@ -13,13 +13,10 @@ // limitations under the License. #include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h" - #include - #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/inference/api/helper.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 434bee4ccee1c199088d09c934fe86435ec7d095..731b89423354532f684e19305dfa87e8eb75d4b1 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -85,7 +85,7 @@ void GraphPatternDetector::operator()(Graph* graph, LOG(INFO) << "detect " << subgraphs.size() << " subgraph matches the pattern"; int id = 0; for (auto& g : subgraphs) { - LOG(INFO) << "optimizing #" << id++ << " subgraph"; + VLOG(3) << "optimizing #" << id++ << " subgraph"; handler(g, graph); } } diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index ca834406451d53fd44887300561a6327d97cafcd..1fd884435d173800563ea37809003ed3aee16c7c 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -106,7 +106,6 @@ void Analyzer::Run(Argument* argument) { } } passes.push_back("graph_viz_pass"); - // Ugly support fluid-to-ir-pass argument->Set(kFluidToIrPassesAttr, new std::vector(passes)); for (auto& x : data_) { diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index 94be6733f63488634ee1302e015a57c9a8892d84..4cf26d3c70eafd951d14c26335416ec2c71c001d 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -16,6 +16,7 @@ #include #include +#include // NOLINT #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/inference/analysis/ut_helper.h" @@ -24,12 +25,12 @@ #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/utils/singleton.h" -#include "paddle/fluid/platform/profiler.h" DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN"); DEFINE_string(infer_ditu_rnn_data, "", "data path for ditu RNN"); DEFINE_int32(batch_size, 10, "batch size."); DEFINE_int32(repeat, 1, "Running the inference program repeat times."); +DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); namespace paddle { namespace inference { @@ -220,39 +221,6 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } } -std::string DescribeTensor(const PaddleTensor &tensor) { - std::stringstream os; - os << "Tensor [" << tensor.name << "]\n"; - os << " - type: "; - switch (tensor.dtype) { - case PaddleDType::FLOAT32: - os << "float32"; - break; - case PaddleDType::INT64: - os << "int64"; - break; - default: - os << "unset"; - } - os << '\n'; - - os << " - shape: " << to_string(tensor.shape) << '\n'; - os << " - lod: "; - for (auto &l : tensor.lod) { - os << to_string(l) << "; "; - } - os << "\n"; - os << " - data: "; - - int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1, - [](int a, int b) { return a * b; }); - for (int i = 0; i < dim; i++) { - os << static_cast(tensor.data.data())[i] << " "; - } - os << '\n'; - return os.str(); -} - } // namespace const float ditu_rnn_target_data[] = { @@ -266,11 +234,29 @@ const float ditu_rnn_target_data[] = { 10.7286, 12.0595, 10.6672, 0, 0, 0, 0, 0, 93.5771, 3.84641, 0, 0, 0, 0, 0, 0, 169.426, 0, 0, 0, 0, 0, 0, 0}; +void CompareResult(const std::vector &outputs, + const std::vector &base_outputs) { + PADDLE_ENFORCE_GT(outputs.size(), 0); + PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size()); + for (size_t i = 0; i < outputs.size(); i++) { + auto &out = outputs[i]; + auto &base_out = base_outputs[i]; + size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, + [](int a, int b) { return a * b; }); + size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(), + 1, [](int a, int b) { return a * b; }); + PADDLE_ENFORCE_EQ(size, size1); + PADDLE_ENFORCE_GT(size, 0); + float *data = static_cast(out.data.data()); + float *base_data = static_cast(base_out.data.data()); + for (size_t i = 0; i < size; i++) { + EXPECT_NEAR(data[i], base_data[i], 1e-3); + } + } +} // Test with a really complicate model. -void TestDituRNNPrediction(const std::string &model_path, - const std::string &data_path, int batch_size, - bool use_analysis, bool activate_ir, - int num_times = 1) { +void TestDituRNNPrediction(bool use_analysis, bool activate_ir, + int num_threads) { AnalysisConfig config; config.prog_file = FLAGS_infer_ditu_rnn_model + "/__model__"; config.param_file = FLAGS_infer_ditu_rnn_model + "/param"; @@ -281,6 +267,8 @@ void TestDituRNNPrediction(const std::string &model_path, PADDLE_ENFORCE(config.ir_mode == AnalysisConfig::IrPassMode::kExclude); // default config.ir_passes.clear(); // Do not exclude any pass. + int batch_size = FLAGS_batch_size; + int num_times = FLAGS_repeat; auto base_predictor = CreatePaddlePredictor(config); @@ -288,40 +276,55 @@ void TestDituRNNPrediction(const std::string &model_path, CreatePaddlePredictor( config); std::vector input_slots; - DataRecord data(data_path, batch_size); + DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size); // Prepare inputs. PrepareInputs(&input_slots, &data, batch_size); std::vector outputs, base_outputs; base_predictor->Run(input_slots, &base_outputs); - Timer timer; - timer.tic(); - for (int i = 0; i < num_times; i++) { - predictor->Run(input_slots, &outputs); - } LOG(INFO) << "===========profile result==========="; - LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << num_times - << ", latency: " << timer.toc() / num_times << "ms"; - LOG(INFO) << "====================================="; - - PADDLE_ENFORCE_GT(outputs.size(), 0); - PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size()); - for (size_t i = 0; i < outputs.size(); i++) { - auto &out = outputs[i]; - auto &base_out = base_outputs[i]; - size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, - [](int a, int b) { return a * b; }); - size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(), - 1, [](int a, int b) { return a * b; }); - PADDLE_ENFORCE_EQ(size, size1); - PADDLE_ENFORCE_GT(size, 0); - float *data = static_cast(out.data.data()); - float *base_data = static_cast(base_out.data.data()); - for (size_t j = 0; j < size; j++) { - EXPECT_NEAR(data[j], base_data[j], 1e-3); + if (num_threads == 1) { + // Prepare inputs. + Timer timer; + timer.tic(); + for (int i = 0; i < num_times; i++) { + predictor->Run(input_slots, &outputs); + } + PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times); + CompareResult(outputs, base_outputs); + } else { + std::vector threads; + std::vector> predictors; + // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled + // because AttentionLSTM's hard code nodeid will be damanged. + for (int tid = 0; tid < num_threads; ++tid) { + predictors.emplace_back( + CreatePaddlePredictor( + config)); + } + for (int tid = 0; tid < num_threads; ++tid) { + threads.emplace_back([&, tid]() { + // Each thread should have local input_slots and outputs. + std::vector input_slots; + DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size); + PrepareInputs(&input_slots, &data, batch_size); + std::vector outputs; + Timer timer; + timer.tic(); + for (int i = 0; i < num_times; i++) { + predictors[tid]->Run(input_slots, &outputs); + } + PrintTime(batch_size, num_times, num_threads, tid, + timer.toc() / num_times); + CompareResult(outputs, base_outputs); + }); + } + for (int i = 0; i < num_threads; ++i) { + threads[i].join(); } } + LOG(INFO) << "====================================="; if (use_analysis && activate_ir) { AnalysisPredictor *analysis_predictor = @@ -350,25 +353,26 @@ void TestDituRNNPrediction(const std::string &model_path, } } -// Directly infer with the original model. -TEST(Analyzer, DituRNN_without_analysis) { - TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data, - FLAGS_batch_size, false, false, FLAGS_repeat); +// Inference with analysis and IR, easy for profiling independently. +TEST(Analyzer, DituRNN) { + TestDituRNNPrediction(true, true, FLAGS_num_threads); } -// Inference with the original model with the analysis turned on, the analysis -// module will transform the program to a data flow graph. -TEST(Analyzer, DituRNN_with_analysis) { - LOG(INFO) << "ditu rnn with analysis"; - TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data, - FLAGS_batch_size, true, false, FLAGS_repeat); -} - -// Inference with analysis and IR. The IR module will fuse some large kernels. -TEST(Analyzer, DituRNN_with_analysis_with_IR) { - LOG(INFO) << "ditu rnn with analysis and IR fuse"; - TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data, - FLAGS_batch_size, true, true, FLAGS_repeat); +// Other unit-tests of DituRNN, test different options of use_analysis, +// activate_ir and multi-threads. +TEST(Analyzer, DituRNN_tests) { + int num_threads[2] = {1, 4}; + for (auto i : num_threads) { + // Directly infer with the original model. + TestDituRNNPrediction(false, false, i); + // Inference with the original model with the analysis turned on, the + // analysis + // module will transform the program to a data flow graph. + TestDituRNNPrediction(true, false, i); + // Inference with analysis and IR. The IR module will fuse some large + // kernels. + TestDituRNNPrediction(true, true, i); + } } } // namespace analysis diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index a8fa677202d8429c274a6e3fdfd18ef5d48620c2..79eeea88ea83ad862b5e2ac1390dae377b676685 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -35,7 +35,6 @@ bool AnalysisPredictor::Init( } else { place_ = paddle::platform::CPUPlace(); } - PADDLE_ENFORCE(!parent_scope); if (parent_scope) { scope_ = parent_scope; sub_scope_ = &(parent_scope->NewScope()); diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index ce4728ab8046f886fc40af3644d855a4f971fb71..2c2ac656e8005369bb0e9033236a431cb09caa15 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include @@ -88,5 +89,45 @@ static void TensorAssignData(PaddleTensor *tensor, } } +std::string DescribeTensor(const PaddleTensor &tensor) { + std::stringstream os; + os << "Tensor [" << tensor.name << "]\n"; + os << " - type: "; + switch (tensor.dtype) { + case PaddleDType::FLOAT32: + os << "float32"; + break; + case PaddleDType::INT64: + os << "int64"; + break; + default: + os << "unset"; + } + os << '\n'; + + os << " - shape: " << to_string(tensor.shape) << '\n'; + os << " - lod: "; + for (auto &l : tensor.lod) { + os << to_string(l) << "; "; + } + os << "\n"; + os << " - data: "; + + int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1, + [](int a, int b) { return a * b; }); + for (int i = 0; i < dim; i++) { + os << static_cast(tensor.data.data())[i] << " "; + } + os << '\n'; + return os.str(); +} + +void PrintTime(int batch_size, int repeat, int num_threads, int tid, + double latency) { + LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << repeat + << ", threads: " << num_threads << ", thread id: " << tid + << ", latency: " << latency << "ms"; +} + } // namespace inference } // namespace paddle