diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc index d2d051a69a33a38535e67227d4cc62f5b35e430c..7763a64530853a69ac5f2190ec9ab416063a9a29 100644 --- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h" +#include #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/inference/api/helper.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc index 5852705b6b8d1c650faeae3dc810aac65353b459..d6414856afdf8442371838d0837b0debf94f1009 100644 --- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc @@ -35,7 +35,6 @@ std::unique_ptr FCLstmFusePass::ApplyImpl( auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { - auto* id = subgraph.at(gpd.pattern().RetrieveNode("any_node")); marked_nodes.insert(id); }; @@ -89,7 +88,6 @@ std::unique_ptr FCLstmFusePass::ApplyImpl( LINK_TO(op, hidden_n); #undef LINK_TO return op; - }; lstm_creator(16, 12, 14, 18, 17, 22, 21, 19); @@ -105,14 +103,16 @@ std::unique_ptr FCLstmFusePass::ApplyImpl( for (auto it = node->inputs.begin(); it != node->inputs.end();) { if (marked_nodes.count(*it)) { it = const_cast(node)->inputs.erase(it); - } else + } else { it++; + } } for (auto it = node->outputs.begin(); it != node->outputs.end();) { if (marked_nodes.count(*it)) { it = const_cast(node)->outputs.erase(it); - } else + } else { it++; + } } } diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 945ab110b148c320b6626cadaa47d483df68419e..8c4c94b8d1c413e051916aa4944dcd6e413a1261 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -81,7 +81,7 @@ void GraphPatternDetector::operator()(Graph* graph, LOG(INFO) << "detect " << subgraphs.size() << " subgraph matches the pattern"; int id = 0; for (auto& g : subgraphs) { - LOG(INFO) << "optimizing #" << id++ << " subgraph"; + VLOG(3) << "optimizing #" << id++ << " subgraph"; handler(g, graph); } } diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index 1a65e85dd237eb1bacd3c15b4538a9835ec4b9e0..dee99d41ac4f940373f193df937a4677adc74c6e 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -16,6 +16,7 @@ #include #include +#include // NOLINT #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/inference/analysis/ut_helper.h" @@ -23,19 +24,17 @@ #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/utils/singleton.h" -#include "paddle/fluid/platform/profiler.h" DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN"); DEFINE_string(infer_ditu_rnn_data, "", "data path for ditu RNN"); DEFINE_int32(batch_size, 10, "batch size."); DEFINE_int32(repeat, 1, "Running the inference program repeat times."); +DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); namespace paddle { namespace inference { namespace analysis { -using namespace framework; - TEST(Analyzer, analysis_without_tensorrt) { FLAGS_IA_enable_tensorrt_subgraph_engine = false; Argument argument; @@ -219,39 +218,6 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } } -std::string DescribeTensor(const PaddleTensor &tensor) { - std::stringstream os; - os << "Tensor [" << tensor.name << "]\n"; - os << " - type: "; - switch (tensor.dtype) { - case PaddleDType::FLOAT32: - os << "float32"; - break; - case PaddleDType::INT64: - os << "int64"; - break; - default: - os << "unset"; - } - os << '\n'; - - os << " - shape: " << to_string(tensor.shape) << '\n'; - os << " - lod: "; - for (auto &l : tensor.lod) { - os << to_string(l) << "; "; - } - os << "\n"; - os << " - data: "; - - int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1, - [](int a, int b) { return a * b; }); - for (int i = 0; i < dim; i++) { - os << static_cast(tensor.data.data())[i] << " "; - } - os << '\n'; - return os.str(); -} - } // namespace const float ditu_rnn_target_data[] = { @@ -266,58 +232,92 @@ const float ditu_rnn_target_data[] = { 93.5771, 3.84641, 0, 0, 0, 0, 0, 0, 169.426, 0, 0, 0, 0, 0, 0, 0}; // Test with a really complicate model. -void TestDituRNNPrediction(const std::string &model_path, - const std::string &data_path, int batch_size, - bool use_analysis, bool activate_ir, - int num_times = 1) { +void TestDituRNNPrediction(bool use_analysis_and_activate_ir = false, + int num_threads = FLAGS_num_threads) { NativeConfig config; config.prog_file = FLAGS_infer_ditu_rnn_model + "/__model__"; config.param_file = FLAGS_infer_ditu_rnn_model + "/param"; config.use_gpu = false; config.device = 0; config.specify_input_name = true; + int batch_size = FLAGS_batch_size; + int num_times = FLAGS_repeat; auto base_predictor = CreatePaddlePredictor(config); auto predictor = CreatePaddlePredictor(config); std::vector input_slots; - DataRecord data(data_path, batch_size); + DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size); // Prepare inputs. PrepareInputs(&input_slots, &data, batch_size); std::vector outputs, base_outputs; base_predictor->Run(input_slots, &base_outputs); - Timer timer; - timer.tic(); - for (int i = 0; i < num_times; i++) { - predictor->Run(input_slots, &outputs); - } LOG(INFO) << "===========profile result==========="; - LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << num_times - << ", latency: " << timer.toc() / num_times << "ms"; + if (num_threads == 1) { + std::vector input_slots; + // Prepare inputs. + DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size); + PrepareInputs(&input_slots, &data, batch_size); + + Timer timer; + timer.tic(); + for (int i = 0; i < num_times; i++) { + predictor->Run(input_slots, &outputs); + } + print_time(batch_size, num_times, 1, 0, timer.toc() / num_times); + } else { + std::vector threads; + std::vector input_slots; + // Prepare inputs. + PrepareInputs(&input_slots, &data, batch_size); + std::vector outputs; + for (int tid = 0; tid < num_threads; ++tid) { + threads.emplace_back([&, tid]() { + auto predictor_tid = + CreatePaddlePredictor( + config); + DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size); + + Timer timer; + timer.tic(); + for (int i = 0; i < num_times; i++) { + predictor_tid->Run(input_slots, &outputs); + } + print_time(batch_size, num_times, num_threads, tid, + timer.toc() / num_times); + }); + } + for (int i = 0; i < num_threads; ++i) { + threads[i].join(); + } + } LOG(INFO) << "====================================="; - PADDLE_ENFORCE_GT(outputs.size(), 0); - PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size()); - for (size_t i = 0; i < outputs.size(); i++) { - auto &out = outputs[i]; - auto &base_out = base_outputs[i]; - size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, - [](int a, int b) { return a * b; }); - size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(), - 1, [](int a, int b) { return a * b; }); - PADDLE_ENFORCE_EQ(size, size1); - PADDLE_ENFORCE_GT(size, 0); - float *data = static_cast(out.data.data()); - float *base_data = static_cast(base_out.data.data()); - for (size_t i = 0; i < size; i++) { - EXPECT_NEAR(data[i], base_data[i], 1e-3); + if (num_threads == 1) { + PADDLE_ENFORCE_GT(outputs.size(), 0); + PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size()); + for (size_t i = 0; i < outputs.size(); i++) { + auto &out = outputs[i]; + auto &base_out = base_outputs[i]; + size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, + [](int a, int b) { return a * b; }); + size_t size1 = + std::accumulate(base_out.shape.begin(), base_out.shape.end(), 1, + [](int a, int b) { return a * b; }); + PADDLE_ENFORCE_EQ(size, size1); + PADDLE_ENFORCE_GT(size, 0); + float *data = static_cast(out.data.data()); + float *base_data = static_cast(base_out.data.data()); + for (size_t i = 0; i < size; i++) { + EXPECT_NEAR(data[i], base_data[i], 1e-3); + } } } - if (use_analysis && activate_ir) { + if (use_analysis_and_activate_ir) { AnalysisPredictor *analysis_predictor = dynamic_cast(predictor.get()); auto &fuse_statis = analysis_predictor->analysis_argument() @@ -334,23 +334,16 @@ void TestDituRNNPrediction(const std::string &model_path, // Directly infer with the original model. TEST(Analyzer, DituRNN_without_analysis) { - TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data, - FLAGS_batch_size, false, false, FLAGS_repeat); -} - -// Inference with the original model with the analysis turned on, the analysis -// module will transform the program to a data flow graph. -TEST(Analyzer, DituRNN_with_analysis) { - LOG(INFO) << "ditu rnn with analysis"; - TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data, - FLAGS_batch_size, true, false, FLAGS_repeat); + LOG(INFO) << "ditu rnn without analysis"; + TestDituRNNPrediction(false, 1); + TestDituRNNPrediction(false, 4); // multi-threads } // Inference with analysis and IR. The IR module will fuse some large kernels. TEST(Analyzer, DituRNN_with_analysis_with_IR) { LOG(INFO) << "ditu rnn with analysis and IR fuse"; - TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data, - FLAGS_batch_size, true, true, FLAGS_repeat); + TestDituRNNPrediction(true, 1); + TestDituRNNPrediction(true, 4); // multi-threads } } // namespace analysis diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 33862232bdaae817b9ca72879605386c32ed3e8b..17310de28d471745ff74e8b01345709c1ebe46a1 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -14,6 +14,8 @@ #include "paddle/fluid/inference/api/analysis_predictor.h" #include +#include +#include #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/scope.h" @@ -30,7 +32,6 @@ bool AnalysisPredictor::Init( } else { place_ = paddle::platform::CPUPlace(); } - PADDLE_ENFORCE(!parent_scope); if (parent_scope) { scope_ = parent_scope; sub_scope_ = &(parent_scope->NewScope()); @@ -92,8 +93,6 @@ void AnalysisPredictor::OptimizeInferenceProgram() { Analyzer().Run(&argument_); CHECK(argument_.transformed_program_desc); VLOG(5) << "to prepare executor"; - // LOG(INFO) << "transformed_parogram_desc " << - // argument.transformed_program_desc->DebugString(); inference_program_.reset( new framework::ProgramDesc(*argument_.transformed_program_desc)); PADDLE_ENFORCE(argument_.Has(framework::ir::kParamScopeAttr)); @@ -106,7 +105,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { template <> std::unique_ptr CreatePaddlePredictor< NativeConfig, PaddleEngineKind::kAnalysis>(const NativeConfig& config) { - VLOG(3) << "create NativePredictor"; + VLOG(3) << "create AnalysisPredictor"; if (config.use_gpu) { // 1. GPU memeroy PADDLE_ENFORCE_GT( diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 8eac449a1081330877eac6d4f40c064533b0acab..6df269c572243cc82d5266606a1401aa4f218b78 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include @@ -106,5 +107,45 @@ static void TensorAssignData(PaddleTensor *tensor, } } +std::string DescribeTensor(const PaddleTensor &tensor) { + std::stringstream os; + os << "Tensor [" << tensor.name << "]\n"; + os << " - type: "; + switch (tensor.dtype) { + case PaddleDType::FLOAT32: + os << "float32"; + break; + case PaddleDType::INT64: + os << "int64"; + break; + default: + os << "unset"; + } + os << '\n'; + + os << " - shape: " << to_string(tensor.shape) << '\n'; + os << " - lod: "; + for (auto &l : tensor.lod) { + os << to_string(l) << "; "; + } + os << "\n"; + os << " - data: "; + + int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1, + [](int a, int b) { return a * b; }); + for (int i = 0; i < dim; i++) { + os << static_cast(tensor.data.data())[i] << " "; + } + os << '\n'; + return os.str(); +} + +void print_time(int batch_size, int repeat, int num_threads, int tid, + double latency) { + LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << repeat + << ", threads: " << num_threads << ", thread id: " << tid + << ", latency: " << latency << "ms"; +} + } // namespace inference } // namespace paddle