From 0fbe0a7a283376294b26738e777660694a1ba557 Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Fri, 31 Aug 2018 18:34:35 +0800
Subject: [PATCH] add multi-thread ut for ditu-rnn

---
 .../framework/ir/attention_lstm_fuse_pass.cc  |   2 +-
 .../fluid/framework/ir/fc_lstm_fuse_pass.cc   |   8 +-
 .../framework/ir/graph_pattern_detector.cc    |   2 +-
 .../inference/analysis/analyzer_tester.cc     | 145 +++++++++---------
 .../fluid/inference/api/analysis_predictor.cc |   7 +-
 paddle/fluid/inference/api/helper.h           |  41 +++++
 6 files changed, 119 insertions(+), 86 deletions(-)
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
index d2d051a69..7763a6453 100644
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h"
+#include <string>
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/inference/api/helper.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
index 5852705b6..d6414856a 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -35,7 +35,6 @@ std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
 
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-
     auto* id = subgraph.at(gpd.pattern().RetrieveNode("any_node"));
     marked_nodes.insert(id);
   };
@@ -89,7 +88,6 @@ std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
     LINK_TO(op, hidden_n);
 #undef LINK_TO
     return op;
-
   };
 
   lstm_creator(16, 12, 14, 18, 17, 22, 21, 19);
@@ -105,14 +103,16 @@ std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
     for (auto it = node->inputs.begin(); it != node->inputs.end();) {
       if (marked_nodes.count(*it)) {
         it = const_cast<Node*>(node)->inputs.erase(it);
-      } else
+      } else {
         it++;
+      }
     }
     for (auto it = node->outputs.begin(); it != node->outputs.end();) {
       if (marked_nodes.count(*it)) {
         it = const_cast<Node*>(node)->outputs.erase(it);
-      } else
+      } else {
         it++;
+      }
     }
   }
 
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 945ab110b..8c4c94b8d 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -81,7 +81,7 @@ void GraphPatternDetector::operator()(Graph* graph,
   LOG(INFO) << "detect " << subgraphs.size() << " subgraph matches the pattern";
   int id = 0;
   for (auto& g : subgraphs) {
-    LOG(INFO) << "optimizing #" << id++ << " subgraph";
+    VLOG(3) << "optimizing #" << id++ << " subgraph";
     handler(g, graph);
   }
 }
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 1a65e85dd..dee99d41a 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -16,6 +16,7 @@
 
 #include <google/protobuf/text_format.h>
 #include <gtest/gtest.h>
+#include <thread>  // NOLINT
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
@@ -23,19 +24,17 @@
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/utils/singleton.h"
-#include "paddle/fluid/platform/profiler.h"
 
 DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN");
 DEFINE_string(infer_ditu_rnn_data, "", "data path for ditu RNN");
 DEFINE_int32(batch_size, 10, "batch size.");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-using namespace framework;
-
 TEST(Analyzer, analysis_without_tensorrt) {
   FLAGS_IA_enable_tensorrt_subgraph_engine = false;
   Argument argument;
@@ -219,39 +218,6 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   }
 }
 
-std::string DescribeTensor(const PaddleTensor &tensor) {
-  std::stringstream os;
-  os << "Tensor [" << tensor.name << "]\n";
-  os << " - type: ";
-  switch (tensor.dtype) {
-    case PaddleDType::FLOAT32:
-      os << "float32";
-      break;
-    case PaddleDType::INT64:
-      os << "int64";
-      break;
-    default:
-      os << "unset";
-  }
-  os << '\n';
-
-  os << " - shape: " << to_string(tensor.shape) << '\n';
-  os << " - lod: ";
-  for (auto &l : tensor.lod) {
-    os << to_string(l) << "; ";
-  }
-  os << "\n";
-  os << " - data: ";
-
-  int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
-                            [](int a, int b) { return a * b; });
-  for (int i = 0; i < dim; i++) {
-    os << static_cast<float *>(tensor.data.data())[i] << " ";
-  }
-  os << '\n';
-  return os.str();
-}
-
 }  // namespace
 
 const float ditu_rnn_target_data[] = {
@@ -266,58 +232,92 @@ const float ditu_rnn_target_data[] = {
     93.5771, 3.84641, 0,       0,       0,       0,       0,       0,
     169.426, 0,       0,       0,       0,       0,       0,       0};
 // Test with a really complicate model.
-void TestDituRNNPrediction(const std::string &model_path,
-                           const std::string &data_path, int batch_size,
-                           bool use_analysis, bool activate_ir,
-                           int num_times = 1) {
+void TestDituRNNPrediction(bool use_analysis_and_activate_ir = false,
+                           int num_threads = FLAGS_num_threads) {
   NativeConfig config;
   config.prog_file = FLAGS_infer_ditu_rnn_model + "/__model__";
   config.param_file = FLAGS_infer_ditu_rnn_model + "/param";
   config.use_gpu = false;
   config.device = 0;
   config.specify_input_name = true;
+  int batch_size = FLAGS_batch_size;
+  int num_times = FLAGS_repeat;
 
   auto base_predictor =
       CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
   auto predictor =
       CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kAnalysis>(config);
   std::vector<PaddleTensor> input_slots;
-  DataRecord data(data_path, batch_size);
+  DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size);
   // Prepare inputs.
   PrepareInputs(&input_slots, &data, batch_size);
   std::vector<PaddleTensor> outputs, base_outputs;
 
   base_predictor->Run(input_slots, &base_outputs);
 
-  Timer timer;
-  timer.tic();
-  for (int i = 0; i < num_times; i++) {
-    predictor->Run(input_slots, &outputs);
-  }
   LOG(INFO) << "===========profile result===========";
-  LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << num_times
-            << ", latency: " << timer.toc() / num_times << "ms";
+  if (num_threads == 1) {
+    std::vector<PaddleTensor> input_slots;
+    // Prepare inputs.
+    DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size);
+    PrepareInputs(&input_slots, &data, batch_size);
+
+    Timer timer;
+    timer.tic();
+    for (int i = 0; i < num_times; i++) {
+      predictor->Run(input_slots, &outputs);
+    }
+    print_time(batch_size, num_times, 1, 0, timer.toc() / num_times);
+  } else {
+    std::vector<std::thread> threads;
+    std::vector<PaddleTensor> input_slots;
+    // Prepare inputs.
+    PrepareInputs(&input_slots, &data, batch_size);
+    std::vector<PaddleTensor> outputs;
+    for (int tid = 0; tid < num_threads; ++tid) {
+      threads.emplace_back([&, tid]() {
+        auto predictor_tid =
+            CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kAnalysis>(
+                config);
+        DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size);
+
+        Timer timer;
+        timer.tic();
+        for (int i = 0; i < num_times; i++) {
+          predictor_tid->Run(input_slots, &outputs);
+        }
+        print_time(batch_size, num_times, num_threads, tid,
+                   timer.toc() / num_times);
+      });
+    }
+    for (int i = 0; i < num_threads; ++i) {
+      threads[i].join();
+    }
+  }
   LOG(INFO) << "=====================================";
 
-  PADDLE_ENFORCE_GT(outputs.size(), 0);
-  PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
-  for (size_t i = 0; i < outputs.size(); i++) {
-    auto &out = outputs[i];
-    auto &base_out = base_outputs[i];
-    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
-                                  [](int a, int b) { return a * b; });
-    size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
-                                   1, [](int a, int b) { return a * b; });
-    PADDLE_ENFORCE_EQ(size, size1);
-    PADDLE_ENFORCE_GT(size, 0);
-    float *data = static_cast<float *>(out.data.data());
-    float *base_data = static_cast<float *>(base_out.data.data());
-    for (size_t i = 0; i < size; i++) {
-      EXPECT_NEAR(data[i], base_data[i], 1e-3);
+  if (num_threads == 1) {
+    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
+    for (size_t i = 0; i < outputs.size(); i++) {
+      auto &out = outputs[i];
+      auto &base_out = base_outputs[i];
+      size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                    [](int a, int b) { return a * b; });
+      size_t size1 =
+          std::accumulate(base_out.shape.begin(), base_out.shape.end(), 1,
+                          [](int a, int b) { return a * b; });
+      PADDLE_ENFORCE_EQ(size, size1);
+      PADDLE_ENFORCE_GT(size, 0);
+      float *data = static_cast<float *>(out.data.data());
+      float *base_data = static_cast<float *>(base_out.data.data());
+      for (size_t i = 0; i < size; i++) {
+        EXPECT_NEAR(data[i], base_data[i], 1e-3);
+      }
     }
   }
 
-  if (use_analysis && activate_ir) {
+  if (use_analysis_and_activate_ir) {
     AnalysisPredictor *analysis_predictor =
         dynamic_cast<AnalysisPredictor *>(predictor.get());
     auto &fuse_statis = analysis_predictor->analysis_argument()
@@ -334,23 +334,16 @@ void TestDituRNNPrediction(const std::string &model_path,
 
 // Directly infer with the original model.
 TEST(Analyzer, DituRNN_without_analysis) {
-  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
-                        FLAGS_batch_size, false, false, FLAGS_repeat);
-}
-
-// Inference with the original model with the analysis turned on, the analysis
-// module will transform the program to a data flow graph.
-TEST(Analyzer, DituRNN_with_analysis) {
-  LOG(INFO) << "ditu rnn with analysis";
-  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
-                        FLAGS_batch_size, true, false, FLAGS_repeat);
+  LOG(INFO) << "ditu rnn without analysis";
+  TestDituRNNPrediction(false, 1);
+  TestDituRNNPrediction(false, 4);  // multi-threads
 }
 
 // Inference with analysis and IR. The IR module will fuse some large kernels.
 TEST(Analyzer, DituRNN_with_analysis_with_IR) {
   LOG(INFO) << "ditu rnn with analysis and IR fuse";
-  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
-                        FLAGS_batch_size, true, true, FLAGS_repeat);
+  TestDituRNNPrediction(true, 1);
+  TestDituRNNPrediction(true, 4);  // multi-threads
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 33862232b..17310de28 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include <memory>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/scope.h"
@@ -30,7 +32,6 @@ bool AnalysisPredictor::Init(
   } else {
     place_ = paddle::platform::CPUPlace();
   }
-  PADDLE_ENFORCE(!parent_scope);
   if (parent_scope) {
     scope_ = parent_scope;
     sub_scope_ = &(parent_scope->NewScope());
@@ -92,8 +93,6 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
   Analyzer().Run(&argument_);
   CHECK(argument_.transformed_program_desc);
   VLOG(5) << "to prepare executor";
-  // LOG(INFO) << "transformed_parogram_desc " <<
-  // argument.transformed_program_desc->DebugString();
   inference_program_.reset(
       new framework::ProgramDesc(*argument_.transformed_program_desc));
   PADDLE_ENFORCE(argument_.Has(framework::ir::kParamScopeAttr));
@@ -106,7 +105,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     NativeConfig, PaddleEngineKind::kAnalysis>(const NativeConfig& config) {
-  VLOG(3) << "create NativePredictor";
+  VLOG(3) << "create AnalysisPredictor";
   if (config.use_gpu) {
     // 1. GPU memeroy
     PADDLE_ENFORCE_GT(
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index 8eac449a1..6df269c57 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <glog/logging.h>
 #include <sys/time.h>
 #include <algorithm>
 #include <sstream>
@@ -106,5 +107,45 @@ static void TensorAssignData(PaddleTensor *tensor,
   }
 }
 
+std::string DescribeTensor(const PaddleTensor &tensor) {
+  std::stringstream os;
+  os << "Tensor [" << tensor.name << "]\n";
+  os << " - type: ";
+  switch (tensor.dtype) {
+    case PaddleDType::FLOAT32:
+      os << "float32";
+      break;
+    case PaddleDType::INT64:
+      os << "int64";
+      break;
+    default:
+      os << "unset";
+  }
+  os << '\n';
+
+  os << " - shape: " << to_string(tensor.shape) << '\n';
+  os << " - lod: ";
+  for (auto &l : tensor.lod) {
+    os << to_string(l) << "; ";
+  }
+  os << "\n";
+  os << " - data: ";
+
+  int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
+                            [](int a, int b) { return a * b; });
+  for (int i = 0; i < dim; i++) {
+    os << static_cast<float *>(tensor.data.data())[i] << " ";
+  }
+  os << '\n';
+  return os.str();
+}
+
+void print_time(int batch_size, int repeat, int num_threads, int tid,
+                double latency) {
+  LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << repeat
+            << ", threads: " << num_threads << ", thread id: " << tid
+            << ", latency: " << latency << "ms";
+}
+
 }  // namespace inference
 }  // namespace paddle
-- 
GitLab