add multi-thread ut for ditu-rnn

0fbe0a7a · luotao1 · d0c65bff · 0fbe0a7a · 0fbe0a7a · 0fbe0a7a
6 changed file
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -13,10 +13,10 @@
 // limitations under the License.

 #include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h"
+#include <string>
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/inference/api/helper.h"

 namespace paddle {
 namespace framework {

--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -35,7 +35,6 @@ std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(

  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                     Graph* g) {
-
    auto* id = subgraph.at(gpd.pattern().RetrieveNode("any_node"));
    marked_nodes.insert(id);
  };
@@ -89,7 +88,6 @@ std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
    LINK_TO(op, hidden_n);
 #undef LINK_TO
    return op;
-
  };

  lstm_creator(16, 12, 14, 18, 17, 22, 21, 19);
@@ -105,14 +103,16 @@ std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
    for (auto it = node->inputs.begin(); it != node->inputs.end();) {
      if (marked_nodes.count(*it)) {
        it = const_cast<Node*>(node)->inputs.erase(it);
-      } else
+      } else {
        it++;
+      }
    }
    for (auto it = node->outputs.begin(); it != node->outputs.end();) {
      if (marked_nodes.count(*it)) {
        it = const_cast<Node*>(node)->outputs.erase(it);
-      } else
+      } else {
        it++;
+      }
    }
  }


--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -81,7 +81,7 @@ void GraphPatternDetector::operator()(Graph* graph,
  LOG(INFO) << "detect " << subgraphs.size() << " subgraph matches the pattern";
  int id = 0;
  for (auto& g : subgraphs) {
-    LOG(INFO) << "optimizing #" << id++ << " subgraph";
+    VLOG(3) << "optimizing #" << id++ << " subgraph";
    handler(g, graph);
  }
 }

--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -16,6 +16,7 @@

 #include <google/protobuf/text_format.h>
 #include <gtest/gtest.h>
+#include <thread>  // NOLINT
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
@@ -23,19 +24,17 @@
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/utils/singleton.h"
-#include "paddle/fluid/platform/profiler.h"

 DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN");
 DEFINE_string(infer_ditu_rnn_data, "", "data path for ditu RNN");
 DEFINE_int32(batch_size, 10, "batch size.");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");

 namespace paddle {
 namespace inference {
 namespace analysis {

-using namespace framework;
-
 TEST(Analyzer, analysis_without_tensorrt) {
  FLAGS_IA_enable_tensorrt_subgraph_engine = false;
  Argument argument;
@@ -219,39 +218,6 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  }
 }

-std::string DescribeTensor(const PaddleTensor &tensor) {
-  std::stringstream os;
-  os << "Tensor [" << tensor.name << "]\n";
-  os << " - type: ";
-  switch (tensor.dtype) {
-    case PaddleDType::FLOAT32:
-      os << "float32";
-      break;
-    case PaddleDType::INT64:
-      os << "int64";
-      break;
-    default:
-      os << "unset";
-  }
-  os << '\n';
-
-  os << " - shape: " << to_string(tensor.shape) << '\n';
-  os << " - lod: ";
-  for (auto &l : tensor.lod) {
-    os << to_string(l) << "; ";
-  }
-  os << "\n";
-  os << " - data: ";
-
-  int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
-                            [](int a, int b) { return a * b; });
-  for (int i = 0; i < dim; i++) {
-    os << static_cast<float *>(tensor.data.data())[i] << " ";
-  }
-  os << '\n';
-  return os.str();
-}
-
 }  // namespace

 const float ditu_rnn_target_data[] = {
@@ -266,58 +232,92 @@ const float ditu_rnn_target_data[] = {
    93.5771, 3.84641, 0,       0,       0,       0,       0,       0,
    169.426, 0,       0,       0,       0,       0,       0,       0};
 // Test with a really complicate model.
-void TestDituRNNPrediction(const std::string &model_path,
-                           const std::string &data_path, int batch_size,
-                           bool use_analysis, bool activate_ir,
-                           int num_times = 1) {
+void TestDituRNNPrediction(bool use_analysis_and_activate_ir = false,
+                           int num_threads = FLAGS_num_threads) {
  NativeConfig config;
  config.prog_file = FLAGS_infer_ditu_rnn_model + "/__model__";
  config.param_file = FLAGS_infer_ditu_rnn_model + "/param";
  config.use_gpu = false;
  config.device = 0;
  config.specify_input_name = true;
+  int batch_size = FLAGS_batch_size;
+  int num_times = FLAGS_repeat;

  auto base_predictor =
      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
  auto predictor =
      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kAnalysis>(config);
  std::vector<PaddleTensor> input_slots;
-  DataRecord data(data_path, batch_size);
+  DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size);
  // Prepare inputs.
  PrepareInputs(&input_slots, &data, batch_size);
  std::vector<PaddleTensor> outputs, base_outputs;

  base_predictor->Run(input_slots, &base_outputs);

-  Timer timer;
-  timer.tic();
-  for (int i = 0; i < num_times; i++) {
-    predictor->Run(input_slots, &outputs);
-  }
  LOG(INFO) << "===========profile result===========";
-  LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << num_times
-            << ", latency: " << timer.toc() / num_times << "ms";
+  if (num_threads == 1) {
+    std::vector<PaddleTensor> input_slots;
+    // Prepare inputs.
+    DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size);
+    PrepareInputs(&input_slots, &data, batch_size);
+
+    Timer timer;
+    timer.tic();
+    for (int i = 0; i < num_times; i++) {
+      predictor->Run(input_slots, &outputs);
+    }
+    print_time(batch_size, num_times, 1, 0, timer.toc() / num_times);
+  } else {
+    std::vector<std::thread> threads;
+    std::vector<PaddleTensor> input_slots;
+    // Prepare inputs.
+    PrepareInputs(&input_slots, &data, batch_size);
+    std::vector<PaddleTensor> outputs;
+    for (int tid = 0; tid < num_threads; ++tid) {
+      threads.emplace_back([&, tid]() {
+        auto predictor_tid =
+            CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kAnalysis>(
+                config);
+        DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size);
+
+        Timer timer;
+        timer.tic();
+        for (int i = 0; i < num_times; i++) {
+          predictor_tid->Run(input_slots, &outputs);
+        }
+        print_time(batch_size, num_times, num_threads, tid,
+                   timer.toc() / num_times);
+      });
+    }
+    for (int i = 0; i < num_threads; ++i) {
+      threads[i].join();
+    }
+  }
  LOG(INFO) << "=====================================";

-  PADDLE_ENFORCE_GT(outputs.size(), 0);
-  PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
-  for (size_t i = 0; i < outputs.size(); i++) {
-    auto &out = outputs[i];
-    auto &base_out = base_outputs[i];
-    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
-                                  [](int a, int b) { return a * b; });
-    size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
-                                   1, [](int a, int b) { return a * b; });
-    PADDLE_ENFORCE_EQ(size, size1);
-    PADDLE_ENFORCE_GT(size, 0);
-    float *data = static_cast<float *>(out.data.data());
-    float *base_data = static_cast<float *>(base_out.data.data());
-    for (size_t i = 0; i < size; i++) {
-      EXPECT_NEAR(data[i], base_data[i], 1e-3);
+  if (num_threads == 1) {
+    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
+    for (size_t i = 0; i < outputs.size(); i++) {
+      auto &out = outputs[i];
+      auto &base_out = base_outputs[i];
+      size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                    [](int a, int b) { return a * b; });
+      size_t size1 =
+          std::accumulate(base_out.shape.begin(), base_out.shape.end(), 1,
+                          [](int a, int b) { return a * b; });
+      PADDLE_ENFORCE_EQ(size, size1);
+      PADDLE_ENFORCE_GT(size, 0);
+      float *data = static_cast<float *>(out.data.data());
+      float *base_data = static_cast<float *>(base_out.data.data());
+      for (size_t i = 0; i < size; i++) {
+        EXPECT_NEAR(data[i], base_data[i], 1e-3);
+      }
    }
  }

-  if (use_analysis && activate_ir) {
+  if (use_analysis_and_activate_ir) {
    AnalysisPredictor *analysis_predictor =
        dynamic_cast<AnalysisPredictor *>(predictor.get());
    auto &fuse_statis = analysis_predictor->analysis_argument()
@@ -334,23 +334,16 @@ void TestDituRNNPrediction(const std::string &model_path,

 // Directly infer with the original model.
 TEST(Analyzer, DituRNN_without_analysis) {
-  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
-                        FLAGS_batch_size, false, false, FLAGS_repeat);
-}
-
-// Inference with the original model with the analysis turned on, the analysis
-// module will transform the program to a data flow graph.
-TEST(Analyzer, DituRNN_with_analysis) {
-  LOG(INFO) << "ditu rnn with analysis";
-  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
-                        FLAGS_batch_size, true, false, FLAGS_repeat);
+  LOG(INFO) << "ditu rnn without analysis";
+  TestDituRNNPrediction(false, 1);
+  TestDituRNNPrediction(false, 4);  // multi-threads
 }

 // Inference with analysis and IR. The IR module will fuse some large kernels.
 TEST(Analyzer, DituRNN_with_analysis_with_IR) {
  LOG(INFO) << "ditu rnn with analysis and IR fuse";
-  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
-                        FLAGS_batch_size, true, true, FLAGS_repeat);
+  TestDituRNNPrediction(true, 1);
+  TestDituRNNPrediction(true, 4);  // multi-threads
 }

 }  // namespace analysis

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -14,6 +14,8 @@

 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include <memory>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/scope.h"
@@ -30,7 +32,6 @@ bool AnalysisPredictor::Init(
  } else {
    place_ = paddle::platform::CPUPlace();
  }
-  PADDLE_ENFORCE(!parent_scope);
  if (parent_scope) {
    scope_ = parent_scope;
    sub_scope_ = &(parent_scope->NewScope());
@@ -92,8 +93,6 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
  Analyzer().Run(&argument_);
  CHECK(argument_.transformed_program_desc);
  VLOG(5) << "to prepare executor";
-  // LOG(INFO) << "transformed_parogram_desc " <<
-  // argument.transformed_program_desc->DebugString();
  inference_program_.reset(
      new framework::ProgramDesc(*argument_.transformed_program_desc));
  PADDLE_ENFORCE(argument_.Has(framework::ir::kParamScopeAttr));
@@ -106,7 +105,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
    NativeConfig, PaddleEngineKind::kAnalysis>(const NativeConfig& config) {
-  VLOG(3) << "create NativePredictor";
+  VLOG(3) << "create AnalysisPredictor";
  if (config.use_gpu) {
    // 1. GPU memeroy
    PADDLE_ENFORCE_GT(

--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -14,6 +14,7 @@

 #pragma once

+#include <glog/logging.h>
 #include <sys/time.h>
 #include <algorithm>
 #include <sstream>
@@ -106,5 +107,45 @@ static void TensorAssignData(PaddleTensor *tensor,
  }
 }

+std::string DescribeTensor(const PaddleTensor &tensor) {
+  std::stringstream os;
+  os << "Tensor [" << tensor.name << "]\n";
+  os << " - type: ";
+  switch (tensor.dtype) {
+    case PaddleDType::FLOAT32:
+      os << "float32";
+      break;
+    case PaddleDType::INT64:
+      os << "int64";
+      break;
+    default:
+      os << "unset";
+  }
+  os << '\n';
+
+  os << " - shape: " << to_string(tensor.shape) << '\n';
+  os << " - lod: ";
+  for (auto &l : tensor.lod) {
+    os << to_string(l) << "; ";
+  }
+  os << "\n";
+  os << " - data: ";
+
+  int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
+                            [](int a, int b) { return a * b; });
+  for (int i = 0; i < dim; i++) {
+    os << static_cast<float *>(tensor.data.data())[i] << " ";
+  }
+  os << '\n';
+  return os.str();
+}
+
+void print_time(int batch_size, int repeat, int num_threads, int tid,
+                double latency) {
+  LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << repeat
+            << ", threads: " << num_threads << ", thread id: " << tid
+            << ", latency: " << latency << "ms";
+}
+
 }  // namespace inference
 }  // namespace paddle