add multi threads test of seqpool test (#15293)

781cd0cf · tensor-tang · Yan Chunwei · 6eada9e0 · 781cd0cf
隐藏空白更改
内联并排

Showing with 57 addition and 2 deletion

paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc ...le/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc +57 -2

未找到文件。
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
@@ -214,6 +214,9 @@ void PrepareZeroCopyInputs(
  }
 }
+// diff: similarity_norm.tmp_0, // speed: fc_4.tmp_1
+static const char out_var_name[] = "reduce_sum_0.tmp_0";
 // return the output values
 std::vector<float> zerocopy_profile(int repeat_times) {
  AnalysisConfig config;
@@ -222,7 +225,7 @@ std::vector<float> zerocopy_profile(int repeat_times) {
  auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
  std::vector<std::unique_ptr<ZeroCopyTensor>> inputs;
  PrepareZeroCopyInputs(predictor, &inputs);
-  auto output_tensor = predictor->GetOutputTensor("reduce_sum_0.tmp_0");
+  auto output_tensor = predictor->GetOutputTensor(out_var_name);
  Timer timer;
  LOG(INFO) << "Warm up run...";
  timer.tic();
@@ -239,7 +242,7 @@ std::vector<float> zerocopy_profile(int repeat_times) {
  PrintTime(FLAGS_batch_size, repeat_times, 1, 0, timer.toc() / repeat_times,
            1);
-  VLOG(3) << "ZeroCopy output: " << DescribeZeroCopyTensor(*output_tensor);
+  LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(*output_tensor);
  PaddlePlace place;
  int output_size{0};
  auto *pdata = output_tensor->data<float>(&place, &output_size);
@@ -252,6 +255,58 @@ std::vector<float> zerocopy_profile(int repeat_times) {
 TEST(Analyzer_seq_pool1, zerocopy_profile) { zerocopy_profile(FLAGS_repeat); }
+TEST(Analyzer_seq_pool1, zerocopy_profile_threads) {
+  AnalysisConfig config;
+  SetConfig(&config);
+  config.SwitchUseFeedFetchOps(false);
+  auto base_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+  double total_time_of_threads{0};
+  std::vector<std::thread> threads;
+  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+  for (int tid = 0; tid < FLAGS_num_threads; tid++) {
+    predictors.emplace_back(base_predictor->Clone());
+    // predictors.emplace_back(CreatePaddlePredictor<AnalysisConfig>(config));
+  }
+  for (int tid = 0; tid < FLAGS_num_threads; tid++) {
+    threads.emplace_back([config, &total_time_of_threads, &predictors, tid] {
+      auto &predictor = predictors[tid];
+      std::vector<std::unique_ptr<ZeroCopyTensor>> inputs;
+      PrepareZeroCopyInputs(predictor, &inputs);
+      auto output_tensor = predictor->GetOutputTensor(out_var_name);
+      Timer timer;
+      double total_time{0};
+      LOG(INFO) << "Warm up run...";
+      timer.tic();
+      predictor->ZeroCopyRun();
+      PrintTime(FLAGS_batch_size, 1, FLAGS_num_threads, tid, timer.toc(), 1);
+      if (FLAGS_profile) {
+        paddle::platform::ResetProfiler();
+      }
+      int repeat_times = FLAGS_repeat;
+      LOG(INFO) << "Run " << repeat_times << " times...";
+      timer.tic();
+      for (int i = 0; i < repeat_times; i++) {
+        predictor->ZeroCopyRun();
+      }
+      total_time += timer.toc();
+      total_time_of_threads += total_time;
+      LOG(INFO) << "thread time: " << total_time / repeat_times;
+    });
+  }
+  for (auto &t : threads) {
+    t.join();
+  }
+  LOG(INFO) << "average time: "
+            << total_time_of_threads / FLAGS_num_threads / FLAGS_repeat;
+}
 TEST(Analyzer_seq_pool1, zerocopy_fuse_statis) { analysis_fuse_statis(true); }
 TEST(Analyzer_seq_pool1, zerocopy_compare_native) {