Merge pull request #13387 from luotao1/nlp_multi_thread

add multi-thread for nlp unit-tests

Merge pull request #13387 from luotao1/nlp_multi_thread
add multi-thread for nlp unit-tests
d4a5326a · Tao Luo · GitHub · 24e61d30 · 6b9ccd97 · d4a5326a
7 changed file
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -123,10 +123,16 @@ std::string DescribeTensor(const PaddleTensor &tensor) {
 }
 void PrintTime(int batch_size, int repeat, int num_threads, int tid,
-               double latency) {
+               double latency, int epoch = 1) {
  LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat
            << ", threads: " << num_threads << ", thread id: " << tid
            << ", latency: " << latency << "ms ======";
+  if (epoch > 1) {
+    int samples = batch_size * epoch;
+    LOG(INFO) << "====== sample number: " << samples
+              << ", average latency of each sample: " << latency / samples
+              << "ms ======";
+  }
 }
 }  // namespace inference

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -51,9 +51,7 @@ inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
 # text_classification
 set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
 download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz")
-inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc
+inference_analysis_test(test_analyzer_text_classification SRCS analyzer_text_classification_tester.cc
    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
    ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta
-         --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt
+         --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt)
-         --topn=1 # Just run top 1 batch.
-         )
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -12,21 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
-#include "paddle/fluid/platform/profiler.h"
-DEFINE_string(infer_model, "", "model path for LAC");
-DEFINE_string(infer_data, "", "data file for LAC");
-DEFINE_int32(batch_size, 1, "batch size.");
-DEFINE_int32(burning, 0, "Burning before repeat.");
-DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
-DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
 namespace paddle {
 namespace inference {
@@ -126,46 +112,37 @@ void TestLACPrediction(const std::string &model_path,
                       const std::string &data_file, const int batch_size,
                       const int repeat, bool test_all_data,
                       bool use_analysis = false) {
-  NativeConfig config;
+  AnalysisConfig cfg;
-  config.model_dir = model_path;
+  cfg.model_dir = model_path;
-  config.use_gpu = false;
+  cfg.use_gpu = false;
-  config.device = 0;
+  cfg.device = 0;
-  config.specify_input_name = true;
+  cfg.specify_input_name = true;
+  cfg.enable_ir_optim = true;
  std::vector<PaddleTensor> input_slots, outputs_slots;
  DataRecord data(data_file, batch_size);
  GetOneBatch(&input_slots, &data, batch_size);
  std::unique_ptr<PaddlePredictor> predictor;
  if (use_analysis) {
-    AnalysisConfig cfg;
-    cfg.model_dir = model_path;
-    cfg.use_gpu = false;
-    cfg.device = 0;
-    cfg.specify_input_name = true;
-    cfg.enable_ir_optim = true;
    predictor =
        CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
  } else {
    predictor =
-        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
  }
  for (int i = 0; i < FLAGS_burning; i++) {
    predictor->Run(input_slots, &outputs_slots);
  }
  Timer timer;
-  if (test_all_data) {
+  if (FLAGS_test_all_data) {
-    double sum = 0;
+    LOG(INFO) << "test all data";
-    LOG(INFO) << "Total number of samples: " << data.datasets.size();
+    std::vector<std::vector<PaddleTensor>> input_slots_all;
-    for (int i = 0; i < repeat; i++) {
+    for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
-      for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
+      GetOneBatch(&input_slots, &data, batch_size);
-        GetOneBatch(&input_slots, &data, batch_size);
+      input_slots_all.emplace_back(input_slots);
-        timer.tic();
-        predictor->Run(input_slots, &outputs_slots);
-        sum += timer.toc();
-      }
    }
-    PrintTime(batch_size, repeat, 1, 0, sum / repeat);
+    LOG(INFO) << "total number of samples: " << data.datasets.size();
-    LOG(INFO) << "Average latency of each sample: "
+    TestPrediction(cfg, input_slots_all, &outputs_slots, FLAGS_num_threads);
-              << sum / repeat / data.datasets.size() << " ms";
    return;
  }
  timer.tic();
@@ -190,19 +167,10 @@ void TestLACPrediction(const std::string &model_path,
  if (use_analysis) {
    // run once for comparion as reference
    auto ref_predictor =
-        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
    std::vector<PaddleTensor> ref_outputs_slots;
    ref_predictor->Run(input_slots, &ref_outputs_slots);
-    EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size());
+    CompareResult(ref_outputs_slots, outputs_slots);
-    auto &ref_out = ref_outputs_slots[0];
-    size_t ref_size =
-        std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
-                        [](int a, int b) { return a * b; });
-    EXPECT_EQ(size, ref_size);
-    int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
-    for (size_t i = 0; i < size; ++i) {
-      EXPECT_EQ(pdata_ref[i], pdata[i]);
-    }
    AnalysisPredictor *analysis_predictor =
        dynamic_cast<AnalysisPredictor *>(predictor.get());

--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -12,20 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
-#include "paddle/fluid/platform/profiler.h"
-DEFINE_string(infer_model, "", "model path");
-DEFINE_string(infer_data, "", "data path");
-DEFINE_int32(batch_size, 10, "batch size.");
-DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
-DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
 namespace paddle {
 namespace inference {
@@ -113,50 +100,35 @@ const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
                                       48, 39, 38, 16, 25};
 void TestChineseNERPrediction(bool use_analysis) {
-  NativeConfig config;
+  AnalysisConfig cfg;
-  config.prog_file = FLAGS_infer_model + "/__model__";
+  cfg.prog_file = FLAGS_infer_model + "/__model__";
-  config.param_file = FLAGS_infer_model + "/param";
+  cfg.param_file = FLAGS_infer_model + "/param";
-  config.use_gpu = false;
+  cfg.use_gpu = false;
-  config.device = 0;
+  cfg.device = 0;
-  config.specify_input_name = true;
+  cfg.specify_input_name = true;
+  cfg.enable_ir_optim = true;
  std::vector<PaddleTensor> input_slots, outputs;
  std::unique_ptr<PaddlePredictor> predictor;
  Timer timer;
  if (use_analysis) {
-    AnalysisConfig cfg;
-    cfg.prog_file = FLAGS_infer_model + "/__model__";
-    cfg.param_file = FLAGS_infer_model + "/param";
-    cfg.use_gpu = false;
-    cfg.device = 0;
-    cfg.specify_input_name = true;
-    cfg.enable_ir_optim = true;
    predictor =
        CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
  } else {
    predictor =
-        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
  }
  if (FLAGS_test_all_data) {
    LOG(INFO) << "test all data";
-    double sum = 0;
+    DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
-    size_t num_samples;
+    std::vector<std::vector<PaddleTensor>> input_slots_all;
-    for (int i = 0; i < FLAGS_repeat; i++) {
+    for (size_t bid = 0; bid < data.num_samples / FLAGS_batch_size; ++bid) {
-      DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+      PrepareInputs(&input_slots, &data, FLAGS_batch_size);
-      // Just one batch, the num_samples remains the same.
+      input_slots_all.emplace_back(input_slots);
-      num_samples = data.num_samples;
-      for (size_t bid = 0; bid < num_samples / FLAGS_batch_size; ++bid) {
-        PrepareInputs(&input_slots, &data, FLAGS_batch_size);
-        timer.tic();
-        predictor->Run(input_slots, &outputs);
-        sum += timer.toc();
-      }
    }
-    LOG(INFO) << "total number of samples: " << num_samples;
+    LOG(INFO) << "total number of samples: " << data.num_samples;
-    PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat);
+    TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
-    LOG(INFO) << "average latency of each sample: "
-              << sum / FLAGS_repeat / num_samples;
    return;
  }
  // Prepare inputs.
@@ -182,19 +154,10 @@ void TestChineseNERPrediction(bool use_analysis) {
  if (use_analysis) {
    // run once for comparion as reference
    auto ref_predictor =
-        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
    std::vector<PaddleTensor> ref_outputs_slots;
    ref_predictor->Run(input_slots, &ref_outputs_slots);
-    EXPECT_EQ(ref_outputs_slots.size(), outputs.size());
+    CompareResult(ref_outputs_slots, outputs);
-    auto &ref_out = ref_outputs_slots[0];
-    size_t ref_size =
-        std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
-                        [](int a, int b) { return a * b; });
-    EXPECT_EQ(size, ref_size);
-    int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
-    for (size_t i = 0; i < size; ++i) {
-      EXPECT_EQ(pdata_ref[i], result[i]);
-    }
    AnalysisPredictor *analysis_predictor =
        dynamic_cast<AnalysisPredictor *>(predictor.get());

--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -12,24 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
-#include <google/protobuf/text_format.h>
-#include <gtest/gtest.h>
-#include <thread>  // NOLINT
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
-DEFINE_string(infer_model, "", "model path");
-DEFINE_string(infer_data, "", "data path");
-DEFINE_int32(batch_size, 10, "batch size.");
-DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
-DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
 namespace paddle {
 namespace inference {
@@ -164,26 +147,6 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  }
 }
-void CompareResult(const std::vector<PaddleTensor> &outputs,
-                   const std::vector<PaddleTensor> &base_outputs) {
-  PADDLE_ENFORCE_GT(outputs.size(), 0);
-  PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
-  for (size_t i = 0; i < outputs.size(); i++) {
-    auto &out = outputs[i];
-    auto &base_out = base_outputs[i];
-    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
-                                  [](int a, int b) { return a * b; });
-    size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
-                                   1, [](int a, int b) { return a * b; });
-    PADDLE_ENFORCE_EQ(size, size1);
-    PADDLE_ENFORCE_GT(size, 0);
-    float *data = static_cast<float *>(out.data.data());
-    float *base_data = static_cast<float *>(base_out.data.data());
-    for (size_t i = 0; i < size; i++) {
-      EXPECT_NEAR(data[i], base_data[i], 1e-3);
-    }
-  }
-}
 // Test with a really complicate model.
 void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
  AnalysisConfig config;
@@ -198,7 +161,6 @@ void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
  config.ir_passes.clear();  // Do not exclude any pass.
  int batch_size = FLAGS_batch_size;
-  int num_times = FLAGS_repeat;
  auto base_predictor =
      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
@@ -213,45 +175,14 @@ void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
  base_predictor->Run(input_slots, &base_outputs);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  input_slots_all.emplace_back(input_slots);
  if (num_threads == 1) {
-    // Prepare inputs.
+    TestOneThreadPrediction(config, input_slots_all, &outputs);
-    Timer timer;
-    timer.tic();
-    for (int i = 0; i < num_times; i++) {
-      predictor->Run(input_slots, &outputs);
-    }
-    PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times);
    CompareResult(outputs, base_outputs);
  } else {
-    std::vector<std::thread> threads;
+    // only return the output of first thread
-    std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+    TestMultiThreadPrediction(config, input_slots_all, &outputs, num_threads);
-    // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
-    // because AttentionLSTM's hard code nodeid will be damanged.
-    for (int tid = 0; tid < num_threads; ++tid) {
-      predictors.emplace_back(
-          CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-              config));
-    }
-    for (int tid = 0; tid < num_threads; ++tid) {
-      threads.emplace_back([&, tid]() {
-        // Each thread should have local input_slots and outputs.
-        std::vector<PaddleTensor> input_slots;
-        DataRecord data(FLAGS_infer_data, batch_size);
-        PrepareInputs(&input_slots, &data, batch_size);
-        std::vector<PaddleTensor> outputs;
-        Timer timer;
-        timer.tic();
-        for (int i = 0; i < num_times; i++) {
-          predictors[tid]->Run(input_slots, &outputs);
-        }
-        PrintTime(batch_size, num_times, num_threads, tid,
-                  timer.toc() / num_times);
-        CompareResult(outputs, base_outputs);
-      });
-    }
-    for (int i = 0; i < num_threads; ++i) {
-      threads[i].join();
-    }
  }
  if (use_analysis && activate_ir) {
@@ -293,8 +224,7 @@ TEST(Analyzer, RNN_tests) {
    // Directly infer with the original model.
    TestRNN1Prediction(false, false, i);
    // Inference with the original model with the analysis turned on, the
-    // analysis
+    // analysis module will transform the program to a data flow graph.
-    // module will transform the program to a data flow graph.
    TestRNN1Prediction(true, false, i);
    // Inference with analysis and IR. The IR module will fuse some large
    // kernels.

--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -12,23 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
-#include <gflags/gflags.h>
-#include <glog/logging.h>  // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
-#include <gtest/gtest.h>
-#include <fstream>
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
-#include "paddle/fluid/inference/api/timer.h"
-DEFINE_string(infer_model, "", "Directory of the inference model.");
-DEFINE_string(infer_data, "", "Path of the dataset.");
-DEFINE_int32(batch_size, 1, "batch size.");
-DEFINE_int32(repeat, 1, "How many times to repeat run.");
-DEFINE_int32(topn, -1, "Run top n batches of data to save time");
 namespace paddle {
 namespace inference {
@@ -37,24 +21,25 @@ struct DataReader {
  explicit DataReader(const std::string &path)
      : file(new std::ifstream(path)) {}
-  bool NextBatch(PaddleTensor *tensor, int batch_size) {
+  bool NextBatch(std::vector<PaddleTensor> *input, int batch_size) {
    PADDLE_ENFORCE_EQ(batch_size, 1);
    std::string line;
-    tensor->lod.clear();
+    PaddleTensor tensor;
-    tensor->lod.emplace_back(std::vector<size_t>({0}));
+    tensor.dtype = PaddleDType::INT64;
+    tensor.lod.emplace_back(std::vector<size_t>({0}));
    std::vector<int64_t> data;
    for (int i = 0; i < batch_size; i++) {
      if (!std::getline(*file, line)) return false;
      inference::split_to_int64(line, ' ', &data);
    }
-    tensor->lod.front().push_back(data.size());
+    tensor.lod.front().push_back(data.size());
-    tensor->data.Resize(data.size() * sizeof(int64_t));
+    tensor.data.Resize(data.size() * sizeof(int64_t));
-    memcpy(tensor->data.data(), data.data(), data.size() * sizeof(int64_t));
+    memcpy(tensor.data.data(), data.data(), data.size() * sizeof(int64_t));
-    tensor->shape.clear();
+    tensor.shape.push_back(data.size());
-    tensor->shape.push_back(data.size());
+    tensor.shape.push_back(1);
-    tensor->shape.push_back(1);
+    input->assign({tensor});
    return true;
  }
@@ -68,32 +53,28 @@ void Main(int batch_size) {
  config.model_dir = FLAGS_infer_model;
  config.use_gpu = false;
  config.enable_ir_optim = true;
-  auto predictor =
-      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-          config);
-  std::vector<PaddleTensor> input_slots(1);
-  // one batch starts
-  // data --
-  auto &input = input_slots[0];
-  input.dtype = PaddleDType::INT64;
-  inference::Timer timer;
+  std::vector<PaddleTensor> input_slots, output_slots;
-  double sum = 0;
+  DataReader reader(FLAGS_infer_data);
-  std::vector<PaddleTensor> output_slots;
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  int num_batches = 0;
+  if (FLAGS_test_all_data) {
-  for (int t = 0; t < FLAGS_repeat; t++) {
+    LOG(INFO) << "test all data";
-    DataReader reader(FLAGS_infer_data);
+    int num_batches = 0;
-    while (reader.NextBatch(&input, FLAGS_batch_size)) {
+    while (reader.NextBatch(&input_slots, FLAGS_batch_size)) {
-      if (FLAGS_topn > 0 && num_batches > FLAGS_topn) break;
+      input_slots_all.emplace_back(input_slots);
-      timer.tic();
-      CHECK(predictor->Run(input_slots, &output_slots));
-      sum += timer.toc();
      ++num_batches;
    }
+    LOG(INFO) << "total number of samples: " << num_batches * FLAGS_batch_size;
+    TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads);
+    return;
  }
-  PrintTime(batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat);
+  // one batch starts
+  // data --
+  reader.NextBatch(&input_slots, FLAGS_batch_size);
+  input_slots_all.emplace_back(input_slots);
+  TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads);
  // Get output
  LOG(INFO) << "get outputs " << output_slots.size();

--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <gtest/gtest.h>
+#include <thread>  // NOLINT
+#include <vector>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/analysis_predictor.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/platform/profiler.h"
+DEFINE_string(infer_model, "", "model path");
+DEFINE_string(infer_data, "", "data file");
+DEFINE_int32(batch_size, 1, "batch size.");
+DEFINE_int32(burning, 0, "Burning before repeat.");
+DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
+DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
+namespace paddle {
+namespace inference {
+void CompareResult(const std::vector<PaddleTensor> &outputs,
+                   const std::vector<PaddleTensor> &base_outputs) {
+  PADDLE_ENFORCE_GT(outputs.size(), 0);
+  PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
+  for (size_t i = 0; i < outputs.size(); i++) {
+    auto &out = outputs[i];
+    auto &base_out = base_outputs[i];
+    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                  [](int a, int b) { return a * b; });
+    size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
+                                   1, [](int a, int b) { return a * b; });
+    PADDLE_ENFORCE_EQ(size, size1);
+    PADDLE_ENFORCE_GT(size, 0);
+    float *data = static_cast<float *>(out.data.data());
+    float *base_data = static_cast<float *>(base_out.data.data());
+    for (size_t i = 0; i < size; i++) {
+      EXPECT_NEAR(data[i], base_data[i], 1e-3);
+    }
+  }
+}
+void TestOneThreadPrediction(
+    AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
+    std::vector<PaddleTensor> *outputs) {
+  int batch_size = FLAGS_batch_size;
+  int num_times = FLAGS_repeat;
+  auto predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
+  Timer timer;
+  timer.tic();
+  for (int i = 0; i < num_times; i++) {
+    for (size_t j = 0; j < inputs.size(); j++) {
+      predictor->Run(inputs[j], outputs);
+    }
+  }
+  PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times,
+            inputs.size());
+}
+void TestMultiThreadPrediction(
+    AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
+    std::vector<PaddleTensor> *outputs, int num_threads) {
+  int batch_size = FLAGS_batch_size;
+  int num_times = FLAGS_repeat;
+  std::vector<std::thread> threads;
+  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+  // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
+  // because AttentionLSTM's hard code nodeid will be damanged.
+  for (int tid = 0; tid < num_threads; ++tid) {
+    predictors.emplace_back(
+        CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+            config));
+  }
+  for (int tid = 0; tid < num_threads; ++tid) {
+    threads.emplace_back([&, tid]() {
+      // Each thread should have local inputs and outputs.
+      // The inputs of each thread are all the same.
+      std::vector<std::vector<PaddleTensor>> inputs_tid = inputs;
+      std::vector<PaddleTensor> outputs_tid;
+      Timer timer;
+      timer.tic();
+      for (int i = 0; i < num_times; i++) {
+        for (size_t j = 0; j < inputs_tid.size(); j++) {
+          predictors[tid]->Run(inputs_tid[j], &outputs_tid);
+        }
+      }
+      PrintTime(batch_size, num_times, num_threads, tid,
+                timer.toc() / num_times, inputs_tid.size());
+    });
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    threads[i].join();
+  }
+}
+void TestPrediction(AnalysisConfig config,
+                    const std::vector<std::vector<PaddleTensor>> inputs,
+                    std::vector<PaddleTensor> *outputs, int num_threads) {
+  if (num_threads == 1) {
+    TestOneThreadPrediction(config, inputs, outputs);
+  } else {
+    TestMultiThreadPrediction(config, inputs, outputs, num_threads);
+  }
+}
+}  // namespace inference
+}  // namespace paddle