unify ZeroCopy in analysis_test

fad06cb9 · luotao1 · 9be825a9 · fad06cb9 · fad06cb9 · fad06cb9
3 changed file
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -127,9 +127,8 @@ static void TensorAssignData(PaddleTensor *tensor,
 }

 template <typename T>
-static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
-                                    const std::vector<std::vector<T>> &data) {
-  int size{0};
+static void ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
+                                     const std::vector<std::vector<T>> &data) {
  auto *ptr = tensor->mutable_data<T>(PaddlePlace::kCPU);
  int c = 0;
  for (const auto &f : data) {
@@ -137,7 +136,15 @@ static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
      ptr[c++] = v;
    }
  }
-  return size;
+}
+
+template <typename T>
+static void ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
+                                     const PaddleBuf &data) {
+  auto *ptr = tensor->mutable_data<T>(PaddlePlace::kCPU);
+  for (size_t i = 0; i < data.length() / sizeof(T); i++) {
+    ptr[i] = *(reinterpret_cast<T *>(data.data()) + i);
+  }
 }

 static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) {

--- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
@@ -107,6 +107,9 @@ void SetConfig(AnalysisConfig *cfg) {
  cfg->DisableGpu();
  cfg->SwitchSpecifyInputNames();
  cfg->SwitchIrOptim();
+  if (FLAGS_zero_copy) {
+    cfg->SwitchUseFeedFetchOps(false);
+  }
 }

 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {

--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -51,6 +51,7 @@ DEFINE_bool(use_analysis, true,
 DEFINE_bool(record_benchmark, false,
            "Record benchmark after profiling the model");
 DEFINE_double(accuracy, 1e-3, "Result Accuracy.");
+DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch.");

 DECLARE_bool(profile);
 DECLARE_int32(paddle_num_threads);
@@ -198,61 +199,104 @@ void GetInputPerBatch(const std::vector<std::vector<int64_t>> &in,
  }
 }

-void TestOneThreadPrediction(
-    const PaddlePredictor::Config *config,
-    const std::vector<std::vector<PaddleTensor>> &inputs,
-    std::vector<PaddleTensor> *outputs, bool use_analysis = true) {
-  int batch_size = FLAGS_batch_size;
-  int num_times = FLAGS_repeat;
-  auto predictor = CreateTestPredictor(config, use_analysis);
+void ConvertPaddleTensorToZeroCopyTensor(
+    PaddlePredictor *predictor, const std::vector<PaddleTensor> &inputs) {
+  for (size_t i = 0; i < inputs.size(); i++) {
+    auto input = inputs[i];
+    auto tensor = predictor->GetInputTensor(input.name);
+    tensor->Reshape(input.shape);
+    tensor->SetLoD({input.lod});
+    if (input.dtype == PaddleDType::INT64) {
+      ZeroCopyTensorAssignData<int64_t>(tensor.get(), input.data);
+    } else if (input.dtype == PaddleDType::FLOAT32) {
+      ZeroCopyTensorAssignData<float>(tensor.get(), input.data);
+    } else {
+      LOG(ERROR) << "unsupported feed type " << input.dtype;
+    }
+  }
+}

-  // warmup run
-  LOG(INFO) << "Warm up run...";
-  {
-    Timer warmup_timer;
-    warmup_timer.tic();
+void PredictionWarmUp(PaddlePredictor *predictor,
+                      const std::vector<std::vector<PaddleTensor>> &inputs,
+                      std::vector<PaddleTensor> *outputs, int num_threads,
+                      int tid) {
+  int batch_size = FLAGS_batch_size;
+  LOG(INFO) << "Running thread " << tid << ", warm up run...";
+  if (FLAGS_zero_copy) {
+    ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[0]);
+  }
+  Timer warmup_timer;
+  warmup_timer.tic();
+  if (!FLAGS_zero_copy) {
    predictor->Run(inputs[0], outputs, batch_size);
-    PrintTime(batch_size, 1, 1, 0, warmup_timer.toc(), 1);
-    if (FLAGS_profile) {
-      paddle::platform::ResetProfiler();
-    }
+  } else {
+    predictor->ZeroCopyRun();
  }
+  PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1);
+  if (FLAGS_profile) {
+    paddle::platform::ResetProfiler();
+  }
+}

-  LOG(INFO) << "Run " << num_times << " times...";
-  {
-    Timer run_timer;
-    run_timer.tic();
+void PredictionRun(PaddlePredictor *predictor,
+                   const std::vector<std::vector<PaddleTensor>> &inputs,
+                   std::vector<PaddleTensor> *outputs, int num_threads,
+                   int tid) {
+  int batch_size = FLAGS_batch_size;
+  int num_times = FLAGS_repeat;
+  LOG(INFO) << "Thread " << tid << " run " << num_times << " times...";
+  Timer run_timer;
+  double elapsed_time = 0;
 #ifdef WITH_GPERFTOOLS
-    ProfilerStart("paddle_inference.prof");
+  ProfilerStart("paddle_inference.prof");
 #endif
-    for (int i = 0; i < num_times; i++) {
-      for (size_t j = 0; j < inputs.size(); j++) {
-        predictor->Run(inputs[j], outputs, batch_size);
+  if (!FLAGS_zero_copy) {
+    run_timer.tic();
+    for (size_t i = 0; i < inputs.size(); i++) {
+      for (int j = 0; j < num_times; j++) {
+        predictor->Run(inputs[i], outputs, batch_size);
      }
    }
+    elapsed_time = run_timer.toc();
+  } else {
+    for (size_t i = 0; i < inputs.size(); i++) {
+      ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[i]);
+      run_timer.tic();
+      for (int j = 0; j < num_times; j++) {
+        predictor->ZeroCopyRun();
+      }
+      elapsed_time += run_timer.toc();
+    }
+  }
 #ifdef WITH_GPERFTOOLS
-    ProfilerStop();
+  ProfilerStop();
 #endif

-    double latency = run_timer.toc() / (num_times > 1 ? num_times : 1);
-    PrintTime(batch_size, num_times, 1, 0, latency, inputs.size());
-    if (FLAGS_record_benchmark) {
-      Benchmark benchmark;
-      benchmark.SetName(FLAGS_model_name);
-      benchmark.SetBatchSize(batch_size);
-      benchmark.SetLatency(latency);
-      benchmark.PersistToFile("benchmark_record.txt");
-    }
+  PrintTime(batch_size, num_times, num_threads, tid, elapsed_time / num_times,
+            inputs.size());
+  if (FLAGS_record_benchmark) {
+    Benchmark benchmark;
+    benchmark.SetName(FLAGS_model_name);
+    benchmark.SetBatchSize(batch_size);
+    benchmark.SetLatency(elapsed_time / num_times);
+    benchmark.PersistToFile("benchmark_record.txt");
  }
 }

+void TestOneThreadPrediction(
+    const PaddlePredictor::Config *config,
+    const std::vector<std::vector<PaddleTensor>> &inputs,
+    std::vector<PaddleTensor> *outputs, bool use_analysis = true) {
+  auto predictor = CreateTestPredictor(config, use_analysis);
+  PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0);
+  PredictionRun(predictor.get(), inputs, outputs, 1, 0);
+}
+
 void TestMultiThreadPrediction(
    const PaddlePredictor::Config *config,
    const std::vector<std::vector<PaddleTensor>> &inputs,
    std::vector<PaddleTensor> *outputs, int num_threads,
    bool use_analysis = true) {
-  int batch_size = FLAGS_batch_size;
-  int num_times = FLAGS_repeat;
  std::vector<std::thread> threads;
  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
  predictors.emplace_back(CreateTestPredictor(config, use_analysis));
@@ -260,7 +304,6 @@ void TestMultiThreadPrediction(
    predictors.emplace_back(predictors.front()->Clone());
  }

-  size_t total_time{0};
  for (int tid = 0; tid < num_threads; ++tid) {
    threads.emplace_back([&, tid]() {
      // Each thread should have local inputs and outputs.
@@ -273,34 +316,8 @@ void TestMultiThreadPrediction(
            ->SetMkldnnThreadID(static_cast<int>(tid) + 1);
      }
 #endif
-
-      // warmup run
-      LOG(INFO) << "Running thread " << tid << ", warm up run...";
-      {
-        Timer warmup_timer;
-        warmup_timer.tic();
-        predictor->Run(inputs[0], outputs, batch_size);
-        PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1);
-        if (FLAGS_profile) {
-          paddle::platform::ResetProfiler();
-        }
-      }
-
-      LOG(INFO) << "Thread " << tid << " run " << num_times << " times...";
-      {
-        Timer timer;
-        timer.tic();
-        for (int i = 0; i < num_times; i++) {
-          for (const auto &input : inputs) {
-            ASSERT_TRUE(predictor->Run(input, &outputs_tid));
-          }
-        }
-
-        auto time = timer.toc();
-        total_time += time;
-        PrintTime(batch_size, num_times, num_threads, tid, time / num_times,
-                  inputs.size());
-      }
+      PredictionWarmUp(predictor.get(), inputs, outputs, num_threads, tid);
+      PredictionRun(predictor.get(), inputs, outputs, num_threads, tid);
    });
  }
  for (int i = 0; i < num_threads; ++i) {