Add profiling information for inference example (#8748)

* Add profiling information for inference example, recognize digits. * Refine the profiling method. * Correct the use of RecordEvent and simplify recognize_digits.

Add profiling information for inference example (#8748)
* Add profiling information for inference example, recognize digits. * Refine the profiling method. * Correct the use of RecordEvent and simplify recognize_digits.
a032f56f · Yiqun Liu · GitHub · 1e4d95c1 · a032f56f · a032f56f
5 changed file
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -22,14 +22,14 @@ namespace paddle {
 namespace inference {
 void ReadBinaryFile(const std::string& filename, std::string& contents) {
-  VLOG(3) << "loading model from " << filename;
+  std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  std::ifstream inputfs(filename, std::ios::in | std::ios::binary);
+  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
-  inputfs.seekg(0, std::ios::end);
+  fin.seekg(0, std::ios::end);
  contents.clear();
-  contents.resize(inputfs.tellg());
+  contents.resize(fin.tellg());
-  inputfs.seekg(0, std::ios::beg);
+  fin.seekg(0, std::ios::beg);
-  inputfs.read(&contents[0], contents.size());
+  fin.read(&contents[0], contents.size());
-  inputfs.close();
+  fin.close();
 }
 bool IsPersistable(const framework::VarDesc* var) {
@@ -97,6 +97,7 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
                                             const std::string& dirname) {
  std::string model_filename = dirname + "/__model__";
  std::string program_desc_str;
+  VLOG(3) << "loading model from " << model_filename;
  ReadBinaryFile(model_filename, program_desc_str);
  std::unique_ptr<framework::ProgramDesc> main_program(

--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@@ -17,10 +17,13 @@ limitations under the License. */
 #include "paddle/fluid/inference/tests/test_helper.h"
 DEFINE_string(dirname, "", "Directory of the inference model.");
+DEFINE_int32(batch_size, 1, "Batch size of input data");
+DEFINE_int32(repeat, 1, "Running the inference program repeat times");
 TEST(inference, image_classification) {
-  if (FLAGS_dirname.empty()) {
+  if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model "
+                  "--batch_size=1 --repeat=1";
  }
  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
@@ -29,13 +32,11 @@ TEST(inference, image_classification) {
  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-  int64_t batch_size = 1;
  paddle::framework::LoDTensor input;
  // Use normilized image pixels as input data,
  // which should be in the range [0.0, 1.0].
  SetupTensor<float>(input,
-                     {batch_size, 3, 32, 32},
+                     {FLAGS_batch_size, 3, 32, 32},
                     static_cast<float>(0),
                     static_cast<float>(1));
  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
@@ -46,7 +47,9 @@ TEST(inference, image_classification) {
  cpu_fetchs1.push_back(&output1);
  // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << "--- CPU Runs: ---";
+  TestInference<paddle::platform::CPUPlace>(
+      dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat);
  LOG(INFO) << output1.dims();
 #ifdef PADDLE_WITH_CUDA
@@ -55,7 +58,9 @@ TEST(inference, image_classification) {
  cpu_fetchs2.push_back(&output2);
  // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << "--- GPU Runs: ---";
+  TestInference<paddle::platform::CUDAPlace>(
+      dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat);
  LOG(INFO) << output2.dims();
  CheckError<float>(output1, output2);

--- a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
@@ -17,10 +17,13 @@ limitations under the License. */
 #include "paddle/fluid/inference/tests/test_helper.h"
 DEFINE_string(dirname, "", "Directory of the inference model.");
+DEFINE_int32(batch_size, 1, "Batch size of input data");
+DEFINE_int32(repeat, 1, "Running the inference program repeat times");
 TEST(inference, recognize_digits) {
-  if (FLAGS_dirname.empty()) {
+  if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model "
+                  "--batch_size=1 --repeat=1";
  }
  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
@@ -29,24 +32,25 @@ TEST(inference, recognize_digits) {
  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-  int64_t batch_size = 1;
  paddle::framework::LoDTensor input;
  // Use normilized image pixels as input data,
  // which should be in the range [-1.0, 1.0].
  SetupTensor<float>(input,
-                     {batch_size, 1, 28, 28},
+                     {FLAGS_batch_size, 1, 28, 28},
                     static_cast<float>(-1),
                     static_cast<float>(1));
  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
  cpu_feeds.push_back(&input);
+  for (auto is_combined : {false, true}) {
    paddle::framework::LoDTensor output1;
    std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
    cpu_fetchs1.push_back(&output1);
    // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+    LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---";
+    TestInference<paddle::platform::CPUPlace>(
+        dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined);
    LOG(INFO) << output1.dims();
 #ifdef PADDLE_WITH_CUDA
@@ -55,51 +59,12 @@ TEST(inference, recognize_digits) {
    cpu_fetchs2.push_back(&output2);
    // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+    LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---";
+    TestInference<paddle::platform::CUDAPlace>(
+        dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined);
    LOG(INFO) << output2.dims();
    CheckError<float>(output1, output2);
 #endif
-}
-TEST(inference, recognize_digits_combine) {
-  if (FLAGS_dirname.empty()) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
  }
-  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
-  std::string dirname = FLAGS_dirname;
-  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
-  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-  paddle::framework::LoDTensor input;
-  // Use normilized image pixels as input data,
-  // which should be in the range [-1.0, 1.0].
-  SetupTensor<float>(
-      input, {1, 1, 28, 28}, static_cast<float>(-1), static_cast<float>(1));
-  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
-  cpu_feeds.push_back(&input);
-  paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-  cpu_fetchs1.push_back(&output1);
-  // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace, true>(
-      dirname, cpu_feeds, cpu_fetchs1);
-  LOG(INFO) << output1.dims();
-#ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
-  cpu_fetchs2.push_back(&output2);
-  // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace, true>(
-      dirname, cpu_feeds, cpu_fetchs2);
-  LOG(INFO) << output2.dims();
-  CheckError<float>(output1, output2);
-#endif
 }
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <time.h>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/profiler.h"
 template <typename T>
 void SetupTensor(paddle::framework::LoDTensor& input,
@@ -87,32 +88,59 @@ void CheckError(paddle::framework::LoDTensor& output1,
  EXPECT_EQ(count, 0U) << "There are " << count << " different elements.";
 }
-template <typename Place, bool IsCombined = false>
+template <typename Place>
 void TestInference(const std::string& dirname,
                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
-                   std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
+                   std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
+                   const int repeat = 1,
+                   const bool is_combined = false) {
  // 1. Define place, executor, scope
  auto place = Place();
  auto executor = paddle::framework::Executor(place);
  auto* scope = new paddle::framework::Scope();
+  // Profile the performance
+  paddle::platform::ProfilerState state;
+  if (paddle::platform::is_cpu_place(place)) {
+    state = paddle::platform::ProfilerState::kCPU;
+  } else {
+#ifdef PADDLE_WITH_CUDA
+    state = paddle::platform::ProfilerState::kCUDA;
+    // The default device_id of paddle::platform::CUDAPlace is 0.
+    // Users can get the device_id using:
+    //   int device_id = place.GetDeviceId();
+    paddle::platform::SetDeviceId(0);
+#endif
+  }
+  // Enable the profiler
+  paddle::platform::EnableProfiler(state);
  // 2. Initialize the inference_program and load parameters
  std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
-  if (IsCombined) {
+  {
+    paddle::platform::RecordEvent record_event(
+        "init_program",
+        paddle::platform::DeviceContextPool::Instance().Get(place));
+    if (is_combined) {
      // All parameters are saved in a single file.
      // Hard-coding the file names of program and parameters in unittest.
      // The file names should be consistent with that used in Python API
      //  `fluid.io.save_inference_model`.
      std::string prog_filename = "__model_combined__";
      std::string param_filename = "__params_combined__";
-    inference_program = paddle::inference::Load(executor,
+      inference_program =
+          paddle::inference::Load(executor,
                                  *scope,
                                  dirname + "/" + prog_filename,
                                  dirname + "/" + param_filename);
    } else {
-    // Parameters are saved in separate files sited in the specified `dirname`.
+      // Parameters are saved in separate files sited in the specified
+      // `dirname`.
      inference_program = paddle::inference::Load(executor, *scope, dirname);
    }
+  }
  // 3. Get the feed_target_names and fetch_target_names
  const std::vector<std::string>& feed_target_names =
@@ -134,7 +162,21 @@ void TestInference(const std::string& dirname,
  }
  // 6. Run the inference program
+  {
+    // Run repeat times to profile the performance
+    for (int i = 0; i < repeat; ++i) {
+      paddle::platform::RecordEvent record_event(
+          "run_inference",
+          paddle::platform::DeviceContextPool::Instance().Get(place));
      executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+    }
+  }
+  // Disable the profiler and print the timing information
+  paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
+                                    "profiler.txt");
+  paddle::platform::ResetProfiler();
  delete scope;
 }
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -178,7 +178,7 @@ void EnableProfiler(ProfilerState state) {
  }
 #ifdef PADDLE_WITH_CUDA
  if (g_state == ProfilerState::kCUDA) {
-    // Generate some dummy evenets first to reduce the startup overhead.
+    // Generate some dummy events first to reduce the startup overhead.
    for (int i = 0; i < 5; i++) {
      ForEachDevice([](int d) {
        DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));