From a032f56f7cdb12a9a62f14f6619e96a2a04b631c Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Thu, 8 Mar 2018 09:48:53 +0800 Subject: [PATCH] Add profiling information for inference example (#8748) * Add profiling information for inference example, recognize digits. * Refine the profiling method. * Correct the use of RecordEvent and simplify recognize_digits. --- paddle/fluid/inference/io.cc | 15 ++-- .../test_inference_image_classification.cc | 19 +++-- .../book/test_inference_recognize_digits.cc | 83 ++++++------------- paddle/fluid/inference/tests/test_helper.h | 76 +++++++++++++---- paddle/fluid/platform/profiler.cc | 2 +- 5 files changed, 104 insertions(+), 91 deletions(-) diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index 80eb988967..52e9c0baa6 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -22,14 +22,14 @@ namespace paddle { namespace inference { void ReadBinaryFile(const std::string& filename, std::string& contents) { - VLOG(3) << "loading model from " << filename; - std::ifstream inputfs(filename, std::ios::in | std::ios::binary); - inputfs.seekg(0, std::ios::end); + std::ifstream fin(filename, std::ios::in | std::ios::binary); + PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s", filename); + fin.seekg(0, std::ios::end); contents.clear(); - contents.resize(inputfs.tellg()); - inputfs.seekg(0, std::ios::beg); - inputfs.read(&contents[0], contents.size()); - inputfs.close(); + contents.resize(fin.tellg()); + fin.seekg(0, std::ios::beg); + fin.read(&contents[0], contents.size()); + fin.close(); } bool IsPersistable(const framework::VarDesc* var) { @@ -97,6 +97,7 @@ std::unique_ptr Load(framework::Executor& executor, const std::string& dirname) { std::string model_filename = dirname + "/__model__"; std::string program_desc_str; + VLOG(3) << "loading model from " << model_filename; ReadBinaryFile(model_filename, program_desc_str); std::unique_ptr main_program( diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc index d6fc51301b..e9a27171f1 100644 --- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc +++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc @@ -17,10 +17,13 @@ limitations under the License. */ #include "paddle/fluid/inference/tests/test_helper.h" DEFINE_string(dirname, "", "Directory of the inference model."); +DEFINE_int32(batch_size, 1, "Batch size of input data"); +DEFINE_int32(repeat, 1, "Running the inference program repeat times"); TEST(inference, image_classification) { - if (FLAGS_dirname.empty()) { - LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model " + "--batch_size=1 --repeat=1"; } LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; @@ -29,13 +32,11 @@ TEST(inference, image_classification) { // 0. Call `paddle::framework::InitDevices()` initialize all the devices // In unittests, this is done in paddle/testing/paddle_gtest_main.cc - int64_t batch_size = 1; - paddle::framework::LoDTensor input; // Use normilized image pixels as input data, // which should be in the range [0.0, 1.0]. SetupTensor(input, - {batch_size, 3, 32, 32}, + {FLAGS_batch_size, 3, 32, 32}, static_cast(0), static_cast(1)); std::vector cpu_feeds; @@ -46,7 +47,9 @@ TEST(inference, image_classification) { cpu_fetchs1.push_back(&output1); // Run inference on CPU - TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << "--- CPU Runs: ---"; + TestInference( + dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat); LOG(INFO) << output1.dims(); #ifdef PADDLE_WITH_CUDA @@ -55,7 +58,9 @@ TEST(inference, image_classification) { cpu_fetchs2.push_back(&output2); // Run inference on CUDA GPU - TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << "--- GPU Runs: ---"; + TestInference( + dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat); LOG(INFO) << output2.dims(); CheckError(output1, output2); diff --git a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc index 99bee94cb8..1fb0f9e777 100644 --- a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc +++ b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc @@ -17,10 +17,13 @@ limitations under the License. */ #include "paddle/fluid/inference/tests/test_helper.h" DEFINE_string(dirname, "", "Directory of the inference model."); +DEFINE_int32(batch_size, 1, "Batch size of input data"); +DEFINE_int32(repeat, 1, "Running the inference program repeat times"); TEST(inference, recognize_digits) { - if (FLAGS_dirname.empty()) { - LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model " + "--batch_size=1 --repeat=1"; } LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; @@ -29,77 +32,39 @@ TEST(inference, recognize_digits) { // 0. Call `paddle::framework::InitDevices()` initialize all the devices // In unittests, this is done in paddle/testing/paddle_gtest_main.cc - int64_t batch_size = 1; - paddle::framework::LoDTensor input; // Use normilized image pixels as input data, // which should be in the range [-1.0, 1.0]. SetupTensor(input, - {batch_size, 1, 28, 28}, + {FLAGS_batch_size, 1, 28, 28}, static_cast(-1), static_cast(1)); std::vector cpu_feeds; cpu_feeds.push_back(&input); - paddle::framework::LoDTensor output1; - std::vector cpu_fetchs1; - cpu_fetchs1.push_back(&output1); + for (auto is_combined : {false, true}) { + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); - // Run inference on CPU - TestInference(dirname, cpu_feeds, cpu_fetchs1); - LOG(INFO) << output1.dims(); + // Run inference on CPU + LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---"; + TestInference( + dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined); + LOG(INFO) << output1.dims(); #ifdef PADDLE_WITH_CUDA - paddle::framework::LoDTensor output2; - std::vector cpu_fetchs2; - cpu_fetchs2.push_back(&output2); + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); - // Run inference on CUDA GPU - TestInference(dirname, cpu_feeds, cpu_fetchs2); - LOG(INFO) << output2.dims(); + // Run inference on CUDA GPU + LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---"; + TestInference( + dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined); + LOG(INFO) << output2.dims(); - CheckError(output1, output2); + CheckError(output1, output2); #endif -} - -TEST(inference, recognize_digits_combine) { - if (FLAGS_dirname.empty()) { - LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; } - - LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; - std::string dirname = FLAGS_dirname; - - // 0. Call `paddle::framework::InitDevices()` initialize all the devices - // In unittests, this is done in paddle/testing/paddle_gtest_main.cc - - paddle::framework::LoDTensor input; - // Use normilized image pixels as input data, - // which should be in the range [-1.0, 1.0]. - SetupTensor( - input, {1, 1, 28, 28}, static_cast(-1), static_cast(1)); - std::vector cpu_feeds; - cpu_feeds.push_back(&input); - - paddle::framework::LoDTensor output1; - std::vector cpu_fetchs1; - cpu_fetchs1.push_back(&output1); - - // Run inference on CPU - TestInference( - dirname, cpu_feeds, cpu_fetchs1); - LOG(INFO) << output1.dims(); - -#ifdef PADDLE_WITH_CUDA - paddle::framework::LoDTensor output2; - std::vector cpu_fetchs2; - cpu_fetchs2.push_back(&output2); - - // Run inference on CUDA GPU - TestInference( - dirname, cpu_feeds, cpu_fetchs2); - LOG(INFO) << output2.dims(); - - CheckError(output1, output2); -#endif } diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h index 49518e50d8..d0688445fe 100644 --- a/paddle/fluid/inference/tests/test_helper.h +++ b/paddle/fluid/inference/tests/test_helper.h @@ -15,6 +15,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/profiler.h" template void SetupTensor(paddle::framework::LoDTensor& input, @@ -87,31 +88,58 @@ void CheckError(paddle::framework::LoDTensor& output1, EXPECT_EQ(count, 0U) << "There are " << count << " different elements."; } -template +template void TestInference(const std::string& dirname, const std::vector& cpu_feeds, - std::vector& cpu_fetchs) { + std::vector& cpu_fetchs, + const int repeat = 1, + const bool is_combined = false) { // 1. Define place, executor, scope auto place = Place(); auto executor = paddle::framework::Executor(place); auto* scope = new paddle::framework::Scope(); + // Profile the performance + paddle::platform::ProfilerState state; + if (paddle::platform::is_cpu_place(place)) { + state = paddle::platform::ProfilerState::kCPU; + } else { +#ifdef PADDLE_WITH_CUDA + state = paddle::platform::ProfilerState::kCUDA; + // The default device_id of paddle::platform::CUDAPlace is 0. + // Users can get the device_id using: + // int device_id = place.GetDeviceId(); + paddle::platform::SetDeviceId(0); +#endif + } + + // Enable the profiler + paddle::platform::EnableProfiler(state); + // 2. Initialize the inference_program and load parameters std::unique_ptr inference_program; - if (IsCombined) { - // All parameters are saved in a single file. - // Hard-coding the file names of program and parameters in unittest. - // The file names should be consistent with that used in Python API - // `fluid.io.save_inference_model`. - std::string prog_filename = "__model_combined__"; - std::string param_filename = "__params_combined__"; - inference_program = paddle::inference::Load(executor, - *scope, - dirname + "/" + prog_filename, - dirname + "/" + param_filename); - } else { - // Parameters are saved in separate files sited in the specified `dirname`. - inference_program = paddle::inference::Load(executor, *scope, dirname); + { + paddle::platform::RecordEvent record_event( + "init_program", + paddle::platform::DeviceContextPool::Instance().Get(place)); + + if (is_combined) { + // All parameters are saved in a single file. + // Hard-coding the file names of program and parameters in unittest. + // The file names should be consistent with that used in Python API + // `fluid.io.save_inference_model`. + std::string prog_filename = "__model_combined__"; + std::string param_filename = "__params_combined__"; + inference_program = + paddle::inference::Load(executor, + *scope, + dirname + "/" + prog_filename, + dirname + "/" + param_filename); + } else { + // Parameters are saved in separate files sited in the specified + // `dirname`. + inference_program = paddle::inference::Load(executor, *scope, dirname); + } } // 3. Get the feed_target_names and fetch_target_names @@ -134,7 +162,21 @@ void TestInference(const std::string& dirname, } // 6. Run the inference program - executor.Run(*inference_program, scope, feed_targets, fetch_targets); + { + // Run repeat times to profile the performance + for (int i = 0; i < repeat; ++i) { + paddle::platform::RecordEvent record_event( + "run_inference", + paddle::platform::DeviceContextPool::Instance().Get(place)); + + executor.Run(*inference_program, scope, feed_targets, fetch_targets); + } + } + + // Disable the profiler and print the timing information + paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault, + "profiler.txt"); + paddle::platform::ResetProfiler(); delete scope; } diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 094f9224f7..28ef3e04b1 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -178,7 +178,7 @@ void EnableProfiler(ProfilerState state) { } #ifdef PADDLE_WITH_CUDA if (g_state == ProfilerState::kCUDA) { - // Generate some dummy evenets first to reduce the startup overhead. + // Generate some dummy events first to reduce the startup overhead. for (int i = 0; i < 5; i++) { ForEachDevice([](int d) { DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d)); -- GitLab