Merge pull request #7043 from kuke/profiler_tool

Add the parsing part for the profiling tool

Merge pull request #7043 from kuke/profiler_tool
Add the parsing part for the profiling tool
5e90f5e1 · Yibing Liu · GitHub · c62383e9 · 6416eee7 · 5e90f5e1
Showing with 238 addition and 21 deletion

paddle/platform/profiler.cc paddle/platform/profiler.cc +170 -14

paddle/platform/profiler.h paddle/platform/profiler.h +30 -2

paddle/platform/profiler_test.cc paddle/platform/profiler_test.cc +38 -5

未找到文件。
--- a/paddle/platform/profiler.cc
+++ b/paddle/platform/profiler.cc
@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/platform/profiler.h"
+#include <iomanip>
+#include <map>
+#include "glog/logging.h"
 namespace paddle {
 namespace platform {
 // The profiler state, the initial value is ProfilerState::kDisabled
 static ProfilerState g_state = ProfilerState::kDisabled;
+// To record which timer the profiler used, CUDA or CPU.
+static std::string g_profiler_place = "";
 // The thread local event list only can be accessed by the specific thread
 // The thread index of each thread
 static thread_local int32_t g_thread_id;
@@ -43,10 +48,7 @@ inline uint64_t GetTimeInNsec() {
 Event::Event(EventKind kind, std::string name, uint32_t thread_id,
             DeviceContext* dev_ctx)
-    : kind_(kind),
+    : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
-      name_(std::move(name)),
-      thread_id_(thread_id),
-      has_cuda_(false) {
 #ifdef PADDLE_WITH_CUDA
  auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
  if (cuda_dev_ctx) {
@@ -72,11 +74,11 @@ std::string Event::kind() const {
  PADDLE_THROW("Unknown EventKind.");
 }
-double Event::CpuElapsedUs(const Event& e) const {
+double Event::CpuElapsedMs(const Event& e) const {
-  return (e.cpu_ns_ - cpu_ns_) / (1000.0);
+  return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
 }
-double Event::CudaElapsedUs(const Event& e) const {
+double Event::CudaElapsedMs(const Event& e) const {
 #ifdef PADDLE_WITH_CUDA
  PADDLE_ENFORCE(e.has_cuda() && has_cuda());
  PADDLE_ENFORCE(e.device() == device());
@@ -84,7 +86,7 @@ double Event::CudaElapsedUs(const Event& e) const {
  PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
  float ms;
  PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
-  return ms * 1000.0;
+  return ms;
 #else
  PADDLE_THROW("CUDA is not enabled");
 #endif
@@ -113,21 +115,27 @@ inline EventList& GetEventList() {
 }
 void Mark(const std::string& name, DeviceContext* dev_ctx) {
-  GetEventList().Record(EventKind::kMark, std::move(name), g_thread_id,
+  GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx);
-                        dev_ctx);
+}
+void PushEvent(const std::string& name, DeviceContext* dev_ctx) {
+  GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx);
+}
+void PopEvent(const std::string& name, DeviceContext* dev_ctx) {
+  GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
 }
 RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) {
  if (g_state == ProfilerState::kDisabled) return;
  dev_ctx_ = dev_ctx;
-  GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id,
+  name_ = name;
-                        dev_ctx_);
+  PushEvent(name_, dev_ctx_);
 }
 RecordEvent::~RecordEvent() {
  if (g_state == ProfilerState::kDisabled) return;
-  GetEventList().Record(EventKind::kPopRange, std::string(), g_thread_id,
+  PopEvent(name_, dev_ctx_);
-                        dev_ctx_);
 }
 void EnableProfiler(ProfilerState state) {
@@ -138,6 +146,7 @@ void EnableProfiler(ProfilerState state) {
                 "The profiling state should be disabled when calling ",
                 "EnableProfiler.");
  g_state = state;
+  g_profiler_place = (g_state == ProfilerState::kCUDA) ? "CUDA" : "CPU";
 #ifdef PADDLE_WITH_CUDA
  if (g_state == ProfilerState::kCUDA) {
    // Generate some dummy evenets first to reduce the startup overhead.
@@ -169,5 +178,152 @@ std::vector<std::vector<Event>> DisableProfiler() {
  return result;
 }
+void ParseEvents(std::vector<std::vector<Event>>& events,
+                 EventSortingKey sorted_by) {
+  if (g_profiler_place == "") return;
+  std::string sorted_domain;
+  std::function<bool(EventItem&, EventItem&)> sorted_func;
+  switch (sorted_by) {
+    case EventSortingKey::kCalls:
+      sorted_domain = "number of calls";
+      sorted_func = [](EventItem& a, EventItem& b) {
+        return a.calls > b.calls;
+      };
+      break;
+    case EventSortingKey::kTotal:
+      sorted_domain = "total time";
+      sorted_func = [](EventItem& a, EventItem& b) {
+        return a.total_time > b.total_time;
+      };
+      break;
+    case EventSortingKey::kMin:
+      sorted_domain = "minimum time";
+      sorted_func = [](EventItem& a, EventItem& b) {
+        return a.min_time > b.min_time;
+      };
+      break;
+    case EventSortingKey::kMax:
+      sorted_domain = "maximum time";
+      sorted_func = [](EventItem& a, EventItem& b) {
+        return a.max_time > b.max_time;
+      };
+      break;
+    case EventSortingKey::kAve:
+      sorted_domain = "average time";
+      sorted_func = [](EventItem& a, EventItem& b) {
+        return a.ave_time > b.ave_time;
+      };
+      break;
+    default:
+      sorted_domain = "event end time";
+  }
+  std::vector<std::vector<EventItem>> events_table;
+  size_t max_name_width = 0;
+  for (size_t i = 0; i < events.size(); i++) {
+    std::list<Event> pushed_events;
+    std::vector<EventItem> event_items;
+    std::unordered_map<std::string, int> event_idx;
+    for (size_t j = 0; j < events[i].size(); j++) {
+      if (events[i][j].kind() == "push") {
+        pushed_events.push_back(events[i][j]);
+      } else if (events[i][j].kind() == "pop") {
+        std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
+        while (rit != pushed_events.rend() &&
+               rit->name() != events[i][j].name()) {
+          ++rit;
+        }
+        if (rit != pushed_events.rend()) {
+          double event_time = (g_profiler_place == "CUDA")
+                                  ? rit->CudaElapsedMs(events[i][j])
+                                  : rit->CpuElapsedMs(events[i][j]);
+          std::string event_name =
+              "thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
+          max_name_width = std::max(max_name_width, event_name.size());
+          if (event_idx.find(event_name) == event_idx.end()) {
+            event_idx[event_name] = event_items.size();
+            EventItem event_item = {event_name, 1,          event_time,
+                                    event_time, event_time, event_time};
+            event_items.push_back(event_item);
+          } else {
+            int index = event_idx[event_name];
+            event_items[index].calls += 1;
+            // total time
+            event_items[index].total_time += event_time;
+            // min time
+            event_items[index].min_time =
+                std::min(event_time, event_items[index].min_time);
+            // max time
+            event_items[index].max_time =
+                std::max(event_time, event_items[index].max_time);
+          }
+          // remove the push marker from the list
+          pushed_events.erase((++rit).base());
+        } else {
+          LOG(WARNING) << "Cannot find the push marker of event \'"
+                       << events[i][j].name()
+                       << "\', which will be ignored in profiling report.";
+        }
+      }
+    }
+    // average time
+    for (auto& item : event_items) {
+      item.ave_time = item.total_time / item.calls;
+    }
+    // sort
+    if (sorted_by != EventSortingKey::kDefault) {
+      std::sort(event_items.begin(), event_items.end(), sorted_func);
+    }
+    events_table.push_back(event_items);
+    // log warning if there are events with `push` but without `pop`
+    std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
+    while (rit != pushed_events.rend()) {
+      LOG(WARNING) << "Cannot find the pop marker of event \'" << rit->name()
+                   << "\', which will be ignored in profiling report.";
+      ++rit;
+    }
+  }
+  // Print report
+  PrintProfilingReport(events_table, sorted_domain, max_name_width + 4, 12);
+}
+void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table,
+                          std::string& sorted_domain, const size_t name_width,
+                          const size_t data_width) {
+  // Output header information
+  std::cout << "\n------------------------->"
+            << "     Profiling Report     "
+            << "<-------------------------\n\n";
+  std::cout << "Place: " << g_profiler_place << std::endl;
+  std::cout << "Time unit: ms" << std::endl;
+  std::cout << "Sorted by " << sorted_domain
+            << " in descending order in the same thread\n\n";
+  // Output events table
+  std::cout.setf(std::ios::left);
+  std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
+            << "Calls" << std::setw(data_width) << "Total"
+            << std::setw(data_width) << "Min." << std::setw(data_width)
+            << "Max." << std::setw(data_width) << "Ave." << std::endl;
+  for (size_t i = 0; i < events_table.size(); ++i) {
+    for (size_t j = 0; j < events_table[i].size(); ++j) {
+      EventItem& event_item = events_table[i][j];
+      std::cout << std::setw(name_width) << event_item.name
+                << std::setw(data_width) << event_item.calls
+                << std::setw(data_width) << event_item.total_time
+                << std::setw(data_width) << event_item.min_time
+                << std::setw(data_width) << event_item.max_time
+                << std::setw(data_width) << event_item.ave_time << std::endl;
+    }
+  }
+  std::cout << std::endl;
+}
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/platform/profiler.h
+++ b/paddle/platform/profiler.h
@@ -33,6 +33,7 @@ class Event {
  std::string kind() const;
  std::string name() const { return name_; }
+  uint32_t thread_id() const { return thread_id_; }
  bool has_cuda() const { return has_cuda_; }
 #ifdef PADDLE_WITH_CUDA
@@ -40,8 +41,8 @@ class Event {
  int device() const { return device_; }
 #endif
-  double CpuElapsedUs(const Event& e) const;
+  double CpuElapsedMs(const Event& e) const;
-  double CudaElapsedUs(const Event& e) const;
+  double CudaElapsedMs(const Event& e) const;
 private:
  EventKind kind_;
@@ -94,6 +95,10 @@ enum ProfilerState {
 void Mark(const std::string& name, DeviceContext* dev_ctx);
+void PushEvent(const std::string& name, DeviceContext* dev_ctx);
+void PopEvent(const std::string& name, DeviceContext* dev_ctx);
 struct RecordEvent {
  explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx);
@@ -101,6 +106,8 @@ struct RecordEvent {
  // The device context is used by Event to get the current cuda stream.
  DeviceContext* dev_ctx_;
+  // Event name
+  std::string name_;
 };
 // Enable the profiling function.
@@ -110,5 +117,26 @@ void EnableProfiler(ProfilerState state);
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
 std::vector<std::vector<Event>> DisableProfiler();
+// The information of each event given in the profiling report
+struct EventItem {
+  std::string name;
+  int calls;
+  double total_time;
+  double min_time;
+  double max_time;
+  double ave_time;
+};
+// Candidate keys to sort the profiling report
+enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };
+// Parse the event list and output the profiling report
+void ParseEvents(std::vector<std::vector<Event>>&,
+                 EventSortingKey sorted_by = EventSortingKey::kDefault);
+// Print results
+void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table,
+                          std::string& sorted_domain, const size_t name_width,
+                          const size_t data_width);
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/platform/profiler_test.cc
+++ b/paddle/platform/profiler_test.cc
@@ -26,7 +26,7 @@ TEST(Event, CpuElapsedTime) {
    counter++;
  }
  Event stop_event(EventKind::kPopRange, "test", 0, nullptr);
-  EXPECT_GT(start_event.CpuElapsedUs(stop_event), 0);
+  EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0);
 }
 #ifdef PADDLE_WITH_CUDA
@@ -45,7 +45,7 @@ TEST(Event, CudaElapsedTime) {
    counter++;
  }
  Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx);
-  EXPECT_GT(start_event.CudaElapsedUs(stop_event), 0);
+  EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0);
 }
 #endif
@@ -55,6 +55,7 @@ TEST(RecordEvent, RecordEvent) {
  using paddle::platform::EventKind;
  using paddle::platform::RecordEvent;
  using paddle::platform::ProfilerState;
+  using paddle::platform::EventSortingKey;
  ProfilerState state = ProfilerState::kCPU;
  DeviceContext* dev_ctx = nullptr;
@@ -67,13 +68,45 @@ TEST(RecordEvent, RecordEvent) {
 #endif
  EnableProfiler(state);
+  /* Usage 1:
+  *  PushEvent(evt_name, dev_ctx);
+  *  ...
+  *  code to be analyzed
+  *  ...
+  * PopEvent(evt_name, dev_ctx);
+  */
+  for (int loop = 0; loop < 3; ++loop) {
    for (int i = 1; i < 5; ++i) {
      std::string name = "op_" + std::to_string(i);
+      PushEvent(name, dev_ctx);
+      int counter = 1;
+      while (counter != i * 1000) counter++;
+      PopEvent(name, dev_ctx);
+    }
+  }
+  /* Usage 2:
+   * {
+   *   RecordEvent record_event(name, dev_ctx);
+   *   ...
+   *   code to be analyzed
+   *   ...
+   * }
+   */
+  for (int i = 1; i < 5; ++i) {
+    std::string name = "evs_op_" + std::to_string(i);
    RecordEvent record_event(name, dev_ctx);
    int counter = 1;
    while (counter != i * 1000) counter++;
  }
+  // Bad Usage:
+  PushEvent("event_without_pop", dev_ctx);
+  PopEvent("event_without_push", dev_ctx);
  std::vector<std::vector<Event>> events = paddle::platform::DisableProfiler();
+  // Will remove parsing-related code from test later
+  ParseEvents(events, EventSortingKey::kTotal);
  int cuda_startup_count = 0;
  int start_profiler_count = 0;
  int stop_profiler_count = 0;
@@ -85,9 +118,9 @@ TEST(RecordEvent, RecordEvent) {
      if (events[i][j].name() == "push") {
        EXPECT_EQ(events[i][j + 1].name(), "pop");
 #ifdef PADDLE_WITH_CUDA
-        EXPECT_GT(events[i][j].CudaElapsedUs(events[i][j + 1]), 0);
+        EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0);
 #else
-        EXPECT_GT(events[i][j].CpuElapsedUs(events[i][j + 1]), 0);
+        EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0);
 #endif
      }
    }