Add pe profiler Event (#24611)

dbfe5333 · wangchaochaohu · GitHub · 55b664a1 · dbfe5333 · dbfe5333
8 changed file
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -31,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h"
+#include "paddle/fluid/platform/event.h"
 #include "paddle/fluid/platform/profiler.h"

 DECLARE_double(eager_delete_tensor_gb);
@@ -820,6 +821,8 @@ void ParallelExecutor::BCastParamsToDevices(
 FetchResultType ParallelExecutor::Run(
    const std::vector<std::string> &fetch_tensors, bool return_merged) {
  VLOG(3) << "enter ParallelExecutor Run";
+  platform::RecordEvent parallel_executor_event(
+      "ParallelExecutor::Run", paddle::platform::EventRole::kSpecial);
 #ifdef WITH_GPERFTOOLS
  if (gProfileStarted) {
    ProfilerFlush();

--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -40,6 +40,9 @@ namespace {
 thread_local std::deque<int> block_id_stack;
 // Tracking the nested event stacks.
 thread_local std::deque<Event *> annotation_stack;
+// stack to strore event sunch as pe and so on
+static std::deque<Event *> main_thread_annotation_stack{};
+static std::deque<std::string> main_thread_annotation_stack_name{};

 std::map<uint32_t, int32_t> system_thread_id_map;

@@ -638,15 +641,49 @@ DeviceTracer *GetDeviceTracer() {
  return tracer;
 }

-void SetCurAnnotation(Event *event) {
-  if (!annotation_stack.empty()) {
+std::string SetCurAnnotation(Event *event) {
+  std::string ret;
+  if (!annotation_stack.empty() && event->role() != EventRole::kSpecial) {
    event->set_parent(annotation_stack.back());
    event->set_name(annotation_stack.back()->name() + "/" + event->name());
  }
+
  annotation_stack.push_back(event);
+
+  if (!main_thread_annotation_stack_name.empty() && !annotation_stack.empty() &&
+      main_thread_annotation_stack.back()->thread_id() !=
+          annotation_stack.back()->thread_id()) {
+    ret = main_thread_annotation_stack_name.back() + "/" + event->name();
+  } else {
+    ret = event->name();
+  }
+  if (event->role() == EventRole::kSpecial) {
+    std::string name = event->name();
+    if (!main_thread_annotation_stack_name.empty()) {
+      name = main_thread_annotation_stack_name.back() + "/" + event->name();
+    }
+    main_thread_annotation_stack_name.push_back(name);
+    main_thread_annotation_stack.push_back(event);
+  }
+
+  return ret;
 }

-void ClearCurAnnotation() { annotation_stack.pop_back(); }
+void ClearCurAnnotation() {
+  if (!main_thread_annotation_stack_name.empty() && !annotation_stack.empty() &&
+      main_thread_annotation_stack.back()->thread_id() !=
+          annotation_stack.back()->thread_id()) {
+    annotation_stack.back()->set_name(main_thread_annotation_stack_name.back() +
+                                      "/" + annotation_stack.back()->name());
+  }
+  if (!main_thread_annotation_stack.empty() &&
+      main_thread_annotation_stack.back()->name() ==
+          annotation_stack.back()->name()) {
+    main_thread_annotation_stack_name.pop_back();
+    main_thread_annotation_stack.pop_back();
+  }
+  annotation_stack.pop_back();
+}

 Event *CurAnnotation() {
  if (annotation_stack.empty()) return nullptr;

--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -137,7 +137,7 @@ class DeviceTracer {
 DeviceTracer* GetDeviceTracer();

 // Set a name for the cuda kernel operation being launched by the thread.
-void SetCurAnnotation(Event* event);
+std::string SetCurAnnotation(Event* event);
 // Clear the name after the operation is done.
 void ClearCurAnnotation();
 // Current name of the operation being run in the thread.

--- a/paddle/fluid/platform/event.h
+++ b/paddle/fluid/platform/event.h
@@ -29,6 +29,7 @@ enum class EventRole {
  kOrdinary,  // only record op time with op type key
  kInnerOp,   // record op detail time with op type key
  kUniqueOp,  // record op detail time with op unique name key
+  kSpecial,   // record event such as PE which is outer of thread local
 };

 class Event {

--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -73,8 +73,7 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role) {
  // lock is not needed, the code below is thread-safe
  Event *e = PushEvent(name, role);
  // Maybe need the same push/pop behavior.
-  SetCurAnnotation(e);
-  name_ = e->name();
+  name_ = SetCurAnnotation(e);
 }

 RecordEvent::~RecordEvent() {
@@ -86,7 +85,7 @@ RecordEvent::~RecordEvent() {
                          BlockDepth(), g_thread_id);
  }
  ClearCurAnnotation();
-  PopEvent(name_);
+  PopEvent(name_, role_);
 }

 void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
@@ -187,8 +186,8 @@ Event *PushEvent(const std::string &name, const EventRole role) {
  return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role);
 }

-void PopEvent(const std::string &name) {
-  GetEventList().Record(EventType::kPopRange, name, g_thread_id);
+void PopEvent(const std::string &name, const EventRole role) {
+  GetEventList().Record(EventType::kPopRange, name, g_thread_id, role);
 }
 void EnableProfiler(ProfilerState state) {
  PADDLE_ENFORCE_NE(state, ProfilerState::kDisabled,

--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -197,7 +197,7 @@ void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
 void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
                 const Place& place, const std::string& annotation);
 Event* PushEvent(const std::string& name, const EventRole role);
-void PopEvent(const std::string& name);
+void PopEvent(const std::string& name, const EventRole role);
 // Return the event list of all threads. Assumed the returned value calls
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
 std::vector<std::vector<Event>> GetAllEvents();

--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -22,12 +22,12 @@ limitations under the License. */
 #include <memory>
 #include <mutex>  // NOLINT
 #include <random>
+#include <set>
 #include <stack>
 #include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
-
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #endif  // PADDLE_WITH_CUDA
@@ -283,7 +283,8 @@ std::function<bool(const EventItem &, const EventItem &)> SetSortedFunc(
 void SetEvent(bool merge_thread, const Event &analyze_event,
              size_t *max_name_width, std::list<Event> *pushed_events,
              std::vector<EventItem> *event_items,
-              std::unordered_map<std::string, int> *event_idx) {
+              std::unordered_map<std::string, int> *event_idx,
+              const std::set<std::string> &main_thread_event_name) {
  if (analyze_event.type() == EventType::kPushRange) {
    pushed_events->push_back(analyze_event);
  } else if (analyze_event.type() == EventType::kPopRange) {
@@ -312,10 +313,37 @@ void SetEvent(bool merge_thread, const Event &analyze_event,
      std::string event_name;
      if (merge_thread) {
        event_name = rit->name();
+      } else {
+        if (!main_thread_event_name.empty()) {
+          auto origin_name = rit->name();
+          int index = 1;
+          int split_pos = 0;
+          while ((split_pos = FindNthReversePos(origin_name, '/', index)) !=
+                 -1) {
+            auto prefix_str = origin_name.substr(0, split_pos);
+            if (main_thread_event_name.count(prefix_str)) {
+              break;
+            }
+            index++;
+          }
+          if (split_pos == -1 && !main_thread_event_name.count(rit->name())) {
+            event_name = "thread" + std::to_string(rit->thread_id()) + "::" +
+                         rit->name();
+          } else {
+            if (!main_thread_event_name.count(rit->name())) {
+              event_name =
+                  origin_name.substr(0, split_pos + 1) + "thread" +
+                  std::to_string(rit->thread_id()) + "::" +
+                  origin_name.substr(split_pos + 1, origin_name.length() - 1);
+            } else {
+              event_name = rit->name();
+            }
+          }
        } else {
          event_name =
              "thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
        }
+      }
      auto print_name_size = event_name.size();
      int found_pos = 0;
      if (rit->role() == EventRole::kInnerOp &&
@@ -608,6 +636,16 @@ void AnalyzeEvent(
    std::function<bool(const EventItem &, const EventItem &)> sorted_func,
    EventSortingKey sorted_by, size_t *max_name_width, OverHead *overhead,
    bool merge_thread) {
+  // In oreder to deal with special event in main thread
+  std::set<std::string> main_thread_event_name;
+  for (size_t i = 0; i < (*analyze_events).size(); i++) {
+    for (size_t j = 0; j < (*analyze_events)[i].size(); j++) {
+      Event event = (*analyze_events)[i][j];
+      if (event.role() == EventRole::kSpecial) {
+        main_thread_event_name.insert(event.name());
+      }
+    }
+  }
  for (size_t i = 0; i < (*analyze_events).size(); i++) {
    double total = 0.;  // the total time in one thread
    std::list<Event> pushed_events;
@@ -618,8 +656,10 @@ void AnalyzeEvent(

    for (size_t j = 0; j < (*analyze_events)[i].size(); j++) {
      Event analyze_event = (*analyze_events)[i][j];
+      if (!(analyze_event.role() == EventRole::kSpecial && !merge_thread)) {
        SetEvent(merge_thread, analyze_event, max_name_width, &pushed_events,
-               &event_items, &event_idx);
+                 &event_items, &event_idx, main_thread_event_name);
+      }
    }

    auto table_size = event_items.size();

--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -59,7 +59,7 @@ TEST(RecordEvent, RecordEvent) {
      PushEvent(name, EventRole::kOrdinary);
      int counter = 1;
      while (counter != i * 1000) counter++;
-      PopEvent(name);
+      PopEvent(name, EventRole::kOrdinary);
    }
  }

@@ -109,7 +109,7 @@ TEST(RecordEvent, RecordEvent) {

  // Bad Usage:
  PushEvent("event_without_pop", EventRole::kOrdinary);
-  PopEvent("event_without_push");
+  PopEvent("event_without_push", EventRole::kOrdinary);
  std::vector<std::vector<Event>> events = paddle::platform::GetAllEvents();

  int cuda_startup_count = 0;