From dbfe5333c59ab01c374de258cdfc565cc091db34 Mon Sep 17 00:00:00 2001
From: wangchaochaohu <wangchao66@baidu.com>
Date: Mon, 25 May 2020 15:18:26 +0800
Subject: [PATCH] Add pe profiler Event (#24611)

---
 paddle/fluid/framework/parallel_executor.cc |  3 ++
 paddle/fluid/platform/device_tracer.cc      | 43 +++++++++++++++--
 paddle/fluid/platform/device_tracer.h       |  2 +-
 paddle/fluid/platform/event.h               |  1 +
 paddle/fluid/platform/profiler.cc           |  9 ++--
 paddle/fluid/platform/profiler.h            |  2 +-
 paddle/fluid/platform/profiler_helper.h     | 52 ++++++++++++++++++---
 paddle/fluid/platform/profiler_test.cc      |  4 +-
 8 files changed, 98 insertions(+), 18 deletions(-)
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index ea36e83038c..8c6dd628bb9 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -31,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h"
+#include "paddle/fluid/platform/event.h"
 #include "paddle/fluid/platform/profiler.h"
 
 DECLARE_double(eager_delete_tensor_gb);
@@ -820,6 +821,8 @@ void ParallelExecutor::BCastParamsToDevices(
 FetchResultType ParallelExecutor::Run(
     const std::vector<std::string> &fetch_tensors, bool return_merged) {
   VLOG(3) << "enter ParallelExecutor Run";
+  platform::RecordEvent parallel_executor_event(
+      "ParallelExecutor::Run", paddle::platform::EventRole::kSpecial);
 #ifdef WITH_GPERFTOOLS
   if (gProfileStarted) {
     ProfilerFlush();
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index 4679b7e1da3..6be8ed25e3f 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -40,6 +40,9 @@ namespace {
 thread_local std::deque<int> block_id_stack;
 // Tracking the nested event stacks.
 thread_local std::deque<Event *> annotation_stack;
+// stack to strore event sunch as pe and so on
+static std::deque<Event *> main_thread_annotation_stack{};
+static std::deque<std::string> main_thread_annotation_stack_name{};
 
 std::map<uint32_t, int32_t> system_thread_id_map;
 
@@ -638,15 +641,49 @@ DeviceTracer *GetDeviceTracer() {
   return tracer;
 }
 
-void SetCurAnnotation(Event *event) {
-  if (!annotation_stack.empty()) {
+std::string SetCurAnnotation(Event *event) {
+  std::string ret;
+  if (!annotation_stack.empty() && event->role() != EventRole::kSpecial) {
     event->set_parent(annotation_stack.back());
     event->set_name(annotation_stack.back()->name() + "/" + event->name());
   }
+
   annotation_stack.push_back(event);
+
+  if (!main_thread_annotation_stack_name.empty() && !annotation_stack.empty() &&
+      main_thread_annotation_stack.back()->thread_id() !=
+          annotation_stack.back()->thread_id()) {
+    ret = main_thread_annotation_stack_name.back() + "/" + event->name();
+  } else {
+    ret = event->name();
+  }
+  if (event->role() == EventRole::kSpecial) {
+    std::string name = event->name();
+    if (!main_thread_annotation_stack_name.empty()) {
+      name = main_thread_annotation_stack_name.back() + "/" + event->name();
+    }
+    main_thread_annotation_stack_name.push_back(name);
+    main_thread_annotation_stack.push_back(event);
+  }
+
+  return ret;
 }
 
-void ClearCurAnnotation() { annotation_stack.pop_back(); }
+void ClearCurAnnotation() {
+  if (!main_thread_annotation_stack_name.empty() && !annotation_stack.empty() &&
+      main_thread_annotation_stack.back()->thread_id() !=
+          annotation_stack.back()->thread_id()) {
+    annotation_stack.back()->set_name(main_thread_annotation_stack_name.back() +
+                                      "/" + annotation_stack.back()->name());
+  }
+  if (!main_thread_annotation_stack.empty() &&
+      main_thread_annotation_stack.back()->name() ==
+          annotation_stack.back()->name()) {
+    main_thread_annotation_stack_name.pop_back();
+    main_thread_annotation_stack.pop_back();
+  }
+  annotation_stack.pop_back();
+}
 
 Event *CurAnnotation() {
   if (annotation_stack.empty()) return nullptr;
diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h
index 85168a046fb..44b7af149ef 100644
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -137,7 +137,7 @@ class DeviceTracer {
 DeviceTracer* GetDeviceTracer();
 
 // Set a name for the cuda kernel operation being launched by the thread.
-void SetCurAnnotation(Event* event);
+std::string SetCurAnnotation(Event* event);
 // Clear the name after the operation is done.
 void ClearCurAnnotation();
 // Current name of the operation being run in the thread.
diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h
index ed2c05af453..9a482a63f5e 100644
--- a/paddle/fluid/platform/event.h
+++ b/paddle/fluid/platform/event.h
@@ -29,6 +29,7 @@ enum class EventRole {
   kOrdinary,  // only record op time with op type key
   kInnerOp,   // record op detail time with op type key
   kUniqueOp,  // record op detail time with op unique name key
+  kSpecial,   // record event such as PE which is outer of thread local
 };
 
 class Event {
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index a36d8456eea..be655255bd8 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -73,8 +73,7 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role) {
   // lock is not needed, the code below is thread-safe
   Event *e = PushEvent(name, role);
   // Maybe need the same push/pop behavior.
-  SetCurAnnotation(e);
-  name_ = e->name();
+  name_ = SetCurAnnotation(e);
 }
 
 RecordEvent::~RecordEvent() {
@@ -86,7 +85,7 @@ RecordEvent::~RecordEvent() {
                           BlockDepth(), g_thread_id);
   }
   ClearCurAnnotation();
-  PopEvent(name_);
+  PopEvent(name_, role_);
 }
 
 void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
@@ -187,8 +186,8 @@ Event *PushEvent(const std::string &name, const EventRole role) {
   return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role);
 }
 
-void PopEvent(const std::string &name) {
-  GetEventList().Record(EventType::kPopRange, name, g_thread_id);
+void PopEvent(const std::string &name, const EventRole role) {
+  GetEventList().Record(EventType::kPopRange, name, g_thread_id, role);
 }
 void EnableProfiler(ProfilerState state) {
   PADDLE_ENFORCE_NE(state, ProfilerState::kDisabled,
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index dcc9f1eee10..07844713eae 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -197,7 +197,7 @@ void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
 void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
                  const Place& place, const std::string& annotation);
 Event* PushEvent(const std::string& name, const EventRole role);
-void PopEvent(const std::string& name);
+void PopEvent(const std::string& name, const EventRole role);
 // Return the event list of all threads. Assumed the returned value calls
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
 std::vector<std::vector<Event>> GetAllEvents();
diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h
index 367676e77ef..9d99022f153 100644
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -22,12 +22,12 @@ limitations under the License. */
 #include <memory>
 #include <mutex>  // NOLINT
 #include <random>
+#include <set>
 #include <stack>
 #include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
-
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #endif  // PADDLE_WITH_CUDA
@@ -283,7 +283,8 @@ std::function<bool(const EventItem &, const EventItem &)> SetSortedFunc(
 void SetEvent(bool merge_thread, const Event &analyze_event,
               size_t *max_name_width, std::list<Event> *pushed_events,
               std::vector<EventItem> *event_items,
-              std::unordered_map<std::string, int> *event_idx) {
+              std::unordered_map<std::string, int> *event_idx,
+              const std::set<std::string> &main_thread_event_name) {
   if (analyze_event.type() == EventType::kPushRange) {
     pushed_events->push_back(analyze_event);
   } else if (analyze_event.type() == EventType::kPopRange) {
@@ -313,8 +314,35 @@ void SetEvent(bool merge_thread, const Event &analyze_event,
       if (merge_thread) {
         event_name = rit->name();
       } else {
-        event_name =
-            "thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
+        if (!main_thread_event_name.empty()) {
+          auto origin_name = rit->name();
+          int index = 1;
+          int split_pos = 0;
+          while ((split_pos = FindNthReversePos(origin_name, '/', index)) !=
+                 -1) {
+            auto prefix_str = origin_name.substr(0, split_pos);
+            if (main_thread_event_name.count(prefix_str)) {
+              break;
+            }
+            index++;
+          }
+          if (split_pos == -1 && !main_thread_event_name.count(rit->name())) {
+            event_name = "thread" + std::to_string(rit->thread_id()) + "::" +
+                         rit->name();
+          } else {
+            if (!main_thread_event_name.count(rit->name())) {
+              event_name =
+                  origin_name.substr(0, split_pos + 1) + "thread" +
+                  std::to_string(rit->thread_id()) + "::" +
+                  origin_name.substr(split_pos + 1, origin_name.length() - 1);
+            } else {
+              event_name = rit->name();
+            }
+          }
+        } else {
+          event_name =
+              "thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
+        }
       }
       auto print_name_size = event_name.size();
       int found_pos = 0;
@@ -608,6 +636,16 @@ void AnalyzeEvent(
     std::function<bool(const EventItem &, const EventItem &)> sorted_func,
     EventSortingKey sorted_by, size_t *max_name_width, OverHead *overhead,
     bool merge_thread) {
+  // In oreder to deal with special event in main thread
+  std::set<std::string> main_thread_event_name;
+  for (size_t i = 0; i < (*analyze_events).size(); i++) {
+    for (size_t j = 0; j < (*analyze_events)[i].size(); j++) {
+      Event event = (*analyze_events)[i][j];
+      if (event.role() == EventRole::kSpecial) {
+        main_thread_event_name.insert(event.name());
+      }
+    }
+  }
   for (size_t i = 0; i < (*analyze_events).size(); i++) {
     double total = 0.;  // the total time in one thread
     std::list<Event> pushed_events;
@@ -618,8 +656,10 @@ void AnalyzeEvent(
 
     for (size_t j = 0; j < (*analyze_events)[i].size(); j++) {
       Event analyze_event = (*analyze_events)[i][j];
-      SetEvent(merge_thread, analyze_event, max_name_width, &pushed_events,
-               &event_items, &event_idx);
+      if (!(analyze_event.role() == EventRole::kSpecial && !merge_thread)) {
+        SetEvent(merge_thread, analyze_event, max_name_width, &pushed_events,
+                 &event_items, &event_idx, main_thread_event_name);
+      }
     }
 
     auto table_size = event_items.size();
diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc
index daa69a77b42..d8bfa5445cc 100644
--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -59,7 +59,7 @@ TEST(RecordEvent, RecordEvent) {
       PushEvent(name, EventRole::kOrdinary);
       int counter = 1;
       while (counter != i * 1000) counter++;
-      PopEvent(name);
+      PopEvent(name, EventRole::kOrdinary);
     }
   }
 
@@ -109,7 +109,7 @@ TEST(RecordEvent, RecordEvent) {
 
   // Bad Usage:
   PushEvent("event_without_pop", EventRole::kOrdinary);
-  PopEvent("event_without_push");
+  PopEvent("event_without_push", EventRole::kOrdinary);
   std::vector<std::vector<Event>> events = paddle::platform::GetAllEvents();
 
   int cuda_startup_count = 0;
-- 
GitLab