From c3876cf82d314dc8adc7c2758179b1bd62fd0acf Mon Sep 17 00:00:00 2001
From: wangchaochaohu <wangchao66@baidu.com>
Date: Thu, 9 Jan 2020 13:23:50 +1400
Subject: [PATCH] add support for nested profiling event and printing in
 different level (#22061)

* add support for nested profiling event and printing in different level
---
 paddle/fluid/framework/operator.cc     |  28 +-
 paddle/fluid/platform/device_tracer.cc |  18 +-
 paddle/fluid/platform/event.h          |   7 +-
 paddle/fluid/platform/profiler.cc      | 345 ++++++++++++++++---------
 4 files changed, 267 insertions(+), 131 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 2cfc4d71a38..42095c7bdc3 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -166,16 +166,11 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #endif
     }
 
-    // The profile has a process-wide mutex, results in serious performance
-    // issue
-    // in concurrency scenerio. Here use an `if` to fix this issue.
-    // Please not remove the `if`, ask @Superjomn if there are any concern.
-    if (platform::IsProfileEnabled()) {
-      platform::RecordEvent record_event(Type());
-      RunImpl(scope, place);
-    } else {
+    {
+      platform::RecordEvent record_event(Type() + "_op");
       RunImpl(scope, place);
     }
+
     VLOG(3) << place << " " << DebugStringEx(&scope);
   } catch (platform::EnforceNotMet& exception) {
     framework::InsertCallStackInfo(Type(), Attrs(), &exception);
@@ -953,9 +948,12 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 
   // do data transformScope &transfer_scope;
   std::vector<std::string> transfered_inplace_vars;
-  auto* transfer_scope =
-      PrepareData(scope, *kernel_type_, &transfered_inplace_vars, runtime_ctx);
-
+  Scope* transfer_scope = nullptr;
+  {
+    platform::RecordEvent record_event("prepare_data");
+    transfer_scope = PrepareData(scope, *kernel_type_, &transfered_inplace_vars,
+                                 runtime_ctx);
+  }
   // exec scope is the scope that kernel actually executed on.
   const Scope& exec_scope =
       (transfer_scope == nullptr ? scope : *transfer_scope);
@@ -965,6 +963,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   }
 
   if (!all_kernels_must_compute_runtime_shape_) {
+    platform::RecordEvent record_event("infer_shape");
     RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx);
     this->InferShape(&infer_shape_ctx);
   }
@@ -975,8 +974,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 
   // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
   // not Scope. Imperative mode only pass inputs and get outputs.
-  (*kernel_func_)(ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx,
-                                   kernel_configs));
+  {
+    platform::RecordEvent record_event("compute");
+    (*kernel_func_)(ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx,
+                                     kernel_configs));
+  }
 
   if (!transfered_inplace_vars.empty()) {
     // there is inplace variable has been transfered.
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index 9aa6aa44b28..74ab56c07cf 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -446,6 +446,11 @@ class DeviceTracerImpl : public DeviceTracer {
       auto c = correlations_.find(r.correlation_id);
       if (c != correlations_.end() && c->second != nullptr) {
         Event *e = c->second;
+        Event *parent = e->parent();
+        while (parent) {
+          parent->AddCudaElapsedTime(r.start_ns, r.end_ns);
+          parent = parent->parent();
+        }
         e->AddCudaElapsedTime(r.start_ns, r.end_ns);
       }
     }
@@ -453,6 +458,11 @@ class DeviceTracerImpl : public DeviceTracer {
       auto c = correlations_.find(r.correlation_id);
       if (c != correlations_.end() && c->second != nullptr) {
         Event *e = c->second;
+        Event *parent = e->parent();
+        while (parent) {
+          parent->AddCudaElapsedTime(r.start_ns, r.end_ns);
+          parent = parent->parent();
+        }
         e->AddCudaElapsedTime(r.start_ns, r.end_ns);
       }
     }
@@ -622,7 +632,13 @@ DeviceTracer *GetDeviceTracer() {
   return tracer;
 }
 
-void SetCurAnnotation(Event *event) { annotation_stack.push_back(event); }
+void SetCurAnnotation(Event *event) {
+  if (!annotation_stack.empty()) {
+    event->set_parent(annotation_stack.back());
+    event->set_name(annotation_stack.back()->name() + "/" + event->name());
+  }
+  annotation_stack.push_back(event);
+}
 
 void ClearCurAnnotation() { annotation_stack.pop_back(); }
 
diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h
index e9bdb82a50f..f7dba693dc6 100644
--- a/paddle/fluid/platform/event.h
+++ b/paddle/fluid/platform/event.h
@@ -32,8 +32,11 @@ class Event {
   Event(EventType type, std::string name, uint32_t thread_id);
 
   const EventType& type() const;
+  Event* parent() const { return parent_; }
+  void set_parent(Event* parent) { parent_ = parent; }
   std::string name() const { return name_; }
   uint32_t thread_id() const { return thread_id_; }
+  void set_name(std::string name) { name_ = name; }
 
 #ifdef PADDLE_WITH_CUDA
 #ifndef PADDLE_WITH_CUPTI
@@ -47,9 +50,11 @@ class Event {
 
  private:
   EventType type_;
-  std::string name_;
+  std::string name_{};
+  Event* parent_{nullptr};
   uint32_t thread_id_;
   int64_t cpu_ns_;
+  bool visited_status_{false};
 #ifdef PADDLE_WITH_CUDA
 #ifdef PADDLE_WITH_CUPTI
   int64_t gpu_ns_ = 0;
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 410a6db6b35..a32b89bc4ce 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <map>
 #include <mutex>  // NOLINT
 #include <random>
+#include <stack>
 #include <string>
 #include <vector>
 
@@ -138,14 +139,14 @@ void PopEvent(const std::string &name) {
 
 RecordEvent::RecordEvent(const std::string &name)
     : is_enabled_(false), start_ns_(PosixInNsec()) {
-  if (g_state == ProfilerState::kDisabled) return;
+  if (g_state == ProfilerState::kDisabled || name.empty()) return;
   // lock is not needed, the code below is thread-safe
 
   is_enabled_ = true;
-  name_ = name;
-  Event *e = PushEvent(name_);
+  Event *e = PushEvent(name);
   // Maybe need the same push/pop behavior.
   SetCurAnnotation(e);
+  name_ = e->name();
 }
 
 RecordEvent::~RecordEvent() {
@@ -320,48 +321,86 @@ struct EventItem {
 
 // Print results
 void PrintProfiler(const std::vector<std::vector<EventItem>> &events_table,
+                   const std::multimap<std::string, EventItem> &child_map,
                    const std::string &sorted_domain, const size_t name_width,
-                   const size_t data_width, bool merge_thread) {
-  // Output header information
-  std::cout << "\n------------------------->"
-            << "     Profiling Report     "
-            << "<-------------------------\n\n";
-  std::string place;
-  if (g_state == ProfilerState::kCPU) {
-    place = "CPU";
-  } else if (g_state == ProfilerState::kCUDA) {
-    place = "CUDA";
-  } else if (g_state == ProfilerState::kAll) {
-    place = "All";
-  } else {
-    PADDLE_THROW("Invalid profiler state", g_state);
-  }
+                   const size_t data_width, bool merge_thread, int print_depth,
+                   int remove_len) {
+  if (print_depth == 0) {
+    // Output header information
+    std::cout << "\n------------------------->"
+              << "     Profiling Report     "
+              << "<-------------------------\n\n";
+    std::string place;
+    if (g_state == ProfilerState::kCPU) {
+      place = "CPU";
+    } else if (g_state == ProfilerState::kCUDA) {
+      place = "CUDA";
+    } else if (g_state == ProfilerState::kAll) {
+      place = "All";
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Except profiler state must to be one of ['CPU', 'GPU' 'ALL'], but "
+          "received Invalid profiler state %s",
+          g_state));
+    }
 
-  if (merge_thread) {
-    std::cout << "Note! This Report merge all thread info into one."
-              << std::endl;
-  }
-  std::cout << "Place: " << place << std::endl;
-  std::cout << "Time unit: ms" << std::endl;
-  std::cout << "Sorted by " << sorted_domain
-            << " in descending order in the same thread\n\n";
-  // Output events table
-  std::cout.setf(std::ios::left);
-  std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
-            << "Calls" << std::setw(data_width) << "Total";
-  if (g_state == ProfilerState::kAll) {
-    std::cout << std::setw(data_width * 2) << "CPU Time (Ratio)"
-              << std::setw(data_width * 2) << "GPU Time (Ratio)";
+    if (merge_thread) {
+      std::cout << "Note! This Report merge all thread info into one."
+                << std::endl;
+    }
+    std::cout << "Place: " << place << std::endl;
+    std::cout << "Time unit: ms" << std::endl;
+    std::cout << "Sorted by " << sorted_domain
+              << " in descending order in the same thread\n\n";
+    // Output events table
+    std::cout.setf(std::ios::left);
+    std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
+              << "Calls" << std::setw(data_width) << "Total";
+    if (g_state == ProfilerState::kAll) {
+      std::cout << std::setw(data_width * 2) << "CPU Time (Ratio)"
+                << std::setw(data_width * 2) << "GPU Time (Ratio)";
+    }
+    std::cout << std::setw(data_width) << "Min." << std::setw(data_width)
+              << "Max." << std::setw(data_width) << "Ave."
+              << std::setw(data_width) << "Ratio." << std::endl;
   }
-  std::cout << std::setw(data_width) << "Min." << std::setw(data_width)
-            << "Max." << std::setw(data_width) << "Ave."
-            << std::setw(data_width) << "Ratio." << std::endl;
+  if (print_depth >= 100) return;
+  int print_depth_next = print_depth;
   for (size_t i = 0; i < events_table.size(); ++i) {
     for (size_t j = 0; j < events_table[i].size(); ++j) {
-      const EventItem &event_item = events_table[i][j];
-      std::cout << std::setw(name_width) << event_item.name
-                << std::setw(data_width) << event_item.calls
-                << std::setw(data_width) << event_item.total_time;
+      auto event_item = events_table[i][j];
+      std::vector<std::vector<EventItem>> child_table;
+      std::vector<EventItem> table;
+      bool do_next = false;
+      std::string op_end_str = "_op";
+      for (auto it = child_map.begin(); it != child_map.end(); it++) {
+        if (it->first == event_item.name) {
+          table.push_back(it->second);
+          do_next = it->second.name.rfind(op_end_str) ==
+                    (it->second.name.length() - op_end_str.length());
+        }
+      }
+      child_table.push_back(table);
+
+      if (do_next)
+        print_depth_next = print_depth + 1;
+      else
+        print_depth_next = print_depth + 100;
+
+      auto name_len = event_item.name.length();
+      std::string print_name = event_item.name.substr(remove_len, name_len);
+      std::string delimiter;
+      for (int i = 0; i < print_depth; i++) {
+        delimiter = "  " + delimiter;
+      }
+      print_name = delimiter + print_name;
+      size_t pos = 0;
+      while ((pos = print_name.find(op_end_str)) != std::string::npos) {
+        print_name.erase(pos, op_end_str.length());
+      }
+      std::cout << std::setw(name_width) << print_name << std::setw(data_width)
+                << event_item.calls << std::setw(data_width)
+                << event_item.total_time;
       if (g_state == ProfilerState::kAll) {
         std::cout << std::setw(data_width * 2)
                   << string::Sprintf(
@@ -376,19 +415,15 @@ void PrintProfiler(const std::vector<std::vector<EventItem>> &events_table,
                 << std::setw(data_width) << event_item.max_time
                 << std::setw(data_width) << event_item.ave_time
                 << std::setw(data_width) << event_item.ratio << std::endl;
+      PrintProfiler(child_table, child_map, sorted_domain, name_width,
+                    data_width, merge_thread, print_depth_next, 0);
     }
   }
-  std::cout << std::endl;
 }
 
-// Parse the event list and output the profiling report
-void ParseEvents(const std::vector<std::vector<Event>> &events,
-                 bool merge_thread,
-                 EventSortingKey sorted_by = EventSortingKey::kDefault) {
-  if (g_state == ProfilerState::kDisabled) return;
-  if (merge_thread && events.size() < 2) return;
-
-  std::string sorted_domain;
+std::function<bool(const EventItem &, const EventItem &)> SetSortedFunc(
+    EventSortingKey sorted_by, std::string *domain) {
+  auto sorted_domain = *domain;
   std::function<bool(const EventItem &, const EventItem &)> sorted_func;
   switch (sorted_by) {
     case EventSortingKey::kCalls:
@@ -436,6 +471,88 @@ void ParseEvents(const std::vector<std::vector<Event>> &events,
     default:
       sorted_domain = "event first end time";
   }
+  return sorted_func;
+}
+
+void SetEvent(bool merge_thread, Event analyze_event, size_t *max_name_width,
+              std::list<Event> *pushed_events,
+              std::vector<EventItem> *event_items,
+              std::unordered_map<std::string, int> *event_idx) {
+  if (analyze_event.type() == EventType::kPushRange) {
+    pushed_events->push_back(analyze_event);
+  } else if (analyze_event.type() == EventType::kPopRange) {
+    std::list<Event>::reverse_iterator rit = pushed_events->rbegin();
+    while (rit != pushed_events->rend() &&
+           rit->name() != analyze_event.name()) {
+      ++rit;
+    }
+    // to find the father name event name
+
+    if (rit != pushed_events->rend()) {
+      double event_time = 0;
+      double gpu_time = 0.0f;
+#ifdef PADDLE_WITH_CUDA
+      gpu_time = rit->CudaElapsedMs(analyze_event);
+#endif
+      double cpu_time = rit->CpuElapsedMs(analyze_event);
+      if (g_state == ProfilerState::kCUDA) {
+        event_time = gpu_time;
+      } else if (g_state == ProfilerState::kCPU) {
+        event_time = cpu_time;
+      } else {
+        event_time = gpu_time + cpu_time;
+      }
+
+      std::string event_name;
+      if (merge_thread) {
+        event_name = rit->name();
+        *max_name_width = std::max(*max_name_width, event_name.size());
+      } else {
+        event_name =
+            "thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
+        *max_name_width = std::max(*max_name_width, event_name.size());
+      }
+
+      if (event_idx->find(event_name) == event_idx->end()) {
+        event_idx->insert({event_name, event_items->size()});
+        EventItem event_item = {event_name, 1,          event_time,
+                                event_time, event_time, event_time,
+                                cpu_time,   gpu_time,   0.};
+        event_items->push_back(event_item);
+      } else {
+        int index = event_idx->at(event_name);
+        event_items->at(index).calls += 1;
+        // total time
+        event_items->at(index).total_time += event_time;
+        // min time
+        event_items->at(index).min_time =
+            std::min(event_time, event_items->at(index).min_time);
+        // max time
+        event_items->at(index).max_time =
+            std::max(event_time, event_items->at(index).max_time);
+        event_items->at(index).gpu_time += gpu_time;
+        event_items->at(index).cpu_time += cpu_time;
+      }
+
+      // remove the push marker from the list
+      pushed_events->erase((++rit).base());
+    } else {
+      LOG(WARNING) << "Cannot find the push marker of event \'"
+                   << analyze_event.name()
+                   << "\', which will be ignored in profiling report.";
+    }
+  }
+}
+// Parse the event list and output the profiling report
+void ParseEvents(const std::vector<std::vector<Event>> &events,
+                 bool merge_thread,
+                 EventSortingKey sorted_by = EventSortingKey::kDefault) {
+  if (g_state == ProfilerState::kDisabled) return;
+  if (merge_thread && events.size() < 2) return;
+
+  std::string sorted_domain;
+  std::function<bool(const EventItem &, const EventItem &)> sorted_func;
+  sorted_func = SetSortedFunc(sorted_by, &sorted_domain);
 
   const std::vector<std::vector<Event>> *analyze_events;
   std::vector<std::vector<Event>> merged_events_list;
@@ -453,91 +570,65 @@ void ParseEvents(const std::vector<std::vector<Event>> &events,
   }
 
   std::vector<std::vector<EventItem>> events_table;
+  std::multimap<std::string, EventItem> child_map;
   size_t max_name_width = 0;
+
   for (size_t i = 0; i < (*analyze_events).size(); i++) {
     double total = 0.;  // the total time in one thread
     std::list<Event> pushed_events;
     std::vector<EventItem> event_items;
+    std::vector<EventItem> main_event_items;
     std::unordered_map<std::string, int> event_idx;
 
     for (size_t j = 0; j < (*analyze_events)[i].size(); j++) {
-      if ((*analyze_events)[i][j].type() == EventType::kPushRange) {
-        pushed_events.push_back((*analyze_events)[i][j]);
-      } else if ((*analyze_events)[i][j].type() == EventType::kPopRange) {
-        std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
-        while (rit != pushed_events.rend() &&
-               rit->name() != (*analyze_events)[i][j].name()) {
-          ++rit;
-        }
+      Event analyze_event = (*analyze_events)[i][j];
+      SetEvent(merge_thread, analyze_event, &max_name_width, &pushed_events,
+               &event_items, &event_idx);
+    }
 
-        if (rit != pushed_events.rend()) {
-          double event_time = 0;
-          double gpu_time = 0.0f;
-#ifdef PADDLE_WITH_CUDA
-          gpu_time = rit->CudaElapsedMs((*analyze_events)[i][j]);
-#endif
-          double cpu_time = rit->CpuElapsedMs((*analyze_events)[i][j]);
-          if (g_state == ProfilerState::kCUDA) {
-            event_time = gpu_time;
-          } else if (g_state == ProfilerState::kCPU) {
-            event_time = cpu_time;
-          } else {
-            event_time = gpu_time + cpu_time;
-          }
-
-          total += event_time;
-
-          std::string event_name;
-          if (merge_thread) {
-            event_name = rit->name();
-            max_name_width = std::max(max_name_width, event_name.size());
-          } else {
-            event_name = "thread" + std::to_string(rit->thread_id()) + "::" +
-                         rit->name();
-            max_name_width = std::max(max_name_width, event_name.size());
-          }
-
-          if (event_idx.find(event_name) == event_idx.end()) {
-            event_idx[event_name] = event_items.size();
-            EventItem event_item = {event_name, 1,          event_time,
-                                    event_time, event_time, event_time,
-                                    cpu_time,   gpu_time,   0.};
-            event_items.push_back(event_item);
-          } else {
-            int index = event_idx[event_name];
-            event_items[index].calls += 1;
-            // total time
-            event_items[index].total_time += event_time;
-            // min time
-            event_items[index].min_time =
-                std::min(event_time, event_items[index].min_time);
-            // max time
-            event_items[index].max_time =
-                std::max(event_time, event_items[index].max_time);
-            event_items[index].gpu_time += gpu_time;
-            event_items[index].cpu_time += cpu_time;
-          }
-
-          // remove the push marker from the list
-          pushed_events.erase((++rit).base());
-        } else {
-          LOG(WARNING) << "Cannot find the push marker of event \'"
-                       << (*analyze_events)[i][j].name()
-                       << "\', which will be ignored in profiling report.";
+    auto table_size = event_items.size();
+    std::vector<int> child_index(table_size, 0);
+    for (size_t j = 0; j < table_size; ++j) {
+      std::string fname = event_items[j].name;
+      std::string grad_name = event_items[j].name + "_grad";
+      for (size_t k = 0; k < table_size; ++k) {
+        std::string cname = event_items[k].name;
+        bool condition = cname.length() > fname.length() &&
+                         cname.rfind(fname, 0) == 0 &&
+                         !cname.rfind(grad_name, 0) == 0 &&
+                         (cname[fname.length()] == '/' &&
+                          cname.rfind('/') == fname.length());
+        if (condition) {
+          child_map.insert(
+              std::pair<std::string, EventItem>(fname, event_items[k]));
+          child_index[k] = 1;
         }
       }
     }
+
+    for (size_t j = 0; j < table_size; ++j) {
+      if (child_index[j] == 0) {
+        main_event_items.push_back(event_items[j]);
+        total += event_items[j].total_time;
+      }
+    }
+
     // average time
-    for (auto &item : event_items) {
+    for (auto &item : main_event_items) {
       item.ave_time = item.total_time / item.calls;
       item.ratio = item.total_time / total;
     }
+    for (auto it = child_map.begin(); it != child_map.end(); it++) {
+      it->second.ratio = it->second.total_time / total;
+      it->second.ave_time = it->second.ave_time / it->second.calls;
+    }
+
     // sort
     if (sorted_by != EventSortingKey::kDefault) {
-      std::sort(event_items.begin(), event_items.end(), sorted_func);
+      std::sort(main_event_items.begin(), main_event_items.end(), sorted_func);
     }
 
-    events_table.push_back(event_items);
+    events_table.push_back(main_event_items);
     // log warning if there are events with `push` but without `pop`
     std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
     while (rit != pushed_events.rend()) {
@@ -548,8 +639,8 @@ void ParseEvents(const std::vector<std::vector<Event>> &events,
   }
 
   // Print report
-  PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12,
-                merge_thread);
+  PrintProfiler(events_table, child_map, sorted_domain, max_name_width + 8, 12,
+                merge_thread, 0, 0);
 }
 
 struct MemoryProfierReport {
@@ -630,6 +721,28 @@ void DisableProfiler(EventSortingKey sorted_key,
   }
 
   std::vector<std::vector<Event>> all_events = GetAllEvents();
+
+  std::vector<std::string> op_name;
+  for (size_t i = 0; i < (all_events).size(); i++) {
+    for (size_t j = 0; j < (all_events)[i].size(); j++) {
+      std::string event_name = (all_events)[i][j].name();
+      size_t start = event_name.find('%', 0);
+      size_t end = event_name.rfind('%', event_name.length() - 1);
+      if (start == std::string::npos || end == std::string::npos) continue;
+      std::string search_str = event_name.substr(start, end - start + 1);
+      auto it = find(op_name.begin(), op_name.end(), search_str);
+      std::string replace_str;
+      if (it != op_name.end()) {
+        replace_str = std::to_string(std::distance(op_name.begin(), it));
+      } else {
+        replace_str = std::to_string(op_name.size());
+        op_name.push_back(search_str);
+      }
+      event_name.replace(start, end - start + 1, replace_str);
+      (all_events)[i][j].set_name(event_name);
+    }
+  }
+
   ParseEvents(all_events, true, sorted_key);
   ParseEvents(all_events, false, sorted_key);
   if (VLOG_IS_ON(5)) {
-- 
GitLab