From dbfe5333c59ab01c374de258cdfc565cc091db34 Mon Sep 17 00:00:00 2001 From: wangchaochaohu Date: Mon, 25 May 2020 15:18:26 +0800 Subject: [PATCH] Add pe profiler Event (#24611) --- paddle/fluid/framework/parallel_executor.cc | 3 ++ paddle/fluid/platform/device_tracer.cc | 43 +++++++++++++++-- paddle/fluid/platform/device_tracer.h | 2 +- paddle/fluid/platform/event.h | 1 + paddle/fluid/platform/profiler.cc | 9 ++-- paddle/fluid/platform/profiler.h | 2 +- paddle/fluid/platform/profiler_helper.h | 52 ++++++++++++++++++--- paddle/fluid/platform/profiler_test.cc | 4 +- 8 files changed, 98 insertions(+), 18 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index ea36e83038..8c6dd628bb 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -31,6 +31,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" #include "paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h" +#include "paddle/fluid/platform/event.h" #include "paddle/fluid/platform/profiler.h" DECLARE_double(eager_delete_tensor_gb); @@ -820,6 +821,8 @@ void ParallelExecutor::BCastParamsToDevices( FetchResultType ParallelExecutor::Run( const std::vector &fetch_tensors, bool return_merged) { VLOG(3) << "enter ParallelExecutor Run"; + platform::RecordEvent parallel_executor_event( + "ParallelExecutor::Run", paddle::platform::EventRole::kSpecial); #ifdef WITH_GPERFTOOLS if (gProfileStarted) { ProfilerFlush(); diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index 4679b7e1da..6be8ed25e3 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -40,6 +40,9 @@ namespace { thread_local std::deque block_id_stack; // Tracking the nested event stacks. thread_local std::deque annotation_stack; +// stack to strore event sunch as pe and so on +static std::deque main_thread_annotation_stack{}; +static std::deque main_thread_annotation_stack_name{}; std::map system_thread_id_map; @@ -638,15 +641,49 @@ DeviceTracer *GetDeviceTracer() { return tracer; } -void SetCurAnnotation(Event *event) { - if (!annotation_stack.empty()) { +std::string SetCurAnnotation(Event *event) { + std::string ret; + if (!annotation_stack.empty() && event->role() != EventRole::kSpecial) { event->set_parent(annotation_stack.back()); event->set_name(annotation_stack.back()->name() + "/" + event->name()); } + annotation_stack.push_back(event); + + if (!main_thread_annotation_stack_name.empty() && !annotation_stack.empty() && + main_thread_annotation_stack.back()->thread_id() != + annotation_stack.back()->thread_id()) { + ret = main_thread_annotation_stack_name.back() + "/" + event->name(); + } else { + ret = event->name(); + } + if (event->role() == EventRole::kSpecial) { + std::string name = event->name(); + if (!main_thread_annotation_stack_name.empty()) { + name = main_thread_annotation_stack_name.back() + "/" + event->name(); + } + main_thread_annotation_stack_name.push_back(name); + main_thread_annotation_stack.push_back(event); + } + + return ret; } -void ClearCurAnnotation() { annotation_stack.pop_back(); } +void ClearCurAnnotation() { + if (!main_thread_annotation_stack_name.empty() && !annotation_stack.empty() && + main_thread_annotation_stack.back()->thread_id() != + annotation_stack.back()->thread_id()) { + annotation_stack.back()->set_name(main_thread_annotation_stack_name.back() + + "/" + annotation_stack.back()->name()); + } + if (!main_thread_annotation_stack.empty() && + main_thread_annotation_stack.back()->name() == + annotation_stack.back()->name()) { + main_thread_annotation_stack_name.pop_back(); + main_thread_annotation_stack.pop_back(); + } + annotation_stack.pop_back(); +} Event *CurAnnotation() { if (annotation_stack.empty()) return nullptr; diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h index 85168a046f..44b7af149e 100644 --- a/paddle/fluid/platform/device_tracer.h +++ b/paddle/fluid/platform/device_tracer.h @@ -137,7 +137,7 @@ class DeviceTracer { DeviceTracer* GetDeviceTracer(); // Set a name for the cuda kernel operation being launched by the thread. -void SetCurAnnotation(Event* event); +std::string SetCurAnnotation(Event* event); // Clear the name after the operation is done. void ClearCurAnnotation(); // Current name of the operation being run in the thread. diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h index ed2c05af45..9a482a63f5 100644 --- a/paddle/fluid/platform/event.h +++ b/paddle/fluid/platform/event.h @@ -29,6 +29,7 @@ enum class EventRole { kOrdinary, // only record op time with op type key kInnerOp, // record op detail time with op type key kUniqueOp, // record op detail time with op unique name key + kSpecial, // record event such as PE which is outer of thread local }; class Event { diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index a36d8456ee..be655255bd 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -73,8 +73,7 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role) { // lock is not needed, the code below is thread-safe Event *e = PushEvent(name, role); // Maybe need the same push/pop behavior. - SetCurAnnotation(e); - name_ = e->name(); + name_ = SetCurAnnotation(e); } RecordEvent::~RecordEvent() { @@ -86,7 +85,7 @@ RecordEvent::~RecordEvent() { BlockDepth(), g_thread_id); } ClearCurAnnotation(); - PopEvent(name_); + PopEvent(name_, role_); } void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place, @@ -187,8 +186,8 @@ Event *PushEvent(const std::string &name, const EventRole role) { return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role); } -void PopEvent(const std::string &name) { - GetEventList().Record(EventType::kPopRange, name, g_thread_id); +void PopEvent(const std::string &name, const EventRole role) { + GetEventList().Record(EventType::kPopRange, name, g_thread_id, role); } void EnableProfiler(ProfilerState state) { PADDLE_ENFORCE_NE(state, ProfilerState::kDisabled, diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index dcc9f1eee1..07844713ea 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -197,7 +197,7 @@ void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, const Place& place, const std::string& annotation); Event* PushEvent(const std::string& name, const EventRole role); -void PopEvent(const std::string& name); +void PopEvent(const std::string& name, const EventRole role); // Return the event list of all threads. Assumed the returned value calls // event_lists, event_lists[i][j] represents the j-th Event of i-th thread. std::vector> GetAllEvents(); diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h index 367676e77e..9d99022f15 100644 --- a/paddle/fluid/platform/profiler_helper.h +++ b/paddle/fluid/platform/profiler_helper.h @@ -22,12 +22,12 @@ limitations under the License. */ #include #include // NOLINT #include +#include #include #include #include #include #include - #ifdef PADDLE_WITH_CUDA #include #endif // PADDLE_WITH_CUDA @@ -283,7 +283,8 @@ std::function SetSortedFunc( void SetEvent(bool merge_thread, const Event &analyze_event, size_t *max_name_width, std::list *pushed_events, std::vector *event_items, - std::unordered_map *event_idx) { + std::unordered_map *event_idx, + const std::set &main_thread_event_name) { if (analyze_event.type() == EventType::kPushRange) { pushed_events->push_back(analyze_event); } else if (analyze_event.type() == EventType::kPopRange) { @@ -313,8 +314,35 @@ void SetEvent(bool merge_thread, const Event &analyze_event, if (merge_thread) { event_name = rit->name(); } else { - event_name = - "thread" + std::to_string(rit->thread_id()) + "::" + rit->name(); + if (!main_thread_event_name.empty()) { + auto origin_name = rit->name(); + int index = 1; + int split_pos = 0; + while ((split_pos = FindNthReversePos(origin_name, '/', index)) != + -1) { + auto prefix_str = origin_name.substr(0, split_pos); + if (main_thread_event_name.count(prefix_str)) { + break; + } + index++; + } + if (split_pos == -1 && !main_thread_event_name.count(rit->name())) { + event_name = "thread" + std::to_string(rit->thread_id()) + "::" + + rit->name(); + } else { + if (!main_thread_event_name.count(rit->name())) { + event_name = + origin_name.substr(0, split_pos + 1) + "thread" + + std::to_string(rit->thread_id()) + "::" + + origin_name.substr(split_pos + 1, origin_name.length() - 1); + } else { + event_name = rit->name(); + } + } + } else { + event_name = + "thread" + std::to_string(rit->thread_id()) + "::" + rit->name(); + } } auto print_name_size = event_name.size(); int found_pos = 0; @@ -608,6 +636,16 @@ void AnalyzeEvent( std::function sorted_func, EventSortingKey sorted_by, size_t *max_name_width, OverHead *overhead, bool merge_thread) { + // In oreder to deal with special event in main thread + std::set main_thread_event_name; + for (size_t i = 0; i < (*analyze_events).size(); i++) { + for (size_t j = 0; j < (*analyze_events)[i].size(); j++) { + Event event = (*analyze_events)[i][j]; + if (event.role() == EventRole::kSpecial) { + main_thread_event_name.insert(event.name()); + } + } + } for (size_t i = 0; i < (*analyze_events).size(); i++) { double total = 0.; // the total time in one thread std::list pushed_events; @@ -618,8 +656,10 @@ void AnalyzeEvent( for (size_t j = 0; j < (*analyze_events)[i].size(); j++) { Event analyze_event = (*analyze_events)[i][j]; - SetEvent(merge_thread, analyze_event, max_name_width, &pushed_events, - &event_items, &event_idx); + if (!(analyze_event.role() == EventRole::kSpecial && !merge_thread)) { + SetEvent(merge_thread, analyze_event, max_name_width, &pushed_events, + &event_items, &event_idx, main_thread_event_name); + } } auto table_size = event_items.size(); diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc index daa69a77b4..d8bfa5445c 100644 --- a/paddle/fluid/platform/profiler_test.cc +++ b/paddle/fluid/platform/profiler_test.cc @@ -59,7 +59,7 @@ TEST(RecordEvent, RecordEvent) { PushEvent(name, EventRole::kOrdinary); int counter = 1; while (counter != i * 1000) counter++; - PopEvent(name); + PopEvent(name, EventRole::kOrdinary); } } @@ -109,7 +109,7 @@ TEST(RecordEvent, RecordEvent) { // Bad Usage: PushEvent("event_without_pop", EventRole::kOrdinary); - PopEvent("event_without_push"); + PopEvent("event_without_push", EventRole::kOrdinary); std::vector> events = paddle::platform::GetAllEvents(); int cuda_startup_count = 0; -- GitLab