From 72ff5a09c34b2c63266ff5e354516a83f96b4664 Mon Sep 17 00:00:00 2001 From: Zhang Ting <709968123@qq.com> Date: Tue, 3 Mar 2020 23:50:45 +0800 Subject: [PATCH] fix print bug of profile, test=develop (#22804) --- paddle/fluid/framework/operator.cc | 8 ++--- paddle/fluid/platform/event.h | 12 ++++++- paddle/fluid/platform/profiler.cc | 13 ++++---- paddle/fluid/platform/profiler.h | 13 +++----- paddle/fluid/platform/profiler_helper.h | 44 +++++++++++++++++++------ paddle/fluid/platform/profiler_test.cc | 5 +-- 6 files changed, 63 insertions(+), 32 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 57bae3545c4..307279ebf54 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -173,7 +173,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { platform::RecordEvent op_type_record_event(Type()); auto op_name = platform::OpName(outputs_, Type()); platform::RecordEvent op_name_record_event( - op_name, platform::RecordRole::kUniqueOp); + op_name, platform::EventRole::kUniqueOp); RunImpl(scope, place); } @@ -957,7 +957,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, Scope* transfer_scope = nullptr; { platform::RecordEvent record_event("prepare_data", - platform::RecordRole::kInnerOp); + platform::EventRole::kInnerOp); transfer_scope = PrepareData(scope, *kernel_type_, &transfered_inplace_vars, runtime_ctx); } @@ -971,7 +971,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, if (!all_kernels_must_compute_runtime_shape_) { platform::RecordEvent record_event("infer_shape", - platform::RecordRole::kInnerOp); + platform::EventRole::kInnerOp); RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx); this->InferShape(&infer_shape_ctx); } @@ -984,7 +984,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, // not Scope. Imperative mode only pass inputs and get outputs. { platform::RecordEvent record_event("compute", - platform::RecordRole::kInnerOp); + platform::EventRole::kInnerOp); (*kernel_func_)(ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx, kernel_configs)); } diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h index 2fd02ac3a9e..ed2c05af453 100644 --- a/paddle/fluid/platform/event.h +++ b/paddle/fluid/platform/event.h @@ -25,18 +25,27 @@ namespace platform { enum class EventType { kMark, kPushRange, kPopRange }; +enum class EventRole { + kOrdinary, // only record op time with op type key + kInnerOp, // record op detail time with op type key + kUniqueOp, // record op detail time with op unique name key +}; + class Event { public: // The DeviceContext is used to get the cuda stream. // If CPU profiling mode, can pass nullptr. - Event(EventType type, std::string name, uint32_t thread_id); + Event(EventType type, std::string name, uint32_t thread_id, + EventRole role = EventRole::kOrdinary); const EventType& type() const; Event* parent() const { return parent_; } void set_parent(Event* parent) { parent_ = parent; } std::string name() const { return name_; } + EventRole role() const { return role_; } uint32_t thread_id() const { return thread_id_; } void set_name(std::string name) { name_ = name; } + void set_role(EventRole role) { role_ = role; } #ifdef PADDLE_WITH_CUDA #ifndef PADDLE_WITH_CUPTI @@ -53,6 +62,7 @@ class Event { std::string name_{}; Event* parent_{nullptr}; uint32_t thread_id_; + EventRole role_{}; int64_t cpu_ns_; bool visited_status_{false}; #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index eae94926b92..394fdf13fa8 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -42,8 +42,9 @@ namespace platform { MemEvenRecorder MemEvenRecorder::recorder; -Event::Event(EventType type, std::string name, uint32_t thread_id) - : type_(type), name_(name), thread_id_(thread_id) { +Event::Event(EventType type, std::string name, uint32_t thread_id, + EventRole role) + : type_(type), name_(name), thread_id_(thread_id), role_(role) { cpu_ns_ = GetTimeInNsec(); } @@ -62,12 +63,12 @@ double Event::CudaElapsedMs(const Event &e) const { #endif } -RecordEvent::RecordEvent(const std::string &name, const RecordRole role) +RecordEvent::RecordEvent(const std::string &name, const EventRole role) : is_enabled_(false), start_ns_(PosixInNsec()), role_(role) { if (g_state == ProfilerState::kDisabled || name.empty()) return; // lock is not needed, the code below is thread-safe is_enabled_ = true; - Event *e = PushEvent(name); + Event *e = PushEvent(name, role); // Maybe need the same push/pop behavior. SetCurAnnotation(e); name_ = e->name(); @@ -179,8 +180,8 @@ void Mark(const std::string &name) { GetEventList().Record(EventType::kMark, name, g_thread_id); } -Event *PushEvent(const std::string &name) { - return GetEventList().Record(EventType::kPushRange, name, g_thread_id); +Event *PushEvent(const std::string &name, const EventRole role) { + return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role); } void PopEvent(const std::string &name) { diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index 0a509a7afc4..9eddee2ab67 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -43,12 +43,6 @@ enum class ProfilerState { kAll, // Profile both CPU and GPU. (Currently experimental). }; -enum class RecordRole { - kOrdinary, // only record op time with op type key - kInnerOp, // record op detail time with op type key - kUniqueOp, // record op detail time with op unique name key -}; - // it is the flag to control to print the profiling result enum class TracerOption { kDefault, // print the different op type profiling result @@ -86,6 +80,7 @@ struct EventItem { double cpu_time; double gpu_time; float ratio; + EventRole role; }; struct OverHead { @@ -128,7 +123,7 @@ struct MemEvenRecorder { struct RecordEvent { RecordEvent(const std::string& name, - const RecordRole role = RecordRole::kOrdinary); + const EventRole role = EventRole::kOrdinary); ~RecordEvent(); @@ -139,7 +134,7 @@ struct RecordEvent { // Need to distinguish name by op type, block_id, program_id and perhaps // different kernel invocations within an op. std::string full_name_; - RecordRole role_{RecordRole::kOrdinary}; + EventRole role_{EventRole::kOrdinary}; }; class RecordRPCEvent { @@ -201,7 +196,7 @@ void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, const Place& place, const std::string& annotation); void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, const Place& place, const std::string& annotation); -Event* PushEvent(const std::string& name); +Event* PushEvent(const std::string& name, const EventRole role); void PopEvent(const std::string& name); // Return the event list of all threads. Assumed the returned value calls // event_lists, event_lists[i][j] represents the j-th Event of i-th thread. diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h index 91a53178bfd..c90f815e914 100644 --- a/paddle/fluid/platform/profiler_helper.h +++ b/paddle/fluid/platform/profiler_helper.h @@ -304,9 +304,9 @@ void SetEvent(bool merge_thread, Event analyze_event, size_t *max_name_width, if (event_idx->find(event_name) == event_idx->end()) { event_idx->insert({event_name, event_items->size()}); - EventItem event_item = {event_name, 1, event_time, - event_time, event_time, event_time, - cpu_time, gpu_time, 0.}; + EventItem event_item = {event_name, 1, event_time, event_time, + event_time, event_time, cpu_time, gpu_time, + 0., rit->role()}; event_items->push_back(event_item); } else { int index = event_idx->at(event_name); @@ -335,8 +335,10 @@ void SetEvent(bool merge_thread, Event analyze_event, size_t *max_name_width, void ComputeOverhead(const std::multimap &sub_child_map, OverHead *overhead) { - EventItem memcpy_async = {"GpuMemcpyAsync", 0, 0., 0., 0., 0., 0., 0., 0.0f}; - EventItem memcpy_sync = {"GpuMemcpySync", 0, 0., 0., 0., 0., 0., 0., 0.0f}; + EventItem memcpy_async = { + "GpuMemcpyAsync", 0, 0., 0., 0., 0., 0., 0., 0.0f, EventRole::kOrdinary}; + EventItem memcpy_sync = {"GpuMemcpySync", 0, 0., 0., 0., 0., 0., 0., 0.0f, + EventRole::kOrdinary}; for (auto it = sub_child_map.begin(); it != sub_child_map.end(); it++) { if (it->second.name.find("compute") != std::string::npos) { overhead->compute_ratio += it->second.ratio; @@ -359,6 +361,29 @@ void ComputeOverhead(const std::multimap &sub_child_map, overhead->sub_memcpy_items = {memcpy_async, memcpy_sync}; } +std::string FindOrdinaryParent( + const std::multimap &sub_child_map, + std::string name) { + bool find_name = false; + std::string parent = name; + EventRole role; + for (auto it = sub_child_map.begin(); it != sub_child_map.end(); it++) { + if (it->second.name == name) { + role = it->second.role; + parent = it->first; + find_name = true; + break; + } + } + if (find_name && role == EventRole::kOrdinary) { + return name; + } else if (find_name && role != EventRole::kOrdinary) { + return FindOrdinaryParent(sub_child_map, parent); + } else { + return parent; + } +} + // When TracerOption is KDefault, OpDetail will be recorded but only default // profile result will be printed. // GpuMemcpy should be printed in kDefault setting, however it offten occurs @@ -376,11 +401,7 @@ void GetChildMap(const std::multimap &sub_child_map, } else { for (auto it = sub_child_map.begin(); it != sub_child_map.end(); it++) { if (it->second.name.find("GpuMemcpy") != std::string::npos) { - std::string parent_name = it->first; - auto left_pos = it->first.find("/"); - if (left_pos != std::string::npos) { - parent_name = it->first.substr(0, left_pos); - } + std::string parent_name = FindOrdinaryParent(sub_child_map, it->first); auto item = it->second; auto right_pos = item.name.rfind("/"); if (right_pos != std::string::npos) { @@ -389,6 +410,9 @@ void GetChildMap(const std::multimap &sub_child_map, item.name = parent_name + "/" + child_name; } child_map->insert(std::pair(parent_name, item)); + } else if (it->second.role == EventRole::kOrdinary) { + child_map->insert( + std::pair(it->first, it->second)); } } } diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc index a851488e72d..daa69a77b42 100644 --- a/paddle/fluid/platform/profiler_test.cc +++ b/paddle/fluid/platform/profiler_test.cc @@ -40,6 +40,7 @@ TEST(RecordEvent, RecordEvent) { using paddle::platform::PopEvent; using paddle::platform::ProfilerState; using paddle::platform::EventSortingKey; + using paddle::platform::EventRole; ProfilerState state = ProfilerState::kCPU; EnableProfiler(state); @@ -55,7 +56,7 @@ TEST(RecordEvent, RecordEvent) { for (int loop = 0; loop < 3; ++loop) { for (int i = 1; i < 5; ++i) { std::string name = "op_" + std::to_string(i); - PushEvent(name); + PushEvent(name, EventRole::kOrdinary); int counter = 1; while (counter != i * 1000) counter++; PopEvent(name); @@ -107,7 +108,7 @@ TEST(RecordEvent, RecordEvent) { } // Bad Usage: - PushEvent("event_without_pop"); + PushEvent("event_without_pop", EventRole::kOrdinary); PopEvent("event_without_push"); std::vector> events = paddle::platform::GetAllEvents(); -- GitLab