From c3876cf82d314dc8adc7c2758179b1bd62fd0acf Mon Sep 17 00:00:00 2001 From: wangchaochaohu Date: Thu, 9 Jan 2020 13:23:50 +1400 Subject: [PATCH] add support for nested profiling event and printing in different level (#22061) * add support for nested profiling event and printing in different level --- paddle/fluid/framework/operator.cc | 28 +- paddle/fluid/platform/device_tracer.cc | 18 +- paddle/fluid/platform/event.h | 7 +- paddle/fluid/platform/profiler.cc | 345 ++++++++++++++++--------- 4 files changed, 267 insertions(+), 131 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 2cfc4d71a38..42095c7bdc3 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -166,16 +166,11 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { #endif } - // The profile has a process-wide mutex, results in serious performance - // issue - // in concurrency scenerio. Here use an `if` to fix this issue. - // Please not remove the `if`, ask @Superjomn if there are any concern. - if (platform::IsProfileEnabled()) { - platform::RecordEvent record_event(Type()); - RunImpl(scope, place); - } else { + { + platform::RecordEvent record_event(Type() + "_op"); RunImpl(scope, place); } + VLOG(3) << place << " " << DebugStringEx(&scope); } catch (platform::EnforceNotMet& exception) { framework::InsertCallStackInfo(Type(), Attrs(), &exception); @@ -953,9 +948,12 @@ void OperatorWithKernel::RunImpl(const Scope& scope, // do data transformScope &transfer_scope; std::vector transfered_inplace_vars; - auto* transfer_scope = - PrepareData(scope, *kernel_type_, &transfered_inplace_vars, runtime_ctx); - + Scope* transfer_scope = nullptr; + { + platform::RecordEvent record_event("prepare_data"); + transfer_scope = PrepareData(scope, *kernel_type_, &transfered_inplace_vars, + runtime_ctx); + } // exec scope is the scope that kernel actually executed on. const Scope& exec_scope = (transfer_scope == nullptr ? scope : *transfer_scope); @@ -965,6 +963,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, } if (!all_kernels_must_compute_runtime_shape_) { + platform::RecordEvent record_event("infer_shape"); RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx); this->InferShape(&infer_shape_ctx); } @@ -975,8 +974,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope, // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext // not Scope. Imperative mode only pass inputs and get outputs. - (*kernel_func_)(ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx, - kernel_configs)); + { + platform::RecordEvent record_event("compute"); + (*kernel_func_)(ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx, + kernel_configs)); + } if (!transfered_inplace_vars.empty()) { // there is inplace variable has been transfered. diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index 9aa6aa44b28..74ab56c07cf 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -446,6 +446,11 @@ class DeviceTracerImpl : public DeviceTracer { auto c = correlations_.find(r.correlation_id); if (c != correlations_.end() && c->second != nullptr) { Event *e = c->second; + Event *parent = e->parent(); + while (parent) { + parent->AddCudaElapsedTime(r.start_ns, r.end_ns); + parent = parent->parent(); + } e->AddCudaElapsedTime(r.start_ns, r.end_ns); } } @@ -453,6 +458,11 @@ class DeviceTracerImpl : public DeviceTracer { auto c = correlations_.find(r.correlation_id); if (c != correlations_.end() && c->second != nullptr) { Event *e = c->second; + Event *parent = e->parent(); + while (parent) { + parent->AddCudaElapsedTime(r.start_ns, r.end_ns); + parent = parent->parent(); + } e->AddCudaElapsedTime(r.start_ns, r.end_ns); } } @@ -622,7 +632,13 @@ DeviceTracer *GetDeviceTracer() { return tracer; } -void SetCurAnnotation(Event *event) { annotation_stack.push_back(event); } +void SetCurAnnotation(Event *event) { + if (!annotation_stack.empty()) { + event->set_parent(annotation_stack.back()); + event->set_name(annotation_stack.back()->name() + "/" + event->name()); + } + annotation_stack.push_back(event); +} void ClearCurAnnotation() { annotation_stack.pop_back(); } diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h index e9bdb82a50f..f7dba693dc6 100644 --- a/paddle/fluid/platform/event.h +++ b/paddle/fluid/platform/event.h @@ -32,8 +32,11 @@ class Event { Event(EventType type, std::string name, uint32_t thread_id); const EventType& type() const; + Event* parent() const { return parent_; } + void set_parent(Event* parent) { parent_ = parent; } std::string name() const { return name_; } uint32_t thread_id() const { return thread_id_; } + void set_name(std::string name) { name_ = name; } #ifdef PADDLE_WITH_CUDA #ifndef PADDLE_WITH_CUPTI @@ -47,9 +50,11 @@ class Event { private: EventType type_; - std::string name_; + std::string name_{}; + Event* parent_{nullptr}; uint32_t thread_id_; int64_t cpu_ns_; + bool visited_status_{false}; #ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUPTI int64_t gpu_ns_ = 0; diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 410a6db6b35..a32b89bc4ce 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include #include // NOLINT #include +#include #include #include @@ -138,14 +139,14 @@ void PopEvent(const std::string &name) { RecordEvent::RecordEvent(const std::string &name) : is_enabled_(false), start_ns_(PosixInNsec()) { - if (g_state == ProfilerState::kDisabled) return; + if (g_state == ProfilerState::kDisabled || name.empty()) return; // lock is not needed, the code below is thread-safe is_enabled_ = true; - name_ = name; - Event *e = PushEvent(name_); + Event *e = PushEvent(name); // Maybe need the same push/pop behavior. SetCurAnnotation(e); + name_ = e->name(); } RecordEvent::~RecordEvent() { @@ -320,48 +321,86 @@ struct EventItem { // Print results void PrintProfiler(const std::vector> &events_table, + const std::multimap &child_map, const std::string &sorted_domain, const size_t name_width, - const size_t data_width, bool merge_thread) { - // Output header information - std::cout << "\n------------------------->" - << " Profiling Report " - << "<-------------------------\n\n"; - std::string place; - if (g_state == ProfilerState::kCPU) { - place = "CPU"; - } else if (g_state == ProfilerState::kCUDA) { - place = "CUDA"; - } else if (g_state == ProfilerState::kAll) { - place = "All"; - } else { - PADDLE_THROW("Invalid profiler state", g_state); - } + const size_t data_width, bool merge_thread, int print_depth, + int remove_len) { + if (print_depth == 0) { + // Output header information + std::cout << "\n------------------------->" + << " Profiling Report " + << "<-------------------------\n\n"; + std::string place; + if (g_state == ProfilerState::kCPU) { + place = "CPU"; + } else if (g_state == ProfilerState::kCUDA) { + place = "CUDA"; + } else if (g_state == ProfilerState::kAll) { + place = "All"; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Except profiler state must to be one of ['CPU', 'GPU' 'ALL'], but " + "received Invalid profiler state %s", + g_state)); + } - if (merge_thread) { - std::cout << "Note! This Report merge all thread info into one." - << std::endl; - } - std::cout << "Place: " << place << std::endl; - std::cout << "Time unit: ms" << std::endl; - std::cout << "Sorted by " << sorted_domain - << " in descending order in the same thread\n\n"; - // Output events table - std::cout.setf(std::ios::left); - std::cout << std::setw(name_width) << "Event" << std::setw(data_width) - << "Calls" << std::setw(data_width) << "Total"; - if (g_state == ProfilerState::kAll) { - std::cout << std::setw(data_width * 2) << "CPU Time (Ratio)" - << std::setw(data_width * 2) << "GPU Time (Ratio)"; + if (merge_thread) { + std::cout << "Note! This Report merge all thread info into one." + << std::endl; + } + std::cout << "Place: " << place << std::endl; + std::cout << "Time unit: ms" << std::endl; + std::cout << "Sorted by " << sorted_domain + << " in descending order in the same thread\n\n"; + // Output events table + std::cout.setf(std::ios::left); + std::cout << std::setw(name_width) << "Event" << std::setw(data_width) + << "Calls" << std::setw(data_width) << "Total"; + if (g_state == ProfilerState::kAll) { + std::cout << std::setw(data_width * 2) << "CPU Time (Ratio)" + << std::setw(data_width * 2) << "GPU Time (Ratio)"; + } + std::cout << std::setw(data_width) << "Min." << std::setw(data_width) + << "Max." << std::setw(data_width) << "Ave." + << std::setw(data_width) << "Ratio." << std::endl; } - std::cout << std::setw(data_width) << "Min." << std::setw(data_width) - << "Max." << std::setw(data_width) << "Ave." - << std::setw(data_width) << "Ratio." << std::endl; + if (print_depth >= 100) return; + int print_depth_next = print_depth; for (size_t i = 0; i < events_table.size(); ++i) { for (size_t j = 0; j < events_table[i].size(); ++j) { - const EventItem &event_item = events_table[i][j]; - std::cout << std::setw(name_width) << event_item.name - << std::setw(data_width) << event_item.calls - << std::setw(data_width) << event_item.total_time; + auto event_item = events_table[i][j]; + std::vector> child_table; + std::vector table; + bool do_next = false; + std::string op_end_str = "_op"; + for (auto it = child_map.begin(); it != child_map.end(); it++) { + if (it->first == event_item.name) { + table.push_back(it->second); + do_next = it->second.name.rfind(op_end_str) == + (it->second.name.length() - op_end_str.length()); + } + } + child_table.push_back(table); + + if (do_next) + print_depth_next = print_depth + 1; + else + print_depth_next = print_depth + 100; + + auto name_len = event_item.name.length(); + std::string print_name = event_item.name.substr(remove_len, name_len); + std::string delimiter; + for (int i = 0; i < print_depth; i++) { + delimiter = " " + delimiter; + } + print_name = delimiter + print_name; + size_t pos = 0; + while ((pos = print_name.find(op_end_str)) != std::string::npos) { + print_name.erase(pos, op_end_str.length()); + } + std::cout << std::setw(name_width) << print_name << std::setw(data_width) + << event_item.calls << std::setw(data_width) + << event_item.total_time; if (g_state == ProfilerState::kAll) { std::cout << std::setw(data_width * 2) << string::Sprintf( @@ -376,19 +415,15 @@ void PrintProfiler(const std::vector> &events_table, << std::setw(data_width) << event_item.max_time << std::setw(data_width) << event_item.ave_time << std::setw(data_width) << event_item.ratio << std::endl; + PrintProfiler(child_table, child_map, sorted_domain, name_width, + data_width, merge_thread, print_depth_next, 0); } } - std::cout << std::endl; } -// Parse the event list and output the profiling report -void ParseEvents(const std::vector> &events, - bool merge_thread, - EventSortingKey sorted_by = EventSortingKey::kDefault) { - if (g_state == ProfilerState::kDisabled) return; - if (merge_thread && events.size() < 2) return; - - std::string sorted_domain; +std::function SetSortedFunc( + EventSortingKey sorted_by, std::string *domain) { + auto sorted_domain = *domain; std::function sorted_func; switch (sorted_by) { case EventSortingKey::kCalls: @@ -436,6 +471,88 @@ void ParseEvents(const std::vector> &events, default: sorted_domain = "event first end time"; } + return sorted_func; +} + +void SetEvent(bool merge_thread, Event analyze_event, size_t *max_name_width, + std::list *pushed_events, + std::vector *event_items, + std::unordered_map *event_idx) { + if (analyze_event.type() == EventType::kPushRange) { + pushed_events->push_back(analyze_event); + } else if (analyze_event.type() == EventType::kPopRange) { + std::list::reverse_iterator rit = pushed_events->rbegin(); + while (rit != pushed_events->rend() && + rit->name() != analyze_event.name()) { + ++rit; + } + // to find the father name event name + + if (rit != pushed_events->rend()) { + double event_time = 0; + double gpu_time = 0.0f; +#ifdef PADDLE_WITH_CUDA + gpu_time = rit->CudaElapsedMs(analyze_event); +#endif + double cpu_time = rit->CpuElapsedMs(analyze_event); + if (g_state == ProfilerState::kCUDA) { + event_time = gpu_time; + } else if (g_state == ProfilerState::kCPU) { + event_time = cpu_time; + } else { + event_time = gpu_time + cpu_time; + } + + std::string event_name; + if (merge_thread) { + event_name = rit->name(); + *max_name_width = std::max(*max_name_width, event_name.size()); + } else { + event_name = + "thread" + std::to_string(rit->thread_id()) + "::" + rit->name(); + *max_name_width = std::max(*max_name_width, event_name.size()); + } + + if (event_idx->find(event_name) == event_idx->end()) { + event_idx->insert({event_name, event_items->size()}); + EventItem event_item = {event_name, 1, event_time, + event_time, event_time, event_time, + cpu_time, gpu_time, 0.}; + event_items->push_back(event_item); + } else { + int index = event_idx->at(event_name); + event_items->at(index).calls += 1; + // total time + event_items->at(index).total_time += event_time; + // min time + event_items->at(index).min_time = + std::min(event_time, event_items->at(index).min_time); + // max time + event_items->at(index).max_time = + std::max(event_time, event_items->at(index).max_time); + event_items->at(index).gpu_time += gpu_time; + event_items->at(index).cpu_time += cpu_time; + } + + // remove the push marker from the list + pushed_events->erase((++rit).base()); + } else { + LOG(WARNING) << "Cannot find the push marker of event \'" + << analyze_event.name() + << "\', which will be ignored in profiling report."; + } + } +} +// Parse the event list and output the profiling report +void ParseEvents(const std::vector> &events, + bool merge_thread, + EventSortingKey sorted_by = EventSortingKey::kDefault) { + if (g_state == ProfilerState::kDisabled) return; + if (merge_thread && events.size() < 2) return; + + std::string sorted_domain; + std::function sorted_func; + sorted_func = SetSortedFunc(sorted_by, &sorted_domain); const std::vector> *analyze_events; std::vector> merged_events_list; @@ -453,91 +570,65 @@ void ParseEvents(const std::vector> &events, } std::vector> events_table; + std::multimap child_map; size_t max_name_width = 0; + for (size_t i = 0; i < (*analyze_events).size(); i++) { double total = 0.; // the total time in one thread std::list pushed_events; std::vector event_items; + std::vector main_event_items; std::unordered_map event_idx; for (size_t j = 0; j < (*analyze_events)[i].size(); j++) { - if ((*analyze_events)[i][j].type() == EventType::kPushRange) { - pushed_events.push_back((*analyze_events)[i][j]); - } else if ((*analyze_events)[i][j].type() == EventType::kPopRange) { - std::list::reverse_iterator rit = pushed_events.rbegin(); - while (rit != pushed_events.rend() && - rit->name() != (*analyze_events)[i][j].name()) { - ++rit; - } + Event analyze_event = (*analyze_events)[i][j]; + SetEvent(merge_thread, analyze_event, &max_name_width, &pushed_events, + &event_items, &event_idx); + } - if (rit != pushed_events.rend()) { - double event_time = 0; - double gpu_time = 0.0f; -#ifdef PADDLE_WITH_CUDA - gpu_time = rit->CudaElapsedMs((*analyze_events)[i][j]); -#endif - double cpu_time = rit->CpuElapsedMs((*analyze_events)[i][j]); - if (g_state == ProfilerState::kCUDA) { - event_time = gpu_time; - } else if (g_state == ProfilerState::kCPU) { - event_time = cpu_time; - } else { - event_time = gpu_time + cpu_time; - } - - total += event_time; - - std::string event_name; - if (merge_thread) { - event_name = rit->name(); - max_name_width = std::max(max_name_width, event_name.size()); - } else { - event_name = "thread" + std::to_string(rit->thread_id()) + "::" + - rit->name(); - max_name_width = std::max(max_name_width, event_name.size()); - } - - if (event_idx.find(event_name) == event_idx.end()) { - event_idx[event_name] = event_items.size(); - EventItem event_item = {event_name, 1, event_time, - event_time, event_time, event_time, - cpu_time, gpu_time, 0.}; - event_items.push_back(event_item); - } else { - int index = event_idx[event_name]; - event_items[index].calls += 1; - // total time - event_items[index].total_time += event_time; - // min time - event_items[index].min_time = - std::min(event_time, event_items[index].min_time); - // max time - event_items[index].max_time = - std::max(event_time, event_items[index].max_time); - event_items[index].gpu_time += gpu_time; - event_items[index].cpu_time += cpu_time; - } - - // remove the push marker from the list - pushed_events.erase((++rit).base()); - } else { - LOG(WARNING) << "Cannot find the push marker of event \'" - << (*analyze_events)[i][j].name() - << "\', which will be ignored in profiling report."; + auto table_size = event_items.size(); + std::vector child_index(table_size, 0); + for (size_t j = 0; j < table_size; ++j) { + std::string fname = event_items[j].name; + std::string grad_name = event_items[j].name + "_grad"; + for (size_t k = 0; k < table_size; ++k) { + std::string cname = event_items[k].name; + bool condition = cname.length() > fname.length() && + cname.rfind(fname, 0) == 0 && + !cname.rfind(grad_name, 0) == 0 && + (cname[fname.length()] == '/' && + cname.rfind('/') == fname.length()); + if (condition) { + child_map.insert( + std::pair(fname, event_items[k])); + child_index[k] = 1; } } } + + for (size_t j = 0; j < table_size; ++j) { + if (child_index[j] == 0) { + main_event_items.push_back(event_items[j]); + total += event_items[j].total_time; + } + } + // average time - for (auto &item : event_items) { + for (auto &item : main_event_items) { item.ave_time = item.total_time / item.calls; item.ratio = item.total_time / total; } + for (auto it = child_map.begin(); it != child_map.end(); it++) { + it->second.ratio = it->second.total_time / total; + it->second.ave_time = it->second.ave_time / it->second.calls; + } + // sort if (sorted_by != EventSortingKey::kDefault) { - std::sort(event_items.begin(), event_items.end(), sorted_func); + std::sort(main_event_items.begin(), main_event_items.end(), sorted_func); } - events_table.push_back(event_items); + events_table.push_back(main_event_items); // log warning if there are events with `push` but without `pop` std::list::reverse_iterator rit = pushed_events.rbegin(); while (rit != pushed_events.rend()) { @@ -548,8 +639,8 @@ void ParseEvents(const std::vector> &events, } // Print report - PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12, - merge_thread); + PrintProfiler(events_table, child_map, sorted_domain, max_name_width + 8, 12, + merge_thread, 0, 0); } struct MemoryProfierReport { @@ -630,6 +721,28 @@ void DisableProfiler(EventSortingKey sorted_key, } std::vector> all_events = GetAllEvents(); + + std::vector op_name; + for (size_t i = 0; i < (all_events).size(); i++) { + for (size_t j = 0; j < (all_events)[i].size(); j++) { + std::string event_name = (all_events)[i][j].name(); + size_t start = event_name.find('%', 0); + size_t end = event_name.rfind('%', event_name.length() - 1); + if (start == std::string::npos || end == std::string::npos) continue; + std::string search_str = event_name.substr(start, end - start + 1); + auto it = find(op_name.begin(), op_name.end(), search_str); + std::string replace_str; + if (it != op_name.end()) { + replace_str = std::to_string(std::distance(op_name.begin(), it)); + } else { + replace_str = std::to_string(op_name.size()); + op_name.push_back(search_str); + } + event_name.replace(start, end - start + 1, replace_str); + (all_events)[i][j].set_name(event_name); + } + } + ParseEvents(all_events, true, sorted_key); ParseEvents(all_events, false, sorted_key); if (VLOG_IS_ON(5)) { -- GitLab