diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h index 44140d3e5faaf2169409bfd7b163cf6e01d237ee..91a53178bfd5cc26c130ed8168682cc02fa89363 100644 --- a/paddle/fluid/platform/profiler_helper.h +++ b/paddle/fluid/platform/profiler_helper.h @@ -394,6 +394,38 @@ void GetChildMap(const std::multimap &sub_child_map, } } +void PrintOverHead(const OverHead &overhead, const size_t data_width) { + double compute_time = overhead.total_time * overhead.compute_ratio; + double framework_time = overhead.total_time * overhead.framework_ratio; + std::cout.setf(std::ios::left); + std::cout << "Total time: " << overhead.total_time << std::endl; + std::cout << std::setw(25) << " Computation time" + << "Total: " << std::setw(data_width) << compute_time + << "Ratio: " << overhead.compute_ratio * 100 << "%" << std::endl; + std::cout << std::setw(25) << " Framework overhead" + << "Total: " << std::setw(data_width) << framework_time + << "Ratio: " << overhead.framework_ratio * 100 << "%" << std::endl; + + std::cout << "\n-------------------------" + << " GpuMemCpy Summary " + << "-------------------------\n\n"; + std::cout << std::setw(25) << "GpuMemcpy" + << "Calls: " << std::setw(data_width) << overhead.memcpy_item.calls + << "Total: " << std::setw(data_width) + << overhead.memcpy_item.total_time + << "Ratio: " << overhead.memcpy_item.ratio * 100 << "%" + << std::endl; + for (size_t i = 0; i < overhead.sub_memcpy_items.size(); ++i) { + EventItem item = overhead.sub_memcpy_items[i]; + if (item.calls != 0) { + std::cout << std::setw(25) << " " + item.name + << "Calls: " << std::setw(data_width) << item.calls + << "Total: " << std::setw(data_width) << item.total_time + << "Ratio: " << item.ratio * 100 << "%" << std::endl; + } + } +} + // Print results void PrintProfiler(const std::vector> &events_table, const std::multimap &child_map, @@ -428,38 +460,7 @@ void PrintProfiler(const std::vector> &events_table, << " in descending order in the same thread\n\n"; if (overhead.print) { - double compute_time = overhead.total_time * overhead.compute_ratio; - double framework_time = overhead.total_time * overhead.framework_ratio; - std::cout.setf(std::ios::left); - std::cout << "Total time: " << overhead.total_time << std::endl; - std::cout << std::setw(25) << " Computation time" - << "Total: " << std::setw(data_width) << compute_time - << "Ratio: " << overhead.compute_ratio * 100 << "%" - << std::endl; - std::cout << std::setw(25) << " Framework overhead" - << "Total: " << std::setw(data_width) << framework_time - << "Ratio: " << overhead.framework_ratio * 100 << "%" - << std::endl; - - std::cout << "\n-------------------------" - << " GpuMemCpy Summary " - << "-------------------------\n\n"; - std::cout << std::setw(25) << "GpuMemcpy" - << "Calls: " << std::setw(data_width) - << overhead.memcpy_item.calls - << "Total: " << std::setw(data_width) - << overhead.memcpy_item.total_time - << "Ratio: " << overhead.memcpy_item.ratio * 100 << "%" - << std::endl; - for (size_t i = 0; i < overhead.sub_memcpy_items.size(); ++i) { - EventItem item = overhead.sub_memcpy_items[i]; - if (item.calls != 0) { - std::cout << std::setw(25) << " " + item.name - << "Calls: " << std::setw(data_width) << item.calls - << "Total: " << std::setw(data_width) << item.total_time - << "Ratio: " << item.ratio * 100 << "%" << std::endl; - } - } + PrintOverHead(overhead, data_width); } std::cout << "\n-------------------------" << " Event Summary " @@ -522,37 +523,13 @@ void PrintProfiler(const std::vector> &events_table, } } -// Parse the event list and output the profiling report -void ParseEvents(const std::vector> &events, - bool merge_thread, - EventSortingKey sorted_by = EventSortingKey::kDefault) { - if (g_state == ProfilerState::kDisabled) return; - if (merge_thread && events.size() < 2) return; - - std::string sorted_domain; - std::function sorted_func; - sorted_func = SetSortedFunc(sorted_by, &sorted_domain); - - const std::vector> *analyze_events; - std::vector> merged_events_list; - if (merge_thread) { - std::vector merged_events; - for (size_t i = 0; i < events.size(); ++i) { - for (size_t j = 0; j < events[i].size(); ++j) { - merged_events.push_back(events[i][j]); - } - } - merged_events_list.push_back(merged_events); - analyze_events = &merged_events_list; - } else { - analyze_events = &events; - } - - std::vector> events_table; - std::multimap child_map; - size_t max_name_width = 0; - OverHead overhead; - +void AnalyzeEvent( + const std::vector> *analyze_events, + std::vector> *events_table, + std::multimap *child_map, + std::function sorted_func, + EventSortingKey sorted_by, size_t *max_name_width, OverHead *overhead, + bool merge_thread) { for (size_t i = 0; i < (*analyze_events).size(); i++) { double total = 0.; // the total time in one thread std::list pushed_events; @@ -563,7 +540,7 @@ void ParseEvents(const std::vector> &events, for (size_t j = 0; j < (*analyze_events)[i].size(); j++) { Event analyze_event = (*analyze_events)[i][j]; - SetEvent(merge_thread, analyze_event, &max_name_width, &pushed_events, + SetEvent(merge_thread, analyze_event, max_name_width, &pushed_events, &event_items, &event_idx); } @@ -593,7 +570,6 @@ void ParseEvents(const std::vector> &events, total += event_items[j].total_time; } } - // average time for (auto &item : main_event_items) { item.ave_time = item.total_time / item.calls; @@ -603,20 +579,18 @@ void ParseEvents(const std::vector> &events, it->second.ratio = it->second.total_time / total; it->second.ave_time = it->second.total_time / it->second.calls; } - // When multi-threaded, overhead are printed only if merge_thread is true if ((*analyze_events).size() == 1) { - overhead.total_time = total; - overhead.print = true; - ComputeOverhead(sub_child_map, &overhead); + overhead->total_time = total; + overhead->print = true; + ComputeOverhead(sub_child_map, overhead); } - // sort if (sorted_by != EventSortingKey::kDefault) { std::sort(main_event_items.begin(), main_event_items.end(), sorted_func); } - events_table.push_back(main_event_items); + events_table->push_back(main_event_items); // log warning if there are events with `push` but without `pop` std::list::reverse_iterator rit = pushed_events.rbegin(); while (rit != pushed_events.rend()) { @@ -625,9 +599,43 @@ void ParseEvents(const std::vector> &events, ++rit; } - GetChildMap(sub_child_map, &child_map); + GetChildMap(sub_child_map, child_map); + } +} +// Parse the event list and output the profiling report +void ParseEvents(const std::vector> &events, + bool merge_thread, + EventSortingKey sorted_by = EventSortingKey::kDefault) { + if (g_state == ProfilerState::kDisabled) return; + if (merge_thread && events.size() < 2) return; + + std::string sorted_domain; + std::function sorted_func; + sorted_func = SetSortedFunc(sorted_by, &sorted_domain); + + const std::vector> *analyze_events; + std::vector> merged_events_list; + if (merge_thread) { + std::vector merged_events; + for (size_t i = 0; i < events.size(); ++i) { + for (size_t j = 0; j < events[i].size(); ++j) { + merged_events.push_back(events[i][j]); + } + } + merged_events_list.push_back(merged_events); + analyze_events = &merged_events_list; + } else { + analyze_events = &events; } + std::vector> events_table; + std::multimap child_map; + size_t max_name_width = 0; + OverHead overhead; + + AnalyzeEvent(analyze_events, &events_table, &child_map, sorted_func, + sorted_by, &max_name_width, &overhead, merge_thread); + // Print report PrintProfiler(events_table, child_map, overhead, sorted_domain, max_name_width + 8, 12, merge_thread, 0, 0);