diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index 07844713eae47d15ba5a3aae5b2d4e877ef4f30d..12049d815cf85dbd54d9188240dbfa5213e94447 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -84,10 +84,12 @@ struct EventItem { }; struct OverHead { - bool print = false; - double total_time = 0.; - float compute_ratio = 0.0f; - float framework_ratio = 0.0f; + bool print_overhead = false; + bool print_explanation = false; + double elapsed_time = 0.; // the elapsed time of all events + double accumulated_time = 0.; // the accumulated time of all events + double compute_time = 0.0; + double framework_time = 0.0; EventItem memcpy_item; std::vector sub_memcpy_items; }; diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h index 9d99022f153ad46a5bee630bb47b7a16bfd79128..36c577fa0503b7bc9d5bbdb23e9e1674331235a4 100644 --- a/paddle/fluid/platform/profiler_helper.h +++ b/paddle/fluid/platform/profiler_helper.h @@ -410,17 +410,24 @@ void ComputeOverhead(const std::vector &main_event_items, EventRole::kOrdinary}; // GpuMemcpy may be in main_event_items for (auto &item : main_event_items) { + if (item.role != EventRole::kSpecial) { + overhead->accumulated_time += item.total_time; + } UpdateGpuMemcpy(item, &memcpy_async, &memcpy_sync); } for (auto it = sub_child_map.begin(); it != sub_child_map.end(); it++) { + if (it->first == "ParallelExecutor::Run") { + overhead->accumulated_time += it->second.total_time; + } if (it->second.name.find("compute") != std::string::npos && it->second.name.find("compute/") == std::string::npos) { - overhead->compute_ratio += it->second.ratio; + overhead->compute_time += it->second.total_time; } UpdateGpuMemcpy(it->second, &memcpy_async, &memcpy_sync); } - overhead->framework_ratio = 1.0f - overhead->compute_ratio; + overhead->framework_time = + overhead->accumulated_time - overhead->compute_time; overhead->memcpy_item.calls = memcpy_async.calls + memcpy_sync.calls; overhead->memcpy_item.total_time = memcpy_async.total_time + memcpy_sync.total_time; @@ -486,16 +493,39 @@ void GetChildMap(const std::multimap &sub_child_map, } void PrintOverHead(const OverHead &overhead, const size_t data_width) { - double compute_time = overhead.total_time * overhead.compute_ratio; - double framework_time = overhead.total_time * overhead.framework_ratio; + float compute_ratio = overhead.compute_time / overhead.accumulated_time; + float framework_ratio = 1 - compute_ratio; + std::cout << "-------------------------" + << " Overhead Summary " + << "-------------------------\n\n"; + if (overhead.print_explanation) { + std::cout + << "The Overhead Summary divides the cost of each event into framework " + "overhead or computation time." + << "\nThe `Accumulated time of events` is higher than the `Elapsed " + "time of events`." + << "\nBecause the OP is executed asynchronously. For example," + << "\nEvent Timeline" + << "\nParallelExecutor::Run " + "---------------------------------------------------------" + << "\n thread1::OP1 -----------------------------" + << "\n thread2::OP2 " + "---------------------------------------------" + << "\nOP1.time + OP2.time > ParallelExecutor::Run.time\n\n"; + std::cout << "Elapsed time of events: " << overhead.elapsed_time + << std::endl; + std::cout << "Accumulated time of events: " << overhead.accumulated_time + << std::endl; + } else { + std::cout << "Total time: " << overhead.elapsed_time << std::endl; + } std::cout.setf(std::ios::left); - std::cout << "Total time: " << overhead.total_time << std::endl; std::cout << std::setw(25) << " Computation time" - << "Total: " << std::setw(data_width) << compute_time - << "Ratio: " << overhead.compute_ratio * 100 << "%" << std::endl; + << "Total: " << std::setw(data_width) << overhead.compute_time + << "Ratio: " << compute_ratio * 100 << "%" << std::endl; std::cout << std::setw(25) << " Framework overhead" - << "Total: " << std::setw(data_width) << framework_time - << "Ratio: " << overhead.framework_ratio * 100 << "%" << std::endl; + << "Total: " << std::setw(data_width) << overhead.framework_time + << "Ratio: " << framework_ratio * 100 << "%" << std::endl; std::cout << "\n-------------------------" << " GpuMemCpy Summary " @@ -552,7 +582,7 @@ void PrintProfiler( std::cout << "Sorted by " << sorted_domain << " in descending order in the same thread\n\n"; - if (overhead.print) { + if (overhead.print_overhead) { PrintOverHead(overhead, data_width); } std::cout << "\n-------------------------" @@ -681,7 +711,6 @@ void AnalyzeEvent( } } } - for (size_t j = 0; j < table_size; ++j) { if (child_index[j] == 0) { main_event_items.push_back(event_items[j]); @@ -699,8 +728,11 @@ void AnalyzeEvent( } // When multi-threaded, overhead are printed only if merge_thread is true if ((*analyze_events).size() == 1) { - overhead->total_time = total; - overhead->print = true; + if (!main_thread_event_name.empty()) { + overhead->print_explanation = true; + } + overhead->elapsed_time = total; + overhead->print_overhead = true; ComputeOverhead(main_event_items, sub_child_map, overhead); } // sort