diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h index f396a2240594f224e7861f6a1c739df1a3eb4c7f..c369023ccf3fbe0d32fb72745457d76b47e76416 100644 --- a/paddle/fluid/platform/profiler_helper.h +++ b/paddle/fluid/platform/profiler_helper.h @@ -355,25 +355,37 @@ void SetEvent(bool merge_thread, const Event &analyze_event, } } -void ComputeOverhead(const std::multimap &sub_child_map, +void UpdateGpuMemcpy(const EventItem &item, EventItem *memcpy_async, + EventItem *memcpy_sync) { + if (item.name.find("GpuMemcpyAsync") != std::string::npos) { + memcpy_async->calls += item.calls; + memcpy_async->total_time += item.total_time; + memcpy_async->ratio += item.ratio; + } else if (item.name.find("GpuMemcpySync") != std::string::npos) { + memcpy_sync->calls += item.calls; + memcpy_sync->total_time += item.total_time; + memcpy_sync->ratio += item.ratio; + } +} + +void ComputeOverhead(const std::vector &main_event_items, + const std::multimap &sub_child_map, OverHead *overhead) { EventItem memcpy_async = { "GpuMemcpyAsync", 0, 0., 0., 0., 0., 0., 0., 0.0f, EventRole::kOrdinary}; EventItem memcpy_sync = {"GpuMemcpySync", 0, 0., 0., 0., 0., 0., 0., 0.0f, EventRole::kOrdinary}; + // GpuMemcpy may be in main_event_items + for (auto &item : main_event_items) { + UpdateGpuMemcpy(item, &memcpy_async, &memcpy_sync); + } + for (auto it = sub_child_map.begin(); it != sub_child_map.end(); it++) { - if (it->second.name.find("compute") != std::string::npos) { + if (it->second.name.find("compute") != std::string::npos && + it->second.name.find("compute/") == std::string::npos) { overhead->compute_ratio += it->second.ratio; } - if (it->second.name.find("GpuMemcpyAsync") != std::string::npos) { - memcpy_async.calls += it->second.calls; - memcpy_async.total_time += it->second.total_time; - memcpy_async.ratio += it->second.ratio; - } else if (it->second.name.find("GpuMemcpySync") != std::string::npos) { - memcpy_sync.calls += it->second.calls; - memcpy_sync.total_time += it->second.total_time; - memcpy_sync.ratio += it->second.ratio; - } + UpdateGpuMemcpy(it->second, &memcpy_async, &memcpy_sync); } overhead->framework_ratio = 1.0f - overhead->compute_ratio; overhead->memcpy_item.calls = memcpy_async.calls + memcpy_sync.calls; @@ -637,7 +649,7 @@ void AnalyzeEvent( if ((*analyze_events).size() == 1) { overhead->total_time = total; overhead->print = true; - ComputeOverhead(sub_child_map, overhead); + ComputeOverhead(main_event_items, sub_child_map, overhead); } // sort if (sorted_by != EventSortingKey::kDefault) {