diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h
index f396a2240594f224e7861f6a1c739df1a3eb4c7f..c369023ccf3fbe0d32fb72745457d76b47e76416 100644
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -355,25 +355,37 @@ void SetEvent(bool merge_thread, const Event &analyze_event,
   }
 }
 
-void ComputeOverhead(const std::multimap<std::string, EventItem> &sub_child_map,
+void UpdateGpuMemcpy(const EventItem &item, EventItem *memcpy_async,
+                     EventItem *memcpy_sync) {
+  if (item.name.find("GpuMemcpyAsync") != std::string::npos) {
+    memcpy_async->calls += item.calls;
+    memcpy_async->total_time += item.total_time;
+    memcpy_async->ratio += item.ratio;
+  } else if (item.name.find("GpuMemcpySync") != std::string::npos) {
+    memcpy_sync->calls += item.calls;
+    memcpy_sync->total_time += item.total_time;
+    memcpy_sync->ratio += item.ratio;
+  }
+}
+
+void ComputeOverhead(const std::vector<EventItem> &main_event_items,
+                     const std::multimap<std::string, EventItem> &sub_child_map,
                      OverHead *overhead) {
   EventItem memcpy_async = {
       "GpuMemcpyAsync", 0, 0., 0., 0., 0., 0., 0., 0.0f, EventRole::kOrdinary};
   EventItem memcpy_sync = {"GpuMemcpySync",     0, 0., 0., 0., 0., 0., 0., 0.0f,
                            EventRole::kOrdinary};
+  // GpuMemcpy may be in main_event_items
+  for (auto &item : main_event_items) {
+    UpdateGpuMemcpy(item, &memcpy_async, &memcpy_sync);
+  }
+
   for (auto it = sub_child_map.begin(); it != sub_child_map.end(); it++) {
-    if (it->second.name.find("compute") != std::string::npos) {
+    if (it->second.name.find("compute") != std::string::npos &&
+        it->second.name.find("compute/") == std::string::npos) {
       overhead->compute_ratio += it->second.ratio;
     }
-    if (it->second.name.find("GpuMemcpyAsync") != std::string::npos) {
-      memcpy_async.calls += it->second.calls;
-      memcpy_async.total_time += it->second.total_time;
-      memcpy_async.ratio += it->second.ratio;
-    } else if (it->second.name.find("GpuMemcpySync") != std::string::npos) {
-      memcpy_sync.calls += it->second.calls;
-      memcpy_sync.total_time += it->second.total_time;
-      memcpy_sync.ratio += it->second.ratio;
-    }
+    UpdateGpuMemcpy(it->second, &memcpy_async, &memcpy_sync);
   }
   overhead->framework_ratio = 1.0f - overhead->compute_ratio;
   overhead->memcpy_item.calls = memcpy_async.calls + memcpy_sync.calls;
@@ -637,7 +649,7 @@ void AnalyzeEvent(
     if ((*analyze_events).size() == 1) {
       overhead->total_time = total;
       overhead->print = true;
-      ComputeOverhead(sub_child_map, overhead);
+      ComputeOverhead(main_event_items, sub_child_map, overhead);
     }
     // sort
     if (sorted_by != EventSortingKey::kDefault) {