diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 07844713eae47d15ba5a3aae5b2d4e877ef4f30d..12049d815cf85dbd54d9188240dbfa5213e94447 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -84,10 +84,12 @@ struct EventItem {
 };
 
 struct OverHead {
-  bool print = false;
-  double total_time = 0.;
-  float compute_ratio = 0.0f;
-  float framework_ratio = 0.0f;
+  bool print_overhead = false;
+  bool print_explanation = false;
+  double elapsed_time = 0.;      // the elapsed time of all events
+  double accumulated_time = 0.;  // the accumulated time of all events
+  double compute_time = 0.0;
+  double framework_time = 0.0;
   EventItem memcpy_item;
   std::vector<EventItem> sub_memcpy_items;
 };
diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h
index 9d99022f153ad46a5bee630bb47b7a16bfd79128..36c577fa0503b7bc9d5bbdb23e9e1674331235a4 100644
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -410,17 +410,24 @@ void ComputeOverhead(const std::vector<EventItem> &main_event_items,
                            EventRole::kOrdinary};
   // GpuMemcpy may be in main_event_items
   for (auto &item : main_event_items) {
+    if (item.role != EventRole::kSpecial) {
+      overhead->accumulated_time += item.total_time;
+    }
     UpdateGpuMemcpy(item, &memcpy_async, &memcpy_sync);
   }
 
   for (auto it = sub_child_map.begin(); it != sub_child_map.end(); it++) {
+    if (it->first == "ParallelExecutor::Run") {
+      overhead->accumulated_time += it->second.total_time;
+    }
     if (it->second.name.find("compute") != std::string::npos &&
         it->second.name.find("compute/") == std::string::npos) {
-      overhead->compute_ratio += it->second.ratio;
+      overhead->compute_time += it->second.total_time;
     }
     UpdateGpuMemcpy(it->second, &memcpy_async, &memcpy_sync);
   }
-  overhead->framework_ratio = 1.0f - overhead->compute_ratio;
+  overhead->framework_time =
+      overhead->accumulated_time - overhead->compute_time;
   overhead->memcpy_item.calls = memcpy_async.calls + memcpy_sync.calls;
   overhead->memcpy_item.total_time =
       memcpy_async.total_time + memcpy_sync.total_time;
@@ -486,16 +493,39 @@ void GetChildMap(const std::multimap<std::string, EventItem> &sub_child_map,
 }
 
 void PrintOverHead(const OverHead &overhead, const size_t data_width) {
-  double compute_time = overhead.total_time * overhead.compute_ratio;
-  double framework_time = overhead.total_time * overhead.framework_ratio;
+  float compute_ratio = overhead.compute_time / overhead.accumulated_time;
+  float framework_ratio = 1 - compute_ratio;
+  std::cout << "-------------------------"
+            << "     Overhead Summary      "
+            << "-------------------------\n\n";
+  if (overhead.print_explanation) {
+    std::cout
+        << "The Overhead Summary divides the cost of each event into framework "
+           "overhead or computation time."
+        << "\nThe `Accumulated time of events` is higher than the `Elapsed "
+           "time of events`."
+        << "\nBecause the OP is executed asynchronously. For example,"
+        << "\nEvent                   Timeline"
+        << "\nParallelExecutor::Run   "
+           "---------------------------------------------------------"
+        << "\n  thread1::OP1                 -----------------------------"
+        << "\n  thread2::OP2                      "
+           "---------------------------------------------"
+        << "\nOP1.time + OP2.time > ParallelExecutor::Run.time\n\n";
+    std::cout << "Elapsed time of events: " << overhead.elapsed_time
+              << std::endl;
+    std::cout << "Accumulated time of events: " << overhead.accumulated_time
+              << std::endl;
+  } else {
+    std::cout << "Total time: " << overhead.elapsed_time << std::endl;
+  }
   std::cout.setf(std::ios::left);
-  std::cout << "Total time: " << overhead.total_time << std::endl;
   std::cout << std::setw(25) << "  Computation time"
-            << "Total: " << std::setw(data_width) << compute_time
-            << "Ratio: " << overhead.compute_ratio * 100 << "%" << std::endl;
+            << "Total: " << std::setw(data_width) << overhead.compute_time
+            << "Ratio: " << compute_ratio * 100 << "%" << std::endl;
   std::cout << std::setw(25) << "  Framework overhead"
-            << "Total: " << std::setw(data_width) << framework_time
-            << "Ratio: " << overhead.framework_ratio * 100 << "%" << std::endl;
+            << "Total: " << std::setw(data_width) << overhead.framework_time
+            << "Ratio: " << framework_ratio * 100 << "%" << std::endl;
 
   std::cout << "\n-------------------------"
             << "     GpuMemCpy Summary     "
@@ -552,7 +582,7 @@ void PrintProfiler(
     std::cout << "Sorted by " << sorted_domain
               << " in descending order in the same thread\n\n";
 
-    if (overhead.print) {
+    if (overhead.print_overhead) {
       PrintOverHead(overhead, data_width);
     }
     std::cout << "\n-------------------------"
@@ -681,7 +711,6 @@ void AnalyzeEvent(
         }
       }
     }
-
     for (size_t j = 0; j < table_size; ++j) {
       if (child_index[j] == 0) {
         main_event_items.push_back(event_items[j]);
@@ -699,8 +728,11 @@ void AnalyzeEvent(
     }
     // When multi-threaded, overhead are printed only if merge_thread is true
     if ((*analyze_events).size() == 1) {
-      overhead->total_time = total;
-      overhead->print = true;
+      if (!main_thread_event_name.empty()) {
+        overhead->print_explanation = true;
+      }
+      overhead->elapsed_time = total;
+      overhead->print_overhead = true;
       ComputeOverhead(main_event_items, sub_child_map, overhead);
     }
     // sort