diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc
index ea16e56076761d63246a9ef86f03604bc3f7085b..abb1e6a89cf9eab69d06a05c70489b6bcef92c8a 100644
--- a/paddle/platform/profiler.cc
+++ b/paddle/platform/profiler.cc
@@ -15,12 +15,16 @@ limitations under the License. */
 #include "paddle/platform/profiler.h"
 #include <iomanip>
 #include <map>
+#include "gflags/gflags.h"
+#include "glog/logging.h"
 
 namespace paddle {
 namespace platform {
 
 // The profiler state, the initial value is ProfilerState::kDisabled
 static ProfilerState g_state = ProfilerState::kDisabled;
+// To record which timer the profiler used, CUDA or CPU.
+static std::string g_profiler_place = "";
 // The thread local event list only can be accessed by the specific thread
 // The thread index of each thread
 static thread_local int32_t g_thread_id;
@@ -45,10 +49,7 @@ inline uint64_t GetTimeInNsec() {
 
 Event::Event(EventKind kind, std::string name, uint32_t thread_id,
              DeviceContext* dev_ctx)
-    : kind_(kind),
-      name_(std::move(name)),
-      thread_id_(thread_id),
-      has_cuda_(false) {
+    : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
 #ifdef PADDLE_WITH_CUDA
   auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
   if (cuda_dev_ctx) {
@@ -115,22 +116,27 @@ inline EventList& GetEventList() {
 }
 
 void Mark(const std::string& name, DeviceContext* dev_ctx) {
-  GetEventList().Record(EventKind::kMark, std::move(name), g_thread_id,
-                        dev_ctx);
+  GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx);
+}
+
+void PushEvent(const std::string& name, DeviceContext* dev_ctx) {
+  GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx);
+}
+
+void PopEvent(const std::string& name, DeviceContext* dev_ctx) {
+  GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
 }
 
 RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) {
   if (g_state == ProfilerState::kDisabled) return;
   dev_ctx_ = dev_ctx;
   name_ = name;
-  GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id,
-                        dev_ctx_);
+  PushEvent(name_, dev_ctx_);
 }
 
 RecordEvent::~RecordEvent() {
   if (g_state == ProfilerState::kDisabled) return;
-  GetEventList().Record(EventKind::kPopRange, std::move(name_), g_thread_id,
-                        dev_ctx_);
+  PopEvent(name_, dev_ctx_);
 }
 
 void EnableProfiler(ProfilerState state) {
@@ -141,6 +147,7 @@ void EnableProfiler(ProfilerState state) {
                  "The profiling state should be disabled when calling ",
                  "EnableProfiler.");
   g_state = state;
+  g_profiler_place = (g_state == ProfilerState::kCUDA) ? "CUDA" : "CPU";
 #ifdef PADDLE_WITH_CUDA
   if (g_state == ProfilerState::kCUDA) {
     // Generate some dummy evenets first to reduce the startup overhead.
@@ -172,56 +179,8 @@ std::vector<std::vector<Event>> DisableProfiler() {
   return result;
 }
 
-void PushEvent(const std::string& name, DeviceContext* dev_ctx) {
-  GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id,
-                        dev_ctx);
-}
-
-void PopEvent(const std::string& name, DeviceContext* dev_ctx) {
-  GetEventList().Record(EventKind::kPopRange, std::move(name), g_thread_id,
-                        dev_ctx);
-}
-
 void ParseEvents(std::vector<std::vector<Event>>& events,
                  EventSortingKey sorted_by) {
-  // Output header information
-  std::cout << "------------------------->"
-            << "     Profiling Report     "
-            << "<-------------------------"
-            << "\n\n";
-#ifdef PADDLE_WITH_CUDA
-  std::cout << "Place: GPU" << std::endl;
-#else
-  std::cout << "Place: CPU" << std::endl;
-#endif
-  std::cout << "Time unit: ms" << std::endl;
-  std::string sort_domain = "event end time";
-  switch (sorted_by) {
-    case EventSortingKey::kCalls:
-      sort_domain = "number of calls";
-      break;
-    case EventSortingKey::kTotal:
-      sort_domain = "total time";
-      break;
-    case EventSortingKey::kMin:
-      sort_domain = "minimum time";
-      break;
-    case EventSortingKey::kMax:
-      sort_domain = "maximum time";
-      break;
-    case EventSortingKey::kAve:
-      sort_domain = "average time";
-      break;
-    default:
-      if (sorted_by != EventSortingKey::kDefault) {
-        std::cout << "Warning: unkown sorting key. ";
-        sorted_by = EventSortingKey::kDefault;
-      }
-  }
-  std::cout << "Sorted by " << sort_domain
-            << " in descending order in the same thread\n\n";
-
-  // Parse events
   std::vector<std::vector<EventItem>> events_table;
   size_t max_name_width = 0;
   for (size_t i = 0; i < events.size(); i++) {
@@ -234,19 +193,19 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
         pushed_events.push_back(events[i][j]);
       } else if (events[i][j].kind() == "pop") {
         std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
-        while (rit->name() != events[i][j].name() &&
-               rit != pushed_events.rend()) {
+        while (rit != pushed_events.rend() &&
+               rit->name() != events[i][j].name()) {
           ++rit;
         }
+
         if (rit != pushed_events.rend()) {
-#ifdef PADDLE_WITH_CUDA
-          double event_time = rit->CudaElapsedMs(events[i][j]);
-#else
-          double event_time = rit->CpuElapsedMs(events[i][j]);
-#endif
+          double event_time = (g_state == ProfilerState::kCUDA)
+                                  ? rit->CudaElapsedMs(events[i][j])
+                                  : rit->CpuElapsedMs(events[i][j]);
           std::string event_name =
               "thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
           max_name_width = std::max(max_name_width, event_name.size());
+
           if (event_idx.find(event_name) == event_idx.end()) {
             event_idx[event_name] = event_items.size();
             EventItem event_item = {event_name, 1,          event_time,
@@ -264,11 +223,13 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
             event_items[index].max_time =
                 std::max(event_time, event_items[index].max_time);
           }
+
           // remove the start marker from the list
           pushed_events.erase((++rit).base());
         } else {
-          std::cout << "Warning: can not find the start marker of event "
-                    << events[i][j].name();
+          LOG(WARNING) << "Cannot find the push marker of event \'"
+                       << events[i][j].name()
+                       << "\', which will be ignored in profiling report.";
         }
       }
     }
@@ -294,19 +255,65 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
                   }
                 });
     }
+
     events_table.push_back(event_items);
+    // To check whether there are events with `push` but without `pop`
+    std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
+    while (rit != pushed_events.rend()) {
+      if (rit->kind() == "push") {
+        LOG(WARNING) << "Cannot find the pop marker of event \'" << rit->name()
+                     << "\', which will be ignored in profiling report.";
+      }
+      ++rit;
+    }
   }
-  // output events table
+
+  // Print report
+  PrintProfilingReport(events_table, sorted_by, max_name_width + 4, 12);
+}
+
+void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table,
+                          EventSortingKey sorted_by, const size_t name_width,
+                          const size_t data_width) {
+  if (g_profiler_place == "") return;
+  // Output header information
+  std::cout << "\n------------------------->"
+            << "     Profiling Report     "
+            << "<-------------------------\n\n";
+  std::cout << "Place: " << g_profiler_place << std::endl;
+  std::cout << "Time unit: ms" << std::endl;
+  std::string sort_domain = "event end time";
+  switch (sorted_by) {
+    case EventSortingKey::kCalls:
+      sort_domain = "number of calls";
+      break;
+    case EventSortingKey::kTotal:
+      sort_domain = "total time";
+      break;
+    case EventSortingKey::kMin:
+      sort_domain = "minimum time";
+      break;
+    case EventSortingKey::kMax:
+      sort_domain = "maximum time";
+      break;
+    case EventSortingKey::kAve:
+      sort_domain = "average time";
+      break;
+    default:
+      break;
+  }
+  std::cout << "Sorted by " << sort_domain
+            << " in descending order in the same thread\n\n";
+  // Output events table
   std::cout.setf(std::ios::left);
-  const int data_width = 12;
-  std::cout << std::setw(max_name_width + 4) << "Event" << std::setw(data_width)
+  std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
             << "Calls" << std::setw(data_width) << "Total"
             << std::setw(data_width) << "Min." << std::setw(data_width)
             << "Max." << std::setw(data_width) << "Ave." << std::endl;
   for (size_t i = 0; i < events_table.size(); ++i) {
     for (size_t j = 0; j < events_table[i].size(); ++j) {
       EventItem& event_item = events_table[i][j];
-      std::cout << std::setw(max_name_width + 4) << event_item.name
+      std::cout << std::setw(name_width) << event_item.name
                 << std::setw(data_width) << event_item.calls
                 << std::setw(data_width) << event_item.total_time
                 << std::setw(data_width) << event_item.min_time
@@ -314,6 +321,7 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
                 << std::setw(data_width) << event_item.ave_time << std::endl;
     }
   }
+  std::cout << std::endl;
 }
 
 }  // namespace platform
diff --git a/paddle/platform/profiler.h b/paddle/platform/profiler.h
index d51ec6443217c6fda882e22dd07f80c12f745d9d..f97a586787780aeea2aeeb970174a8b34544ac95 100644
--- a/paddle/platform/profiler.h
+++ b/paddle/platform/profiler.h
@@ -134,5 +134,9 @@ enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };
 void ParseEvents(std::vector<std::vector<Event>>&,
                  EventSortingKey sorted_by = EventSortingKey::kDefault);
 
+// Print results
+void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table,
+                          EventSortingKey sorted_by, const size_t name_width,
+                          const size_t data_width);
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/profiler_test.cc b/paddle/platform/profiler_test.cc
index 3800f3914f18881f336c5d9920d797fbafab232d..13dea713c71e147ed5dd8d090e92d86c96256c09 100644
--- a/paddle/platform/profiler_test.cc
+++ b/paddle/platform/profiler_test.cc
@@ -99,8 +99,12 @@ TEST(RecordEvent, RecordEvent) {
     int counter = 1;
     while (counter != i * 1000) counter++;
   }
+
+  // Bad Usage:
+  PushEvent("event_without_pop", dev_ctx);
+  PopEvent("event_without_push", dev_ctx);
   std::vector<std::vector<Event>> events = paddle::platform::DisableProfiler();
-  // Will remove from test before merging
+  // Will remove parsing-related code from test later
   ParseEvents(events, EventSortingKey::kTotal);
 
   int cuda_startup_count = 0;