From 52d43ca2905c2df7ac9e1a99b06eec6f5835d3e4 Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Thu, 30 Jun 2022 16:42:22 +0800
Subject: [PATCH] Add statistic code for memory (#43960)

* add code

* add unit test
---
 paddle/fluid/platform/profiler.cc             | 305 +++++++++++++++---
 paddle/fluid/platform/profiler/mem_tracing.h  |  12 +
 .../unittests/test_profiler_statistic.py      |  39 +++
 python/paddle/profiler/profiler_statistic.py  | 153 ++++++++-
 4 files changed, 472 insertions(+), 37 deletions(-)
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index ec33e9e8198..38471251ff4 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -308,6 +308,10 @@ RecordOpInfoSupplement::RecordOpInfoSupplement(
       PosixInNsec(), type, input_shapes, dtypes, callstack);
 }
 
+std::map<const char *, std::map<uint64_t, std::vector<uint64_t>>>
+    RecordMemEvent::size_cache;
+std::map<const char *, std::map<uint64_t, bool>>
+    RecordMemEvent::has_initialized;
 RecordMemEvent::RecordMemEvent(const void *ptr,
                                const phi::Place &place,
                                size_t size,
@@ -323,17 +327,75 @@ RecordMemEvent::RecordMemEvent(const void *ptr,
     uint64_t peak_reserved = 0;     // 0 means keep the same as before
     if (platform::is_cpu_place(place) ||
         platform::is_cuda_pinned_place(place)) {
-      current_allocated =
-          HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
-      peak_allocated =
-          HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+      if (RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] ==
+          false) {
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()));
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()));
+        current_allocated =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0];
+        current_reserved =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1];
+        peak_allocated =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2];
+        peak_reserved =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3];
+        RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] = true;
+      } else {
+        current_allocated =
+            HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
+        peak_allocated =
+            HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0] =
+            current_allocated;
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2] =
+            peak_allocated;
+        current_reserved =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1];
+        peak_reserved =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3];
+      }
+
     } else {
-      current_allocated =
-          DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
-      peak_allocated =
-          DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+      if (RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] ==
+          false) {
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()));
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()));
+        current_allocated =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0];
+        current_reserved =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1];
+        peak_allocated =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2];
+        peak_reserved =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3];
+        RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] = true;
+      } else {
+        current_allocated =
+            DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
+        peak_allocated =
+            DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0] =
+            current_allocated;
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2] =
+            peak_allocated;
+        current_reserved =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1];
+        peak_reserved =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3];
+      }
     }
-
     platform::MemEvenRecorder::Instance().PushMemRecord(ptr,
                                                         place,
                                                         size,
@@ -349,17 +411,74 @@ RecordMemEvent::RecordMemEvent(const void *ptr,
     uint64_t peak_allocated = 0;     // 0 means keep the same as before
     if (platform::is_cpu_place(place) ||
         platform::is_cuda_pinned_place(place)) {
-      current_reserved =
-          HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
-      peak_reserved =
-          HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+      if (RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] ==
+          false) {
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()));
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()));
+        current_allocated =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0];
+        current_reserved =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1];
+        peak_allocated =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2];
+        peak_reserved =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3];
+        RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] = true;
+      } else {
+        current_reserved =
+            HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
+        peak_reserved =
+            HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1] =
+            current_reserved;
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3] =
+            peak_reserved;
+        current_allocated =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0];
+        peak_allocated =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2];
+      }
     } else {
-      current_reserved =
-          DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
-      peak_reserved =
-          DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+      if (RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] ==
+          false) {
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()));
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()));
+        current_allocated =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0];
+        current_reserved =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1];
+        peak_allocated =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2];
+        peak_reserved =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3];
+        RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] = true;
+      } else {
+        current_reserved =
+            DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
+        peak_reserved =
+            DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1] =
+            current_reserved;
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3] =
+            peak_reserved;
+        current_allocated =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0];
+        peak_allocated =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2];
+      }
     }
-
     platform::MemEvenRecorder::Instance().PushMemRecord(ptr,
                                                         place,
                                                         size,
@@ -375,17 +494,74 @@ RecordMemEvent::RecordMemEvent(const void *ptr,
     uint64_t peak_reserved = 0;     // 0 means keep the same as before
     if (platform::is_cpu_place(place) ||
         platform::is_cuda_pinned_place(place)) {
-      current_allocated =
-          HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
-      peak_allocated =
-          HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+      if (RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] ==
+          false) {
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()));
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()));
+        current_allocated =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0];
+        current_reserved =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1];
+        peak_allocated =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2];
+        peak_reserved =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3];
+        RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] = true;
+      } else {
+        current_allocated =
+            HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
+        peak_allocated =
+            HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0] =
+            current_allocated;
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2] =
+            peak_allocated;
+        current_reserved =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1];
+        peak_reserved =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3];
+      }
     } else {
-      current_allocated =
-          DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
-      peak_allocated =
-          DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+      if (RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] ==
+          false) {
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()));
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()));
+        current_allocated =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0];
+        current_reserved =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1];
+        peak_allocated =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2];
+        peak_reserved =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3];
+        RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] = true;
+      } else {
+        current_allocated =
+            DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
+        peak_allocated =
+            DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0] =
+            current_allocated;
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2] =
+            peak_allocated;
+        current_reserved =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1];
+        peak_reserved =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3];
+      }
     }
-
     platform::MemEvenRecorder::Instance().PopMemRecord(ptr,
                                                        place,
                                                        size,
@@ -401,17 +577,74 @@ RecordMemEvent::RecordMemEvent(const void *ptr,
     uint64_t peak_allocated = 0;     // 0 means keep the same as before
     if (platform::is_cpu_place(place) ||
         platform::is_cuda_pinned_place(place)) {
-      current_reserved =
-          HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
-      peak_reserved =
-          HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+      if (RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] ==
+          false) {
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()));
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()));
+        current_allocated =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0];
+        current_reserved =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1];
+        peak_allocated =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2];
+        peak_reserved =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3];
+        RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] = true;
+      } else {
+        current_reserved =
+            HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
+        peak_reserved =
+            HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1] =
+            current_reserved;
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3] =
+            peak_reserved;
+        current_allocated =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0];
+        peak_allocated =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2];
+      }
     } else {
-      current_reserved =
-          DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
-      peak_reserved =
-          DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+      if (RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] ==
+          false) {
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()));
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()));
+        current_allocated =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0];
+        current_reserved =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1];
+        peak_allocated =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2];
+        peak_reserved =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3];
+        RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] = true;
+      } else {
+        current_reserved =
+            DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
+        peak_reserved =
+            DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1] =
+            current_reserved;
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3] =
+            peak_reserved;
+        current_allocated =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0];
+        peak_allocated =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2];
+      }
     }
-
     platform::MemEvenRecorder::Instance().PopMemRecord(ptr,
                                                        place,
                                                        size,
diff --git a/paddle/fluid/platform/profiler/mem_tracing.h b/paddle/fluid/platform/profiler/mem_tracing.h
index 3d3508c7bd5..5b2a2391c2e 100644
--- a/paddle/fluid/platform/profiler/mem_tracing.h
+++ b/paddle/fluid/platform/profiler/mem_tracing.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <map>
 #include <string>
 
 #include "paddle/fluid/platform/place.h"
@@ -37,6 +38,17 @@ class RecordMemEvent {
       const Place& place,
       size_t size,
       const TracerMemEventType type = TracerMemEventType::Allocate);
+
+  // size_cache: In the outer map, key is device type, 'cpu'  or 'gpu', and in
+  // the inner map, key is device ip.
+  //   Values record memory sizes for current_allocated, current_reserved,
+  //   peak_allocated and peak_reserved.
+  // has_initialized: Flags to denote whether memory cache for some device has
+  // collected once.
+
+  static std::map<const char*, std::map<uint64_t, std::vector<uint64_t>>>
+      size_cache;
+  static std::map<const char*, std::map<uint64_t, bool>> has_initialized;
 };
 
 }  // namespace platform
diff --git a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
index e5463b1a90d..6481e0f825d 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
@@ -16,6 +16,7 @@ import unittest
 
 import paddle
 import paddle.profiler as profiler
+import paddle.profiler.profiler_statistic as profiler_statistic
 
 
 class HostPythonNode:
@@ -30,6 +31,7 @@ class HostPythonNode:
         self.children_node = []
         self.runtime_node = []
         self.device_node = []
+        self.mem_node = []
 
 
 class DevicePythonNode:
@@ -45,6 +47,22 @@ class DevicePythonNode:
         self.stream_id = stream_id
 
 
+class MemPythonNode:
+    def __init__(self, timestamp_ns, addr, type, process_id, thread_id, increase_bytes, place, current_allocated, \
+        current_reserved, peak_allocated, peak_reserved):
+        self.timestamp_ns = timestamp_ns
+        self.addr = addr
+        self.type = type
+        self.process_id = process_id
+        self.thread_id = thread_id
+        self.increase_bytes = increase_bytes
+        self.place = place
+        self.current_allocated = current_allocated
+        self.current_reserved = current_reserved
+        self.peak_allocated = peak_allocated
+        self.peak_reserved = peak_reserved
+
+
 class TestProfilerStatistic(unittest.TestCase):
 
     def test_statistic_case1(self):
@@ -89,6 +107,9 @@ class TestProfilerStatistic(unittest.TestCase):
         conv2d_compute = HostPythonNode('conv2d::compute',
                                         profiler.TracerEventType.OperatorInner,
                                         30, 40, 1000, 1001)
+        conv2d_compute.mem_node.append(
+            MemPythonNode(33, 0, profiler_statistic.TracerMemEventType.Allocate,
+                          1000, 1001, 20, 'place(gpu:0)', 200, 200, 800, 800))
         conv2d_launchkernel = HostPythonNode(
             'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 30, 35,
             1000, 1001)
@@ -211,6 +232,24 @@ class TestProfilerStatistic(unittest.TestCase):
         self.assertEqual(
             event_summary.memory_manipulation_items['AsyncMemcpy'].
             general_gpu_time, 60)
+        self.assertEqual(
+            statistic_data.memory_summary.allocated_items['place(gpu:0)']
+            ['conv2d'].allocation_count, 1)
+        self.assertEqual(
+            statistic_data.memory_summary.allocated_items['place(gpu:0)']
+            ['conv2d'].allocation_size, 20)
+        self.assertEqual(
+            statistic_data.memory_summary.allocated_items['place(gpu:0)']
+            ['conv2d'].increase_size, 20)
+        self.assertEqual(
+            statistic_data.memory_summary.allocated_items['place(gpu:0)']
+            ['conv2d'].increase_size, 20)
+        self.assertEqual(
+            statistic_data.memory_summary.
+            peak_allocation_values['place(gpu:0)'], 800)
+        self.assertEqual(
+            statistic_data.memory_summary.peak_reserved_values['place(gpu:0)'],
+            800)
         print(
             profiler.profiler_statistic._build_table(
                 statistic_data,
diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py
index daa6925c4b9..f33335c907d 100755
--- a/python/paddle/profiler/profiler_statistic.py
+++ b/python/paddle/profiler/profiler_statistic.py
@@ -15,7 +15,7 @@ import collections
 from enum import Enum
 import re
 
-from paddle.fluid.core import TracerEventType
+from paddle.fluid.core import TracerEventType, TracerMemEventType
 
 from .statistic_helper import *
 
@@ -603,6 +603,83 @@ class EventSummary:
                 self.kernel_items[name].add_item(device_node)
 
 
+class MemorySummary:
+    r"""
+    Analyse memory events in profiling data.
+    """
+
+    class MemoryItem:
+
+        def __init__(self, event_name, place, memory_type='Allocated'):
+            self.event_name = event_name
+            self.place = place
+            self.allocation_count = 0
+            self.free_count = 0
+            self.allocation_size = 0
+            self.free_size = 0
+            self.increase_size = 0
+            self.memory_type = memory_type
+
+        def add_memory_record(self, size, allocation_type):
+            if allocation_type == TracerMemEventType.Allocate or allocation_type == TracerMemEventType.ReservedAllocate:
+                self.allocation_count += 1
+                self.allocation_size += size
+
+            elif allocation_type == TracerMemEventType.Free or allocation_type == TracerMemEventType.ReservedFree:
+                self.free_count += 1
+                self.free_size -= size  # size is sign(-) when free.
+
+            else:
+                print("No corresponding type.")
+            self.increase_size = self.allocation_size - self.free_size
+
+    def __init__(self):
+        self.allocated_items = collections.defaultdict(
+            dict)  # for memory summary, device type: event
+        self.reserved_items = collections.defaultdict(
+            dict)  # for memory summary, device type: event
+        self.peak_allocation_values = collections.defaultdict(int)
+        self.peak_reserved_values = collections.defaultdict(int)
+
+    def _analyse_node_memory(self, event_name, node):
+        for memnode in node.mem_node:  # self mem node
+            if memnode.type == TracerMemEventType.Allocate or memnode.type == TracerMemEventType.Free:
+                if event_name not in self.allocated_items[memnode.place]:
+                    self.allocated_items[
+                        memnode.place][event_name] = MemorySummary.MemoryItem(
+                            event_name, memnode.place, 'Allocated')
+                self.allocated_items[
+                    memnode.place][event_name].add_memory_record(
+                        memnode.increase_bytes, memnode.type)
+            elif memnode.type == TracerMemEventType.ReservedAllocate or memnode.type == TracerMemEventType.ReservedFree:
+                if event_name not in self.reserved_items[memnode.place]:
+                    self.reserved_items[
+                        memnode.place][event_name] = MemorySummary.MemoryItem(
+                            event_name, memnode.place, 'Reserved')
+                self.reserved_items[
+                    memnode.place][event_name].add_memory_record(
+                        memnode.increase_bytes, memnode.type)
+            self.peak_allocation_values[memnode.place] = max(
+                self.peak_allocation_values[memnode.place],
+                memnode.peak_allocated)
+            self.peak_reserved_values[memnode.place] = max(
+                self.peak_reserved_values[memnode.place], memnode.peak_reserved)
+
+    def parse(self, nodetrees):
+        r"""
+        Analyse memory event in the nodetress.
+        """
+        thread2hostnodes = traverse_tree(nodetrees)
+        for threadid, host_nodes in thread2hostnodes.items():
+            for host_node in host_nodes[1:]:  #skip root node
+                if host_node.type == TracerEventType.OperatorInner:
+                    continue
+                if host_node.type == TracerEventType.Operator:
+                    for child in host_node.children_node:
+                        self._analyse_node_memory(host_node.name, child)
+                self._analyse_node_memory(host_node.name, host_node)
+
+
 class StatisticData:
     r"""
     Hold all analysed results.
@@ -614,9 +691,11 @@ class StatisticData:
         self.time_range_summary = TimeRangeSummary()
         self.event_summary = EventSummary()
         self.distributed_summary = DistributedSummary()
+        self.memory_summary = MemorySummary()
         self.time_range_summary.parse(node_trees)
         self.event_summary.parse(node_trees)
         self.distributed_summary.parse(node_trees)
+        self.memory_summary.parse(node_trees)
 
 
 def _build_table(statistic_data,
@@ -1498,4 +1577,76 @@ def _build_table(statistic_data,
         append('')
         append('')
 
+    ###### Print Memory Summary Report ######
+    if statistic_data.memory_summary.allocated_items or statistic_data.memory_summary.reserved_items:
+        for device_type, memory_events in statistic_data.memory_summary.allocated_items.items(
+        ):
+            all_row_values = []
+            sorted_items = sorted(memory_events.items(),
+                                  key=lambda x: x[1].increase_size,
+                                  reverse=True)
+
+            for event_name, item in sorted_items:
+                row_values = [
+                    event_name, item.memory_type, item.allocation_count,
+                    item.free_count, item.allocation_size, item.free_size,
+                    item.increase_size
+                ]
+                all_row_values.append(row_values)
+
+            sorted_reserved_items = sorted(statistic_data.memory_summary.
+                                           reserved_items[device_type].items(),
+                                           key=lambda x: x[1].increase_size,
+                                           reverse=True)
+            for event_name, item in sorted_reserved_items:
+                row_values = [
+                    event_name, item.memory_type, item.allocation_count,
+                    item.free_count, item.allocation_size, item.free_size,
+                    item.increase_size
+                ]
+                all_row_values.append(row_values)
+
+            # Calculate the column width
+            headers = [
+                'Name', 'Type', 'Allocation Count', 'Free Count',
+                'Allocation Size', 'Free Size', 'Increased Size'
+            ]
+            row_format_list = [""]
+            header_sep_list = [""]
+            line_length_list = [-SPACING_SIZE]
+            name_column_width = 50
+            number_column_width = 15
+            add_column(name_column_width)
+            add_column(12)
+            add_column(number_column_width)
+            add_column(number_column_width)
+            add_column(number_column_width)
+            add_column(number_column_width)
+            add_column(number_column_width)
+
+            row_format = row_format_list[0]
+            header_sep = header_sep_list[0]
+            line_length = line_length_list[0]
+
+            # construct table string
+            append(
+                add_title(line_length,
+                          "Memory Summary - {}".format(device_type)))
+            append('Peak Allocated Memory: {}'.format(
+                statistic_data.memory_summary.
+                peak_allocation_values[device_type]))
+            append('Peak Reserved Memory: {}'.format(
+                statistic_data.memory_summary.peak_reserved_values[device_type])
+                   )
+            append(header_sep)
+            append(row_format.format(*headers))
+            append(header_sep)
+            for row_values in all_row_values:
+                if isinstance(row_values, str):
+                    append(add_title(line_length, row_values))
+                else:
+                    append(row_format.format(*row_values))
+            append('')
+            append('')
+
     return ''.join(result)
-- 
GitLab