From 52d43ca2905c2df7ac9e1a99b06eec6f5835d3e4 Mon Sep 17 00:00:00 2001 From: chenjian Date: Thu, 30 Jun 2022 16:42:22 +0800 Subject: [PATCH] Add statistic code for memory (#43960) * add code * add unit test --- paddle/fluid/platform/profiler.cc | 305 +++++++++++++++--- paddle/fluid/platform/profiler/mem_tracing.h | 12 + .../unittests/test_profiler_statistic.py | 39 +++ python/paddle/profiler/profiler_statistic.py | 153 ++++++++- 4 files changed, 472 insertions(+), 37 deletions(-) diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index ec33e9e8198..38471251ff4 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -308,6 +308,10 @@ RecordOpInfoSupplement::RecordOpInfoSupplement( PosixInNsec(), type, input_shapes, dtypes, callstack); } +std::map>> + RecordMemEvent::size_cache; +std::map> + RecordMemEvent::has_initialized; RecordMemEvent::RecordMemEvent(const void *ptr, const phi::Place &place, size_t size, @@ -323,17 +327,75 @@ RecordMemEvent::RecordMemEvent(const void *ptr, uint64_t peak_reserved = 0; // 0 means keep the same as before if (platform::is_cpu_place(place) || platform::is_cuda_pinned_place(place)) { - current_allocated = - HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); - peak_allocated = - HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); + if (RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] == + false) { + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId())); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId())); + current_allocated = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0]; + current_reserved = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1]; + peak_allocated = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2]; + peak_reserved = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3]; + RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] = true; + } else { + current_allocated = + HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); + peak_allocated = + HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0] = + current_allocated; + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2] = + peak_allocated; + current_reserved = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1]; + peak_reserved = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3]; + } + } else { - current_allocated = - DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); - peak_allocated = - DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); + if (RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] == + false) { + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId())); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId())); + current_allocated = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0]; + current_reserved = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1]; + peak_allocated = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2]; + peak_reserved = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3]; + RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] = true; + } else { + current_allocated = + DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); + peak_allocated = + DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0] = + current_allocated; + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2] = + peak_allocated; + current_reserved = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1]; + peak_reserved = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3]; + } } - platform::MemEvenRecorder::Instance().PushMemRecord(ptr, place, size, @@ -349,17 +411,74 @@ RecordMemEvent::RecordMemEvent(const void *ptr, uint64_t peak_allocated = 0; // 0 means keep the same as before if (platform::is_cpu_place(place) || platform::is_cuda_pinned_place(place)) { - current_reserved = - HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); - peak_reserved = - HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); + if (RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] == + false) { + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId())); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId())); + current_allocated = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0]; + current_reserved = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1]; + peak_allocated = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2]; + peak_reserved = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3]; + RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] = true; + } else { + current_reserved = + HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); + peak_reserved = + HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1] = + current_reserved; + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3] = + peak_reserved; + current_allocated = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0]; + peak_allocated = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2]; + } } else { - current_reserved = - DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); - peak_reserved = - DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); + if (RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] == + false) { + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId())); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId())); + current_allocated = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0]; + current_reserved = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1]; + peak_allocated = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2]; + peak_reserved = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3]; + RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] = true; + } else { + current_reserved = + DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); + peak_reserved = + DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1] = + current_reserved; + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3] = + peak_reserved; + current_allocated = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0]; + peak_allocated = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2]; + } } - platform::MemEvenRecorder::Instance().PushMemRecord(ptr, place, size, @@ -375,17 +494,74 @@ RecordMemEvent::RecordMemEvent(const void *ptr, uint64_t peak_reserved = 0; // 0 means keep the same as before if (platform::is_cpu_place(place) || platform::is_cuda_pinned_place(place)) { - current_allocated = - HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); - peak_allocated = - HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); + if (RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] == + false) { + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId())); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId())); + current_allocated = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0]; + current_reserved = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1]; + peak_allocated = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2]; + peak_reserved = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3]; + RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] = true; + } else { + current_allocated = + HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); + peak_allocated = + HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0] = + current_allocated; + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2] = + peak_allocated; + current_reserved = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1]; + peak_reserved = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3]; + } } else { - current_allocated = - DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); - peak_allocated = - DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); + if (RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] == + false) { + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId())); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId())); + current_allocated = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0]; + current_reserved = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1]; + peak_allocated = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2]; + peak_reserved = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3]; + RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] = true; + } else { + current_allocated = + DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); + peak_allocated = + DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0] = + current_allocated; + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2] = + peak_allocated; + current_reserved = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1]; + peak_reserved = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3]; + } } - platform::MemEvenRecorder::Instance().PopMemRecord(ptr, place, size, @@ -401,17 +577,74 @@ RecordMemEvent::RecordMemEvent(const void *ptr, uint64_t peak_allocated = 0; // 0 means keep the same as before if (platform::is_cpu_place(place) || platform::is_cuda_pinned_place(place)) { - current_reserved = - HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); - peak_reserved = - HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); + if (RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] == + false) { + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId())); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId())); + current_allocated = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0]; + current_reserved = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1]; + peak_allocated = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2]; + peak_reserved = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3]; + RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] = true; + } else { + current_reserved = + HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); + peak_reserved = + HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1] = + current_reserved; + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3] = + peak_reserved; + current_allocated = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0]; + peak_allocated = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2]; + } } else { - current_reserved = - DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); - peak_reserved = - DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); + if (RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] == + false) { + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId())); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId())); + current_allocated = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0]; + current_reserved = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1]; + peak_allocated = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2]; + peak_reserved = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3]; + RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] = true; + } else { + current_reserved = + DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); + peak_reserved = + DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1] = + current_reserved; + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3] = + peak_reserved; + current_allocated = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0]; + peak_allocated = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2]; + } } - platform::MemEvenRecorder::Instance().PopMemRecord(ptr, place, size, diff --git a/paddle/fluid/platform/profiler/mem_tracing.h b/paddle/fluid/platform/profiler/mem_tracing.h index 3d3508c7bd5..5b2a2391c2e 100644 --- a/paddle/fluid/platform/profiler/mem_tracing.h +++ b/paddle/fluid/platform/profiler/mem_tracing.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include "paddle/fluid/platform/place.h" @@ -37,6 +38,17 @@ class RecordMemEvent { const Place& place, size_t size, const TracerMemEventType type = TracerMemEventType::Allocate); + + // size_cache: In the outer map, key is device type, 'cpu' or 'gpu', and in + // the inner map, key is device ip. + // Values record memory sizes for current_allocated, current_reserved, + // peak_allocated and peak_reserved. + // has_initialized: Flags to denote whether memory cache for some device has + // collected once. + + static std::map>> + size_cache; + static std::map> has_initialized; }; } // namespace platform diff --git a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py index e5463b1a90d..6481e0f825d 100644 --- a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py +++ b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py @@ -16,6 +16,7 @@ import unittest import paddle import paddle.profiler as profiler +import paddle.profiler.profiler_statistic as profiler_statistic class HostPythonNode: @@ -30,6 +31,7 @@ class HostPythonNode: self.children_node = [] self.runtime_node = [] self.device_node = [] + self.mem_node = [] class DevicePythonNode: @@ -45,6 +47,22 @@ class DevicePythonNode: self.stream_id = stream_id +class MemPythonNode: + def __init__(self, timestamp_ns, addr, type, process_id, thread_id, increase_bytes, place, current_allocated, \ + current_reserved, peak_allocated, peak_reserved): + self.timestamp_ns = timestamp_ns + self.addr = addr + self.type = type + self.process_id = process_id + self.thread_id = thread_id + self.increase_bytes = increase_bytes + self.place = place + self.current_allocated = current_allocated + self.current_reserved = current_reserved + self.peak_allocated = peak_allocated + self.peak_reserved = peak_reserved + + class TestProfilerStatistic(unittest.TestCase): def test_statistic_case1(self): @@ -89,6 +107,9 @@ class TestProfilerStatistic(unittest.TestCase): conv2d_compute = HostPythonNode('conv2d::compute', profiler.TracerEventType.OperatorInner, 30, 40, 1000, 1001) + conv2d_compute.mem_node.append( + MemPythonNode(33, 0, profiler_statistic.TracerMemEventType.Allocate, + 1000, 1001, 20, 'place(gpu:0)', 200, 200, 800, 800)) conv2d_launchkernel = HostPythonNode( 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 30, 35, 1000, 1001) @@ -211,6 +232,24 @@ class TestProfilerStatistic(unittest.TestCase): self.assertEqual( event_summary.memory_manipulation_items['AsyncMemcpy']. general_gpu_time, 60) + self.assertEqual( + statistic_data.memory_summary.allocated_items['place(gpu:0)'] + ['conv2d'].allocation_count, 1) + self.assertEqual( + statistic_data.memory_summary.allocated_items['place(gpu:0)'] + ['conv2d'].allocation_size, 20) + self.assertEqual( + statistic_data.memory_summary.allocated_items['place(gpu:0)'] + ['conv2d'].increase_size, 20) + self.assertEqual( + statistic_data.memory_summary.allocated_items['place(gpu:0)'] + ['conv2d'].increase_size, 20) + self.assertEqual( + statistic_data.memory_summary. + peak_allocation_values['place(gpu:0)'], 800) + self.assertEqual( + statistic_data.memory_summary.peak_reserved_values['place(gpu:0)'], + 800) print( profiler.profiler_statistic._build_table( statistic_data, diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py index daa6925c4b9..f33335c907d 100755 --- a/python/paddle/profiler/profiler_statistic.py +++ b/python/paddle/profiler/profiler_statistic.py @@ -15,7 +15,7 @@ import collections from enum import Enum import re -from paddle.fluid.core import TracerEventType +from paddle.fluid.core import TracerEventType, TracerMemEventType from .statistic_helper import * @@ -603,6 +603,83 @@ class EventSummary: self.kernel_items[name].add_item(device_node) +class MemorySummary: + r""" + Analyse memory events in profiling data. + """ + + class MemoryItem: + + def __init__(self, event_name, place, memory_type='Allocated'): + self.event_name = event_name + self.place = place + self.allocation_count = 0 + self.free_count = 0 + self.allocation_size = 0 + self.free_size = 0 + self.increase_size = 0 + self.memory_type = memory_type + + def add_memory_record(self, size, allocation_type): + if allocation_type == TracerMemEventType.Allocate or allocation_type == TracerMemEventType.ReservedAllocate: + self.allocation_count += 1 + self.allocation_size += size + + elif allocation_type == TracerMemEventType.Free or allocation_type == TracerMemEventType.ReservedFree: + self.free_count += 1 + self.free_size -= size # size is sign(-) when free. + + else: + print("No corresponding type.") + self.increase_size = self.allocation_size - self.free_size + + def __init__(self): + self.allocated_items = collections.defaultdict( + dict) # for memory summary, device type: event + self.reserved_items = collections.defaultdict( + dict) # for memory summary, device type: event + self.peak_allocation_values = collections.defaultdict(int) + self.peak_reserved_values = collections.defaultdict(int) + + def _analyse_node_memory(self, event_name, node): + for memnode in node.mem_node: # self mem node + if memnode.type == TracerMemEventType.Allocate or memnode.type == TracerMemEventType.Free: + if event_name not in self.allocated_items[memnode.place]: + self.allocated_items[ + memnode.place][event_name] = MemorySummary.MemoryItem( + event_name, memnode.place, 'Allocated') + self.allocated_items[ + memnode.place][event_name].add_memory_record( + memnode.increase_bytes, memnode.type) + elif memnode.type == TracerMemEventType.ReservedAllocate or memnode.type == TracerMemEventType.ReservedFree: + if event_name not in self.reserved_items[memnode.place]: + self.reserved_items[ + memnode.place][event_name] = MemorySummary.MemoryItem( + event_name, memnode.place, 'Reserved') + self.reserved_items[ + memnode.place][event_name].add_memory_record( + memnode.increase_bytes, memnode.type) + self.peak_allocation_values[memnode.place] = max( + self.peak_allocation_values[memnode.place], + memnode.peak_allocated) + self.peak_reserved_values[memnode.place] = max( + self.peak_reserved_values[memnode.place], memnode.peak_reserved) + + def parse(self, nodetrees): + r""" + Analyse memory event in the nodetress. + """ + thread2hostnodes = traverse_tree(nodetrees) + for threadid, host_nodes in thread2hostnodes.items(): + for host_node in host_nodes[1:]: #skip root node + if host_node.type == TracerEventType.OperatorInner: + continue + if host_node.type == TracerEventType.Operator: + for child in host_node.children_node: + self._analyse_node_memory(host_node.name, child) + self._analyse_node_memory(host_node.name, host_node) + + class StatisticData: r""" Hold all analysed results. @@ -614,9 +691,11 @@ class StatisticData: self.time_range_summary = TimeRangeSummary() self.event_summary = EventSummary() self.distributed_summary = DistributedSummary() + self.memory_summary = MemorySummary() self.time_range_summary.parse(node_trees) self.event_summary.parse(node_trees) self.distributed_summary.parse(node_trees) + self.memory_summary.parse(node_trees) def _build_table(statistic_data, @@ -1498,4 +1577,76 @@ def _build_table(statistic_data, append('') append('') + ###### Print Memory Summary Report ###### + if statistic_data.memory_summary.allocated_items or statistic_data.memory_summary.reserved_items: + for device_type, memory_events in statistic_data.memory_summary.allocated_items.items( + ): + all_row_values = [] + sorted_items = sorted(memory_events.items(), + key=lambda x: x[1].increase_size, + reverse=True) + + for event_name, item in sorted_items: + row_values = [ + event_name, item.memory_type, item.allocation_count, + item.free_count, item.allocation_size, item.free_size, + item.increase_size + ] + all_row_values.append(row_values) + + sorted_reserved_items = sorted(statistic_data.memory_summary. + reserved_items[device_type].items(), + key=lambda x: x[1].increase_size, + reverse=True) + for event_name, item in sorted_reserved_items: + row_values = [ + event_name, item.memory_type, item.allocation_count, + item.free_count, item.allocation_size, item.free_size, + item.increase_size + ] + all_row_values.append(row_values) + + # Calculate the column width + headers = [ + 'Name', 'Type', 'Allocation Count', 'Free Count', + 'Allocation Size', 'Free Size', 'Increased Size' + ] + row_format_list = [""] + header_sep_list = [""] + line_length_list = [-SPACING_SIZE] + name_column_width = 50 + number_column_width = 15 + add_column(name_column_width) + add_column(12) + add_column(number_column_width) + add_column(number_column_width) + add_column(number_column_width) + add_column(number_column_width) + add_column(number_column_width) + + row_format = row_format_list[0] + header_sep = header_sep_list[0] + line_length = line_length_list[0] + + # construct table string + append( + add_title(line_length, + "Memory Summary - {}".format(device_type))) + append('Peak Allocated Memory: {}'.format( + statistic_data.memory_summary. + peak_allocation_values[device_type])) + append('Peak Reserved Memory: {}'.format( + statistic_data.memory_summary.peak_reserved_values[device_type]) + ) + append(header_sep) + append(row_format.format(*headers)) + append(header_sep) + for row_values in all_row_values: + if isinstance(row_values, str): + append(add_title(line_length, row_values)) + else: + append(row_format.format(*row_values)) + append('') + append('') + return ''.join(result) -- GitLab