未验证 提交 52d43ca2 编写于 作者: C chenjian 提交者: GitHub

Add statistic code for memory (#43960)

* add code

* add unit test
上级 35ca3009
...@@ -308,6 +308,10 @@ RecordOpInfoSupplement::RecordOpInfoSupplement( ...@@ -308,6 +308,10 @@ RecordOpInfoSupplement::RecordOpInfoSupplement(
PosixInNsec(), type, input_shapes, dtypes, callstack); PosixInNsec(), type, input_shapes, dtypes, callstack);
} }
std::map<const char *, std::map<uint64_t, std::vector<uint64_t>>>
RecordMemEvent::size_cache;
std::map<const char *, std::map<uint64_t, bool>>
RecordMemEvent::has_initialized;
RecordMemEvent::RecordMemEvent(const void *ptr, RecordMemEvent::RecordMemEvent(const void *ptr,
const phi::Place &place, const phi::Place &place,
size_t size, size_t size,
...@@ -323,17 +327,75 @@ RecordMemEvent::RecordMemEvent(const void *ptr, ...@@ -323,17 +327,75 @@ RecordMemEvent::RecordMemEvent(const void *ptr,
uint64_t peak_reserved = 0; // 0 means keep the same as before uint64_t peak_reserved = 0; // 0 means keep the same as before
if (platform::is_cpu_place(place) || if (platform::is_cpu_place(place) ||
platform::is_cuda_pinned_place(place)) { platform::is_cuda_pinned_place(place)) {
current_allocated = if (RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] ==
HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); false) {
peak_allocated = RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()));
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()));
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()));
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()));
current_allocated =
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0];
current_reserved =
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1];
peak_allocated =
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2];
peak_reserved =
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3];
RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] = true;
} else {
current_allocated =
HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
peak_allocated =
HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0] =
current_allocated;
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2] =
peak_allocated;
current_reserved =
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1];
peak_reserved =
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3];
}
} else { } else {
current_allocated = if (RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] ==
DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); false) {
peak_allocated = RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()));
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()));
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()));
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()));
current_allocated =
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0];
current_reserved =
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1];
peak_allocated =
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2];
peak_reserved =
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3];
RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] = true;
} else {
current_allocated =
DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
peak_allocated =
DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0] =
current_allocated;
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2] =
peak_allocated;
current_reserved =
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1];
peak_reserved =
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3];
}
} }
platform::MemEvenRecorder::Instance().PushMemRecord(ptr, platform::MemEvenRecorder::Instance().PushMemRecord(ptr,
place, place,
size, size,
...@@ -349,17 +411,74 @@ RecordMemEvent::RecordMemEvent(const void *ptr, ...@@ -349,17 +411,74 @@ RecordMemEvent::RecordMemEvent(const void *ptr,
uint64_t peak_allocated = 0; // 0 means keep the same as before uint64_t peak_allocated = 0; // 0 means keep the same as before
if (platform::is_cpu_place(place) || if (platform::is_cpu_place(place) ||
platform::is_cuda_pinned_place(place)) { platform::is_cuda_pinned_place(place)) {
current_reserved = if (RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] ==
HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); false) {
peak_reserved = RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()));
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()));
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()));
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()));
current_allocated =
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0];
current_reserved =
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1];
peak_allocated =
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2];
peak_reserved =
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3];
RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] = true;
} else {
current_reserved =
HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
peak_reserved =
HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1] =
current_reserved;
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3] =
peak_reserved;
current_allocated =
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0];
peak_allocated =
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2];
}
} else { } else {
current_reserved = if (RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] ==
DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); false) {
peak_reserved = RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()));
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()));
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()));
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()));
current_allocated =
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0];
current_reserved =
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1];
peak_allocated =
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2];
peak_reserved =
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3];
RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] = true;
} else {
current_reserved =
DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
peak_reserved =
DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1] =
current_reserved;
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3] =
peak_reserved;
current_allocated =
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0];
peak_allocated =
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2];
}
} }
platform::MemEvenRecorder::Instance().PushMemRecord(ptr, platform::MemEvenRecorder::Instance().PushMemRecord(ptr,
place, place,
size, size,
...@@ -375,17 +494,74 @@ RecordMemEvent::RecordMemEvent(const void *ptr, ...@@ -375,17 +494,74 @@ RecordMemEvent::RecordMemEvent(const void *ptr,
uint64_t peak_reserved = 0; // 0 means keep the same as before uint64_t peak_reserved = 0; // 0 means keep the same as before
if (platform::is_cpu_place(place) || if (platform::is_cpu_place(place) ||
platform::is_cuda_pinned_place(place)) { platform::is_cuda_pinned_place(place)) {
current_allocated = if (RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] ==
HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); false) {
peak_allocated = RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()));
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()));
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()));
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()));
current_allocated =
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0];
current_reserved =
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1];
peak_allocated =
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2];
peak_reserved =
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3];
RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] = true;
} else {
current_allocated =
HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
peak_allocated =
HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0] =
current_allocated;
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2] =
peak_allocated;
current_reserved =
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1];
peak_reserved =
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3];
}
} else { } else {
current_allocated = if (RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] ==
DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); false) {
peak_allocated = RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()));
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()));
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()));
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()));
current_allocated =
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0];
current_reserved =
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1];
peak_allocated =
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2];
peak_reserved =
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3];
RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] = true;
} else {
current_allocated =
DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
peak_allocated =
DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0] =
current_allocated;
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2] =
peak_allocated;
current_reserved =
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1];
peak_reserved =
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3];
}
} }
platform::MemEvenRecorder::Instance().PopMemRecord(ptr, platform::MemEvenRecorder::Instance().PopMemRecord(ptr,
place, place,
size, size,
...@@ -401,17 +577,74 @@ RecordMemEvent::RecordMemEvent(const void *ptr, ...@@ -401,17 +577,74 @@ RecordMemEvent::RecordMemEvent(const void *ptr,
uint64_t peak_allocated = 0; // 0 means keep the same as before uint64_t peak_allocated = 0; // 0 means keep the same as before
if (platform::is_cpu_place(place) || if (platform::is_cpu_place(place) ||
platform::is_cuda_pinned_place(place)) { platform::is_cuda_pinned_place(place)) {
current_reserved = if (RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] ==
HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); false) {
peak_reserved = RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()));
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()));
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()));
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()));
current_allocated =
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0];
current_reserved =
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1];
peak_allocated =
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2];
peak_reserved =
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3];
RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] = true;
} else {
current_reserved =
HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
peak_reserved =
HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1] =
current_reserved;
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3] =
peak_reserved;
current_allocated =
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0];
peak_allocated =
RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2];
}
} else { } else {
current_reserved = if (RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] ==
DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); false) {
peak_reserved = RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()));
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()));
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()));
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()));
current_allocated =
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0];
current_reserved =
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1];
peak_allocated =
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2];
peak_reserved =
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3];
RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] = true;
} else {
current_reserved =
DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
peak_reserved =
DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1] =
current_reserved;
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3] =
peak_reserved;
current_allocated =
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0];
peak_allocated =
RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2];
}
} }
platform::MemEvenRecorder::Instance().PopMemRecord(ptr, platform::MemEvenRecorder::Instance().PopMemRecord(ptr,
place, place,
size, size,
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <map>
#include <string> #include <string>
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
...@@ -37,6 +38,17 @@ class RecordMemEvent { ...@@ -37,6 +38,17 @@ class RecordMemEvent {
const Place& place, const Place& place,
size_t size, size_t size,
const TracerMemEventType type = TracerMemEventType::Allocate); const TracerMemEventType type = TracerMemEventType::Allocate);
// size_cache: In the outer map, key is device type, 'cpu' or 'gpu', and in
// the inner map, key is device ip.
// Values record memory sizes for current_allocated, current_reserved,
// peak_allocated and peak_reserved.
// has_initialized: Flags to denote whether memory cache for some device has
// collected once.
static std::map<const char*, std::map<uint64_t, std::vector<uint64_t>>>
size_cache;
static std::map<const char*, std::map<uint64_t, bool>> has_initialized;
}; };
} // namespace platform } // namespace platform
......
...@@ -16,6 +16,7 @@ import unittest ...@@ -16,6 +16,7 @@ import unittest
import paddle import paddle
import paddle.profiler as profiler import paddle.profiler as profiler
import paddle.profiler.profiler_statistic as profiler_statistic
class HostPythonNode: class HostPythonNode:
...@@ -30,6 +31,7 @@ class HostPythonNode: ...@@ -30,6 +31,7 @@ class HostPythonNode:
self.children_node = [] self.children_node = []
self.runtime_node = [] self.runtime_node = []
self.device_node = [] self.device_node = []
self.mem_node = []
class DevicePythonNode: class DevicePythonNode:
...@@ -45,6 +47,22 @@ class DevicePythonNode: ...@@ -45,6 +47,22 @@ class DevicePythonNode:
self.stream_id = stream_id self.stream_id = stream_id
class MemPythonNode:
def __init__(self, timestamp_ns, addr, type, process_id, thread_id, increase_bytes, place, current_allocated, \
current_reserved, peak_allocated, peak_reserved):
self.timestamp_ns = timestamp_ns
self.addr = addr
self.type = type
self.process_id = process_id
self.thread_id = thread_id
self.increase_bytes = increase_bytes
self.place = place
self.current_allocated = current_allocated
self.current_reserved = current_reserved
self.peak_allocated = peak_allocated
self.peak_reserved = peak_reserved
class TestProfilerStatistic(unittest.TestCase): class TestProfilerStatistic(unittest.TestCase):
def test_statistic_case1(self): def test_statistic_case1(self):
...@@ -89,6 +107,9 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -89,6 +107,9 @@ class TestProfilerStatistic(unittest.TestCase):
conv2d_compute = HostPythonNode('conv2d::compute', conv2d_compute = HostPythonNode('conv2d::compute',
profiler.TracerEventType.OperatorInner, profiler.TracerEventType.OperatorInner,
30, 40, 1000, 1001) 30, 40, 1000, 1001)
conv2d_compute.mem_node.append(
MemPythonNode(33, 0, profiler_statistic.TracerMemEventType.Allocate,
1000, 1001, 20, 'place(gpu:0)', 200, 200, 800, 800))
conv2d_launchkernel = HostPythonNode( conv2d_launchkernel = HostPythonNode(
'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 30, 35, 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 30, 35,
1000, 1001) 1000, 1001)
...@@ -211,6 +232,24 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -211,6 +232,24 @@ class TestProfilerStatistic(unittest.TestCase):
self.assertEqual( self.assertEqual(
event_summary.memory_manipulation_items['AsyncMemcpy']. event_summary.memory_manipulation_items['AsyncMemcpy'].
general_gpu_time, 60) general_gpu_time, 60)
self.assertEqual(
statistic_data.memory_summary.allocated_items['place(gpu:0)']
['conv2d'].allocation_count, 1)
self.assertEqual(
statistic_data.memory_summary.allocated_items['place(gpu:0)']
['conv2d'].allocation_size, 20)
self.assertEqual(
statistic_data.memory_summary.allocated_items['place(gpu:0)']
['conv2d'].increase_size, 20)
self.assertEqual(
statistic_data.memory_summary.allocated_items['place(gpu:0)']
['conv2d'].increase_size, 20)
self.assertEqual(
statistic_data.memory_summary.
peak_allocation_values['place(gpu:0)'], 800)
self.assertEqual(
statistic_data.memory_summary.peak_reserved_values['place(gpu:0)'],
800)
print( print(
profiler.profiler_statistic._build_table( profiler.profiler_statistic._build_table(
statistic_data, statistic_data,
......
...@@ -15,7 +15,7 @@ import collections ...@@ -15,7 +15,7 @@ import collections
from enum import Enum from enum import Enum
import re import re
from paddle.fluid.core import TracerEventType from paddle.fluid.core import TracerEventType, TracerMemEventType
from .statistic_helper import * from .statistic_helper import *
...@@ -603,6 +603,83 @@ class EventSummary: ...@@ -603,6 +603,83 @@ class EventSummary:
self.kernel_items[name].add_item(device_node) self.kernel_items[name].add_item(device_node)
class MemorySummary:
r"""
Analyse memory events in profiling data.
"""
class MemoryItem:
def __init__(self, event_name, place, memory_type='Allocated'):
self.event_name = event_name
self.place = place
self.allocation_count = 0
self.free_count = 0
self.allocation_size = 0
self.free_size = 0
self.increase_size = 0
self.memory_type = memory_type
def add_memory_record(self, size, allocation_type):
if allocation_type == TracerMemEventType.Allocate or allocation_type == TracerMemEventType.ReservedAllocate:
self.allocation_count += 1
self.allocation_size += size
elif allocation_type == TracerMemEventType.Free or allocation_type == TracerMemEventType.ReservedFree:
self.free_count += 1
self.free_size -= size # size is sign(-) when free.
else:
print("No corresponding type.")
self.increase_size = self.allocation_size - self.free_size
def __init__(self):
self.allocated_items = collections.defaultdict(
dict) # for memory summary, device type: event
self.reserved_items = collections.defaultdict(
dict) # for memory summary, device type: event
self.peak_allocation_values = collections.defaultdict(int)
self.peak_reserved_values = collections.defaultdict(int)
def _analyse_node_memory(self, event_name, node):
for memnode in node.mem_node: # self mem node
if memnode.type == TracerMemEventType.Allocate or memnode.type == TracerMemEventType.Free:
if event_name not in self.allocated_items[memnode.place]:
self.allocated_items[
memnode.place][event_name] = MemorySummary.MemoryItem(
event_name, memnode.place, 'Allocated')
self.allocated_items[
memnode.place][event_name].add_memory_record(
memnode.increase_bytes, memnode.type)
elif memnode.type == TracerMemEventType.ReservedAllocate or memnode.type == TracerMemEventType.ReservedFree:
if event_name not in self.reserved_items[memnode.place]:
self.reserved_items[
memnode.place][event_name] = MemorySummary.MemoryItem(
event_name, memnode.place, 'Reserved')
self.reserved_items[
memnode.place][event_name].add_memory_record(
memnode.increase_bytes, memnode.type)
self.peak_allocation_values[memnode.place] = max(
self.peak_allocation_values[memnode.place],
memnode.peak_allocated)
self.peak_reserved_values[memnode.place] = max(
self.peak_reserved_values[memnode.place], memnode.peak_reserved)
def parse(self, nodetrees):
r"""
Analyse memory event in the nodetress.
"""
thread2hostnodes = traverse_tree(nodetrees)
for threadid, host_nodes in thread2hostnodes.items():
for host_node in host_nodes[1:]: #skip root node
if host_node.type == TracerEventType.OperatorInner:
continue
if host_node.type == TracerEventType.Operator:
for child in host_node.children_node:
self._analyse_node_memory(host_node.name, child)
self._analyse_node_memory(host_node.name, host_node)
class StatisticData: class StatisticData:
r""" r"""
Hold all analysed results. Hold all analysed results.
...@@ -614,9 +691,11 @@ class StatisticData: ...@@ -614,9 +691,11 @@ class StatisticData:
self.time_range_summary = TimeRangeSummary() self.time_range_summary = TimeRangeSummary()
self.event_summary = EventSummary() self.event_summary = EventSummary()
self.distributed_summary = DistributedSummary() self.distributed_summary = DistributedSummary()
self.memory_summary = MemorySummary()
self.time_range_summary.parse(node_trees) self.time_range_summary.parse(node_trees)
self.event_summary.parse(node_trees) self.event_summary.parse(node_trees)
self.distributed_summary.parse(node_trees) self.distributed_summary.parse(node_trees)
self.memory_summary.parse(node_trees)
def _build_table(statistic_data, def _build_table(statistic_data,
...@@ -1498,4 +1577,76 @@ def _build_table(statistic_data, ...@@ -1498,4 +1577,76 @@ def _build_table(statistic_data,
append('') append('')
append('') append('')
###### Print Memory Summary Report ######
if statistic_data.memory_summary.allocated_items or statistic_data.memory_summary.reserved_items:
for device_type, memory_events in statistic_data.memory_summary.allocated_items.items(
):
all_row_values = []
sorted_items = sorted(memory_events.items(),
key=lambda x: x[1].increase_size,
reverse=True)
for event_name, item in sorted_items:
row_values = [
event_name, item.memory_type, item.allocation_count,
item.free_count, item.allocation_size, item.free_size,
item.increase_size
]
all_row_values.append(row_values)
sorted_reserved_items = sorted(statistic_data.memory_summary.
reserved_items[device_type].items(),
key=lambda x: x[1].increase_size,
reverse=True)
for event_name, item in sorted_reserved_items:
row_values = [
event_name, item.memory_type, item.allocation_count,
item.free_count, item.allocation_size, item.free_size,
item.increase_size
]
all_row_values.append(row_values)
# Calculate the column width
headers = [
'Name', 'Type', 'Allocation Count', 'Free Count',
'Allocation Size', 'Free Size', 'Increased Size'
]
row_format_list = [""]
header_sep_list = [""]
line_length_list = [-SPACING_SIZE]
name_column_width = 50
number_column_width = 15
add_column(name_column_width)
add_column(12)
add_column(number_column_width)
add_column(number_column_width)
add_column(number_column_width)
add_column(number_column_width)
add_column(number_column_width)
row_format = row_format_list[0]
header_sep = header_sep_list[0]
line_length = line_length_list[0]
# construct table string
append(
add_title(line_length,
"Memory Summary - {}".format(device_type)))
append('Peak Allocated Memory: {}'.format(
statistic_data.memory_summary.
peak_allocation_values[device_type]))
append('Peak Reserved Memory: {}'.format(
statistic_data.memory_summary.peak_reserved_values[device_type])
)
append(header_sep)
append(row_format.format(*headers))
append(header_sep)
for row_values in all_row_values:
if isinstance(row_values, str):
append(add_title(line_length, row_values))
else:
append(row_format.format(*row_values))
append('')
append('')
return ''.join(result) return ''.join(result)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册