未验证 提交 2ea5e02c 编写于 作者: C chenjian 提交者: GitHub

[cherry-pick] Refine user experience for profiler (#41989)

* fix divide zero error when cpu only (#41794)

* reduce performance influence by RecordEvent in Python (#41822)

* reduce performance influence

* add unit test

* fix

* Rebase for profiler statistic ratio (#41939)

* fix according to suggestion

* add kernel summary

* improve coverage
上级 85a4ecb6
...@@ -20,6 +20,7 @@ import tempfile ...@@ -20,6 +20,7 @@ import tempfile
import paddle import paddle
import paddle.profiler as profiler import paddle.profiler as profiler
import paddle.profiler.utils as utils
import paddle.nn as nn import paddle.nn as nn
import paddle.nn.functional as F import paddle.nn.functional as F
from paddle.io import Dataset, DataLoader from paddle.io import Dataset, DataLoader
...@@ -40,11 +41,17 @@ class TestProfiler(unittest.TestCase): ...@@ -40,11 +41,17 @@ class TestProfiler(unittest.TestCase):
with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU], ) as prof: with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU], ) as prof:
y = x / 2.0 y = x / 2.0
prof = None prof = None
self.assertEqual(utils._is_profiler_used, False)
with profiler.RecordEvent(name='test'):
y = x / 2.0
with profiler.Profiler( with profiler.Profiler(
targets=[profiler.ProfilerTarget.CPU], targets=[profiler.ProfilerTarget.CPU],
scheduler=(1, 2)) as prof: scheduler=(1, 2)) as prof:
self.assertEqual(utils._is_profiler_used, True)
with profiler.RecordEvent(name='test'): with profiler.RecordEvent(name='test'):
y = x / 2.0 y = x / 2.0
prof = None prof = None
with profiler.Profiler( with profiler.Profiler(
targets=[profiler.ProfilerTarget.CPU], targets=[profiler.ProfilerTarget.CPU],
......
...@@ -51,8 +51,9 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -51,8 +51,9 @@ class TestProfilerStatistic(unittest.TestCase):
profilerstep_node = HostPythonNode('ProfileStep#1', profilerstep_node = HostPythonNode('ProfileStep#1',
profiler.TracerEventType.ProfileStep, profiler.TracerEventType.ProfileStep,
0, 400, 1000, 1001) 0, 400, 1000, 1001)
dataloader_node = HostPythonNode( dataloader_node = HostPythonNode('Dataloader',
'Dataloader', profiler.TracerEventType.Forward, 5, 15, 1000, 1001) profiler.TracerEventType.Dataloader, 5,
15, 1000, 1001)
mobilenet_node = HostPythonNode( mobilenet_node = HostPythonNode(
'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001) 'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
yolonet_node = HostPythonNode( yolonet_node = HostPythonNode(
...@@ -155,7 +156,7 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -155,7 +156,7 @@ class TestProfilerStatistic(unittest.TestCase):
profiler.TracerEventType.ProfileStep), 400) profiler.TracerEventType.ProfileStep), 400)
self.assertEqual( self.assertEqual(
time_range_summary.get_cpu_range_sum( time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Forward), 100) profiler.TracerEventType.Forward), 90)
self.assertEqual( self.assertEqual(
time_range_summary.get_cpu_range_sum( time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Backward), 80) profiler.TracerEventType.Backward), 80)
...@@ -185,12 +186,12 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -185,12 +186,12 @@ class TestProfilerStatistic(unittest.TestCase):
profiler.TracerEventType.Communication), 5) profiler.TracerEventType.Communication), 5)
self.assertEqual(len(event_summary.items), 2) self.assertEqual(len(event_summary.items), 2)
self.assertEqual(len(event_summary.userdefined_items), 1) self.assertEqual(len(event_summary.userdefined_items), 1)
self.assertEqual(len(event_summary.model_perspective_items), 4) self.assertEqual(len(event_summary.model_perspective_items), 5)
self.assertEqual(len(event_summary.memory_manipulation_items), 1) self.assertEqual(len(event_summary.memory_manipulation_items), 1)
self.assertEqual(event_summary.items['conv2d'].cpu_time, 15) self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 25) self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 25)
self.assertEqual( self.assertEqual(
event_summary.model_perspective_items['Forward'].cpu_time, 100) event_summary.model_perspective_items['Forward'].cpu_time, 90)
self.assertEqual( self.assertEqual(
event_summary.model_perspective_items['Forward'].general_gpu_time, event_summary.model_perspective_items['Forward'].general_gpu_time,
135) 135)
...@@ -217,8 +218,9 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -217,8 +218,9 @@ class TestProfilerStatistic(unittest.TestCase):
profiler.TracerEventType.ProfileStep, profiler.TracerEventType.ProfileStep,
0, 400, 1000, 1001) 0, 400, 1000, 1001)
dataloader_node = HostPythonNode( dataloader_node = HostPythonNode('Dataloader',
'Dataloader', profiler.TracerEventType.Forward, 5, 15, 1000, 1001) profiler.TracerEventType.Dataloader, 5,
15, 1000, 1001)
mobilenet_node = HostPythonNode( mobilenet_node = HostPythonNode(
'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001) 'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
...@@ -372,7 +374,7 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -372,7 +374,7 @@ class TestProfilerStatistic(unittest.TestCase):
profiler.TracerEventType.ProfileStep), 400) profiler.TracerEventType.ProfileStep), 400)
self.assertEqual( self.assertEqual(
time_range_summary.get_cpu_range_sum( time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Forward), 100) profiler.TracerEventType.Forward), 90)
self.assertEqual( self.assertEqual(
time_range_summary.get_cpu_range_sum( time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Backward), 80) profiler.TracerEventType.Backward), 80)
...@@ -417,12 +419,12 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -417,12 +419,12 @@ class TestProfilerStatistic(unittest.TestCase):
distributed_summary.overlap_range), 85) distributed_summary.overlap_range), 85)
self.assertEqual(len(event_summary.items), 4) self.assertEqual(len(event_summary.items), 4)
self.assertEqual(len(event_summary.userdefined_items), 1) self.assertEqual(len(event_summary.userdefined_items), 1)
self.assertEqual(len(event_summary.model_perspective_items), 4) self.assertEqual(len(event_summary.model_perspective_items), 5)
self.assertEqual(len(event_summary.memory_manipulation_items), 1) self.assertEqual(len(event_summary.memory_manipulation_items), 1)
self.assertEqual(event_summary.items['conv2d'].cpu_time, 15) self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 25) self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 25)
self.assertEqual( self.assertEqual(
event_summary.model_perspective_items['Forward'].cpu_time, 100) event_summary.model_perspective_items['Forward'].cpu_time, 90)
self.assertEqual( self.assertEqual(
event_summary.model_perspective_items['Forward'].general_gpu_time, event_summary.model_perspective_items['Forward'].general_gpu_time,
315) 315)
...@@ -441,6 +443,86 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -441,6 +443,86 @@ class TestProfilerStatistic(unittest.TestCase):
thread_sep=False, thread_sep=False,
time_unit='ms')) time_unit='ms'))
def test_statistic_case3(self):
# for coverage, test all time is 0
root_node = HostPythonNode('Root Node',
profiler.TracerEventType.UserDefined, 0,
float('inf'), 1000, 1001)
profilerstep_node = HostPythonNode('ProfileStep#1',
profiler.TracerEventType.ProfileStep,
0, 400, 1000, 1001)
dataloader_node = HostPythonNode('Dataloader',
profiler.TracerEventType.Dataloader, 5,
15, 1000, 1001)
mobilenet_node = HostPythonNode(
'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
backward_node = HostPythonNode('Gradient Backward',
profiler.TracerEventType.Backward, 120,
200, 1000, 1001)
optimization_node = HostPythonNode(
'Optimization', profiler.TracerEventType.Optimization, 220, 300,
1000, 1001)
userdefined_node = HostPythonNode('Communication Time',
profiler.TracerEventType.UserDefined,
60, 70, 1000, 1001)
conv2d_node = HostPythonNode(
'conv2d', profiler.TracerEventType.Operator, 25, 25, 1000, 1001)
conv2d_infer_shape = HostPythonNode(
'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25,
25, 1000, 1001)
conv2d_compute = HostPythonNode('conv2d::compute',
profiler.TracerEventType.OperatorInner,
25, 25, 1000, 1001)
conv2d_launchkernel = HostPythonNode(
'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 25, 25,
1000, 1001)
conv2d_kernel = DevicePythonNode(
'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 35, 0, 0, 0)
another_kernel = DevicePythonNode(
'void phi::funcs::VectorizedBroadcastKernel<float, float, phi::funcs::AddFunctor<float>, phi::funcs::AddFunctor<float>>()',
profiler.TracerEventType.Kernel, 35, 35, 0, 0, 0)
root_node.children_node.append(profilerstep_node)
profilerstep_node.children_node.extend([
dataloader_node, mobilenet_node, userdefined_node, backward_node,
optimization_node
])
mobilenet_node.children_node.append(conv2d_node)
conv2d_node.children_node.extend([conv2d_infer_shape, conv2d_compute])
conv2d_compute.runtime_node.append(conv2d_launchkernel)
conv2d_launchkernel.device_node.append(conv2d_kernel)
conv2d_launchkernel.device_node.append(another_kernel)
thread_tree = {'thread1001': root_node}
extra_info = {
'Process Cpu Utilization': '1.02',
'System Cpu Utilization': '0.68'
}
statistic_data = profiler.profiler_statistic.StatisticData(thread_tree,
extra_info)
time_range_summary = statistic_data.time_range_summary
event_summary = statistic_data.event_summary
self.assertEqual(event_summary.items['conv2d'].cpu_time, 0)
self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 0)
self.assertEqual(event_summary.userdefined_items['Communication Time']
.general_gpu_time, 0)
for sort_key in [
profiler.SortedKeys.CPUTotal, profiler.SortedKeys.CPUMax,
profiler.SortedKeys.CPUMin, profiler.SortedKeys.CPUAvg,
profiler.SortedKeys.GPUTotal, profiler.SortedKeys.GPUMax,
profiler.SortedKeys.GPUMin, profiler.SortedKeys.GPUAvg
]:
print(
profiler.profiler_statistic._build_table(
statistic_data,
sorted_by=sort_key,
op_detail=True,
thread_sep=False,
time_unit='ms'))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -27,6 +27,7 @@ from paddle.fluid.core import (_Profiler, _ProfilerResult, ProfilerOptions, ...@@ -27,6 +27,7 @@ from paddle.fluid.core import (_Profiler, _ProfilerResult, ProfilerOptions,
from .utils import RecordEvent, wrap_optimizers from .utils import RecordEvent, wrap_optimizers
from .profiler_statistic import StatisticData, _build_table, SortedKeys from .profiler_statistic import StatisticData, _build_table, SortedKeys
from paddle.profiler import utils
from .timer import benchmark from .timer import benchmark
...@@ -475,6 +476,7 @@ class Profiler: ...@@ -475,6 +476,7 @@ class Profiler:
if self.timer_only: if self.timer_only:
return return
# CLOSED -> self.current_state # CLOSED -> self.current_state
utils._is_profiler_used = True
if self.current_state == ProfilerState.READY: if self.current_state == ProfilerState.READY:
self.profiler.prepare() self.profiler.prepare()
elif self.current_state == ProfilerState.RECORD: elif self.current_state == ProfilerState.RECORD:
...@@ -527,6 +529,7 @@ class Profiler: ...@@ -527,6 +529,7 @@ class Profiler:
self.profiler_result = self.profiler.stop() self.profiler_result = self.profiler.stop()
if self.on_trace_ready: if self.on_trace_ready:
self.on_trace_ready(self) self.on_trace_ready(self)
utils._is_profiler_used = False
def step(self, num_samples: Optional[int]=None): def step(self, num_samples: Optional[int]=None):
r""" r"""
......
...@@ -78,15 +78,19 @@ class HostStatisticNode: ...@@ -78,15 +78,19 @@ class HostStatisticNode:
self.self_gpu_time = 0 self.self_gpu_time = 0
self.general_gpu_time = 0 # besides kernel, include time of gpu events like memcpy and memset self.general_gpu_time = 0 # besides kernel, include time of gpu events like memcpy and memset
self.self_general_gpu_time = 0 self.self_general_gpu_time = 0
self.is_terminal_operator_node = True
def cal_statistic(self): def cal_statistic(self):
for child in self.children_node: for child in self.children_node:
child.cal_statistic() child.cal_statistic()
if child.is_terminal_operator_node == False:
self.is_terminal_operator_node = False
for rt in self.runtime_node: for rt in self.runtime_node:
rt.cal_statistic() rt.cal_statistic()
self.cpu_time = self.hostnode.end_ns - self.hostnode.start_ns self.cpu_time = self.hostnode.end_ns - self.hostnode.start_ns
for child in self.children_node: for child in self.children_node:
if child.type == TracerEventType.Operator:
self.is_terminal_operator_node = False
self.gpu_time += child.gpu_time self.gpu_time += child.gpu_time
self.general_gpu_time += child.general_gpu_time self.general_gpu_time += child.general_gpu_time
self.self_cpu_time -= (child.end_ns - child.start_ns) self.self_cpu_time -= (child.end_ns - child.start_ns)
...@@ -419,10 +423,10 @@ class EventSummary: ...@@ -419,10 +423,10 @@ class EventSummary:
for runtimenode in node.runtime_node: for runtimenode in node.runtime_node:
for devicenode in runtimenode.device_node: for devicenode in runtimenode.device_node:
if devicenode.name not in self.devices: name = devicenode.name
self.devices[devicenode.name] = EventSummary.DeviceItem( if name not in self.devices:
devicenode.name) self.devices[name] = EventSummary.DeviceItem(name)
self.devices[devicenode.name].add_item(devicenode) self.devices[name].add_item(devicenode)
class GeneralItem: class GeneralItem:
def __init__(self, name): def __init__(self, name):
...@@ -489,6 +493,7 @@ class EventSummary: ...@@ -489,6 +493,7 @@ class EventSummary:
dict) # for userdefined summary dict) # for userdefined summary
self.model_perspective_items = {} # for model summary self.model_perspective_items = {} # for model summary
self.memory_manipulation_items = {} # for memory manipulation summary self.memory_manipulation_items = {} # for memory manipulation summary
self.kernel_items = {} # for kernel summary
def parse(self, nodetrees): def parse(self, nodetrees):
r""" r"""
...@@ -508,6 +513,7 @@ class EventSummary: ...@@ -508,6 +513,7 @@ class EventSummary:
self.add_memory_manipulation_item(host_statistic_node) self.add_memory_manipulation_item(host_statistic_node)
else: else:
self.add_userdefined_item(host_statistic_node) self.add_userdefined_item(host_statistic_node)
self.add_kernel_item(host_statistic_nodes[0])
for threadid, root_statistic_node in node_statistic_trees.items(): for threadid, root_statistic_node in node_statistic_trees.items():
deque = collections.deque() deque = collections.deque()
...@@ -525,11 +531,7 @@ class EventSummary: ...@@ -525,11 +531,7 @@ class EventSummary:
deque.append(child) deque.append(child)
def add_operator_item(self, operator_node): def add_operator_item(self, operator_node):
have_inner = False if operator_node.is_terminal_operator_node == False:
for child in operator_node.children_node:
if child.type == TracerEventType.OperatorInner:
have_inner = True
if have_inner == False:
return return
if operator_node.name not in self.items: if operator_node.name not in self.items:
self.items[operator_node.name] = EventSummary.OperatorItem( self.items[operator_node.name] = EventSummary.OperatorItem(
...@@ -585,6 +587,15 @@ class EventSummary: ...@@ -585,6 +587,15 @@ class EventSummary:
self.model_perspective_items[name] = EventSummary.GeneralItem(name) self.model_perspective_items[name] = EventSummary.GeneralItem(name)
self.model_perspective_items[name].add_item(model_perspective_node) self.model_perspective_items[name].add_item(model_perspective_node)
def add_kernel_item(self, root_node):
device_nodes = get_device_nodes(root_node)
for device_node in device_nodes:
if device_node.type == TracerEventType.Kernel:
name = device_node.name
if name not in self.kernel_items:
self.kernel_items[name] = EventSummary.DeviceItem(name)
self.kernel_items[name].add_item(device_node)
class StatisticData: class StatisticData:
r""" r"""
...@@ -752,6 +763,9 @@ def _build_table(statistic_data, ...@@ -752,6 +763,9 @@ def _build_table(statistic_data,
cpu_call_times[ cpu_call_times[
event_type] = statistic_data.event_summary.model_perspective_items[ event_type] = statistic_data.event_summary.model_perspective_items[
event_type_name].call event_type_name].call
cpu_type_time[
event_type] = statistic_data.event_summary.model_perspective_items[
event_type_name].cpu_time
gpu_time_range = collections.defaultdict(list) gpu_time_range = collections.defaultdict(list)
for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items( for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items(
...@@ -800,7 +814,6 @@ def _build_table(statistic_data, ...@@ -800,7 +814,6 @@ def _build_table(statistic_data,
append( append(
"Note:\nIn this table, We sum up all collected events in terms of event type.\n" "Note:\nIn this table, We sum up all collected events in terms of event type.\n"
"The time of events collected on host are presented as CPU Time, and as GPU Time if on device.\n" "The time of events collected on host are presented as CPU Time, and as GPU Time if on device.\n"
"The time with ratio 100% is the base time for calculating ratio. \n"
"Events with different types may overlap or inclusion, e.g. Operator includes OperatorInner, so the sum of ratios is not 100%.\n" "Events with different types may overlap or inclusion, e.g. Operator includes OperatorInner, so the sum of ratios is not 100%.\n"
"The time of events in the same type with overlap will not calculate twice, and all time is summed after merged.\n" "The time of events in the same type with overlap will not calculate twice, and all time is summed after merged.\n"
"Example:\n" "Example:\n"
...@@ -820,13 +833,18 @@ def _build_table(statistic_data, ...@@ -820,13 +833,18 @@ def _build_table(statistic_data,
all_row_values = [] all_row_values = []
accmulation_time = 0 accmulation_time = 0
gpu_accmulation_time = 0 gpu_accmulation_time = 0
gpu_total_time = 0 gpu_total_time = statistic_data.event_summary.model_perspective_items[
'ProfileStep'].general_gpu_time
for name in [ for name in [
'ProfileStep', 'Dataloader', 'Forward', 'Backward', 'ProfileStep', 'Dataloader', 'Forward', 'Backward',
'Optimization' 'Optimization'
]: ]:
if name in model_perspective_items: if name in model_perspective_items:
item = model_perspective_items[name] item = model_perspective_items[name]
if gpu_total_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(item.general_gpu_time) / gpu_total_time
name = '{}'.format( name = '{}'.format(
name) if 'ProfileStep' in name else ' {}'.format(name) name) if 'ProfileStep' in name else ' {}'.format(name)
row_values = [ row_values = [
...@@ -850,17 +868,19 @@ def _build_table(statistic_data, ...@@ -850,17 +868,19 @@ def _build_table(statistic_data,
item.max_gpu_time, unit=time_unit), item.max_gpu_time, unit=time_unit),
format_time( format_time(
item.min_gpu_time, unit=time_unit), item.min_gpu_time, unit=time_unit),
format_ratio(float(item.gpu_time) / total_time)) format_ratio(gpu_ratio))
] ]
all_row_values.append(row_values) all_row_values.append(row_values)
if 'ProfileStep' not in name: if 'ProfileStep' not in name:
accmulation_time += item.cpu_time accmulation_time += item.cpu_time
gpu_accmulation_time += item.gpu_time gpu_accmulation_time += item.general_gpu_time
else:
gpu_total_time = item.gpu_time
other_time = total_time - accmulation_time other_time = total_time - accmulation_time
other_gpu_time = gpu_total_time - gpu_accmulation_time other_gpu_time = gpu_total_time - gpu_accmulation_time
if gpu_total_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(other_gpu_time) / gpu_total_time
row_values = [ row_values = [
' Others', '-', '{} / - / - / - / {}'.format( ' Others', '-', '{} / - / - / - / {}'.format(
format_time( format_time(
...@@ -869,7 +889,7 @@ def _build_table(statistic_data, ...@@ -869,7 +889,7 @@ def _build_table(statistic_data,
'{} / - / - / - / {}'.format( '{} / - / - / - / {}'.format(
format_time( format_time(
other_gpu_time, unit=time_unit), other_gpu_time, unit=time_unit),
format_ratio(float(other_gpu_time) / gpu_total_time)) format_ratio(gpu_ratio))
] ]
all_row_values.append(row_values) all_row_values.append(row_values)
# Calculate the column width # Calculate the column width
...@@ -913,7 +933,6 @@ def _build_table(statistic_data, ...@@ -913,7 +933,6 @@ def _build_table(statistic_data,
append( append(
"Note:\nIn this table, GPU time is the sum of all device(GPU) events called in the phase.\n" "Note:\nIn this table, GPU time is the sum of all device(GPU) events called in the phase.\n"
"Unlike overview summary, if two device(GPU) events execute on different streams with overlap time, we sum them directly here.\n" "Unlike overview summary, if two device(GPU) events execute on different streams with overlap time, we sum them directly here.\n"
"The time with ratio 100% is the base time for calculating ratio. \n"
) )
append('-' * line_length) append('-' * line_length)
append('') append('')
...@@ -981,7 +1000,6 @@ def _build_table(statistic_data, ...@@ -981,7 +1000,6 @@ def _build_table(statistic_data,
"Note:\nCommunication time: Communication Event time, Communication Op time and its kernel time on gpu.\n" "Note:\nCommunication time: Communication Event time, Communication Op time and its kernel time on gpu.\n"
"Computation time: Kernel time, except kernels belong to communication(nccl kernels).\n" "Computation time: Kernel time, except kernels belong to communication(nccl kernels).\n"
"Overlap time: Communication time intersects with computation time.\n" "Overlap time: Communication time intersects with computation time.\n"
"The time with ratio 100% is the base time for calculating ratio. \n"
"Example:\n" "Example:\n"
"Communication:\n" "Communication:\n"
" CPU: |_________________|\n" " CPU: |_________________|\n"
...@@ -1040,8 +1058,22 @@ def _build_table(statistic_data, ...@@ -1040,8 +1058,22 @@ def _build_table(statistic_data,
elif sorted_by == SortedKeys.GPUMin: elif sorted_by == SortedKeys.GPUMin:
sorted_items = sorted( sorted_items = sorted(
items.items(), key=lambda x: x[1].min_general_gpu_time) items.items(), key=lambda x: x[1].min_general_gpu_time)
total_op_cpu_time = 0
total_op_gpu_time = 0
for name, item in sorted_items:
total_op_cpu_time += item.cpu_time
total_op_gpu_time += item.general_gpu_time
for name, item in sorted_items: for name, item in sorted_items:
if total_op_cpu_time == 0:
cpu_ratio = 0
else:
cpu_ratio = float(item.cpu_time) / total_op_cpu_time
if total_op_gpu_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(item.general_gpu_time) / total_op_gpu_time
row_values = [ row_values = [
name, item.call, '{} / {} / {} / {} / {}'.format( name, item.call, '{} / {} / {} / {} / {}'.format(
format_time( format_time(
...@@ -1052,7 +1084,7 @@ def _build_table(statistic_data, ...@@ -1052,7 +1084,7 @@ def _build_table(statistic_data,
item.max_cpu_time, unit=time_unit), item.max_cpu_time, unit=time_unit),
format_time( format_time(
item.min_cpu_time, unit=time_unit), item.min_cpu_time, unit=time_unit),
format_ratio(float(item.cpu_time) / total_time)), format_ratio(cpu_ratio)),
'{} / {} / {} / {} / {}'.format( '{} / {} / {} / {} / {}'.format(
format_time( format_time(
item.general_gpu_time, unit=time_unit), item.general_gpu_time, unit=time_unit),
...@@ -1062,13 +1094,22 @@ def _build_table(statistic_data, ...@@ -1062,13 +1094,22 @@ def _build_table(statistic_data,
item.max_general_gpu_time, unit=time_unit), item.max_general_gpu_time, unit=time_unit),
format_time( format_time(
item.min_general_gpu_time, unit=time_unit), item.min_general_gpu_time, unit=time_unit),
format_ratio( format_ratio(gpu_ratio))
float(item.general_gpu_time) / total_time))
] ]
all_row_values.append(row_values) all_row_values.append(row_values)
if op_detail: if op_detail:
for innerop_name, innerop_node in item.operator_inners.items( for innerop_name, innerop_node in item.operator_inners.items(
): ):
if item.cpu_time == 0:
cpu_ratio = 0
else:
cpu_ratio = float(
innerop_node.cpu_time) / item.cpu_time
if item.general_gpu_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(innerop_node.general_gpu_time
) / item.general_gpu_time
if len(innerop_name) + 2 > name_column_width: if len(innerop_name) + 2 > name_column_width:
innerop_name = innerop_name[:name_column_width - 5] innerop_name = innerop_name[:name_column_width - 5]
innerop_name += "..." innerop_name += "..."
...@@ -1083,8 +1124,7 @@ def _build_table(statistic_data, ...@@ -1083,8 +1124,7 @@ def _build_table(statistic_data,
innerop_node.max_cpu_time, unit=time_unit), innerop_node.max_cpu_time, unit=time_unit),
format_time( format_time(
innerop_node.min_cpu_time, unit=time_unit), innerop_node.min_cpu_time, unit=time_unit),
format_ratio( format_ratio(cpu_ratio)),
float(innerop_node.cpu_time) / total_time)),
'{} / {} / {} / {} / {}'.format( '{} / {} / {} / {} / {}'.format(
format_time( format_time(
innerop_node.general_gpu_time, innerop_node.general_gpu_time,
...@@ -1098,13 +1138,17 @@ def _build_table(statistic_data, ...@@ -1098,13 +1138,17 @@ def _build_table(statistic_data,
format_time( format_time(
innerop_node.min_general_gpu_time, innerop_node.min_general_gpu_time,
unit=time_unit), unit=time_unit),
format_ratio( format_ratio(gpu_ratio))
float(innerop_node.general_gpu_time) /
total_time))
] ]
all_row_values.append(row_values) all_row_values.append(row_values)
for device_node_name, device_node in innerop_node.devices.items( for device_node_name, device_node in innerop_node.devices.items(
): ):
if innerop_node.general_gpu_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(
device_node.
gpu_time) / innerop_node.general_gpu_time
if len(device_node_name) + 4 > name_column_width: if len(device_node_name) + 4 > name_column_width:
device_node_name = device_node_name[: device_node_name = device_node_name[:
name_column_width name_column_width
...@@ -1125,12 +1169,15 @@ def _build_table(statistic_data, ...@@ -1125,12 +1169,15 @@ def _build_table(statistic_data,
format_time( format_time(
device_node.min_gpu_time, device_node.min_gpu_time,
unit=time_unit), unit=time_unit),
format_ratio( format_ratio(gpu_ratio))
float(device_node.gpu_time) /
total_time))
] ]
all_row_values.append(row_values) all_row_values.append(row_values)
for device_node_name, device_node in item.devices.items(): for device_node_name, device_node in item.devices.items():
if item.general_gpu_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(
device_node.gpu_time) / item.general_gpu_time
if len(device_node_name) + 2 > name_column_width: if len(device_node_name) + 2 > name_column_width:
device_node_name = device_node_name[: device_node_name = device_node_name[:
name_column_width name_column_width
...@@ -1148,8 +1195,7 @@ def _build_table(statistic_data, ...@@ -1148,8 +1195,7 @@ def _build_table(statistic_data,
device_node.max_gpu_time, unit=time_unit), device_node.max_gpu_time, unit=time_unit),
format_time( format_time(
device_node.min_gpu_time, unit=time_unit), device_node.min_gpu_time, unit=time_unit),
format_ratio( format_ratio(gpu_ratio))
float(device_node.gpu_time) / total_time))
] ]
all_row_values.append(row_values) all_row_values.append(row_values)
# Calculate the column width # Calculate the column width
...@@ -1197,11 +1243,106 @@ def _build_table(statistic_data, ...@@ -1197,11 +1243,106 @@ def _build_table(statistic_data,
append('') append('')
append('') append('')
###### Print Kernel Summary Report ######
if statistic_data.event_summary.kernel_items:
all_row_values = []
kernel_items = statistic_data.event_summary.kernel_items
if sorted_by == SortedKeys.GPUAvg:
sorted_items = sorted(
kernel_items.items(),
key=lambda x: x[1].avg_gpu_time,
reverse=True)
elif sorted_by == SortedKeys.GPUMax:
sorted_items = sorted(
kernel_items.items(),
key=lambda x: x[1].max_gpu_time,
reverse=True)
elif sorted_by == SortedKeys.GPUMin:
sorted_items = sorted(
kernel_items.items(), key=lambda x: x[1].min_gpu_time)
else:
sorted_items = sorted(
kernel_items.items(), key=lambda x: x[1].gpu_time, reverse=True)
total_kernel_gpu_time = 0
for name, item in sorted_items:
total_kernel_gpu_time += item.gpu_time
for name, item in sorted_items:
if total_kernel_gpu_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(item.gpu_time) / total_kernel_gpu_time
row_values = [
name,
item.call,
'{} / {} / {} / {} / {}'.format(
format_time(
item.gpu_time, unit=time_unit),
format_time(
item.avg_gpu_time, unit=time_unit),
format_time(
item.max_gpu_time, unit=time_unit),
format_time(
item.min_gpu_time, unit=time_unit),
format_ratio(gpu_ratio)),
]
all_row_values.append(row_values)
headers = ['Name', 'Calls', 'GPU Total / Avg / Max / Min / Ratio(%)']
# Calculate the column width
name_column_width = 90
calltime_width = 6
gpu_data_description_width = 40
for row_values in all_row_values:
if isinstance(row_values[1],
int) and len(str(row_values[1])) > calltime_width:
calltime_width = len(str(row_values[1]))
if len(row_values[2]) > gpu_data_description_width:
gpu_data_description_width = len(row_values[2])
row_format_list = [""]
header_sep_list = [""]
line_length_list = [-SPACING_SIZE]
add_column(name_column_width)
add_column(calltime_width)
add_column(gpu_data_description_width)
row_format = row_format_list[0]
header_sep = header_sep_list[0]
line_length = line_length_list[0]
# construct table string
append(add_title(line_length, "Kernel Summary"))
append('Time unit: {}'.format(time_unit))
append(header_sep)
append(row_format.format(*headers))
append(header_sep)
for row_values in all_row_values:
indx = row_values[0].find('(')
if indx != -1:
name = row_values[0][:indx]
else:
name = row_values[0]
if len(name) > name_column_width:
row_values[0] = name[:name_column_width - 3] + '...'
else:
row_values[0] = name
append(row_format.format(*row_values))
append(header_sep)
append('')
append('')
###### Print Memory Manipulation Summary Report ###### ###### Print Memory Manipulation Summary Report ######
if statistic_data.event_summary.memory_manipulation_items: if statistic_data.event_summary.memory_manipulation_items:
all_row_values = [] all_row_values = []
memory_manipulation_items = statistic_data.event_summary.memory_manipulation_items memory_manipulation_items = statistic_data.event_summary.memory_manipulation_items
gpu_total_time = statistic_data.event_summary.model_perspective_items[
'ProfileStep'].general_gpu_time
for name, item in memory_manipulation_items.items(): for name, item in memory_manipulation_items.items():
if gpu_total_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(item.general_gpu_time) / gpu_total_time
row_values = [ row_values = [
name, name,
item.call, item.call,
...@@ -1224,7 +1365,7 @@ def _build_table(statistic_data, ...@@ -1224,7 +1365,7 @@ def _build_table(statistic_data,
item.max_general_gpu_time, unit=time_unit), item.max_general_gpu_time, unit=time_unit),
format_time( format_time(
item.min_general_gpu_time, unit=time_unit), item.min_general_gpu_time, unit=time_unit),
format_ratio(float(item.general_gpu_time) / total_time)), format_ratio(gpu_ratio)),
] ]
all_row_values.append(row_values) all_row_values.append(row_values)
...@@ -1274,6 +1415,8 @@ def _build_table(statistic_data, ...@@ -1274,6 +1415,8 @@ def _build_table(statistic_data,
###### Print UserDefined Summary Report ###### ###### Print UserDefined Summary Report ######
if statistic_data.event_summary.userdefined_items: if statistic_data.event_summary.userdefined_items:
all_row_values = [] all_row_values = []
gpu_total_time = statistic_data.event_summary.model_perspective_items[
'ProfileStep'].general_gpu_time
if thread_sep == True: if thread_sep == True:
userdefined_thread_items = statistic_data.event_summary.userdefined_thread_items userdefined_thread_items = statistic_data.event_summary.userdefined_thread_items
else: else:
...@@ -1319,6 +1462,10 @@ def _build_table(statistic_data, ...@@ -1319,6 +1462,10 @@ def _build_table(statistic_data,
items.items(), key=lambda x: x[1].min_general_gpu_time) items.items(), key=lambda x: x[1].min_general_gpu_time)
for name, item in sorted_items: for name, item in sorted_items:
if gpu_total_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(item.general_gpu_time) / gpu_total_time
row_values = [ row_values = [
name, name,
item.call, item.call,
...@@ -1341,8 +1488,7 @@ def _build_table(statistic_data, ...@@ -1341,8 +1488,7 @@ def _build_table(statistic_data,
item.max_general_gpu_time, unit=time_unit), item.max_general_gpu_time, unit=time_unit),
format_time( format_time(
item.min_general_gpu_time, unit=time_unit), item.min_general_gpu_time, unit=time_unit),
format_ratio( format_ratio(gpu_ratio)),
float(item.general_gpu_time) / total_time)),
] ]
all_row_values.append(row_values) all_row_values.append(row_values)
......
...@@ -20,6 +20,8 @@ from contextlib import ContextDecorator ...@@ -20,6 +20,8 @@ from contextlib import ContextDecorator
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.core import (_RecordEvent, TracerEventType) from paddle.fluid.core import (_RecordEvent, TracerEventType)
_is_profiler_used = False
_AllowedEventTypeList = [ _AllowedEventTypeList = [
TracerEventType.Dataloader, TracerEventType.ProfileStep, TracerEventType.Dataloader, TracerEventType.ProfileStep,
TracerEventType.UserDefined, TracerEventType.Forward, TracerEventType.UserDefined, TracerEventType.Forward,
...@@ -91,6 +93,8 @@ class RecordEvent(ContextDecorator): ...@@ -91,6 +93,8 @@ class RecordEvent(ContextDecorator):
result = data1 - data2 result = data1 - data2
record_event.end() record_event.end()
""" """
if not _is_profiler_used:
return
if self.event_type not in _AllowedEventTypeList: if self.event_type not in _AllowedEventTypeList:
warn("Only TracerEvent Type in [{}, {}, {}, {}, {}, {},{}]\ warn("Only TracerEvent Type in [{}, {}, {}, {}, {}, {},{}]\
can be recorded.".format(*_AllowedEventTypeList)) can be recorded.".format(*_AllowedEventTypeList))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册