未验证 提交 194d16c1 编写于 作者: K kuizhiqing 提交者: GitHub

[Profiler] add views in summary API (#45225)

* add views in summary api

* add args in the last position
上级 1aa6adb1
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
from .profiler import ProfilerState, ProfilerTarget from .profiler import ProfilerState, ProfilerTarget
from .profiler import make_scheduler, export_chrome_tracing, export_protobuf from .profiler import make_scheduler, export_chrome_tracing, export_protobuf
from .profiler import Profiler from .profiler import Profiler
from .profiler import SummaryView
from .profiler import TracerEventType from .profiler import TracerEventType
from .utils import RecordEvent, load_profiler_result from .utils import RecordEvent, load_profiler_result
from .profiler_statistic import SortedKeys from .profiler_statistic import SortedKeys
...@@ -22,5 +23,5 @@ from .profiler_statistic import SortedKeys ...@@ -22,5 +23,5 @@ from .profiler_statistic import SortedKeys
__all__ = [ __all__ = [
'ProfilerState', 'ProfilerTarget', 'make_scheduler', 'ProfilerState', 'ProfilerTarget', 'make_scheduler',
'export_chrome_tracing', 'export_protobuf', 'Profiler', 'RecordEvent', 'export_chrome_tracing', 'export_protobuf', 'Profiler', 'RecordEvent',
'load_profiler_result', 'SortedKeys' 'load_profiler_result', 'SortedKeys', 'SummaryView'
] ]
...@@ -34,6 +34,22 @@ from paddle.profiler import utils ...@@ -34,6 +34,22 @@ from paddle.profiler import utils
from .timer import benchmark from .timer import benchmark
class SummaryView(Enum):
r"""
SummaryView define the summary view of different contents.
"""
DeviceView = 0
OverView = 1
ModelView = 2
DistributedView = 3
KernelView = 4
OperatorView = 5
MemoryView = 6
MemoryManipulationView = 7
UDFView = 8
class ProfilerState(Enum): class ProfilerState(Enum):
r""" r"""
ProfilerState is used to present the state of :ref:`Profiler <api_paddle_profiler_Profiler>` . ProfilerState is used to present the state of :ref:`Profiler <api_paddle_profiler_Profiler>` .
...@@ -734,7 +750,8 @@ class Profiler: ...@@ -734,7 +750,8 @@ class Profiler:
sorted_by=SortedKeys.CPUTotal, sorted_by=SortedKeys.CPUTotal,
op_detail=True, op_detail=True,
thread_sep=False, thread_sep=False,
time_unit='ms'): time_unit='ms',
views=None):
r""" r"""
Print the Summary table. Currently support overview, model, distributed, operator, memory manipulation and userdefined summary. Print the Summary table. Currently support overview, model, distributed, operator, memory manipulation and userdefined summary.
...@@ -743,6 +760,7 @@ class Profiler: ...@@ -743,6 +760,7 @@ class Profiler:
op_detail(bool, optional): expand each operator detail information, default value is True. op_detail(bool, optional): expand each operator detail information, default value is True.
thread_sep(bool, optional): print op table each thread, default value is False. thread_sep(bool, optional): print op table each thread, default value is False.
time_unit(str, optional): time unit for display, can be chosen form ['s', 'ms', 'us', 'ns'], default value is 'ms'. time_unit(str, optional): time unit for display, can be chosen form ['s', 'ms', 'us', 'ns'], default value is 'ms'.
views(list[SummaryView], optional): summary tables to print, default to None means all views to be printed.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -770,7 +788,8 @@ class Profiler: ...@@ -770,7 +788,8 @@ class Profiler:
sorted_by=sorted_by, sorted_by=sorted_by,
op_detail=op_detail, op_detail=op_detail,
thread_sep=thread_sep, thread_sep=thread_sep,
time_unit=time_unit)) time_unit=time_unit,
views=views))
def get_profiler(config_path): def get_profiler(config_path):
......
...@@ -700,7 +700,10 @@ def _build_table(statistic_data, ...@@ -700,7 +700,10 @@ def _build_table(statistic_data,
thread_sep=False, thread_sep=False,
time_unit='ms', time_unit='ms',
row_limit=100, row_limit=100,
max_src_column_width=75): max_src_column_width=75,
views=None):
from .profiler import SummaryView
"""Prints a summary of events.""" """Prints a summary of events."""
# format table row # format table row
SPACING_SIZE = 2 SPACING_SIZE = 2
...@@ -749,277 +752,62 @@ def _build_table(statistic_data, ...@@ -749,277 +752,62 @@ def _build_table(statistic_data,
total_time = statistic_data.time_range_summary.get_cpu_range_sum( total_time = statistic_data.time_range_summary.get_cpu_range_sum(
TracerEventType.ProfileStep) TracerEventType.ProfileStep)
###### Print Device Summary ######
headers = ['Device', 'Utilization (%)']
name_column_width = 30
DEFAULT_COLUMN_WIDTH = 20
add_column(name_column_width)
for _ in headers[1:]:
add_column(DEFAULT_COLUMN_WIDTH)
row_format = row_format_list[0]
header_sep = header_sep_list[0]
line_length = line_length_list[0]
# construct table string
append(add_title(line_length, "Device Summary"))
append(header_sep)
append(row_format.format(*headers))
append(header_sep)
row_values = [
'CPU(Process)',
format_ratio(float(
statistic_data.extra_info['Process Cpu Utilization']))
]
append(row_format.format(*row_values))
row_values = [
'CPU(System)',
format_ratio(float(statistic_data.extra_info['System Cpu Utilization']))
]
append(row_format.format(*row_values))
for gpu_name in statistic_data.time_range_summary.get_gpu_devices():
gpu_time = float(
statistic_data.time_range_summary.get_gpu_range_sum(
gpu_name, TracerEventType.Kernel))
utilization = gpu_time / total_time
row_values = ['GPU{}'.format(gpu_name), format_ratio(utilization)]
append(row_format.format(*row_values))
append(header_sep)
append(
"Note:\nCPU(Process) Utilization = Current process CPU time over all cpu cores / elapsed time, so max utilization can be reached 100% * number of cpu cores.\n"
"CPU(System) Utilization = All processes CPU time over all cpu cores(busy time) / (busy time + idle time).\n"
"GPU Utilization = Current process GPU time / elapsed time.")
append('-' * line_length)
append('')
append('')
if total_time == 0:
return ''.join(result)
###### Print Overview Summary ###### if views is None or SummaryView.DeviceView in views:
headers = ['Event Type', 'Calls', 'CPU Time', 'Ratio (%)']
row_format_list = [""]
header_sep_list = [""]
line_length_list = [-SPACING_SIZE]
DEFAULT_COLUMN_WIDTH = 25 ###### Print Device Summary ######
for _ in headers: headers = ['Device', 'Utilization (%)']
add_column(DEFAULT_COLUMN_WIDTH) name_column_width = 30
DEFAULT_COLUMN_WIDTH = 20
row_format = row_format_list[0]
header_sep = header_sep_list[0]
line_length = line_length_list[0]
# construct table string
append(add_title(line_length, "Overview Summary"))
append('Time unit: {}'.format(time_unit))
append(header_sep)
append(row_format.format(*headers))
append(header_sep)
cpu_type_time = collections.defaultdict(int)
gpu_type_time = collections.defaultdict(int)
cpu_call_times = collections.defaultdict(int)
gpu_call_times = collections.defaultdict(int)
cpu_call_times.update(statistic_data.time_range_summary.call_times)
gpu_call_times.update(statistic_data.time_range_summary.call_times)
for event_type, value in statistic_data.time_range_summary.CPUTimeRangeSum.items(
):
if event_type != TracerEventType.Communication:
cpu_type_time[event_type] = value
if statistic_data.distributed_summary.cpu_communication_range:
cpu_type_time[TracerEventType.Communication] = sum_ranges(
statistic_data.distributed_summary.cpu_communication_range)
cpu_call_times[
TracerEventType.
Communication] = statistic_data.distributed_summary.cpu_calls
for event_type in [
TracerEventType.Dataloader, TracerEventType.Forward,
TracerEventType.Backward, TracerEventType.Optimization
]:
event_type_name = str(event_type).split('.')[1]
if event_type in cpu_call_times and event_type_name in statistic_data.event_summary.model_perspective_items:
cpu_call_times[
event_type] = statistic_data.event_summary.model_perspective_items[
event_type_name].call
cpu_type_time[
event_type] = statistic_data.event_summary.model_perspective_items[
event_type_name].cpu_time
gpu_time_range = collections.defaultdict(list)
for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items(
):
for event_type, time_range in device_time_ranges.items():
gpu_time_range[event_type] = merge_ranges(
gpu_time_range[event_type], time_range, is_sorted=True)
for event_type, time_range in gpu_time_range.items():
gpu_type_time[event_type] = sum_ranges(time_range)
if statistic_data.distributed_summary.gpu_communication_range:
gpu_type_time[TracerEventType.Communication] = sum_ranges(
statistic_data.distributed_summary.gpu_communication_range)
gpu_call_times[
TracerEventType.
Communication] = statistic_data.distributed_summary.gpu_calls
sorted_items = sorted(cpu_type_time.items(),
key=lambda x: x[1],
reverse=True)
event_type, time = sorted_items[0]
row_values = [
'{}'.format(str(event_type).split('.')[1]), cpu_call_times[event_type],
format_time(time, unit=time_unit),
format_ratio(float(time) / total_time)
]
append(row_format.format(*row_values))
for event_type, time in sorted_items[1:]:
row_values = [
' {}'.format(str(event_type).split('.')[1]),
cpu_call_times[event_type],
format_time(time, unit=time_unit),
format_ratio(float(time) / total_time)
]
append(row_format.format(*row_values))
append(header_sep)
headers = ['', 'Calls', 'GPU Time', 'Ratio (%)']
append(row_format.format(*headers))
append(header_sep)
for event_type, time in gpu_type_time.items():
row_values = [
' {}'.format(str(event_type).split('.')[1]),
gpu_call_times[event_type],
format_time(time, unit=time_unit),
format_ratio(float(time) / total_time)
]
append(row_format.format(*row_values))
append(header_sep)
append(
"Note:\nIn this table, We sum up all collected events in terms of event type.\n"
"The time of events collected on host are presented as CPU Time, and as GPU Time if on device.\n"
"Events with different types may overlap or inclusion, e.g. Operator includes OperatorInner, so the sum of ratios is not 100%.\n"
"The time of events in the same type with overlap will not calculate twice, and all time is summed after merged.\n"
"Example:\n"
"Thread 1:\n"
" Operator: |___________| |__________|\n"
"Thread 2:\n"
" Operator: |____________| |___|\n"
"After merged:\n"
" Result: |______________| |__________|\n")
append('-' * line_length)
append('')
append('')
###### Print Model Summary Report ######
model_perspective_items = statistic_data.event_summary.model_perspective_items
if len(model_perspective_items) > 1:
all_row_values = []
accmulation_time = 0
gpu_accmulation_time = 0
gpu_total_time = statistic_data.event_summary.model_perspective_items[
'ProfileStep'].gpu_time
for name in [
'ProfileStep', 'Dataloader', 'Forward', 'Backward',
'Optimization'
]:
if name in model_perspective_items:
item = model_perspective_items[name]
if gpu_total_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(item.gpu_time) / gpu_total_time
name = '{}'.format(
name) if 'ProfileStep' in name else ' {}'.format(name)
row_values = [
'{}'.format(name), item.call,
'{} / {} / {} / {} / {}'.format(
format_time(item.cpu_time, unit=time_unit),
format_time(item.avg_cpu_time, unit=time_unit),
format_time(item.max_cpu_time, unit=time_unit),
format_time(item.min_cpu_time, unit=time_unit),
format_ratio(float(item.cpu_time) / total_time)),
'{} / {} / {} / {} / {}'.format(
format_time(item.gpu_time, unit=time_unit),
format_time(item.avg_gpu_time, unit=time_unit),
format_time(item.max_gpu_time, unit=time_unit),
format_time(item.min_gpu_time, unit=time_unit),
format_ratio(gpu_ratio))
]
all_row_values.append(row_values)
if 'ProfileStep' not in name:
accmulation_time += item.cpu_time
gpu_accmulation_time += item.gpu_time
other_time = total_time - accmulation_time
other_gpu_time = gpu_total_time - gpu_accmulation_time
if gpu_total_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(other_gpu_time) / gpu_total_time
row_values = [
' Others', '-', '{} / - / - / - / {}'.format(
format_time(other_time, unit=time_unit),
format_ratio(float(other_time) / total_time)),
'{} / - / - / - / {}'.format(
format_time(other_gpu_time, unit=time_unit),
format_ratio(gpu_ratio))
]
all_row_values.append(row_values)
# Calculate the column width
calltime_width = 6
cpu_data_description_width = 40
gpu_data_description_width = 40
for row_values in all_row_values:
if isinstance(row_values[1],
int) and len(str(row_values[1])) > calltime_width:
calltime_width = len(str(row_values[1]))
if len(row_values[2]) > cpu_data_description_width:
cpu_data_description_width = len(row_values[2])
if len(row_values[3]) > gpu_data_description_width:
gpu_data_description_width = len(row_values[3])
headers = [
'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
'GPU Total / Avg / Max / Min / Ratio(%)'
]
row_format_list = [""]
header_sep_list = [""]
line_length_list = [-SPACING_SIZE]
name_column_width = 15
add_column(name_column_width) add_column(name_column_width)
add_column(calltime_width) for _ in headers[1:]:
add_column(cpu_data_description_width) add_column(DEFAULT_COLUMN_WIDTH)
add_column(gpu_data_description_width)
row_format = row_format_list[0] row_format = row_format_list[0]
header_sep = header_sep_list[0] header_sep = header_sep_list[0]
line_length = line_length_list[0] line_length = line_length_list[0]
# construct table string # construct table string
append(add_title(line_length, "Model Summary"))
append('Time unit: {}'.format(time_unit)) append(add_title(line_length, "Device Summary"))
append(header_sep) append(header_sep)
append(row_format.format(*headers)) append(row_format.format(*headers))
append(header_sep) append(header_sep)
for row_values in all_row_values: row_values = [
'CPU(Process)',
format_ratio(
float(statistic_data.extra_info['Process Cpu Utilization']))
]
append(row_format.format(*row_values))
row_values = [
'CPU(System)',
format_ratio(
float(statistic_data.extra_info['System Cpu Utilization']))
]
append(row_format.format(*row_values))
for gpu_name in statistic_data.time_range_summary.get_gpu_devices():
gpu_time = float(
statistic_data.time_range_summary.get_gpu_range_sum(
gpu_name, TracerEventType.Kernel))
utilization = gpu_time / total_time
row_values = ['GPU{}'.format(gpu_name), format_ratio(utilization)]
append(row_format.format(*row_values)) append(row_format.format(*row_values))
append(header_sep) append(header_sep)
append( append(
"Note:\nIn this table, GPU time is the sum of all device(GPU) events called in the phase.\n" "Note:\nCPU(Process) Utilization = Current process CPU time over all cpu cores / elapsed time, so max utilization can be reached 100% * number of cpu cores.\n"
"Unlike overview summary, if two device(GPU) events execute on different streams with overlap time, we sum them directly here.\n" "CPU(System) Utilization = All processes CPU time over all cpu cores(busy time) / (busy time + idle time).\n"
) "GPU Utilization = Current process GPU time / elapsed time.")
append('-' * line_length) append('-' * line_length)
append('') append('')
append('') append('')
###### Print Distribution Summary Report ###### if total_time == 0:
if statistic_data.distributed_summary.communication_range: return ''.join(result)
headers = [
'Name', if views is None or SummaryView.OverView in views:
'Total Time', ###### Print Overview Summary ######
'Ratio (%)', headers = ['Event Type', 'Calls', 'CPU Time', 'Ratio (%)']
]
row_format_list = [""] row_format_list = [""]
header_sep_list = [""] header_sep_list = [""]
line_length_list = [-SPACING_SIZE] line_length_list = [-SPACING_SIZE]
...@@ -1033,188 +821,453 @@ def _build_table(statistic_data, ...@@ -1033,188 +821,453 @@ def _build_table(statistic_data,
line_length = line_length_list[0] line_length = line_length_list[0]
# construct table string # construct table string
append(add_title(line_length, "Distribution Summary")) append(add_title(line_length, "Overview Summary"))
append('Time unit: {}'.format(time_unit)) append('Time unit: {}'.format(time_unit))
append(header_sep) append(header_sep)
append(row_format.format(*headers)) append(row_format.format(*headers))
append(header_sep) append(header_sep)
communication_time = sum_ranges( cpu_type_time = collections.defaultdict(int)
statistic_data.distributed_summary.communication_range) gpu_type_time = collections.defaultdict(int)
computation_time = sum_ranges( cpu_call_times = collections.defaultdict(int)
statistic_data.distributed_summary.computation_range) gpu_call_times = collections.defaultdict(int)
overlap_time = sum_ranges( cpu_call_times.update(statistic_data.time_range_summary.call_times)
statistic_data.distributed_summary.overlap_range) gpu_call_times.update(statistic_data.time_range_summary.call_times)
row_values = [
'ProfileStep', for event_type, value in statistic_data.time_range_summary.CPUTimeRangeSum.items(
format_time(total_time, unit=time_unit), ):
format_ratio(float(total_time) / total_time) if event_type != TracerEventType.Communication:
] cpu_type_time[event_type] = value
append(row_format.format(*row_values)) if statistic_data.distributed_summary.cpu_communication_range:
row_values = [ cpu_type_time[TracerEventType.Communication] = sum_ranges(
' Communication', statistic_data.distributed_summary.cpu_communication_range)
format_time(communication_time, unit=time_unit), cpu_call_times[
format_ratio(float(communication_time) / total_time) TracerEventType.
] Communication] = statistic_data.distributed_summary.cpu_calls
append(row_format.format(*row_values))
for event_type in [
TracerEventType.Dataloader, TracerEventType.Forward,
TracerEventType.Backward, TracerEventType.Optimization
]:
event_type_name = str(event_type).split('.')[1]
if event_type in cpu_call_times and event_type_name in statistic_data.event_summary.model_perspective_items:
cpu_call_times[
event_type] = statistic_data.event_summary.model_perspective_items[
event_type_name].call
cpu_type_time[
event_type] = statistic_data.event_summary.model_perspective_items[
event_type_name].cpu_time
gpu_time_range = collections.defaultdict(list)
for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items(
):
for event_type, time_range in device_time_ranges.items():
gpu_time_range[event_type] = merge_ranges(
gpu_time_range[event_type], time_range, is_sorted=True)
for event_type, time_range in gpu_time_range.items():
gpu_type_time[event_type] = sum_ranges(time_range)
if statistic_data.distributed_summary.gpu_communication_range:
gpu_type_time[TracerEventType.Communication] = sum_ranges(
statistic_data.distributed_summary.gpu_communication_range)
gpu_call_times[
TracerEventType.
Communication] = statistic_data.distributed_summary.gpu_calls
sorted_items = sorted(cpu_type_time.items(),
key=lambda x: x[1],
reverse=True)
event_type, time = sorted_items[0]
row_values = [ row_values = [
' Computation', '{}'.format(str(event_type).split('.')[1]),
format_time(computation_time, unit=time_unit), cpu_call_times[event_type],
format_ratio(float(computation_time) / total_time) format_time(time, unit=time_unit),
format_ratio(float(time) / total_time)
] ]
append(row_format.format(*row_values)) append(row_format.format(*row_values))
for event_type, time in sorted_items[1:]:
row_values = [
' {}'.format(str(event_type).split('.')[1]),
cpu_call_times[event_type],
format_time(time, unit=time_unit),
format_ratio(float(time) / total_time)
]
append(row_format.format(*row_values))
append(header_sep)
headers = ['', 'Calls', 'GPU Time', 'Ratio (%)']
append(row_format.format(*headers))
append(header_sep)
for event_type, time in gpu_type_time.items():
row_values = [
' {}'.format(str(event_type).split('.')[1]),
gpu_call_times[event_type],
format_time(time, unit=time_unit),
format_ratio(float(time) / total_time)
]
append(row_format.format(*row_values))
row_values = [
' Overlap',
format_time(overlap_time, unit=time_unit),
format_ratio(float(overlap_time) / total_time)
]
append(row_format.format(*row_values))
append(header_sep) append(header_sep)
append( append(
"Note:\nCommunication time: Communication Event time, Communication Op time and its kernel time on gpu.\n" "Note:\nIn this table, We sum up all collected events in terms of event type.\n"
"Computation time: Kernel time, except kernels belong to communication(nccl kernels).\n" "The time of events collected on host are presented as CPU Time, and as GPU Time if on device.\n"
"Overlap time: Communication time intersects with computation time.\n" "Events with different types may overlap or inclusion, e.g. Operator includes OperatorInner, so the sum of ratios is not 100%.\n"
"The time of events in the same type with overlap will not calculate twice, and all time is summed after merged.\n"
"Example:\n" "Example:\n"
"Communication:\n" "Thread 1:\n"
" CPU: |_________________|\n" " Operator: |___________| |__________|\n"
" GPU: |______________|\n" "Thread 2:\n"
" Total: |_________________| |______________|\n" " Operator: |____________| |___|\n"
"Computation time(Kernel):\n" "After merged:\n"
" GPU: |________________|\n" " Result: |______________| |__________|\n")
"Overlap time: |___________|\n")
append('-' * line_length) append('-' * line_length)
append('') append('')
append('') append('')
###### Print Operator Summary Report ###### if views is None or SummaryView.ModelView in views:
if statistic_data.event_summary.items:
all_row_values = []
name_column_width = 52
if thread_sep == True:
thread_items = statistic_data.event_summary.thread_items
else:
thread_items = {
'All threads merged': statistic_data.event_summary.items
}
for thread_id, items in thread_items.items():
all_row_values.append("Thread: {}".format(thread_id))
if sorted_by == SortedKeys.CPUTotal:
sorted_items = sorted(items.items(),
key=lambda x: x[1].cpu_time,
reverse=True)
elif sorted_by == SortedKeys.CPUAvg:
sorted_items = sorted(items.items(),
key=lambda x: x[1].avg_cpu_time,
reverse=True)
elif sorted_by == SortedKeys.CPUMax:
sorted_items = sorted(items.items(),
key=lambda x: x[1].max_cpu_time,
reverse=True)
elif sorted_by == SortedKeys.CPUMin:
sorted_items = sorted(items.items(),
key=lambda x: x[1].min_cpu_time)
elif sorted_by == SortedKeys.GPUTotal:
sorted_items = sorted(items.items(),
key=lambda x: x[1].general_gpu_time,
reverse=True)
elif sorted_by == SortedKeys.GPUAvg:
sorted_items = sorted(items.items(),
key=lambda x: x[1].avg_general_gpu_time,
reverse=True)
elif sorted_by == SortedKeys.GPUMax:
sorted_items = sorted(items.items(),
key=lambda x: x[1].max_general_gpu_time,
reverse=True)
elif sorted_by == SortedKeys.GPUMin:
sorted_items = sorted(items.items(),
key=lambda x: x[1].min_general_gpu_time)
total_op_cpu_time = 0
total_op_gpu_time = 0
for name, item in sorted_items: ###### Print Model Summary Report ######
total_op_cpu_time += item.cpu_time model_perspective_items = statistic_data.event_summary.model_perspective_items
total_op_gpu_time += item.general_gpu_time if len(model_perspective_items) > 1:
all_row_values = []
accmulation_time = 0
gpu_accmulation_time = 0
gpu_total_time = statistic_data.event_summary.model_perspective_items[
'ProfileStep'].gpu_time
for name in [
'ProfileStep', 'Dataloader', 'Forward', 'Backward',
'Optimization'
]:
if name in model_perspective_items:
item = model_perspective_items[name]
if gpu_total_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(item.gpu_time) / gpu_total_time
name = '{}'.format(
name) if 'ProfileStep' in name else ' {}'.format(name)
row_values = [
'{}'.format(name), item.call,
'{} / {} / {} / {} / {}'.format(
format_time(item.cpu_time, unit=time_unit),
format_time(item.avg_cpu_time, unit=time_unit),
format_time(item.max_cpu_time, unit=time_unit),
format_time(item.min_cpu_time, unit=time_unit),
format_ratio(float(item.cpu_time) / total_time)),
'{} / {} / {} / {} / {}'.format(
format_time(item.gpu_time, unit=time_unit),
format_time(item.avg_gpu_time, unit=time_unit),
format_time(item.max_gpu_time, unit=time_unit),
format_time(item.min_gpu_time, unit=time_unit),
format_ratio(gpu_ratio))
]
all_row_values.append(row_values)
if 'ProfileStep' not in name:
accmulation_time += item.cpu_time
gpu_accmulation_time += item.gpu_time
other_time = total_time - accmulation_time
other_gpu_time = gpu_total_time - gpu_accmulation_time
if gpu_total_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(other_gpu_time) / gpu_total_time
row_values = [
' Others', '-', '{} / - / - / - / {}'.format(
format_time(other_time, unit=time_unit),
format_ratio(float(other_time) / total_time)),
'{} / - / - / - / {}'.format(
format_time(other_gpu_time, unit=time_unit),
format_ratio(gpu_ratio))
]
all_row_values.append(row_values)
# Calculate the column width
calltime_width = 6
cpu_data_description_width = 40
gpu_data_description_width = 40
for row_values in all_row_values:
if isinstance(row_values[1],
int) and len(str(row_values[1])) > calltime_width:
calltime_width = len(str(row_values[1]))
if len(row_values[2]) > cpu_data_description_width:
cpu_data_description_width = len(row_values[2])
if len(row_values[3]) > gpu_data_description_width:
gpu_data_description_width = len(row_values[3])
headers = [
'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
'GPU Total / Avg / Max / Min / Ratio(%)'
]
row_format_list = [""]
header_sep_list = [""]
line_length_list = [-SPACING_SIZE]
name_column_width = 15
add_column(name_column_width)
add_column(calltime_width)
add_column(cpu_data_description_width)
add_column(gpu_data_description_width)
for name, item in sorted_items: row_format = row_format_list[0]
if total_op_cpu_time == 0: header_sep = header_sep_list[0]
cpu_ratio = 0 line_length = line_length_list[0]
else:
cpu_ratio = float(item.cpu_time) / total_op_cpu_time # construct table string
if total_op_gpu_time == 0: append(add_title(line_length, "Model Summary"))
gpu_ratio = 0 append('Time unit: {}'.format(time_unit))
else: append(header_sep)
gpu_ratio = float(item.general_gpu_time) / total_op_gpu_time append(row_format.format(*headers))
row_values = [ append(header_sep)
name, item.call, '{} / {} / {} / {} / {}'.format( for row_values in all_row_values:
format_time(item.cpu_time, unit=time_unit), append(row_format.format(*row_values))
format_time(item.avg_cpu_time, unit=time_unit), append(header_sep)
format_time(item.max_cpu_time, unit=time_unit), append(
format_time(item.min_cpu_time, unit=time_unit), "Note:\nIn this table, GPU time is the sum of all device(GPU) events called in the phase.\n"
format_ratio(cpu_ratio)), "Unlike overview summary, if two device(GPU) events execute on different streams with overlap time, we sum them directly here.\n"
'{} / {} / {} / {} / {}'.format( )
format_time(item.general_gpu_time, unit=time_unit), append('-' * line_length)
format_time(item.avg_general_gpu_time, unit=time_unit), append('')
format_time(item.max_general_gpu_time, unit=time_unit), append('')
format_time(item.min_general_gpu_time, unit=time_unit),
format_ratio(gpu_ratio)) if views is None or SummaryView.DistributedView in views:
]
all_row_values.append(row_values) ###### Print Distribution Summary Report ######
if op_detail: if statistic_data.distributed_summary.communication_range:
for innerop_name, innerop_node in item.operator_inners.items( headers = [
): 'Name',
if item.cpu_time == 0: 'Total Time',
cpu_ratio = 0 'Ratio (%)',
else: ]
cpu_ratio = float( row_format_list = [""]
innerop_node.cpu_time) / item.cpu_time header_sep_list = [""]
if item.general_gpu_time == 0: line_length_list = [-SPACING_SIZE]
gpu_ratio = 0
else: DEFAULT_COLUMN_WIDTH = 25
gpu_ratio = float(innerop_node.general_gpu_time for _ in headers:
) / item.general_gpu_time add_column(DEFAULT_COLUMN_WIDTH)
if len(innerop_name) + 2 > name_column_width:
innerop_name = innerop_name[:name_column_width - 5] row_format = row_format_list[0]
innerop_name += "..." header_sep = header_sep_list[0]
row_values = [ line_length = line_length_list[0]
' {}'.format(innerop_name), innerop_node.call,
'{} / {} / {} / {} / {}'.format( # construct table string
format_time(innerop_node.cpu_time, append(add_title(line_length, "Distribution Summary"))
unit=time_unit), append('Time unit: {}'.format(time_unit))
format_time(innerop_node.avg_cpu_time, append(header_sep)
unit=time_unit), append(row_format.format(*headers))
format_time(innerop_node.max_cpu_time, append(header_sep)
unit=time_unit), communication_time = sum_ranges(
format_time(innerop_node.min_cpu_time, statistic_data.distributed_summary.communication_range)
unit=time_unit), computation_time = sum_ranges(
format_ratio(cpu_ratio)), statistic_data.distributed_summary.computation_range)
'{} / {} / {} / {} / {}'.format( overlap_time = sum_ranges(
format_time(innerop_node.general_gpu_time, statistic_data.distributed_summary.overlap_range)
unit=time_unit), row_values = [
format_time(innerop_node.avg_general_gpu_time, 'ProfileStep',
unit=time_unit), format_time(total_time, unit=time_unit),
format_time(innerop_node.max_general_gpu_time, format_ratio(float(total_time) / total_time)
unit=time_unit), ]
format_time(innerop_node.min_general_gpu_time, append(row_format.format(*row_values))
unit=time_unit), row_values = [
format_ratio(gpu_ratio)) ' Communication',
] format_time(communication_time, unit=time_unit),
all_row_values.append(row_values) format_ratio(float(communication_time) / total_time)
for device_node_name, device_node in innerop_node.devices.items( ]
append(row_format.format(*row_values))
row_values = [
' Computation',
format_time(computation_time, unit=time_unit),
format_ratio(float(computation_time) / total_time)
]
append(row_format.format(*row_values))
row_values = [
' Overlap',
format_time(overlap_time, unit=time_unit),
format_ratio(float(overlap_time) / total_time)
]
append(row_format.format(*row_values))
append(header_sep)
append(
"Note:\nCommunication time: Communication Event time, Communication Op time and its kernel time on gpu.\n"
"Computation time: Kernel time, except kernels belong to communication(nccl kernels).\n"
"Overlap time: Communication time intersects with computation time.\n"
"Example:\n"
"Communication:\n"
" CPU: |_________________|\n"
" GPU: |______________|\n"
" Total: |_________________| |______________|\n"
"Computation time(Kernel):\n"
" GPU: |________________|\n"
"Overlap time: |___________|\n")
append('-' * line_length)
append('')
append('')
if views is None or SummaryView.OperatorView in views:
###### Print Operator Summary Report ######
if statistic_data.event_summary.items:
all_row_values = []
name_column_width = 52
if thread_sep == True:
thread_items = statistic_data.event_summary.thread_items
else:
thread_items = {
'All threads merged': statistic_data.event_summary.items
}
for thread_id, items in thread_items.items():
all_row_values.append("Thread: {}".format(thread_id))
if sorted_by == SortedKeys.CPUTotal:
sorted_items = sorted(items.items(),
key=lambda x: x[1].cpu_time,
reverse=True)
elif sorted_by == SortedKeys.CPUAvg:
sorted_items = sorted(items.items(),
key=lambda x: x[1].avg_cpu_time,
reverse=True)
elif sorted_by == SortedKeys.CPUMax:
sorted_items = sorted(items.items(),
key=lambda x: x[1].max_cpu_time,
reverse=True)
elif sorted_by == SortedKeys.CPUMin:
sorted_items = sorted(items.items(),
key=lambda x: x[1].min_cpu_time)
elif sorted_by == SortedKeys.GPUTotal:
sorted_items = sorted(items.items(),
key=lambda x: x[1].general_gpu_time,
reverse=True)
elif sorted_by == SortedKeys.GPUAvg:
sorted_items = sorted(
items.items(),
key=lambda x: x[1].avg_general_gpu_time,
reverse=True)
elif sorted_by == SortedKeys.GPUMax:
sorted_items = sorted(
items.items(),
key=lambda x: x[1].max_general_gpu_time,
reverse=True)
elif sorted_by == SortedKeys.GPUMin:
sorted_items = sorted(
items.items(), key=lambda x: x[1].min_general_gpu_time)
total_op_cpu_time = 0
total_op_gpu_time = 0
for name, item in sorted_items:
total_op_cpu_time += item.cpu_time
total_op_gpu_time += item.general_gpu_time
for name, item in sorted_items:
if total_op_cpu_time == 0:
cpu_ratio = 0
else:
cpu_ratio = float(item.cpu_time) / total_op_cpu_time
if total_op_gpu_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(
item.general_gpu_time) / total_op_gpu_time
row_values = [
name, item.call, '{} / {} / {} / {} / {}'.format(
format_time(item.cpu_time, unit=time_unit),
format_time(item.avg_cpu_time, unit=time_unit),
format_time(item.max_cpu_time, unit=time_unit),
format_time(item.min_cpu_time, unit=time_unit),
format_ratio(cpu_ratio)),
'{} / {} / {} / {} / {}'.format(
format_time(item.general_gpu_time, unit=time_unit),
format_time(item.avg_general_gpu_time,
unit=time_unit),
format_time(item.max_general_gpu_time,
unit=time_unit),
format_time(item.min_general_gpu_time,
unit=time_unit),
format_ratio(gpu_ratio))
]
all_row_values.append(row_values)
if op_detail:
for innerop_name, innerop_node in item.operator_inners.items(
):
if item.cpu_time == 0:
cpu_ratio = 0
else:
cpu_ratio = float(
innerop_node.cpu_time) / item.cpu_time
if item.general_gpu_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(innerop_node.general_gpu_time
) / item.general_gpu_time
if len(innerop_name) + 2 > name_column_width:
innerop_name = innerop_name[:name_column_width -
5]
innerop_name += "..."
row_values = [
' {}'.format(innerop_name), innerop_node.call,
'{} / {} / {} / {} / {}'.format(
format_time(innerop_node.cpu_time,
unit=time_unit),
format_time(innerop_node.avg_cpu_time,
unit=time_unit),
format_time(innerop_node.max_cpu_time,
unit=time_unit),
format_time(innerop_node.min_cpu_time,
unit=time_unit),
format_ratio(cpu_ratio)),
'{} / {} / {} / {} / {}'.format(
format_time(innerop_node.general_gpu_time,
unit=time_unit),
format_time(
innerop_node.avg_general_gpu_time,
unit=time_unit),
format_time(
innerop_node.max_general_gpu_time,
unit=time_unit),
format_time(
innerop_node.min_general_gpu_time,
unit=time_unit),
format_ratio(gpu_ratio))
]
all_row_values.append(row_values)
for device_node_name, device_node in innerop_node.devices.items(
):
if innerop_node.general_gpu_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(
device_node.gpu_time
) / innerop_node.general_gpu_time
if len(device_node_name
) + 4 > name_column_width:
device_node_name = device_node_name[:
name_column_width
- 7]
device_node_name += "..."
row_values = [
' {}'.format(device_node_name),
device_node.call, '- / - / - / - / -',
'{} / {} / {} / {} / {}'.format(
format_time(device_node.gpu_time,
unit=time_unit),
format_time(device_node.avg_gpu_time,
unit=time_unit),
format_time(device_node.max_gpu_time,
unit=time_unit),
format_time(device_node.min_gpu_time,
unit=time_unit),
format_ratio(gpu_ratio))
]
all_row_values.append(row_values)
for device_node_name, device_node in item.devices.items(
): ):
if innerop_node.general_gpu_time == 0: if item.general_gpu_time == 0:
gpu_ratio = 0 gpu_ratio = 0
else: else:
gpu_ratio = float( gpu_ratio = float(device_node.gpu_time
device_node.gpu_time ) / item.general_gpu_time
) / innerop_node.general_gpu_time if len(device_node_name) + 2 > name_column_width:
if len(device_node_name) + 4 > name_column_width:
device_node_name = device_node_name[: device_node_name = device_node_name[:
name_column_width name_column_width
- 7] - 5]
device_node_name += "..." device_node_name += "..."
row_values = [ row_values = [
' {}'.format(device_node_name), ' {}'.format(device_node_name),
device_node.call, '- / - / - / - / -', device_node.call, '- / - / - / - / -',
'{} / {} / {} / {} / {}'.format( '{} / {} / {} / {} / {}'.format(
format_time(device_node.gpu_time, format_time(device_node.gpu_time,
...@@ -1228,280 +1281,149 @@ def _build_table(statistic_data, ...@@ -1228,280 +1281,149 @@ def _build_table(statistic_data,
format_ratio(gpu_ratio)) format_ratio(gpu_ratio))
] ]
all_row_values.append(row_values) all_row_values.append(row_values)
for device_node_name, device_node in item.devices.items(): # Calculate the column width
if item.general_gpu_time == 0: calltime_width = 6
gpu_ratio = 0 cpu_data_description_width = 40
else: gpu_data_description_width = 40
gpu_ratio = float( for row_values in all_row_values:
device_node.gpu_time) / item.general_gpu_time if isinstance(row_values, str):
if len(device_node_name) + 2 > name_column_width: continue
device_node_name = device_node_name[: if isinstance(row_values[1],
name_column_width int) and len(str(row_values[1])) > calltime_width:
- 5] calltime_width = len(str(row_values[1]))
device_node_name += "..." if len(row_values[2]) > cpu_data_description_width:
row_values = [ cpu_data_description_width = len(row_values[2])
' {}'.format(device_node_name), device_node.call, if len(row_values[3]) > gpu_data_description_width:
'- / - / - / - / -', gpu_data_description_width = len(row_values[3])
'{} / {} / {} / {} / {}'.format( headers = [
format_time(device_node.gpu_time, 'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
unit=time_unit), 'GPU Total / Avg / Max / Min / Ratio(%)'
format_time(device_node.avg_gpu_time,
unit=time_unit),
format_time(device_node.max_gpu_time,
unit=time_unit),
format_time(device_node.min_gpu_time,
unit=time_unit),
format_ratio(gpu_ratio))
]
all_row_values.append(row_values)
# Calculate the column width
calltime_width = 6
cpu_data_description_width = 40
gpu_data_description_width = 40
for row_values in all_row_values:
if isinstance(row_values, str):
continue
if isinstance(row_values[1],
int) and len(str(row_values[1])) > calltime_width:
calltime_width = len(str(row_values[1]))
if len(row_values[2]) > cpu_data_description_width:
cpu_data_description_width = len(row_values[2])
if len(row_values[3]) > gpu_data_description_width:
gpu_data_description_width = len(row_values[3])
headers = [
'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
'GPU Total / Avg / Max / Min / Ratio(%)'
]
row_format_list = [""]
header_sep_list = [""]
line_length_list = [-SPACING_SIZE]
add_column(name_column_width)
add_column(calltime_width)
add_column(cpu_data_description_width)
add_column(gpu_data_description_width)
row_format = row_format_list[0]
header_sep = header_sep_list[0]
line_length = line_length_list[0]
# construct table string
append(add_title(line_length, "Operator Summary"))
append('Time unit: {}'.format(time_unit))
append(header_sep)
append(row_format.format(*headers))
append(header_sep)
for row_values in all_row_values:
if isinstance(row_values, str):
append(add_title(line_length, row_values))
else:
append(row_format.format(*row_values))
append(header_sep)
append('')
append('')
###### Print Kernel Summary Report ######
if statistic_data.event_summary.kernel_items:
all_row_values = []
kernel_items = statistic_data.event_summary.kernel_items
if sorted_by == SortedKeys.GPUAvg:
sorted_items = sorted(kernel_items.items(),
key=lambda x: x[1].avg_gpu_time,
reverse=True)
elif sorted_by == SortedKeys.GPUMax:
sorted_items = sorted(kernel_items.items(),
key=lambda x: x[1].max_gpu_time,
reverse=True)
elif sorted_by == SortedKeys.GPUMin:
sorted_items = sorted(kernel_items.items(),
key=lambda x: x[1].min_gpu_time)
else:
sorted_items = sorted(kernel_items.items(),
key=lambda x: x[1].gpu_time,
reverse=True)
total_kernel_gpu_time = 0
for name, item in sorted_items:
total_kernel_gpu_time += item.gpu_time
for name, item in sorted_items:
if total_kernel_gpu_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(item.gpu_time) / total_kernel_gpu_time
row_values = [
name,
item.call,
'{} / {} / {} / {} / {}'.format(
format_time(item.gpu_time, unit=time_unit),
format_time(item.avg_gpu_time, unit=time_unit),
format_time(item.max_gpu_time, unit=time_unit),
format_time(item.min_gpu_time, unit=time_unit),
format_ratio(gpu_ratio)),
] ]
all_row_values.append(row_values) row_format_list = [""]
header_sep_list = [""]
line_length_list = [-SPACING_SIZE]
add_column(name_column_width)
add_column(calltime_width)
add_column(cpu_data_description_width)
add_column(gpu_data_description_width)
headers = ['Name', 'Calls', 'GPU Total / Avg / Max / Min / Ratio(%)'] row_format = row_format_list[0]
# Calculate the column width header_sep = header_sep_list[0]
name_column_width = 90 line_length = line_length_list[0]
calltime_width = 6
gpu_data_description_width = 40
for row_values in all_row_values:
if isinstance(row_values[1],
int) and len(str(row_values[1])) > calltime_width:
calltime_width = len(str(row_values[1]))
if len(row_values[2]) > gpu_data_description_width:
gpu_data_description_width = len(row_values[2])
row_format_list = [""] # construct table string
header_sep_list = [""] append(add_title(line_length, "Operator Summary"))
line_length_list = [-SPACING_SIZE] append('Time unit: {}'.format(time_unit))
add_column(name_column_width) append(header_sep)
add_column(calltime_width) append(row_format.format(*headers))
add_column(gpu_data_description_width) append(header_sep)
for row_values in all_row_values:
if isinstance(row_values, str):
append(add_title(line_length, row_values))
else:
append(row_format.format(*row_values))
append(header_sep)
append('')
append('')
row_format = row_format_list[0] if views is None or SummaryView.KernelView in views:
header_sep = header_sep_list[0]
line_length = line_length_list[0]
# construct table string ###### Print Kernel Summary Report ######
append(add_title(line_length, "Kernel Summary")) if statistic_data.event_summary.kernel_items:
append('Time unit: {}'.format(time_unit)) all_row_values = []
append(header_sep) kernel_items = statistic_data.event_summary.kernel_items
append(row_format.format(*headers)) if sorted_by == SortedKeys.GPUAvg:
append(header_sep) sorted_items = sorted(kernel_items.items(),
kernel_name_pattern = re.compile('(.+?)(<.*>)(\(.*\))') key=lambda x: x[1].avg_gpu_time,
for row_values in all_row_values: reverse=True)
match = kernel_name_pattern.match(row_values[0]) elif sorted_by == SortedKeys.GPUMax:
if match: sorted_items = sorted(kernel_items.items(),
name = match.group(1) + match.group(2) key=lambda x: x[1].max_gpu_time,
else: reverse=True)
name = row_values[0] elif sorted_by == SortedKeys.GPUMin:
if len(name) > name_column_width: sorted_items = sorted(kernel_items.items(),
row_values[0] = name[:name_column_width - 3] + '...' key=lambda x: x[1].min_gpu_time)
else: else:
row_values[0] = name sorted_items = sorted(kernel_items.items(),
append(row_format.format(*row_values)) key=lambda x: x[1].gpu_time,
append(header_sep) reverse=True)
append('')
append('')
###### Print Memory Manipulation Summary Report ###### total_kernel_gpu_time = 0
if statistic_data.event_summary.memory_manipulation_items: for name, item in sorted_items:
all_row_values = [] total_kernel_gpu_time += item.gpu_time
memory_manipulation_items = statistic_data.event_summary.memory_manipulation_items for name, item in sorted_items:
gpu_total_time = statistic_data.event_summary.model_perspective_items[ if total_kernel_gpu_time == 0:
'ProfileStep'].general_gpu_time gpu_ratio = 0
for name, item in memory_manipulation_items.items(): else:
if gpu_total_time == 0: gpu_ratio = float(item.gpu_time) / total_kernel_gpu_time
gpu_ratio = 0 row_values = [
else: name,
gpu_ratio = float(item.general_gpu_time) / gpu_total_time item.call,
row_values = [ '{} / {} / {} / {} / {}'.format(
name, format_time(item.gpu_time, unit=time_unit),
item.call, format_time(item.avg_gpu_time, unit=time_unit),
'{} / {} / {} / {} / {}'.format( format_time(item.max_gpu_time, unit=time_unit),
format_time(item.cpu_time, unit=time_unit), format_time(item.min_gpu_time, unit=time_unit),
format_time(item.avg_cpu_time, unit=time_unit), format_ratio(gpu_ratio)),
format_time(item.max_cpu_time, unit=time_unit), ]
format_time(item.min_cpu_time, unit=time_unit), all_row_values.append(row_values)
format_ratio(float(item.cpu_time) / total_time)),
'{} / {} / {} / {} / {}'.format( headers = [
format_time(item.general_gpu_time, unit=time_unit), 'Name', 'Calls', 'GPU Total / Avg / Max / Min / Ratio(%)'
format_time(item.avg_general_gpu_time, unit=time_unit),
format_time(item.max_general_gpu_time, unit=time_unit),
format_time(item.min_general_gpu_time, unit=time_unit),
format_ratio(gpu_ratio)),
] ]
all_row_values.append(row_values) # Calculate the column width
name_column_width = 90
calltime_width = 6
gpu_data_description_width = 40
for row_values in all_row_values:
if isinstance(row_values[1],
int) and len(str(row_values[1])) > calltime_width:
calltime_width = len(str(row_values[1]))
if len(row_values[2]) > gpu_data_description_width:
gpu_data_description_width = len(row_values[2])
headers = [ row_format_list = [""]
'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)', header_sep_list = [""]
'GPU Total / Avg / Max / Min / Ratio(%)' line_length_list = [-SPACING_SIZE]
] add_column(name_column_width)
# Calculate the column width add_column(calltime_width)
name_column_width = 0 add_column(gpu_data_description_width)
calltime_width = 6
cpu_data_description_width = 40
gpu_data_description_width = 40
for row_values in all_row_values:
if len(row_values[0]) > name_column_width:
name_column_width = len(row_values[0])
if isinstance(row_values[1],
int) and len(str(row_values[1])) > calltime_width:
calltime_width = len(str(row_values[1]))
if len(row_values[2]) > cpu_data_description_width:
cpu_data_description_width = len(row_values[2])
if len(row_values[3]) > gpu_data_description_width:
gpu_data_description_width = len(row_values[3])
row_format_list = [""] row_format = row_format_list[0]
header_sep_list = [""] header_sep = header_sep_list[0]
line_length_list = [-SPACING_SIZE] line_length = line_length_list[0]
add_column(name_column_width)
add_column(calltime_width)
add_column(cpu_data_description_width)
add_column(gpu_data_description_width)
row_format = row_format_list[0] # construct table string
header_sep = header_sep_list[0] append(add_title(line_length, "Kernel Summary"))
line_length = line_length_list[0] append('Time unit: {}'.format(time_unit))
append(header_sep)
append(row_format.format(*headers))
append(header_sep)
kernel_name_pattern = re.compile('(.+?)(<.*>)(\(.*\))')
for row_values in all_row_values:
match = kernel_name_pattern.match(row_values[0])
if match:
name = match.group(1) + match.group(2)
else:
name = row_values[0]
if len(name) > name_column_width:
row_values[0] = name[:name_column_width - 3] + '...'
else:
row_values[0] = name
append(row_format.format(*row_values))
append(header_sep)
append('')
append('')
# construct table string if views is None or SummaryView.MemoryManipulationView in views:
append(add_title(line_length, "Memory Manipulation Summary"))
append('Time unit: {}'.format(time_unit))
append(header_sep)
append(row_format.format(*headers))
append(header_sep)
for row_values in all_row_values:
append(row_format.format(*row_values))
append(header_sep)
append('')
append('')
###### Print UserDefined Summary Report ######
if statistic_data.event_summary.userdefined_items:
all_row_values = []
gpu_total_time = statistic_data.event_summary.model_perspective_items[
'ProfileStep'].general_gpu_time
if thread_sep == True:
userdefined_thread_items = statistic_data.event_summary.userdefined_thread_items
else:
userdefined_thread_items = {
'All threads merged':
statistic_data.event_summary.userdefined_items
}
for thread_id, items in userdefined_thread_items.items():
all_row_values.append("Thread: {}".format(thread_id))
if sorted_by == SortedKeys.CPUTotal:
sorted_items = sorted(items.items(),
key=lambda x: x[1].cpu_time,
reverse=True)
elif sorted_by == SortedKeys.CPUAvg:
sorted_items = sorted(items.items(),
key=lambda x: x[1].avg_cpu_time,
reverse=True)
elif sorted_by == SortedKeys.CPUMax:
sorted_items = sorted(items.items(),
key=lambda x: x[1].max_cpu_time,
reverse=True)
elif sorted_by == SortedKeys.CPUMin:
sorted_items = sorted(items.items(),
key=lambda x: x[1].min_cpu_time)
elif sorted_by == SortedKeys.GPUTotal:
sorted_items = sorted(items.items(),
key=lambda x: x[1].general_gpu_time,
reverse=True)
elif sorted_by == SortedKeys.GPUAvg:
sorted_items = sorted(items.items(),
key=lambda x: x[1].avg_general_gpu_time,
reverse=True)
elif sorted_by == SortedKeys.GPUMax:
sorted_items = sorted(items.items(),
key=lambda x: x[1].max_general_gpu_time,
reverse=True)
elif sorted_by == SortedKeys.GPUMin:
sorted_items = sorted(items.items(),
key=lambda x: x[1].min_general_gpu_time)
for name, item in sorted_items: ###### Print Memory Manipulation Summary Report ######
if statistic_data.event_summary.memory_manipulation_items:
all_row_values = []
memory_manipulation_items = statistic_data.event_summary.memory_manipulation_items
gpu_total_time = statistic_data.event_summary.model_perspective_items[
'ProfileStep'].general_gpu_time
for name, item in memory_manipulation_items.items():
if gpu_total_time == 0: if gpu_total_time == 0:
gpu_ratio = 0 gpu_ratio = 0
else: else:
...@@ -1524,116 +1446,164 @@ def _build_table(statistic_data, ...@@ -1524,116 +1446,164 @@ def _build_table(statistic_data,
] ]
all_row_values.append(row_values) all_row_values.append(row_values)
# Calculate the column width headers = [
name_column_width = 0 'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
calltime_width = 6 'GPU Total / Avg / Max / Min / Ratio(%)'
cpu_data_description_width = 40 ]
gpu_data_description_width = 40 # Calculate the column width
for row_values in all_row_values: name_column_width = 0
if isinstance(row_values, str): calltime_width = 6
continue cpu_data_description_width = 40
if len(row_values[0]) > name_column_width: gpu_data_description_width = 40
name_column_width = len(row_values[0]) for row_values in all_row_values:
if isinstance(row_values[1], if len(row_values[0]) > name_column_width:
int) and len(str(row_values[1])) > calltime_width: name_column_width = len(row_values[0])
calltime_width = len(str(row_values[1])) if isinstance(row_values[1],
if len(row_values[2]) > cpu_data_description_width: int) and len(str(row_values[1])) > calltime_width:
cpu_data_description_width = len(row_values[2]) calltime_width = len(str(row_values[1]))
if len(row_values[3]) > gpu_data_description_width: if len(row_values[2]) > cpu_data_description_width:
gpu_data_description_width = len(row_values[3]) cpu_data_description_width = len(row_values[2])
if len(row_values[3]) > gpu_data_description_width:
headers = [ gpu_data_description_width = len(row_values[3])
'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
'GPU Total / Avg / Max / Min / Ratio(%)'
]
row_format_list = [""]
header_sep_list = [""]
line_length_list = [-SPACING_SIZE]
add_column(name_column_width) row_format_list = [""]
add_column(calltime_width) header_sep_list = [""]
add_column(cpu_data_description_width) line_length_list = [-SPACING_SIZE]
add_column(gpu_data_description_width) add_column(name_column_width)
add_column(calltime_width)
add_column(cpu_data_description_width)
add_column(gpu_data_description_width)
row_format = row_format_list[0] row_format = row_format_list[0]
header_sep = header_sep_list[0] header_sep = header_sep_list[0]
line_length = line_length_list[0] line_length = line_length_list[0]
# construct table string # construct table string
append(add_title(line_length, "UserDefined Summary")) append(add_title(line_length, "Memory Manipulation Summary"))
append('Time unit: {}'.format(time_unit)) append('Time unit: {}'.format(time_unit))
append(header_sep) append(header_sep)
append(row_format.format(*headers)) append(row_format.format(*headers))
append(header_sep) append(header_sep)
for row_values in all_row_values: for row_values in all_row_values:
if isinstance(row_values, str):
append(add_title(line_length, row_values))
else:
append(row_format.format(*row_values)) append(row_format.format(*row_values))
append('') append(header_sep)
append('') append('')
append('')
###### Print Memory Summary Report ######
if statistic_data.memory_summary.allocated_items or statistic_data.memory_summary.reserved_items:
for device_type, memory_events in statistic_data.memory_summary.allocated_items.items(
):
all_row_values = []
sorted_items = sorted(memory_events.items(),
key=lambda x: x[1].increase_size,
reverse=True)
for event_name, item in sorted_items: if views is None or SummaryView.UDFView in views:
row_values = [
event_name, item.memory_type, item.allocation_count,
item.free_count, item.allocation_size, item.free_size,
item.increase_size
]
all_row_values.append(row_values)
sorted_reserved_items = sorted(statistic_data.memory_summary. ###### Print UserDefined Summary Report ######
reserved_items[device_type].items(), if statistic_data.event_summary.userdefined_items:
key=lambda x: x[1].increase_size, all_row_values = []
reverse=True) gpu_total_time = statistic_data.event_summary.model_perspective_items[
for event_name, item in sorted_reserved_items: 'ProfileStep'].general_gpu_time
row_values = [ if thread_sep == True:
event_name, item.memory_type, item.allocation_count, userdefined_thread_items = statistic_data.event_summary.userdefined_thread_items
item.free_count, item.allocation_size, item.free_size, else:
item.increase_size userdefined_thread_items = {
] 'All threads merged':
all_row_values.append(row_values) statistic_data.event_summary.userdefined_items
}
for thread_id, items in userdefined_thread_items.items():
all_row_values.append("Thread: {}".format(thread_id))
if sorted_by == SortedKeys.CPUTotal:
sorted_items = sorted(items.items(),
key=lambda x: x[1].cpu_time,
reverse=True)
elif sorted_by == SortedKeys.CPUAvg:
sorted_items = sorted(items.items(),
key=lambda x: x[1].avg_cpu_time,
reverse=True)
elif sorted_by == SortedKeys.CPUMax:
sorted_items = sorted(items.items(),
key=lambda x: x[1].max_cpu_time,
reverse=True)
elif sorted_by == SortedKeys.CPUMin:
sorted_items = sorted(items.items(),
key=lambda x: x[1].min_cpu_time)
elif sorted_by == SortedKeys.GPUTotal:
sorted_items = sorted(items.items(),
key=lambda x: x[1].general_gpu_time,
reverse=True)
elif sorted_by == SortedKeys.GPUAvg:
sorted_items = sorted(
items.items(),
key=lambda x: x[1].avg_general_gpu_time,
reverse=True)
elif sorted_by == SortedKeys.GPUMax:
sorted_items = sorted(
items.items(),
key=lambda x: x[1].max_general_gpu_time,
reverse=True)
elif sorted_by == SortedKeys.GPUMin:
sorted_items = sorted(
items.items(), key=lambda x: x[1].min_general_gpu_time)
for name, item in sorted_items:
if gpu_total_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(
item.general_gpu_time) / gpu_total_time
row_values = [
name,
item.call,
'{} / {} / {} / {} / {}'.format(
format_time(item.cpu_time, unit=time_unit),
format_time(item.avg_cpu_time, unit=time_unit),
format_time(item.max_cpu_time, unit=time_unit),
format_time(item.min_cpu_time, unit=time_unit),
format_ratio(float(item.cpu_time) / total_time)),
'{} / {} / {} / {} / {}'.format(
format_time(item.general_gpu_time, unit=time_unit),
format_time(item.avg_general_gpu_time,
unit=time_unit),
format_time(item.max_general_gpu_time,
unit=time_unit),
format_time(item.min_general_gpu_time,
unit=time_unit),
format_ratio(gpu_ratio)),
]
all_row_values.append(row_values)
# Calculate the column width # Calculate the column width
name_column_width = 0
calltime_width = 6
cpu_data_description_width = 40
gpu_data_description_width = 40
for row_values in all_row_values:
if isinstance(row_values, str):
continue
if len(row_values[0]) > name_column_width:
name_column_width = len(row_values[0])
if isinstance(row_values[1],
int) and len(str(row_values[1])) > calltime_width:
calltime_width = len(str(row_values[1]))
if len(row_values[2]) > cpu_data_description_width:
cpu_data_description_width = len(row_values[2])
if len(row_values[3]) > gpu_data_description_width:
gpu_data_description_width = len(row_values[3])
headers = [ headers = [
'Name', 'Type', 'Allocation Count', 'Free Count', 'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
'Allocation Size', 'Free Size', 'Increased Size' 'GPU Total / Avg / Max / Min / Ratio(%)'
] ]
row_format_list = [""] row_format_list = [""]
header_sep_list = [""] header_sep_list = [""]
line_length_list = [-SPACING_SIZE] line_length_list = [-SPACING_SIZE]
name_column_width = 50
number_column_width = 15
add_column(name_column_width) add_column(name_column_width)
add_column(12) add_column(calltime_width)
add_column(number_column_width) add_column(cpu_data_description_width)
add_column(number_column_width) add_column(gpu_data_description_width)
add_column(number_column_width)
add_column(number_column_width)
add_column(number_column_width)
row_format = row_format_list[0] row_format = row_format_list[0]
header_sep = header_sep_list[0] header_sep = header_sep_list[0]
line_length = line_length_list[0] line_length = line_length_list[0]
# construct table string # construct table string
append( append(add_title(line_length, "UserDefined Summary"))
add_title(line_length, append('Time unit: {}'.format(time_unit))
"Memory Summary - {}".format(device_type)))
append('Peak Allocated Memory: {}'.format(
statistic_data.memory_summary.
peak_allocation_values[device_type]))
append('Peak Reserved Memory: {}'.format(
statistic_data.memory_summary.peak_reserved_values[device_type])
)
append(header_sep) append(header_sep)
append(row_format.format(*headers)) append(row_format.format(*headers))
append(header_sep) append(header_sep)
...@@ -1645,4 +1615,79 @@ def _build_table(statistic_data, ...@@ -1645,4 +1615,79 @@ def _build_table(statistic_data,
append('') append('')
append('') append('')
if views is None or SummaryView.MemoryView in views:
###### Print Memory Summary Report ######
if statistic_data.memory_summary.allocated_items or statistic_data.memory_summary.reserved_items:
for device_type, memory_events in statistic_data.memory_summary.allocated_items.items(
):
all_row_values = []
sorted_items = sorted(memory_events.items(),
key=lambda x: x[1].increase_size,
reverse=True)
for event_name, item in sorted_items:
row_values = [
event_name, item.memory_type, item.allocation_count,
item.free_count, item.allocation_size, item.free_size,
item.increase_size
]
all_row_values.append(row_values)
sorted_reserved_items = sorted(
statistic_data.memory_summary.reserved_items[device_type].
items(),
key=lambda x: x[1].increase_size,
reverse=True)
for event_name, item in sorted_reserved_items:
row_values = [
event_name, item.memory_type, item.allocation_count,
item.free_count, item.allocation_size, item.free_size,
item.increase_size
]
all_row_values.append(row_values)
# Calculate the column width
headers = [
'Name', 'Type', 'Allocation Count', 'Free Count',
'Allocation Size', 'Free Size', 'Increased Size'
]
row_format_list = [""]
header_sep_list = [""]
line_length_list = [-SPACING_SIZE]
name_column_width = 50
number_column_width = 15
add_column(name_column_width)
add_column(12)
add_column(number_column_width)
add_column(number_column_width)
add_column(number_column_width)
add_column(number_column_width)
add_column(number_column_width)
row_format = row_format_list[0]
header_sep = header_sep_list[0]
line_length = line_length_list[0]
# construct table string
append(
add_title(line_length,
"Memory Summary - {}".format(device_type)))
append('Peak Allocated Memory: {}'.format(
statistic_data.memory_summary.
peak_allocation_values[device_type]))
append('Peak Reserved Memory: {}'.format(
statistic_data.memory_summary.
peak_reserved_values[device_type]))
append(header_sep)
append(row_format.format(*headers))
append(header_sep)
for row_values in all_row_values:
if isinstance(row_values, str):
append(add_title(line_length, row_values))
else:
append(row_format.format(*row_values))
append('')
append('')
return ''.join(result) return ''.join(result)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册