未验证 提交 c1d81ec1 编写于 作者: C chenjian 提交者: GitHub

Add profiler statistic (#40249)

* add python profiler package

* update according to review

* fix bug

* fix bug

* fix bug

* add unit test

* Revert "add unit test"

This reverts commit 4e69ff71b0645e069afe5dd8fea0d07717852c48.

* reduce for pr

* add unit test

* modify for pr

* fix unittest

* update for ci coverage

* modify according to review

* fix bug

* improve coverage

* add profiler code

* add statistic code

* reduce content for pr
上级 3a77d027
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle
import paddle.profiler as profiler
class HostPythonNode:
def __init__(self, name, type, start_ns, end_ns, process_id, thread_id):
self.name = name
self.type = type
self.start_ns = start_ns
self.end_ns = end_ns
self.process_id = process_id
self.thread_id = thread_id
self.children_node = []
self.runtime_node = []
self.device_node = []
class DevicePythonNode:
def __init__(self, name, type, start_ns, end_ns, device_id, context_id,
stream_id):
self.name = name
self.type = type
self.start_ns = start_ns
self.end_ns = end_ns
self.device_id = device_id
self.context_id = context_id
self.stream_id = stream_id
class TestProfilerStatistic(unittest.TestCase):
def test_statistic_case1(self):
root_node = HostPythonNode('Root Node',
profiler.TracerEventType.UserDefined, 0,
float('inf'), 1000, 1001)
profilerstep_node = HostPythonNode('ProfileStep#1',
profiler.TracerEventType.ProfileStep,
0, 400, 1000, 1001)
dataloader_node = HostPythonNode(
'Dataloader', profiler.TracerEventType.Forward, 5, 15, 1000, 1001)
mobilenet_node = HostPythonNode(
'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
yolonet_node = HostPythonNode(
'Yolov3Net', profiler.TracerEventType.Forward, 50, 100, 1000, 1001)
backward_node = HostPythonNode('Gradient Backward',
profiler.TracerEventType.Backward, 120,
200, 1000, 1001)
optimization_node = HostPythonNode(
'Optimization', profiler.TracerEventType.Optimization, 220, 300,
1000, 1001)
conv2d_node = HostPythonNode(
'conv2d', profiler.TracerEventType.Operator, 25, 40, 1000, 1001)
sync_batch_norm_node = HostPythonNode('sync_batch_norm',
profiler.TracerEventType.Operator,
60, 100, 1000, 1001)
conv2d_infer_shape = HostPythonNode(
'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25,
30, 1000, 1001)
conv2d_compute = HostPythonNode('conv2d::compute',
profiler.TracerEventType.OperatorInner,
30, 40, 1000, 1001)
conv2d_launchkernel = HostPythonNode(
'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 30, 35,
1000, 1001)
conv2d_MemCpy = HostPythonNode('AsyncMemcpy',
profiler.TracerEventType.UserDefined, 35,
40, 1000, 1001)
conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy',
profiler.TracerEventType.CudaRuntime,
35, 40, 1000, 1001)
conv2d_kernel = DevicePythonNode(
'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 50, 0, 0, 0)
conv2d_memcpy = DevicePythonNode(
'conv2d_memcpy', profiler.TracerEventType.Memcpy, 50, 60, 0, 0, 0)
sync_batch_norm_infer_shape = HostPythonNode(
'sync_batch_norm::infer_shape',
profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001)
sync_batch_norm_compute = HostPythonNode(
'sync_batch_norm::compute', profiler.TracerEventType.OperatorInner,
80, 100, 1000, 1001)
sync_batch_norm_launchkernel = HostPythonNode(
'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 80, 90,
1000, 1001)
sync_batch_norm_MemCpy = HostPythonNode(
'AsyncMemcpy', profiler.TracerEventType.UserDefined, 90, 100, 1000,
1001)
sync_batch_norm_cudaMemCpy = HostPythonNode(
'cudaMemcpy', profiler.TracerEventType.CudaRuntime, 90, 100, 1000,
1001)
sync_batch_norm_kernel = DevicePythonNode(
'sync_batch_norm_kernel', profiler.TracerEventType.Kernel, 95, 155,
0, 0, 0)
sync_batch_norm_memcpy = DevicePythonNode(
'sync_batch_norm_memcpy', profiler.TracerEventType.Memcpy, 150, 200,
0, 0, 1)
root_node.children_node.append(profilerstep_node)
profilerstep_node.children_node.extend([
dataloader_node, mobilenet_node, yolonet_node, backward_node,
optimization_node
])
mobilenet_node.children_node.append(conv2d_node)
yolonet_node.children_node.append(sync_batch_norm_node)
conv2d_node.children_node.extend(
[conv2d_infer_shape, conv2d_compute, conv2d_MemCpy])
conv2d_compute.runtime_node.append(conv2d_launchkernel)
conv2d_MemCpy.runtime_node.append(conv2d_cudaMemCpy)
conv2d_launchkernel.device_node.append(conv2d_kernel)
conv2d_cudaMemCpy.device_node.append(conv2d_memcpy)
sync_batch_norm_node.children_node.extend([
sync_batch_norm_infer_shape, sync_batch_norm_compute,
sync_batch_norm_MemCpy
])
sync_batch_norm_compute.runtime_node.append(
sync_batch_norm_launchkernel)
sync_batch_norm_MemCpy.runtime_node.append(sync_batch_norm_cudaMemCpy)
sync_batch_norm_launchkernel.device_node.append(sync_batch_norm_kernel)
sync_batch_norm_cudaMemCpy.device_node.append(sync_batch_norm_memcpy)
thread_tree = {'thread1001': root_node}
extra_info = {
'Process Cpu Utilization': '1.02',
'System Cpu Utilization': '0.68'
}
statistic_data = profiler.profiler_statistic.StatisticData(thread_tree,
extra_info)
time_range_summary = statistic_data.time_range_summary
event_summary = statistic_data.event_summary
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.ProfileStep), 400)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Forward), 90)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Backward), 80)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Optimization), 80)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Operator), 55)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.OperatorInner), 45)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.CudaRuntime), 30)
self.assertEqual(
time_range_summary.get_gpu_range_sum(
0, profiler.TracerEventType.Kernel), 75)
self.assertEqual(
time_range_summary.get_gpu_range_sum(
0, profiler.TracerEventType.Memcpy), 60)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.UserDefined), 15)
self.assertEqual(len(event_summary.items), 2)
self.assertEqual(len(event_summary.userdefined_items), 0)
self.assertEqual(len(event_summary.model_perspective_items), 3)
self.assertEqual(len(event_summary.memory_manipulation_items), 1)
self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
self.assertEqual(event_summary.items['conv2d'].gpu_time, 25)
self.assertEqual(
event_summary.model_perspective_items['Forward'].cpu_time, 90)
self.assertEqual(
event_summary.model_perspective_items['Forward'].gpu_time, 135)
self.assertEqual(
event_summary.model_perspective_items['Backward'].gpu_time, 0)
self.assertEqual(
event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15)
self.assertEqual(
event_summary.memory_manipulation_items['AsyncMemcpy'].gpu_time, 60)
print(
profiler.profiler_statistic._build_table(
statistic_data,
sorted_by=profiler.SortedKeys.CPUTotal,
op_detail=True,
thread_sep=False,
time_unit='ms'))
if __name__ == '__main__':
unittest.main()
...@@ -16,6 +16,20 @@ from enum import Enum ...@@ -16,6 +16,20 @@ from enum import Enum
from paddle.fluid.core import TracerEventType from paddle.fluid.core import TracerEventType
from .statistic_helper import *
_AllTracerEventType = [
TracerEventType.Operator, TracerEventType.Dataloader,
TracerEventType.ProfileStep, TracerEventType.CudaRuntime,
TracerEventType.Kernel, TracerEventType.Memcpy, TracerEventType.Memset,
TracerEventType.UserDefined, TracerEventType.OperatorInner,
TracerEventType.Forward, TracerEventType.Backward,
TracerEventType.Optimization, TracerEventType.Communication,
TracerEventType.PythonOp, TracerEventType.PythonUserDefined
]
_CommunicationOpName = ['reduce', 'broadcast', 'rpc']
class SortedKeys(Enum): class SortedKeys(Enum):
r""" r"""
...@@ -29,3 +43,782 @@ class SortedKeys(Enum): ...@@ -29,3 +43,782 @@ class SortedKeys(Enum):
GPUAvg = 5 GPUAvg = 5
GPUMax = 6 GPUMax = 6
GPUMin = 7 GPUMin = 7
class HostStatisticNode:
r'''
Wrap original node for calculating statistic metrics.
'''
def __init__(self, hostnode):
self.hostnode = hostnode
self.children_node = []
self.runtime_node = []
self.cpu_time = 0
self.self_cpu_time = 0
self.gpu_time = 0
self.self_gpu_time = 0
def cal_statistic(self):
for child in self.children_node:
child.cal_statistic()
for rt in self.runtime_node:
rt.cal_statistic()
self.cpu_time = self.hostnode.end_ns - self.hostnode.start_ns
for child in self.children_node:
self.gpu_time += child.gpu_time
self.self_cpu_time -= (child.end_ns - child.start_ns)
for rt in self.runtime_node:
self.self_cpu_time -= (rt.end_ns - rt.start_ns)
self.gpu_time += rt.gpu_time
self.self_gpu_time += rt.gpu_time
for device in self.hostnode.device_node:
self.gpu_time += (device.end_ns - device.start_ns)
self.self_gpu_time += (device.end_ns - device.start_ns)
@property
def end_ns(self):
return self.hostnode.end_ns
@property
def start_ns(self):
return self.hostnode.start_ns
def __getattr__(self, name):
return getattr(self.hostnode, name)
def traverse_tree(nodetrees):
results = collections.defaultdict(list)
for thread_id, rootnode in nodetrees.items():
stack = []
stack.append(rootnode)
threadlist = results[thread_id]
while stack:
current_node = stack.pop()
threadlist.append(current_node)
for childnode in current_node.children_node:
stack.append(childnode)
return results
def wrap_tree(nodetrees):
'''
Using HostStatisticNode to wrap original profiler result tree, and calculate node statistic metrics.
'''
node_statistic_tree = {}
results = collections.defaultdict(list)
newresults = collections.defaultdict(list)
for thread_id, rootnode in nodetrees.items():
stack = []
stack.append(rootnode)
root_statistic_node = HostStatisticNode(rootnode)
newstack = []
newstack.append(root_statistic_node)
node_statistic_tree[thread_id] = root_statistic_node
threadlist = results[thread_id]
newthreadlist = newresults[thread_id]
while stack:
current_node = stack.pop()
threadlist.append(current_node)
current_statistic_node = newstack.pop()
newthreadlist.append(current_statistic_node)
for childnode in current_node.children_node:
stack.append(childnode)
child_statistic_node = HostStatisticNode(childnode)
current_statistic_node.children_node.append(
child_statistic_node)
newstack.append(child_statistic_node)
for runtimenode in current_node.runtime_node:
runtime_statistic_node = HostStatisticNode(runtimenode)
current_statistic_node.runtime_node.append(
runtime_statistic_node)
# recursive calculate node statistic values
for thread_id, root_statistic_node in node_statistic_tree.items():
root_statistic_node.cal_statistic()
return node_statistic_tree, newresults
class TimeRangeSummary:
r"""
Analyse time ranges for each TracerEventType, and summarize the time.
"""
def __init__(self):
self.CPUTimeRange = collections.defaultdict(list)
self.GPUTimeRange = collections.defaultdict(
lambda: collections.defaultdict(list)
) # GPU events should be divided into different devices
self.CPUTimeRangeSum = collections.defaultdict(int)
self.GPUTimeRangeSum = collections.defaultdict(
lambda: collections.defaultdict(int))
self.call_times = collections.defaultdict(int)
def parse(self, nodetrees):
r"""
Analysis node trees in profiler result, and get time range for different tracer event type.
"""
thread2hostnodes = traverse_tree(nodetrees)
for threadid, hostnodes in thread2hostnodes.items():
CPUTimeRange = collections.defaultdict(list)
GPUTimeRange = collections.defaultdict(
lambda: collections.defaultdict(lambda: collections.defaultdict(list))
) # device_id/type/stream_id
for hostnode in hostnodes[1:]: #skip root node
CPUTimeRange[hostnode.type].append(
(hostnode.start_ns, hostnode.end_ns))
self.call_times[hostnode.type] += 1
if hostnode.type == TracerEventType.Operator and any(
[name in hostnode.name for name in
_CommunicationOpName]): # special case, communication op
CPUTimeRange[TracerEventType.Communication].append(
(hostnode.start_ns, hostnode.end_ns))
self.call_times[TracerEventType.Communication] += 1
is_communication_node = (
hostnode.type == TracerEventType.Communication
) or (hostnode.type == TracerEventType.Operator and any(
[name in hostnode.name for name in _CommunicationOpName]))
for runtimenode in hostnode.runtime_node:
CPUTimeRange[runtimenode.type].append(
(runtimenode.start_ns, runtimenode.end_ns))
self.call_times[runtimenode.type] += 1
for devicenode in runtimenode.device_node:
GPUTimeRange[devicenode.device_id][devicenode.type][
devicenode.stream_id].append(
(devicenode.start_ns, devicenode.end_ns))
self.call_times[devicenode.type] += 1
if is_communication_node: # gpu activity for communication node
GPUTimeRange[devicenode.device_id][
TracerEventType.Communication][
devicenode.stream_id].append((
devicenode.start_ns, devicenode.end_ns))
self.call_times[TracerEventType.Communication] += 1
for event_type, time_ranges in CPUTimeRange.items():
time_ranges = merge_self_ranges(time_ranges, is_sorted=False)
self.CPUTimeRange[event_type] = merge_ranges(
self.CPUTimeRange[event_type], time_ranges, is_sorted=True)
for device_id, device_time_ranges in GPUTimeRange.items():
for event_type, event_time_ranges in device_time_ranges.items():
for stream_id, time_ranges in event_time_ranges.items():
time_ranges = merge_self_ranges(
time_ranges, is_sorted=False)
self.GPUTimeRange[device_id][event_type] = merge_ranges(
self.GPUTimeRange[device_id][event_type],
time_ranges,
is_sorted=True)
for event_type, time_ranges in self.CPUTimeRange.items():
self.CPUTimeRangeSum[event_type] = sum_ranges(time_ranges)
for device_id, device_time_ranges in self.GPUTimeRange.items():
for event_type, time_ranges in device_time_ranges.items():
self.GPUTimeRangeSum[device_id][event_type] = sum_ranges(
time_ranges)
def get_gpu_devices(self):
return self.GPUTimeRange.keys()
def get_gpu_range_sum(self, device_id, event_type):
return self.GPUTimeRangeSum[device_id][event_type]
def get_cpu_range_sum(self, event_type):
return self.CPUTimeRangeSum[event_type]
class EventSummary:
r"""
Analyse operator event in profiling data, correlate with its device event.
"""
class DeviceItem:
def __init__(self, name):
self.name = name
self.call = 0
self.gpu_time = 0
self.max_gpu_time = 0
self.min_gpu_time = float('inf')
@property
def avg_gpu_time(self):
return self.gpu_time / self.call
def add_gpu_time(self, time):
if time > self.max_gpu_time:
self.max_gpu_time = time
if time < self.min_gpu_time:
self.min_gpu_time = time
self.gpu_time += time
def add_item(self, node):
self.call += 1
self.add_gpu_time(node.end_ns - node.start_ns)
class OperatorItem:
def __init__(self, name):
self.name = name
self.call = 0
self.cpu_time = 0
self.gpu_time = 0
self.max_cpu_time = 0
self.min_cpu_time = float('inf')
self.max_gpu_time = 0
self.min_gpu_time = float('inf')
self.devices = {}
self.operator_inners = {}
@property
def avg_cpu_time(self):
return self.cpu_time / self.call
@property
def avg_gpu_time(self):
return self.gpu_time / self.call
def add_cpu_time(self, time):
if time > self.max_cpu_time:
self.max_cpu_time = time
if time < self.min_cpu_time:
self.min_cpu_time = time
self.cpu_time += time
def add_gpu_time(self, time):
if time > self.max_gpu_time:
self.max_gpu_time = time
if time < self.min_gpu_time:
self.min_gpu_time = time
self.gpu_time += time
def add_call(self):
self.call += 1
def add_item(self, node):
self.add_call()
self.add_cpu_time(node.cpu_time)
self.add_gpu_time(node.gpu_time)
for child in node.children_node:
if child.name not in self.operator_inners:
self.operator_inners[
child.name] = EventSummary.OperatorItem(child.name)
self.operator_inners[child.name].add_item(child)
for runtimenode in node.runtime_node:
for devicenode in runtimenode.device_node:
if devicenode.name not in self.devices:
self.devices[devicenode.name] = EventSummary.DeviceItem(
devicenode.name)
self.devices[devicenode.name].add_item(devicenode)
class GeneralItem:
def __init__(self, name):
self.name = name
self.call = 0
self.cpu_time = 0
self.max_cpu_time = 0
self.min_cpu_time = float('inf')
self.gpu_time = 0
self.max_gpu_time = 0
self.min_gpu_time = float('inf')
@property
def avg_cpu_time(self):
return self.cpu_time / self.call
@property
def avg_gpu_time(self):
return self.gpu_time / self.call
def add_cpu_time(self, time):
if time > self.max_cpu_time:
self.max_cpu_time = time
if time < self.min_cpu_time:
self.min_cpu_time = time
self.cpu_time += time
def add_gpu_time(self, time):
if time > self.max_gpu_time:
self.max_gpu_time = time
if time < self.min_gpu_time:
self.min_gpu_time = time
self.gpu_time += time
def add_call(self):
self.call += 1
def add_item(self, node):
self.add_call()
self.add_cpu_time(node.cpu_time)
self.add_gpu_time(node.gpu_time)
def __init__(self):
self.items = {} # for operator summary
self.thread_items = collections.defaultdict(
dict) # for operator summary
self.userdefined_items = {} # for userdefined summary
self.userdefined_thread_items = collections.defaultdict(
dict) # for userdefined summary
self.model_perspective_items = {} # for model summary
self.memory_manipulation_items = {} # for memory manipulation summary
def parse(self, nodetrees):
r"""
Analysis operator event in the nodetress.
"""
node_statistic_trees, thread2host_statistic_nodes = wrap_tree(nodetrees)
for threadid, host_statistic_nodes in thread2host_statistic_nodes.items(
):
for host_statistic_node in host_statistic_nodes[
1:]: #skip root node
if host_statistic_node.type == TracerEventType.Operator:
self.add_operator_item(host_statistic_node)
if host_statistic_node.type == TracerEventType.UserDefined\
or host_statistic_node.type == TracerEventType.PythonUserDefined:
if 'memcpy' in host_statistic_node.name.lower() or 'memorycopy' in host_statistic_node.name.lower()\
or 'memset' in host_statistic_node.name.lower():
self.add_memory_manipulation_item(host_statistic_node)
else:
self.add_userdefined_item(host_statistic_node)
for threadid, root_statistic_node in node_statistic_trees.items():
deque = collections.deque()
deque.append(root_statistic_node)
while deque:
current_node = deque.popleft()
for child in current_node.children_node:
if child.type == TracerEventType.Forward or child.type == TracerEventType.Dataloader\
or child.type == TracerEventType.Backward or child.type == TracerEventType.Optimization:
self.add_model_perspective_item(
child) #find first model perspective node
else:
deque.append(child)
def add_operator_item(self, operator_node):
if operator_node.name not in self.items:
self.items[operator_node.name] = EventSummary.OperatorItem(
operator_node.name)
self.items[operator_node.name].add_item(operator_node)
if operator_node.name not in self.thread_items[operator_node.thread_id]:
self.thread_items[operator_node.thread_id][
operator_node.name] = EventSummary.OperatorItem(
operator_node.name)
self.thread_items[operator_node.thread_id][operator_node.name].add_item(
operator_node)
def add_userdefined_item(self, userdefined_node):
if userdefined_node.name not in self.userdefined_items:
self.userdefined_items[
userdefined_node.name] = EventSummary.GeneralItem(
userdefined_node.name)
self.userdefined_items[userdefined_node.name].add_item(userdefined_node)
if userdefined_node.name not in self.userdefined_thread_items[
userdefined_node.thread_id]:
self.userdefined_thread_items[userdefined_node.thread_id][
userdefined_node.name] = EventSummary.GeneralItem(
userdefined_node.name)
self.userdefined_thread_items[userdefined_node.thread_id][
userdefined_node.name].add_item(userdefined_node)
def add_memory_manipulation_item(self, memory_manipulation_node):
if memory_manipulation_node.name not in self.memory_manipulation_items:
self.memory_manipulation_items[
memory_manipulation_node.name] = EventSummary.GeneralItem(
memory_manipulation_node.name)
self.memory_manipulation_items[memory_manipulation_node.name].add_item(
memory_manipulation_node)
def add_model_perspective_item(self, model_perspective_node):
if model_perspective_node.type == TracerEventType.Forward:
name = 'Forward'
elif model_perspective_node.type == TracerEventType.Backward:
name = 'Backward'
elif model_perspective_node.type == TracerEventType.Optimization:
name = 'Optimization'
elif model_perspective_node.type == TracerEventType.Dataloader:
name = 'Dataloader'
else:
return
if name not in self.model_perspective_items:
self.model_perspective_items[name] = EventSummary.GeneralItem(name)
self.model_perspective_items[name].add_item(model_perspective_node)
class StatisticData:
r"""
Hold all analysed results.
"""
def __init__(self, node_trees, extra_info):
self.node_trees = node_trees
self.extra_info = extra_info
self.time_range_summary = TimeRangeSummary()
self.event_summary = EventSummary()
self.time_range_summary.parse(node_trees)
self.event_summary.parse(node_trees)
def _build_table(statistic_data,
sorted_by=SortedKeys.CPUTotal,
op_detail=True,
thread_sep=False,
time_unit='ms',
row_limit=100,
max_src_column_width=75):
"""Prints a summary of events."""
# format table row
SPACING_SIZE = 2
row_format_list = [""]
header_sep_list = [""]
line_length_list = [-SPACING_SIZE]
def add_column(padding, text_dir='<'):
row_format_list[0] += '{: ' + text_dir + str(padding) + '}' + (
' ' * SPACING_SIZE)
header_sep_list[0] += '-' * padding + (' ' * SPACING_SIZE)
line_length_list[0] += padding + SPACING_SIZE
def add_title(padding, text):
left_length = padding - len(text)
half = left_length // 2
return '-' * half + text + '-' * (left_length - half)
result = []
def append(s):
result.append(s)
result.append('\n')
def format_time(time, unit='ms', indent=0):
r"""
Transform time in ns to time in unit.
"""
if time == float('inf'):
return '-'
else:
result = float(time)
if unit == 's':
result /= 1e9
elif unit == 'ms':
result /= 1e6
elif unit == 'us':
result /= 1e3
return '{}{:.2f}'.format(' ' * indent, result)
def format_ratio(ratio, indent=0):
r"""
Transform ratio within [0, 1] to percentage presentation.
"""
return '{}{:.2f}'.format(' ' * indent, ratio * 100)
total_time = statistic_data.time_range_summary.get_cpu_range_sum(
TracerEventType.ProfileStep)
###### Print Device Summary ######
headers = ['Device', 'Utilization (%)']
name_column_width = 30
DEFAULT_COLUMN_WIDTH = 20
add_column(name_column_width)
for _ in headers[1:]:
add_column(DEFAULT_COLUMN_WIDTH)
row_format = row_format_list[0]
header_sep = header_sep_list[0]
line_length = line_length_list[0]
# construct table string
append(add_title(line_length, "Device Summary"))
append('Time unit: {}'.format(time_unit))
append(header_sep)
append(row_format.format(*headers))
append(header_sep)
row_values = [
'CPU(Process)', format_ratio(
float(statistic_data.extra_info['Process Cpu Utilization']))
]
append(row_format.format(*row_values))
row_values = [
'CPU(System)', format_ratio(
float(statistic_data.extra_info['System Cpu Utilization']))
]
append(row_format.format(*row_values))
for gpu_name in statistic_data.time_range_summary.get_gpu_devices():
gpu_time = float(
statistic_data.time_range_summary.get_gpu_range_sum(
gpu_name, TracerEventType.Kernel))
utilization = gpu_time / total_time
row_values = ['GPU{}'.format(gpu_name), format_ratio(utilization)]
append(row_format.format(*row_values))
append(header_sep)
append(
"Note:\nCPU(Process) Utilization = Current process CPU time over all cpu cores / elapsed time, so max utilization can be reached 100% * number of cpu cores.\n"
"CPU(System) Utilization = All processes CPU time over all cpu cores(busy time) / (busy time + idle time).\n"
"GPU Utilization = Current process GPU time / elapsed time")
append('-' * line_length)
append('')
append('')
if total_time == 0:
return ''.join(result)
###### Print Overview Summary ######
headers = ['Event Type', 'CPU Time', 'Ratio (%)']
row_format_list = [""]
header_sep_list = [""]
line_length_list = [-SPACING_SIZE]
DEFAULT_COLUMN_WIDTH = 25
for _ in headers:
add_column(DEFAULT_COLUMN_WIDTH)
row_format = row_format_list[0]
header_sep = header_sep_list[0]
line_length = line_length_list[0]
# construct table string
append(add_title(line_length, "Overview Summary"))
append('Time unit: {}'.format(time_unit))
append(header_sep)
append(row_format.format(*headers))
append(header_sep)
row_values = [
'Total Time', format_time(
total_time, unit=time_unit), format_ratio(1)
]
append(row_format.format(*row_values))
cpu_type_time = collections.defaultdict(int)
gpu_type_time = collections.defaultdict(int)
for event_type, value in statistic_data.time_range_summary.CPUTimeRangeSum.items(
):
cpu_type_time[event_type] = value
gpu_time_range = collections.defaultdict(list)
for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items(
):
for event_type, time_range in device_time_ranges.items():
gpu_time_range[event_type] = merge_ranges(
gpu_time_range[event_type], time_range, is_sorted=True)
for event_type, time_range in gpu_time_range.items():
gpu_type_time[event_type] = sum_ranges(time_range)
sorted_items = sorted(
cpu_type_time.items(), key=lambda x: x[1], reverse=True)
for event_type, time in sorted_items:
row_values = [
' {}'.format(str(event_type).split('.')[1]), format_time(
time, unit=time_unit), format_ratio(float(time) / total_time)
]
append(row_format.format(*row_values))
append(header_sep)
headers = ['', 'GPU Time', 'Ratio (%)']
append(row_format.format(*headers))
append(header_sep)
for event_type, time in gpu_type_time.items():
row_values = [
' {}'.format(str(event_type).split('.')[1]), format_time(
time, unit=time_unit), format_ratio(float(time) / total_time)
]
append(row_format.format(*row_values))
append(header_sep)
append(
"Note:\nIn this table, We sum up all collected events in terms of event type.\n"
"The time of events collected on host are presented as CPU Time, and as GPU Time if on device.\n"
"ratio = CPU(GPU) Time / Total Time."
"Events with different types may overlap or inclusion, e.g. Operator includes OperatorInner, so the sum of ratios is not 100%.\n"
"The time of events in the same type with overlap will not calculate twice, and all time is summed after merged.\n"
"Example:\n"
"Thread 1:\n"
" Operator: |___________| |__________|\n"
"Thread 2:\n"
" Operator: |____________| |___|\n"
"After merged:\n"
" Result: |______________| |__________|\n")
append('-' * line_length)
append('')
append('')
###### Print Operator Summary Report ######
if statistic_data.event_summary.items:
headers = [
'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
'GPU Total / Avg / Max / Min / Ratio(%)'
]
row_format_list = [""]
header_sep_list = [""]
line_length_list = [-SPACING_SIZE]
name_column_width = 50
add_column(name_column_width)
add_column(6)
add_column(40)
add_column(40)
row_format = row_format_list[0]
header_sep = header_sep_list[0]
line_length = line_length_list[0]
# construct table string
append(add_title(line_length, "Operator Summary"))
append('Time unit: {}'.format(time_unit))
append(header_sep)
append(row_format.format(*headers))
append(header_sep)
if thread_sep == True:
thread_items = statistic_data.event_summary.thread_items
else:
thread_items = {
'All threads merged': statistic_data.event_summary.items
}
for thread_id, items in thread_items.items():
append(add_title(line_length, "Thread: {}".format(thread_id)))
if sorted_by == SortedKeys.CPUTotal:
sorted_items = sorted(
items.items(), key=lambda x: x[1].cpu_time, reverse=True)
elif sorted_by == SortedKeys.CPUAvg:
sorted_items = sorted(
items.items(),
key=lambda x: x[1].avg_cpu_time,
reverse=True)
elif sorted_by == SortedKeys.CPUMax:
sorted_items = sorted(
items.items(),
key=lambda x: x[1].max_cpu_time,
reverse=True)
elif sorted_by == SortedKeys.CPUMin:
sorted_items = sorted(
items.items(), key=lambda x: x[1].min_cpu_time)
elif sorted_by == SortedKeys.GPUTotal:
sorted_items = sorted(
items.items(), key=lambda x: x[1].gpu_time, reverse=True)
elif sorted_by == SortedKeys.GPUAvg:
sorted_items = sorted(
items.items(),
key=lambda x: x[1].avg_gpu_time,
reverse=True)
elif sorted_by == SortedKeys.GPUMax:
sorted_items = sorted(
items.items(),
key=lambda x: x[1].max_gpu_time,
reverse=True)
elif sorted_by == SortedKeys.GPUMin:
sorted_items = sorted(
items.items(), key=lambda x: x[1].min_gpu_time)
total_cpu_time = 0
total_gpu_time = 0
for name, item in sorted_items:
total_cpu_time += item.cpu_time
total_gpu_time += item.gpu_time
for name, item in sorted_items:
row_values = [
name, item.call, '{} / {} / {} / {} / {}'.format(
format_time(
item.cpu_time, unit=time_unit),
format_time(
item.avg_cpu_time, unit=time_unit),
format_time(
item.max_cpu_time, unit=time_unit),
format_time(
item.min_cpu_time, unit=time_unit),
format_ratio(float(item.cpu_time) / total_cpu_time)),
'{} / {} / {} / {} / {}'.format(
format_time(
item.gpu_time, unit=time_unit),
format_time(
item.avg_gpu_time, unit=time_unit),
format_time(
item.max_gpu_time, unit=time_unit),
format_time(
item.min_gpu_time, unit=time_unit),
format_ratio(float(item.gpu_time) / total_gpu_time))
]
append(row_format.format(*row_values))
if op_detail:
for innerop_name, innerop_node in item.operator_inners.items(
):
row_values = [
' {}'.format(innerop_name), innerop_node.call,
'{} / {} / {} / {} / {}'.format(
format_time(
innerop_node.cpu_time, unit=time_unit),
format_time(
innerop_node.avg_cpu_time, unit=time_unit),
format_time(
innerop_node.max_cpu_time, unit=time_unit),
format_time(
innerop_node.min_cpu_time, unit=time_unit),
format_ratio(
float(innerop_node.cpu_time) /
total_cpu_time)),
'{} / {} / {} / {} / {}'.format(
format_time(
innerop_node.gpu_time, unit=time_unit),
format_time(
innerop_node.avg_gpu_time, unit=time_unit),
format_time(
innerop_node.max_gpu_time, unit=time_unit),
format_time(
innerop_node.min_gpu_time, unit=time_unit),
format_ratio(
float(innerop_node.gpu_time) /
total_gpu_time))
]
append(row_format.format(*row_values))
for device_node_name, devicenode in innerop_node.devices.items(
):
if len(device_node_name) + 4 > name_column_width:
device_node_name = device_node_name[:
name_column_width
- 7]
device_node_name += "..."
row_values = [
' {}'.format(device_node_name),
devicenode.call, '- / - / - / - / -',
'{} / {} / {} / {} / {}'.format(
format_time(
devicenode.gpu_time, unit=time_unit),
format_time(
devicenode.avg_gpu_time,
unit=time_unit),
format_time(
devicenode.max_gpu_time,
unit=time_unit),
format_time(
devicenode.min_gpu_time,
unit=time_unit),
format_ratio(
float(devicenode.gpu_time) /
total_gpu_time))
]
append(row_format.format(*row_values))
for device_node_name, device_node in item.devices.items():
if len(device_node_name) + 2 > name_column_width:
device_node_name = device_node_name[:
name_column_width
- 5]
device_node_name += "..."
row_values = [
' {}'.format(device_node_name), devicenode.call,
'- / - / - / - / -',
'{} / {} / {} / {} / {}'.format(
format_time(
devicenode.gpu_time, unit=time_unit),
format_time(
devicenode.avg_gpu_time, unit=time_unit),
format_time(
devicenode.max_gpu_time, unit=time_unit),
format_time(
devicenode.min_gpu_time, unit=time_unit),
format_ratio(
float(devicenode.gpu_time) /
total_gpu_time))
]
append(row_format.format(*row_values))
append(header_sep)
append('')
append('')
return ''.join(result)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册