diff --git a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py new file mode 100644 index 0000000000000000000000000000000000000000..838ccae37cfa5fb7dbdedcb5d39655cb62ad429f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py @@ -0,0 +1,199 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import paddle +import paddle.profiler as profiler + + +class HostPythonNode: + def __init__(self, name, type, start_ns, end_ns, process_id, thread_id): + self.name = name + self.type = type + self.start_ns = start_ns + self.end_ns = end_ns + self.process_id = process_id + self.thread_id = thread_id + self.children_node = [] + self.runtime_node = [] + self.device_node = [] + + +class DevicePythonNode: + def __init__(self, name, type, start_ns, end_ns, device_id, context_id, + stream_id): + self.name = name + self.type = type + self.start_ns = start_ns + self.end_ns = end_ns + self.device_id = device_id + self.context_id = context_id + self.stream_id = stream_id + + +class TestProfilerStatistic(unittest.TestCase): + def test_statistic_case1(self): + root_node = HostPythonNode('Root Node', + profiler.TracerEventType.UserDefined, 0, + float('inf'), 1000, 1001) + profilerstep_node = HostPythonNode('ProfileStep#1', + profiler.TracerEventType.ProfileStep, + 0, 400, 1000, 1001) + dataloader_node = HostPythonNode( + 'Dataloader', profiler.TracerEventType.Forward, 5, 15, 1000, 1001) + mobilenet_node = HostPythonNode( + 'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001) + yolonet_node = HostPythonNode( + 'Yolov3Net', profiler.TracerEventType.Forward, 50, 100, 1000, 1001) + backward_node = HostPythonNode('Gradient Backward', + profiler.TracerEventType.Backward, 120, + 200, 1000, 1001) + optimization_node = HostPythonNode( + 'Optimization', profiler.TracerEventType.Optimization, 220, 300, + 1000, 1001) + conv2d_node = HostPythonNode( + 'conv2d', profiler.TracerEventType.Operator, 25, 40, 1000, 1001) + sync_batch_norm_node = HostPythonNode('sync_batch_norm', + profiler.TracerEventType.Operator, + 60, 100, 1000, 1001) + conv2d_infer_shape = HostPythonNode( + 'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25, + 30, 1000, 1001) + conv2d_compute = HostPythonNode('conv2d::compute', + profiler.TracerEventType.OperatorInner, + 30, 40, 1000, 1001) + conv2d_launchkernel = HostPythonNode( + 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 30, 35, + 1000, 1001) + conv2d_MemCpy = HostPythonNode('AsyncMemcpy', + profiler.TracerEventType.UserDefined, 35, + 40, 1000, 1001) + conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy', + profiler.TracerEventType.CudaRuntime, + 35, 40, 1000, 1001) + conv2d_kernel = DevicePythonNode( + 'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 50, 0, 0, 0) + conv2d_memcpy = DevicePythonNode( + 'conv2d_memcpy', profiler.TracerEventType.Memcpy, 50, 60, 0, 0, 0) + sync_batch_norm_infer_shape = HostPythonNode( + 'sync_batch_norm::infer_shape', + profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001) + sync_batch_norm_compute = HostPythonNode( + 'sync_batch_norm::compute', profiler.TracerEventType.OperatorInner, + 80, 100, 1000, 1001) + sync_batch_norm_launchkernel = HostPythonNode( + 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 80, 90, + 1000, 1001) + sync_batch_norm_MemCpy = HostPythonNode( + 'AsyncMemcpy', profiler.TracerEventType.UserDefined, 90, 100, 1000, + 1001) + sync_batch_norm_cudaMemCpy = HostPythonNode( + 'cudaMemcpy', profiler.TracerEventType.CudaRuntime, 90, 100, 1000, + 1001) + sync_batch_norm_kernel = DevicePythonNode( + 'sync_batch_norm_kernel', profiler.TracerEventType.Kernel, 95, 155, + 0, 0, 0) + sync_batch_norm_memcpy = DevicePythonNode( + 'sync_batch_norm_memcpy', profiler.TracerEventType.Memcpy, 150, 200, + 0, 0, 1) + root_node.children_node.append(profilerstep_node) + profilerstep_node.children_node.extend([ + dataloader_node, mobilenet_node, yolonet_node, backward_node, + optimization_node + ]) + mobilenet_node.children_node.append(conv2d_node) + yolonet_node.children_node.append(sync_batch_norm_node) + conv2d_node.children_node.extend( + [conv2d_infer_shape, conv2d_compute, conv2d_MemCpy]) + conv2d_compute.runtime_node.append(conv2d_launchkernel) + conv2d_MemCpy.runtime_node.append(conv2d_cudaMemCpy) + conv2d_launchkernel.device_node.append(conv2d_kernel) + conv2d_cudaMemCpy.device_node.append(conv2d_memcpy) + sync_batch_norm_node.children_node.extend([ + sync_batch_norm_infer_shape, sync_batch_norm_compute, + sync_batch_norm_MemCpy + ]) + sync_batch_norm_compute.runtime_node.append( + sync_batch_norm_launchkernel) + sync_batch_norm_MemCpy.runtime_node.append(sync_batch_norm_cudaMemCpy) + sync_batch_norm_launchkernel.device_node.append(sync_batch_norm_kernel) + sync_batch_norm_cudaMemCpy.device_node.append(sync_batch_norm_memcpy) + thread_tree = {'thread1001': root_node} + extra_info = { + 'Process Cpu Utilization': '1.02', + 'System Cpu Utilization': '0.68' + } + statistic_data = profiler.profiler_statistic.StatisticData(thread_tree, + extra_info) + time_range_summary = statistic_data.time_range_summary + event_summary = statistic_data.event_summary + + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.ProfileStep), 400) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.Forward), 90) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.Backward), 80) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.Optimization), 80) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.Operator), 55) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.OperatorInner), 45) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.CudaRuntime), 30) + self.assertEqual( + time_range_summary.get_gpu_range_sum( + 0, profiler.TracerEventType.Kernel), 75) + self.assertEqual( + time_range_summary.get_gpu_range_sum( + 0, profiler.TracerEventType.Memcpy), 60) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.UserDefined), 15) + self.assertEqual(len(event_summary.items), 2) + self.assertEqual(len(event_summary.userdefined_items), 0) + self.assertEqual(len(event_summary.model_perspective_items), 3) + self.assertEqual(len(event_summary.memory_manipulation_items), 1) + self.assertEqual(event_summary.items['conv2d'].cpu_time, 15) + self.assertEqual(event_summary.items['conv2d'].gpu_time, 25) + self.assertEqual( + event_summary.model_perspective_items['Forward'].cpu_time, 90) + self.assertEqual( + event_summary.model_perspective_items['Forward'].gpu_time, 135) + self.assertEqual( + event_summary.model_perspective_items['Backward'].gpu_time, 0) + self.assertEqual( + event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15) + self.assertEqual( + event_summary.memory_manipulation_items['AsyncMemcpy'].gpu_time, 60) + print( + profiler.profiler_statistic._build_table( + statistic_data, + sorted_by=profiler.SortedKeys.CPUTotal, + op_detail=True, + thread_sep=False, + time_unit='ms')) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py old mode 100644 new mode 100755 index 29d586268a01485606ae1b1e142b1928e2fbc0f3..e39871c7365ba8eebdb33c2a1323b7459aaa52ed --- a/python/paddle/profiler/profiler_statistic.py +++ b/python/paddle/profiler/profiler_statistic.py @@ -16,6 +16,20 @@ from enum import Enum from paddle.fluid.core import TracerEventType +from .statistic_helper import * + +_AllTracerEventType = [ + TracerEventType.Operator, TracerEventType.Dataloader, + TracerEventType.ProfileStep, TracerEventType.CudaRuntime, + TracerEventType.Kernel, TracerEventType.Memcpy, TracerEventType.Memset, + TracerEventType.UserDefined, TracerEventType.OperatorInner, + TracerEventType.Forward, TracerEventType.Backward, + TracerEventType.Optimization, TracerEventType.Communication, + TracerEventType.PythonOp, TracerEventType.PythonUserDefined +] + +_CommunicationOpName = ['reduce', 'broadcast', 'rpc'] + class SortedKeys(Enum): r""" @@ -29,3 +43,782 @@ class SortedKeys(Enum): GPUAvg = 5 GPUMax = 6 GPUMin = 7 + + +class HostStatisticNode: + r''' + Wrap original node for calculating statistic metrics. + ''' + + def __init__(self, hostnode): + self.hostnode = hostnode + self.children_node = [] + self.runtime_node = [] + self.cpu_time = 0 + self.self_cpu_time = 0 + self.gpu_time = 0 + self.self_gpu_time = 0 + + def cal_statistic(self): + for child in self.children_node: + child.cal_statistic() + for rt in self.runtime_node: + rt.cal_statistic() + + self.cpu_time = self.hostnode.end_ns - self.hostnode.start_ns + for child in self.children_node: + self.gpu_time += child.gpu_time + self.self_cpu_time -= (child.end_ns - child.start_ns) + for rt in self.runtime_node: + self.self_cpu_time -= (rt.end_ns - rt.start_ns) + self.gpu_time += rt.gpu_time + self.self_gpu_time += rt.gpu_time + for device in self.hostnode.device_node: + self.gpu_time += (device.end_ns - device.start_ns) + self.self_gpu_time += (device.end_ns - device.start_ns) + + @property + def end_ns(self): + return self.hostnode.end_ns + + @property + def start_ns(self): + return self.hostnode.start_ns + + def __getattr__(self, name): + return getattr(self.hostnode, name) + + +def traverse_tree(nodetrees): + results = collections.defaultdict(list) + for thread_id, rootnode in nodetrees.items(): + stack = [] + stack.append(rootnode) + threadlist = results[thread_id] + while stack: + current_node = stack.pop() + threadlist.append(current_node) + for childnode in current_node.children_node: + stack.append(childnode) + return results + + +def wrap_tree(nodetrees): + ''' + Using HostStatisticNode to wrap original profiler result tree, and calculate node statistic metrics. + ''' + node_statistic_tree = {} + results = collections.defaultdict(list) + newresults = collections.defaultdict(list) + for thread_id, rootnode in nodetrees.items(): + stack = [] + stack.append(rootnode) + root_statistic_node = HostStatisticNode(rootnode) + newstack = [] + newstack.append(root_statistic_node) + node_statistic_tree[thread_id] = root_statistic_node + threadlist = results[thread_id] + newthreadlist = newresults[thread_id] + while stack: + current_node = stack.pop() + threadlist.append(current_node) + current_statistic_node = newstack.pop() + newthreadlist.append(current_statistic_node) + for childnode in current_node.children_node: + stack.append(childnode) + child_statistic_node = HostStatisticNode(childnode) + current_statistic_node.children_node.append( + child_statistic_node) + newstack.append(child_statistic_node) + for runtimenode in current_node.runtime_node: + runtime_statistic_node = HostStatisticNode(runtimenode) + current_statistic_node.runtime_node.append( + runtime_statistic_node) + # recursive calculate node statistic values + for thread_id, root_statistic_node in node_statistic_tree.items(): + root_statistic_node.cal_statistic() + + return node_statistic_tree, newresults + + +class TimeRangeSummary: + r""" + Analyse time ranges for each TracerEventType, and summarize the time. + """ + + def __init__(self): + self.CPUTimeRange = collections.defaultdict(list) + self.GPUTimeRange = collections.defaultdict( + lambda: collections.defaultdict(list) + ) # GPU events should be divided into different devices + self.CPUTimeRangeSum = collections.defaultdict(int) + self.GPUTimeRangeSum = collections.defaultdict( + lambda: collections.defaultdict(int)) + self.call_times = collections.defaultdict(int) + + def parse(self, nodetrees): + r""" + Analysis node trees in profiler result, and get time range for different tracer event type. + """ + thread2hostnodes = traverse_tree(nodetrees) + for threadid, hostnodes in thread2hostnodes.items(): + CPUTimeRange = collections.defaultdict(list) + GPUTimeRange = collections.defaultdict( + lambda: collections.defaultdict(lambda: collections.defaultdict(list)) + ) # device_id/type/stream_id + for hostnode in hostnodes[1:]: #skip root node + CPUTimeRange[hostnode.type].append( + (hostnode.start_ns, hostnode.end_ns)) + self.call_times[hostnode.type] += 1 + if hostnode.type == TracerEventType.Operator and any( + [name in hostnode.name for name in + _CommunicationOpName]): # special case, communication op + CPUTimeRange[TracerEventType.Communication].append( + (hostnode.start_ns, hostnode.end_ns)) + self.call_times[TracerEventType.Communication] += 1 + is_communication_node = ( + hostnode.type == TracerEventType.Communication + ) or (hostnode.type == TracerEventType.Operator and any( + [name in hostnode.name for name in _CommunicationOpName])) + for runtimenode in hostnode.runtime_node: + CPUTimeRange[runtimenode.type].append( + (runtimenode.start_ns, runtimenode.end_ns)) + self.call_times[runtimenode.type] += 1 + for devicenode in runtimenode.device_node: + GPUTimeRange[devicenode.device_id][devicenode.type][ + devicenode.stream_id].append( + (devicenode.start_ns, devicenode.end_ns)) + self.call_times[devicenode.type] += 1 + if is_communication_node: # gpu activity for communication node + GPUTimeRange[devicenode.device_id][ + TracerEventType.Communication][ + devicenode.stream_id].append(( + devicenode.start_ns, devicenode.end_ns)) + self.call_times[TracerEventType.Communication] += 1 + + for event_type, time_ranges in CPUTimeRange.items(): + time_ranges = merge_self_ranges(time_ranges, is_sorted=False) + self.CPUTimeRange[event_type] = merge_ranges( + self.CPUTimeRange[event_type], time_ranges, is_sorted=True) + for device_id, device_time_ranges in GPUTimeRange.items(): + for event_type, event_time_ranges in device_time_ranges.items(): + for stream_id, time_ranges in event_time_ranges.items(): + time_ranges = merge_self_ranges( + time_ranges, is_sorted=False) + self.GPUTimeRange[device_id][event_type] = merge_ranges( + self.GPUTimeRange[device_id][event_type], + time_ranges, + is_sorted=True) + + for event_type, time_ranges in self.CPUTimeRange.items(): + self.CPUTimeRangeSum[event_type] = sum_ranges(time_ranges) + for device_id, device_time_ranges in self.GPUTimeRange.items(): + for event_type, time_ranges in device_time_ranges.items(): + self.GPUTimeRangeSum[device_id][event_type] = sum_ranges( + time_ranges) + + def get_gpu_devices(self): + return self.GPUTimeRange.keys() + + def get_gpu_range_sum(self, device_id, event_type): + return self.GPUTimeRangeSum[device_id][event_type] + + def get_cpu_range_sum(self, event_type): + return self.CPUTimeRangeSum[event_type] + + +class EventSummary: + r""" + Analyse operator event in profiling data, correlate with its device event. + """ + + class DeviceItem: + def __init__(self, name): + self.name = name + self.call = 0 + self.gpu_time = 0 + self.max_gpu_time = 0 + self.min_gpu_time = float('inf') + + @property + def avg_gpu_time(self): + return self.gpu_time / self.call + + def add_gpu_time(self, time): + if time > self.max_gpu_time: + self.max_gpu_time = time + if time < self.min_gpu_time: + self.min_gpu_time = time + self.gpu_time += time + + def add_item(self, node): + self.call += 1 + self.add_gpu_time(node.end_ns - node.start_ns) + + class OperatorItem: + def __init__(self, name): + self.name = name + self.call = 0 + self.cpu_time = 0 + self.gpu_time = 0 + self.max_cpu_time = 0 + self.min_cpu_time = float('inf') + self.max_gpu_time = 0 + self.min_gpu_time = float('inf') + self.devices = {} + self.operator_inners = {} + + @property + def avg_cpu_time(self): + return self.cpu_time / self.call + + @property + def avg_gpu_time(self): + return self.gpu_time / self.call + + def add_cpu_time(self, time): + if time > self.max_cpu_time: + self.max_cpu_time = time + if time < self.min_cpu_time: + self.min_cpu_time = time + self.cpu_time += time + + def add_gpu_time(self, time): + if time > self.max_gpu_time: + self.max_gpu_time = time + if time < self.min_gpu_time: + self.min_gpu_time = time + self.gpu_time += time + + def add_call(self): + self.call += 1 + + def add_item(self, node): + self.add_call() + self.add_cpu_time(node.cpu_time) + self.add_gpu_time(node.gpu_time) + for child in node.children_node: + if child.name not in self.operator_inners: + self.operator_inners[ + child.name] = EventSummary.OperatorItem(child.name) + self.operator_inners[child.name].add_item(child) + + for runtimenode in node.runtime_node: + for devicenode in runtimenode.device_node: + if devicenode.name not in self.devices: + self.devices[devicenode.name] = EventSummary.DeviceItem( + devicenode.name) + self.devices[devicenode.name].add_item(devicenode) + + class GeneralItem: + def __init__(self, name): + self.name = name + self.call = 0 + self.cpu_time = 0 + self.max_cpu_time = 0 + self.min_cpu_time = float('inf') + self.gpu_time = 0 + self.max_gpu_time = 0 + self.min_gpu_time = float('inf') + + @property + def avg_cpu_time(self): + return self.cpu_time / self.call + + @property + def avg_gpu_time(self): + return self.gpu_time / self.call + + def add_cpu_time(self, time): + if time > self.max_cpu_time: + self.max_cpu_time = time + if time < self.min_cpu_time: + self.min_cpu_time = time + self.cpu_time += time + + def add_gpu_time(self, time): + if time > self.max_gpu_time: + self.max_gpu_time = time + if time < self.min_gpu_time: + self.min_gpu_time = time + self.gpu_time += time + + def add_call(self): + self.call += 1 + + def add_item(self, node): + self.add_call() + self.add_cpu_time(node.cpu_time) + self.add_gpu_time(node.gpu_time) + + def __init__(self): + self.items = {} # for operator summary + self.thread_items = collections.defaultdict( + dict) # for operator summary + self.userdefined_items = {} # for userdefined summary + self.userdefined_thread_items = collections.defaultdict( + dict) # for userdefined summary + self.model_perspective_items = {} # for model summary + self.memory_manipulation_items = {} # for memory manipulation summary + + def parse(self, nodetrees): + r""" + Analysis operator event in the nodetress. + """ + node_statistic_trees, thread2host_statistic_nodes = wrap_tree(nodetrees) + for threadid, host_statistic_nodes in thread2host_statistic_nodes.items( + ): + for host_statistic_node in host_statistic_nodes[ + 1:]: #skip root node + if host_statistic_node.type == TracerEventType.Operator: + self.add_operator_item(host_statistic_node) + if host_statistic_node.type == TracerEventType.UserDefined\ + or host_statistic_node.type == TracerEventType.PythonUserDefined: + if 'memcpy' in host_statistic_node.name.lower() or 'memorycopy' in host_statistic_node.name.lower()\ + or 'memset' in host_statistic_node.name.lower(): + self.add_memory_manipulation_item(host_statistic_node) + else: + self.add_userdefined_item(host_statistic_node) + + for threadid, root_statistic_node in node_statistic_trees.items(): + deque = collections.deque() + deque.append(root_statistic_node) + while deque: + current_node = deque.popleft() + for child in current_node.children_node: + if child.type == TracerEventType.Forward or child.type == TracerEventType.Dataloader\ + or child.type == TracerEventType.Backward or child.type == TracerEventType.Optimization: + self.add_model_perspective_item( + child) #find first model perspective node + else: + deque.append(child) + + def add_operator_item(self, operator_node): + if operator_node.name not in self.items: + self.items[operator_node.name] = EventSummary.OperatorItem( + operator_node.name) + + self.items[operator_node.name].add_item(operator_node) + + if operator_node.name not in self.thread_items[operator_node.thread_id]: + self.thread_items[operator_node.thread_id][ + operator_node.name] = EventSummary.OperatorItem( + operator_node.name) + self.thread_items[operator_node.thread_id][operator_node.name].add_item( + operator_node) + + def add_userdefined_item(self, userdefined_node): + if userdefined_node.name not in self.userdefined_items: + self.userdefined_items[ + userdefined_node.name] = EventSummary.GeneralItem( + userdefined_node.name) + + self.userdefined_items[userdefined_node.name].add_item(userdefined_node) + + if userdefined_node.name not in self.userdefined_thread_items[ + userdefined_node.thread_id]: + self.userdefined_thread_items[userdefined_node.thread_id][ + userdefined_node.name] = EventSummary.GeneralItem( + userdefined_node.name) + self.userdefined_thread_items[userdefined_node.thread_id][ + userdefined_node.name].add_item(userdefined_node) + + def add_memory_manipulation_item(self, memory_manipulation_node): + if memory_manipulation_node.name not in self.memory_manipulation_items: + self.memory_manipulation_items[ + memory_manipulation_node.name] = EventSummary.GeneralItem( + memory_manipulation_node.name) + self.memory_manipulation_items[memory_manipulation_node.name].add_item( + memory_manipulation_node) + + def add_model_perspective_item(self, model_perspective_node): + if model_perspective_node.type == TracerEventType.Forward: + name = 'Forward' + elif model_perspective_node.type == TracerEventType.Backward: + name = 'Backward' + elif model_perspective_node.type == TracerEventType.Optimization: + name = 'Optimization' + elif model_perspective_node.type == TracerEventType.Dataloader: + name = 'Dataloader' + else: + return + if name not in self.model_perspective_items: + self.model_perspective_items[name] = EventSummary.GeneralItem(name) + self.model_perspective_items[name].add_item(model_perspective_node) + + +class StatisticData: + r""" + Hold all analysed results. + """ + + def __init__(self, node_trees, extra_info): + self.node_trees = node_trees + self.extra_info = extra_info + self.time_range_summary = TimeRangeSummary() + self.event_summary = EventSummary() + self.time_range_summary.parse(node_trees) + self.event_summary.parse(node_trees) + + +def _build_table(statistic_data, + sorted_by=SortedKeys.CPUTotal, + op_detail=True, + thread_sep=False, + time_unit='ms', + row_limit=100, + max_src_column_width=75): + """Prints a summary of events.""" + # format table row + SPACING_SIZE = 2 + row_format_list = [""] + header_sep_list = [""] + line_length_list = [-SPACING_SIZE] + + def add_column(padding, text_dir='<'): + row_format_list[0] += '{: ' + text_dir + str(padding) + '}' + ( + ' ' * SPACING_SIZE) + header_sep_list[0] += '-' * padding + (' ' * SPACING_SIZE) + line_length_list[0] += padding + SPACING_SIZE + + def add_title(padding, text): + left_length = padding - len(text) + half = left_length // 2 + return '-' * half + text + '-' * (left_length - half) + + result = [] + + def append(s): + result.append(s) + result.append('\n') + + def format_time(time, unit='ms', indent=0): + r""" + Transform time in ns to time in unit. + """ + if time == float('inf'): + return '-' + else: + result = float(time) + if unit == 's': + result /= 1e9 + elif unit == 'ms': + result /= 1e6 + elif unit == 'us': + result /= 1e3 + return '{}{:.2f}'.format(' ' * indent, result) + + def format_ratio(ratio, indent=0): + r""" + Transform ratio within [0, 1] to percentage presentation. + """ + return '{}{:.2f}'.format(' ' * indent, ratio * 100) + + total_time = statistic_data.time_range_summary.get_cpu_range_sum( + TracerEventType.ProfileStep) + ###### Print Device Summary ###### + headers = ['Device', 'Utilization (%)'] + name_column_width = 30 + DEFAULT_COLUMN_WIDTH = 20 + add_column(name_column_width) + for _ in headers[1:]: + add_column(DEFAULT_COLUMN_WIDTH) + + row_format = row_format_list[0] + header_sep = header_sep_list[0] + line_length = line_length_list[0] + + # construct table string + + append(add_title(line_length, "Device Summary")) + append('Time unit: {}'.format(time_unit)) + append(header_sep) + append(row_format.format(*headers)) + append(header_sep) + row_values = [ + 'CPU(Process)', format_ratio( + float(statistic_data.extra_info['Process Cpu Utilization'])) + ] + append(row_format.format(*row_values)) + row_values = [ + 'CPU(System)', format_ratio( + float(statistic_data.extra_info['System Cpu Utilization'])) + ] + append(row_format.format(*row_values)) + for gpu_name in statistic_data.time_range_summary.get_gpu_devices(): + gpu_time = float( + statistic_data.time_range_summary.get_gpu_range_sum( + gpu_name, TracerEventType.Kernel)) + utilization = gpu_time / total_time + row_values = ['GPU{}'.format(gpu_name), format_ratio(utilization)] + append(row_format.format(*row_values)) + + append(header_sep) + append( + "Note:\nCPU(Process) Utilization = Current process CPU time over all cpu cores / elapsed time, so max utilization can be reached 100% * number of cpu cores.\n" + "CPU(System) Utilization = All processes CPU time over all cpu cores(busy time) / (busy time + idle time).\n" + "GPU Utilization = Current process GPU time / elapsed time") + append('-' * line_length) + append('') + append('') + + if total_time == 0: + return ''.join(result) + + ###### Print Overview Summary ###### + headers = ['Event Type', 'CPU Time', 'Ratio (%)'] + row_format_list = [""] + header_sep_list = [""] + line_length_list = [-SPACING_SIZE] + + DEFAULT_COLUMN_WIDTH = 25 + for _ in headers: + add_column(DEFAULT_COLUMN_WIDTH) + + row_format = row_format_list[0] + header_sep = header_sep_list[0] + line_length = line_length_list[0] + + # construct table string + append(add_title(line_length, "Overview Summary")) + append('Time unit: {}'.format(time_unit)) + append(header_sep) + append(row_format.format(*headers)) + append(header_sep) + row_values = [ + 'Total Time', format_time( + total_time, unit=time_unit), format_ratio(1) + ] + append(row_format.format(*row_values)) + cpu_type_time = collections.defaultdict(int) + gpu_type_time = collections.defaultdict(int) + for event_type, value in statistic_data.time_range_summary.CPUTimeRangeSum.items( + ): + cpu_type_time[event_type] = value + + gpu_time_range = collections.defaultdict(list) + for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items( + ): + for event_type, time_range in device_time_ranges.items(): + gpu_time_range[event_type] = merge_ranges( + gpu_time_range[event_type], time_range, is_sorted=True) + for event_type, time_range in gpu_time_range.items(): + gpu_type_time[event_type] = sum_ranges(time_range) + + sorted_items = sorted( + cpu_type_time.items(), key=lambda x: x[1], reverse=True) + for event_type, time in sorted_items: + row_values = [ + ' {}'.format(str(event_type).split('.')[1]), format_time( + time, unit=time_unit), format_ratio(float(time) / total_time) + ] + append(row_format.format(*row_values)) + append(header_sep) + headers = ['', 'GPU Time', 'Ratio (%)'] + append(row_format.format(*headers)) + append(header_sep) + for event_type, time in gpu_type_time.items(): + row_values = [ + ' {}'.format(str(event_type).split('.')[1]), format_time( + time, unit=time_unit), format_ratio(float(time) / total_time) + ] + append(row_format.format(*row_values)) + + append(header_sep) + append( + "Note:\nIn this table, We sum up all collected events in terms of event type.\n" + "The time of events collected on host are presented as CPU Time, and as GPU Time if on device.\n" + "ratio = CPU(GPU) Time / Total Time." + "Events with different types may overlap or inclusion, e.g. Operator includes OperatorInner, so the sum of ratios is not 100%.\n" + "The time of events in the same type with overlap will not calculate twice, and all time is summed after merged.\n" + "Example:\n" + "Thread 1:\n" + " Operator: |___________| |__________|\n" + "Thread 2:\n" + " Operator: |____________| |___|\n" + "After merged:\n" + " Result: |______________| |__________|\n") + append('-' * line_length) + append('') + append('') + + ###### Print Operator Summary Report ###### + if statistic_data.event_summary.items: + headers = [ + 'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)', + 'GPU Total / Avg / Max / Min / Ratio(%)' + ] + row_format_list = [""] + header_sep_list = [""] + line_length_list = [-SPACING_SIZE] + name_column_width = 50 + add_column(name_column_width) + add_column(6) + add_column(40) + add_column(40) + + row_format = row_format_list[0] + header_sep = header_sep_list[0] + line_length = line_length_list[0] + + # construct table string + append(add_title(line_length, "Operator Summary")) + append('Time unit: {}'.format(time_unit)) + append(header_sep) + append(row_format.format(*headers)) + append(header_sep) + if thread_sep == True: + thread_items = statistic_data.event_summary.thread_items + else: + thread_items = { + 'All threads merged': statistic_data.event_summary.items + } + for thread_id, items in thread_items.items(): + append(add_title(line_length, "Thread: {}".format(thread_id))) + if sorted_by == SortedKeys.CPUTotal: + sorted_items = sorted( + items.items(), key=lambda x: x[1].cpu_time, reverse=True) + elif sorted_by == SortedKeys.CPUAvg: + sorted_items = sorted( + items.items(), + key=lambda x: x[1].avg_cpu_time, + reverse=True) + elif sorted_by == SortedKeys.CPUMax: + sorted_items = sorted( + items.items(), + key=lambda x: x[1].max_cpu_time, + reverse=True) + elif sorted_by == SortedKeys.CPUMin: + sorted_items = sorted( + items.items(), key=lambda x: x[1].min_cpu_time) + elif sorted_by == SortedKeys.GPUTotal: + sorted_items = sorted( + items.items(), key=lambda x: x[1].gpu_time, reverse=True) + elif sorted_by == SortedKeys.GPUAvg: + sorted_items = sorted( + items.items(), + key=lambda x: x[1].avg_gpu_time, + reverse=True) + elif sorted_by == SortedKeys.GPUMax: + sorted_items = sorted( + items.items(), + key=lambda x: x[1].max_gpu_time, + reverse=True) + elif sorted_by == SortedKeys.GPUMin: + sorted_items = sorted( + items.items(), key=lambda x: x[1].min_gpu_time) + + total_cpu_time = 0 + total_gpu_time = 0 + for name, item in sorted_items: + total_cpu_time += item.cpu_time + total_gpu_time += item.gpu_time + for name, item in sorted_items: + row_values = [ + name, item.call, '{} / {} / {} / {} / {}'.format( + format_time( + item.cpu_time, unit=time_unit), + format_time( + item.avg_cpu_time, unit=time_unit), + format_time( + item.max_cpu_time, unit=time_unit), + format_time( + item.min_cpu_time, unit=time_unit), + format_ratio(float(item.cpu_time) / total_cpu_time)), + '{} / {} / {} / {} / {}'.format( + format_time( + item.gpu_time, unit=time_unit), + format_time( + item.avg_gpu_time, unit=time_unit), + format_time( + item.max_gpu_time, unit=time_unit), + format_time( + item.min_gpu_time, unit=time_unit), + format_ratio(float(item.gpu_time) / total_gpu_time)) + ] + append(row_format.format(*row_values)) + if op_detail: + for innerop_name, innerop_node in item.operator_inners.items( + ): + row_values = [ + ' {}'.format(innerop_name), innerop_node.call, + '{} / {} / {} / {} / {}'.format( + format_time( + innerop_node.cpu_time, unit=time_unit), + format_time( + innerop_node.avg_cpu_time, unit=time_unit), + format_time( + innerop_node.max_cpu_time, unit=time_unit), + format_time( + innerop_node.min_cpu_time, unit=time_unit), + format_ratio( + float(innerop_node.cpu_time) / + total_cpu_time)), + '{} / {} / {} / {} / {}'.format( + format_time( + innerop_node.gpu_time, unit=time_unit), + format_time( + innerop_node.avg_gpu_time, unit=time_unit), + format_time( + innerop_node.max_gpu_time, unit=time_unit), + format_time( + innerop_node.min_gpu_time, unit=time_unit), + format_ratio( + float(innerop_node.gpu_time) / + total_gpu_time)) + ] + append(row_format.format(*row_values)) + for device_node_name, devicenode in innerop_node.devices.items( + ): + if len(device_node_name) + 4 > name_column_width: + device_node_name = device_node_name[: + name_column_width + - 7] + device_node_name += "..." + row_values = [ + ' {}'.format(device_node_name), + devicenode.call, '- / - / - / - / -', + '{} / {} / {} / {} / {}'.format( + format_time( + devicenode.gpu_time, unit=time_unit), + format_time( + devicenode.avg_gpu_time, + unit=time_unit), + format_time( + devicenode.max_gpu_time, + unit=time_unit), + format_time( + devicenode.min_gpu_time, + unit=time_unit), + format_ratio( + float(devicenode.gpu_time) / + total_gpu_time)) + ] + append(row_format.format(*row_values)) + for device_node_name, device_node in item.devices.items(): + if len(device_node_name) + 2 > name_column_width: + device_node_name = device_node_name[: + name_column_width + - 5] + device_node_name += "..." + row_values = [ + ' {}'.format(device_node_name), devicenode.call, + '- / - / - / - / -', + '{} / {} / {} / {} / {}'.format( + format_time( + devicenode.gpu_time, unit=time_unit), + format_time( + devicenode.avg_gpu_time, unit=time_unit), + format_time( + devicenode.max_gpu_time, unit=time_unit), + format_time( + devicenode.min_gpu_time, unit=time_unit), + format_ratio( + float(devicenode.gpu_time) / + total_gpu_time)) + ] + append(row_format.format(*row_values)) + append(header_sep) + append('') + append('') + return ''.join(result)