未验证 提交 77a455c7 编写于 作者: C chenjian 提交者: GitHub

Fix profiler package bug (#40888)

* no

* fix bugs

* fix doc according to review

* fix api doc format

* fix api doc according to review

* fix bug and add unit test

* fix record event bug
上级 00448f0e
......@@ -162,7 +162,7 @@ void RecordEvent::OriginalConstruct(const std::string &name,
void RecordEvent::End() {
#ifndef _WIN32
#ifdef PADDLE_WITH_CUDA
if (g_enable_nvprof_hook && is_pushed_) {
if (g_enable_nvprof_hook && is_pushed_ && is_enabled_) {
dynload::nvtxRangePop();
}
#endif
......
......@@ -20,7 +20,7 @@
// Used to filter events, works like glog VLOG(level).
// RecordEvent will works if host_trace_level >= level.
PADDLE_DEFINE_EXPORTED_int64(host_trace_level, 2,
PADDLE_DEFINE_EXPORTED_int64(host_trace_level, 1,
"RecordEvent will works "
"if host_trace_level >= level.");
......
......@@ -207,6 +207,228 @@ class TestProfilerStatistic(unittest.TestCase):
thread_sep=False,
time_unit='ms'))
def test_statistic_case2(self):
root_node = HostPythonNode('Root Node',
profiler.TracerEventType.UserDefined, 0,
float('inf'), 1000, 1001)
profilerstep_node = HostPythonNode('ProfileStep#1',
profiler.TracerEventType.ProfileStep,
0, 400, 1000, 1001)
dataloader_node = HostPythonNode(
'Dataloader', profiler.TracerEventType.Forward, 5, 15, 1000, 1001)
mobilenet_node = HostPythonNode(
'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
yolonet_node = HostPythonNode(
'Yolov3Net', profiler.TracerEventType.Forward, 50, 110, 1000, 1001)
userdefined_node = HostPythonNode('Communication Time',
profiler.TracerEventType.UserDefined,
100, 110, 1000, 1001)
reduce_all_launchkernel0 = HostPythonNode(
'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 102, 104,
1000, 1001)
nccl_reduce_all_kernel0 = DevicePythonNode(
'nccl_reduce_all_kernel', profiler.TracerEventType.Kernel, 105, 120,
0, 0, 2)
communication_node = HostPythonNode(
'Communication', profiler.TracerEventType.Communication, 105, 110,
1000, 1001)
reduce_all_op1 = HostPythonNode('cudalaunchkernel',
profiler.TracerEventType.Operator, 105,
108, 1000, 1001)
reduce_all_launchkernel1 = HostPythonNode(
'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 106, 107,
1000, 1001)
nccl_reduce_all_kernel1 = DevicePythonNode(
'nccl_reduce_all_kernel', profiler.TracerEventType.Kernel, 130, 150,
0, 0, 2)
backward_node = HostPythonNode('Gradient Backward',
profiler.TracerEventType.Backward, 120,
200, 1000, 1001)
optimization_node = HostPythonNode(
'Optimization', profiler.TracerEventType.Optimization, 220, 300,
1000, 1001)
conv2d_node = HostPythonNode(
'conv2d', profiler.TracerEventType.Operator, 25, 40, 1000, 1001)
sync_batch_norm_node = HostPythonNode('sync_batch_norm',
profiler.TracerEventType.Operator,
60, 100, 1000, 1001)
conv2d_infer_shape = HostPythonNode(
'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25,
30, 1000, 1001)
conv2d_compute = HostPythonNode('conv2d::compute',
profiler.TracerEventType.OperatorInner,
30, 40, 1000, 1001)
conv2d_launchkernel = HostPythonNode(
'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 30, 35,
1000, 1001)
conv2d_MemCpy = HostPythonNode('AsyncMemcpy',
profiler.TracerEventType.UserDefined, 35,
40, 1000, 1001)
conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy',
profiler.TracerEventType.CudaRuntime,
35, 40, 1000, 1001)
conv2d_kernel = DevicePythonNode(
'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 50, 0, 0, 0)
conv2d_memcpy = DevicePythonNode(
'conv2d_memcpy', profiler.TracerEventType.Memcpy, 50, 60, 0, 0, 0)
sync_batch_norm_infer_shape = HostPythonNode(
'sync_batch_norm::infer_shape',
profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001)
sync_batch_norm_compute = HostPythonNode(
'sync_batch_norm::compute', profiler.TracerEventType.OperatorInner,
80, 100, 1000, 1001)
sync_batch_norm_launchkernel = HostPythonNode(
'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 80, 90,
1000, 1001)
sync_batch_norm_MemCpy = HostPythonNode(
'AsyncMemcpy', profiler.TracerEventType.UserDefined, 90, 100, 1000,
1001)
sync_batch_norm_cudaMemCpy = HostPythonNode(
'cudaMemcpy', profiler.TracerEventType.CudaRuntime, 90, 100, 1000,
1001)
sync_batch_norm_kernel = DevicePythonNode(
'sync_batch_norm_kernel', profiler.TracerEventType.Kernel, 95, 300,
0, 0, 0)
sync_batch_norm_memcpy = DevicePythonNode(
'sync_batch_norm_memcpy', profiler.TracerEventType.Memcpy, 150, 200,
0, 0, 1)
reduce_all_node2 = HostPythonNode('reduce_all',
profiler.TracerEventType.Operator,
230, 250, 1000, 1001)
reduce_all_launchkernel2 = HostPythonNode(
'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 235, 240,
1000, 1001)
nccl_reduce_all_kernel2 = DevicePythonNode(
'nccl_reduce_all_kernel', profiler.TracerEventType.Kernel, 250, 280,
0, 0, 2)
root_node.children_node.append(profilerstep_node)
profilerstep_node.children_node.extend([
dataloader_node, mobilenet_node, yolonet_node, backward_node,
optimization_node
])
mobilenet_node.children_node.append(conv2d_node)
yolonet_node.children_node.extend(
[sync_batch_norm_node, userdefined_node])
userdefined_node.children_node.append(communication_node)
userdefined_node.runtime_node.append(reduce_all_launchkernel0)
reduce_all_launchkernel0.device_node.append(nccl_reduce_all_kernel0)
communication_node.children_node.append(reduce_all_op1)
reduce_all_op1.runtime_node.append(reduce_all_launchkernel1)
reduce_all_launchkernel1.device_node.append(nccl_reduce_all_kernel1)
conv2d_node.children_node.extend(
[conv2d_infer_shape, conv2d_compute, conv2d_MemCpy])
conv2d_compute.runtime_node.append(conv2d_launchkernel)
conv2d_MemCpy.runtime_node.append(conv2d_cudaMemCpy)
conv2d_launchkernel.device_node.append(conv2d_kernel)
conv2d_cudaMemCpy.device_node.append(conv2d_memcpy)
sync_batch_norm_node.children_node.extend([
sync_batch_norm_infer_shape, sync_batch_norm_compute,
sync_batch_norm_MemCpy
])
sync_batch_norm_compute.runtime_node.append(
sync_batch_norm_launchkernel)
sync_batch_norm_MemCpy.runtime_node.append(sync_batch_norm_cudaMemCpy)
sync_batch_norm_launchkernel.device_node.append(sync_batch_norm_kernel)
sync_batch_norm_cudaMemCpy.device_node.append(sync_batch_norm_memcpy)
optimization_node.children_node.append(reduce_all_node2)
reduce_all_node2.runtime_node.append(reduce_all_launchkernel2)
reduce_all_launchkernel2.device_node.append(nccl_reduce_all_kernel2)
thread_tree = {'thread1001': root_node}
extra_info = {
'Process Cpu Utilization': '1.02',
'System Cpu Utilization': '0.68'
}
statistic_data = profiler.profiler_statistic.StatisticData(thread_tree,
extra_info)
time_range_summary = statistic_data.time_range_summary
event_summary = statistic_data.event_summary
distributed_summary = statistic_data.distributed_summary
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.ProfileStep), 400)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Forward), 100)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Backward), 80)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Optimization), 80)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Operator), 78)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.OperatorInner), 45)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.CudaRuntime), 38)
self.assertEqual(
time_range_summary.get_gpu_range_sum(
0, profiler.TracerEventType.Kernel), 220)
self.assertEqual(
time_range_summary.get_gpu_range_sum(
0, profiler.TracerEventType.Memcpy), 60)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.UserDefined), 25)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Communication), 5)
self.assertEqual(
profiler.statistic_helper.sum_ranges(
distributed_summary.cpu_communication_range), 25)
self.assertEqual(
profiler.statistic_helper.sum_ranges(
distributed_summary.gpu_communication_range), 65)
self.assertEqual(
profiler.statistic_helper.sum_ranges(
distributed_summary.communication_range), 85)
self.assertEqual(
profiler.statistic_helper.sum_ranges(
distributed_summary.computation_range), 220)
self.assertEqual(
profiler.statistic_helper.sum_ranges(
distributed_summary.overlap_range), 85)
self.assertEqual(len(event_summary.items), 4)
self.assertEqual(len(event_summary.userdefined_items), 1)
self.assertEqual(len(event_summary.model_perspective_items), 3)
self.assertEqual(len(event_summary.memory_manipulation_items), 1)
self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
self.assertEqual(event_summary.items['conv2d'].gpu_time, 25)
self.assertEqual(
event_summary.model_perspective_items['Forward'].cpu_time, 100)
self.assertEqual(
event_summary.model_perspective_items['Forward'].gpu_time, 315)
self.assertEqual(
event_summary.model_perspective_items['Backward'].gpu_time, 0)
self.assertEqual(
event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15)
self.assertEqual(
event_summary.memory_manipulation_items['AsyncMemcpy'].gpu_time, 60)
print(
profiler.profiler_statistic._build_table(
statistic_data,
sorted_by=profiler.SortedKeys.CPUTotal,
op_detail=True,
thread_sep=False,
time_unit='ms'))
if __name__ == '__main__':
unittest.main()
......@@ -29,17 +29,17 @@ from .profiler_statistic import StatisticData, _build_table, SortedKeys
class ProfilerState(Enum):
r"""
Profiler state that can be specified to control profiler action.
ProfilerState is used to present the state of :ref:`Profiler <api_paddle_profiler_Profiler>` .
CLOSED: The profilers are closed.
The meaning of each ProfilerState is as following
READY: The profilers are open, but the data will not be recorded.
This state is used for reducing overhead influence when profilers start.
- **ProfilerState.CLOSED** : The profiler is closed, and no profiling data will be recorded.
RECORD: The profilers are open, and the data will be recorded.
- **ProfilerState.READY** : The profiler is open, but the data will not be recorded. This state is used for reducing overhead influence when profiler starts.
RECORD_AND_RETURN: The profilers are open, and at the last batch of current profiler period,
the collected data will be returned.
- **ProfilerState.RECORD** : The profiler is open, and the data will be recorded.
- **ProfilerState.RECORD_AND_RETURN** : The profiler is open, and this state stands for the last batch of "RECORD" state in current profiling period. The collected data will be returned in this state.
"""
CLOSED = 0
READY = 1
......@@ -49,11 +49,13 @@ class ProfilerState(Enum):
class ProfilerTarget(Enum):
r"""
Target device for profiling.
ProfilerTarget is used to specify target device for :ref:`profiling <api_paddle_profiler_Profiler>` . Only CPU and GPU are supported currently.
CPU: Profile events on CPU.
GPU: Profile events on GPU.
The meaning of each ProfilerState is as following
- **ProfilerTarget.CPU** : Profile events on CPU.
- **ProfilerTarget.GPU** : Profile events on GPU.
"""
CPU = 0
GPU = 1
......@@ -66,7 +68,7 @@ def make_scheduler(*,
repeat: int=0,
skip_first: int=0) -> Callable:
r"""
Return a scheduler function, which scheduler the state according to the setting.
Return a scheduler function, which scheduler the :ref:`state <api_paddle_profiler_ProfilerState>` according to the setting.
The state transform confirms to:
.. code-block:: text
......@@ -78,22 +80,23 @@ def make_scheduler(*,
- - - - - - - - - - - -
Note that repeat <= 0 means the cycle will continue until the profiler exits.
Parameters:
Args:
closed(int): The number of steps in state ProfilerState.CLOSED.
ready(int): The number of steps in state ProfilerState.READY.
record(int): The number of steps in state ProfilerState.RECORD.
repeat(int): The number of cycles to repeat above state transform.
skip_first(int): The number of first steps to drop, not participate in the state transform.
record(int): The number of steps in state ProfilerState.RECORD, and the state in last step will be set as ProfilerState.RECORD_AND_RETURN.
repeat(int, optional): The number of cycles to repeat above state transform. Default value is 0, which means it will repeat this cycle until profiler exits.
skip_first(int, optional): The number of first steps to drop, not participate in the state transform, and at ProfilerState.CLOSED state. Default value is 0.
Returns:
A scheduler function, conforms to above state transform setting.
A scheduler function, conforms to above state transform setting. The function will takes one parameter step_num, and returns corresponding ProfilerState.
Examples:
1. profiling range [2, 5]
batch 0: closed, batch 1: ready, batch [2, 5] record
Assume batch 0: closed, batch 1: ready, batch [2, 5] record
.. code-block:: python
:name: code-example1
import paddle.profiler as profiler
profiler.make_scheduler(closed=1, ready=1, record=4, repeat=1)
......@@ -101,9 +104,10 @@ def make_scheduler(*,
2. profiling range [3,6], [9,12], [15,18]...
batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat
Assume batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat
.. code-block:: python
:name: code-example2
import paddle.profiler as profiler
profiler.make_scheduler(closed=1, ready=1, record=4, skip_first=1)
......@@ -148,15 +152,21 @@ def export_chrome_tracing(dir_name: str,
worker_name: Optional[str]=None) -> Callable:
r"""
Return a callable, used for outputing tracing data to chrome tracing format file.
The output file will be saved in directory 'dir_name', and file name will be set as worker_name.
The output file will be saved in directory ``dir_name``, and file name will be set as worker_name.
if worker_name is not set, the default name is [hostname]_[pid].
Parameters:
Args:
dir_name(str): Directory to save profiling data.
worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid].
worker_name(str, optional): Prefix of the file name saved, default is [hostname]_[pid].
Returns:
A callable, which takes a Profiler object as parameter and calls its export method to save data to chrome tracing format file.
Examples:
The return value can be used as parameter ``on_trace_ready`` in :ref:`Profiler <api_paddle_profiler_Profiler>` .
.. code-block:: python
:name: code-example1
# required: gpu
import paddle.profiler as profiler
......@@ -192,15 +202,21 @@ def export_chrome_tracing(dir_name: str,
def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable:
r"""
Return a callable, used for outputing tracing data to protobuf file.
The output file will be saved in directory 'dir_name', and file name will be set as worker_name.
The output file will be saved in directory ``dir_name``, and file name will be set as worker_name.
if worker_name is not set, the default name is [hostname]_[pid].
Parameters:
Args:
dir_name(str): Directory to save profiling data.
worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid].
worker_name(str, optional): Prefix of the file name saved, default is [hostname]_[pid].
Returns:
A callable, which takes a Profiler object as parameter and calls its export method to save data to protobuf file.
Examples:
The return value can be used as parameter ``on_trace_ready`` in :ref:`Profiler <api_paddle_profiler_Profiler>` .
.. code-block:: python
:name: code-example1
# required: gpu
import paddle.profiler as profiler
......@@ -244,20 +260,21 @@ def _get_supported_targets() -> Iterable[ProfilerTarget]:
class Profiler:
r"""
Profiler context manager, user interface to manage profile process.
Profiler context manager, user interface to manage profiling process to start, stop, export profiling data and print summary table.
Parameters:
targets (iterable): list of tracing targets, currently supported values, ``ProfilerTarget.CPU``, ``ProfilerTarget.GPU`` .
scheduler (callable or tuple): If it is a callable object, it takes a step number as parameter and return the corresponding ``ProfilerState``.
If not provided, the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch,
Args:
targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU <api_paddle_profiler_ProfilerTarget>` and :ref:`ProfilerTarget.GPU <api_paddle_profiler_ProfilerTarget>` .
scheduler (Callable|tuple, optional): If it is a callable object, it takes a step number as parameter and return the corresponding :ref:`ProfilerState <api_paddle_profiler_ProfilerState>`. This callable object can be generated by :ref:`make_scheduler <api_paddle_profiler_make_scheduler>` function.
If not provided (None), the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch,
which means profiling range [start_batch, end_batch).
on_trace_ready (callable): callable object, takes the Profiler object as parameter, which provides a way for users to do post-processing.
This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``.
on_trace_ready (Callable, optional): Callable object, serves as callback function, and takes the Profiler object as parameter, which provides a way for users to do post-processing.
This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``. The default value is :ref:`export_chrome_tracing <api_paddle_profiler_export_chrome_tracing>` (./profiler_log/).
Examples:
1. profiling range [2, 5)
1. profiling range [2, 5).
.. code-block:: python
:name: code-example1
# required: gpu
import paddle.profiler as profiler
......@@ -272,6 +289,7 @@ class Profiler:
2. profiling range [2,4], [7, 9], [11,13]
.. code-block:: python
:name: code-example2
# required: gpu
import paddle.profiler as profiler
......@@ -286,6 +304,7 @@ class Profiler:
3. Use profiler without context manager, and use default parameters
.. code-block:: python
:name: code-example3
# required: gpu
import paddle.profiler as profiler
......@@ -367,6 +386,7 @@ class Profiler:
Examples:
.. code-block:: python
:name: code-example4
# required: gpu
import paddle.profiler as profiler
......@@ -401,6 +421,7 @@ class Profiler:
Examples:
.. code-block:: python
:name: code-example5
# required: gpu
import paddle.profiler as profiler
......@@ -437,6 +458,7 @@ class Profiler:
Examples:
.. code-block:: python
:name: code-example6
# required: gpu
import paddle.profiler as profiler
......@@ -522,10 +544,16 @@ class Profiler:
def export(self, path="", format="json"):
r"""
Exports the tracing data in Chrome tracing data format.
Exports the tracing data to file.
Args:
path(str): file path of the output.
format(str, optional): output format, can be chosen from ['json', 'pb], 'json' for chrome tracing and 'pb' for protobuf, default value is "json".
Examples:
.. code-block:: python
:name: code-example7
# required: gpu
import paddle.profiler as profiler
......@@ -548,16 +576,17 @@ class Profiler:
thread_sep=False,
time_unit='ms'):
r"""
Print the Summary table.
Print the Summary table. Currently support overview, model, distributed, operator, memory manipulation and userdefined summary.
Parameters:
sorted_by(SortedKeys): how to rank the op table items.
op_detail(bool): expand each operator detail information.
thread_sep(bool): print op table each thread.
time_unit(str): can be chosen form ['s', 'ms', 'us', 'ns']
Args:
sorted_by( :ref:`SortedKeys <api_paddle_profiler_SortedKeys>` , optional): how to rank the op table items, default value is SortedKeys.CPUTotal.
op_detail(bool, optional): expand each operator detail information, default value is True.
thread_sep(bool, optional): print op table each thread, default value is False.
time_unit(str, optional): time unit for display, can be chosen form ['s', 'ms', 'us', 'ns'], default value is 'ms'.
Examples:
.. code-block:: python
:name: code-example8
# required: gpu
import paddle.profiler as profiler
......
......@@ -33,23 +33,25 @@ _CommunicationOpName = ['reduce', 'broadcast', 'rpc']
class SortedKeys(Enum):
r"""
Sorted keys for printing summary table.
SortedKeys is used to specify how to sort items when printing :ref:`summary <api_paddle_profiler_profiler_summary>` table.
CPUTotal: Sorted by CPU total time.
The meaning of each SortedKeys is as following
CPUAvg: Sorted by CPU average time.
- **SortedKeys.CPUTotal** : Sorted by CPU total time.
CPUMax: Sorted by CPU max time.
- **SortedKeys.CPUAvg** : Sorted by CPU average time.
CPUMin: Sorted by CPU min time.
- **SortedKeys.CPUMax** : Sorted by CPU max time.
GPUTotal: Sorted by GPU total time.
- **SortedKeys.CPUMin** : Sorted by CPU min time.
GPUAvg: Sorted by GPU average time.
- **SortedKeys.GPUTotal** : Sorted by GPU total time.
GPUMax: Sorted by GPU max time.
- **SortedKeys.GPUAvg** : Sorted by GPU average time.
GPUMin: Sorted by GPU min time.
- **SortedKeys.GPUMax** : Sorted by GPU max time.
- **SortedKeys.GPUMin** : Sorted by GPU min time.
"""
CPUTotal = 0
CPUAvg = 1
......@@ -119,6 +121,23 @@ def traverse_tree(nodetrees):
return results
def get_device_nodes(hostnode):
'''
Get all device nodes called in the time range of hostnode.
'''
stack = []
device_nodes = []
stack.append(hostnode)
while stack:
current_node = stack.pop()
for childnode in current_node.children_node:
stack.append(childnode)
for runtimenode in current_node.runtime_node:
for devicenode in runtimenode.device_node:
device_nodes.append(devicenode)
return device_nodes
def wrap_tree(nodetrees):
'''
Using HostStatisticNode to wrap original profiler result tree, and calculate node statistic metrics.
......@@ -186,16 +205,6 @@ class TimeRangeSummary:
CPUTimeRange[hostnode.type].append(
(hostnode.start_ns, hostnode.end_ns))
self.call_times[hostnode.type] += 1
if hostnode.type == TracerEventType.Operator and any([
name in hostnode.name for name in _CommunicationOpName
]): # special case, communication op
CPUTimeRange[TracerEventType.Communication].append(
(hostnode.start_ns, hostnode.end_ns))
self.call_times[TracerEventType.Communication] += 1
is_communication_node = (
hostnode.type == TracerEventType.Communication
) or (hostnode.type == TracerEventType.Operator and any(
[name in hostnode.name for name in _CommunicationOpName]))
for runtimenode in hostnode.runtime_node:
CPUTimeRange[runtimenode.type].append(
(runtimenode.start_ns, runtimenode.end_ns))
......@@ -205,12 +214,6 @@ class TimeRangeSummary:
devicenode.stream_id].append(
(devicenode.start_ns, devicenode.end_ns))
self.call_times[devicenode.type] += 1
if is_communication_node: # gpu activity for communication node
GPUTimeRange[devicenode.device_id][
TracerEventType.Communication][
devicenode.stream_id].append((
devicenode.start_ns, devicenode.end_ns))
self.call_times[TracerEventType.Communication] += 1
for event_type, time_ranges in CPUTimeRange.items():
time_ranges = merge_self_ranges(time_ranges, is_sorted=False)
......@@ -243,6 +246,74 @@ class TimeRangeSummary:
return self.CPUTimeRangeSum[event_type]
class DistributedSummary:
r"""
Analysis communication and computation time range, and their overlap.
The computation time is all kernel except kernels for communication like nccl.
"""
def __init__(self):
self.cpu_communication_range = []
self.gpu_communication_range = []
self.communication_range = []
self.computation_range = []
self.overlap_range = []
def parse(self, nodetrees):
'''
Collect all communication and computation time ranges.
'''
thread2hostnodes = traverse_tree(nodetrees)
for threadid, hostnodes in thread2hostnodes.items():
for hostnode in hostnodes[1:]: #skip root node
# case 1: TracerEventType is Communication
if hostnode.type == TracerEventType.Communication:
self.cpu_communication_range.append(
(hostnode.start_ns, hostnode.end_ns))
device_nodes = get_device_nodes(hostnode)
for device_node in device_nodes:
if device_node.type == TracerEventType.Kernel:
self.gpu_communication_range.append(
(device_node.start_ns, device_node.end_ns))
#case 2: TracerEventType is Operator but is communication op
elif hostnode.type == TracerEventType.Operator and any([
name in hostnode.name.lower()
for name in _CommunicationOpName
]):
self.cpu_communication_range.append(
(hostnode.start_ns, hostnode.end_ns))
device_nodes = get_device_nodes(hostnode)
for device_node in device_nodes:
if device_node.type == TracerEventType.Kernel:
self.gpu_communication_range.append(
(device_node.start_ns, device_node.end_ns))
#case 3: Others, filter kernels named with nccl
else:
for runtimenode in hostnode.runtime_node:
for devicenode in runtimenode.device_node:
if devicenode.type == TracerEventType.Kernel:
if 'nccl' in devicenode.name.lower():
self.gpu_communication_range.append((
devicenode.start_ns, devicenode.end_ns))
else:
self.computation_range.append((
devicenode.start_ns, devicenode.end_ns))
self.cpu_communication_range = merge_self_ranges(
self.cpu_communication_range, is_sorted=False)
self.gpu_communication_range = merge_self_ranges(
self.gpu_communication_range, is_sorted=False)
self.communication_range = merge_ranges(
self.cpu_communication_range,
self.gpu_communication_range,
is_sorted=True)
self.computation_range = merge_self_ranges(
self.computation_range, is_sorted=False)
self.overlap_range = intersection_ranges(
self.communication_range, self.computation_range, is_sorted=True)
class EventSummary:
r"""
Analyse operator event in profiling data, correlate with its device event.
......@@ -473,8 +544,10 @@ class StatisticData:
self.extra_info = extra_info
self.time_range_summary = TimeRangeSummary()
self.event_summary = EventSummary()
self.distributed_summary = DistributedSummary()
self.time_range_summary.parse(node_trees)
self.event_summary.parse(node_trees)
self.distributed_summary.parse(node_trees)
def _build_table(statistic_data,
......@@ -610,7 +683,11 @@ def _build_table(statistic_data,
gpu_type_time = collections.defaultdict(int)
for event_type, value in statistic_data.time_range_summary.CPUTimeRangeSum.items(
):
cpu_type_time[event_type] = value
if event_type != TracerEventType.Communication:
cpu_type_time[event_type] = value
if statistic_data.distributed_summary.cpu_communication_range:
cpu_type_time[TracerEventType.Communication] = sum_ranges(
statistic_data.distributed_summary.cpu_communication_range)
gpu_time_range = collections.defaultdict(list)
for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items(
......@@ -620,6 +697,9 @@ def _build_table(statistic_data,
gpu_time_range[event_type], time_range, is_sorted=True)
for event_type, time_range in gpu_time_range.items():
gpu_type_time[event_type] = sum_ranges(time_range)
if statistic_data.distributed_summary.gpu_communication_range:
gpu_type_time[TracerEventType.Communication] = sum_ranges(
statistic_data.distributed_summary.gpu_communication_range)
sorted_items = sorted(
cpu_type_time.items(), key=lambda x: x[1], reverse=True)
......@@ -735,7 +815,7 @@ def _build_table(statistic_data,
append('')
###### Print Distribution Summary Report ######
if TracerEventType.Communication in statistic_data.time_range_summary.CPUTimeRange:
if statistic_data.distributed_summary.communication_range:
headers = [
'Name',
'Total Time',
......@@ -759,33 +839,12 @@ def _build_table(statistic_data,
append(header_sep)
append(row_format.format(*headers))
append(header_sep)
cpu_communication_time_range = []
gpu_communication_time_range = []
cpu_communication_time_range = merge_ranges(
statistic_data.time_range_summary.CPUTimeRange[
TracerEventType.Communication], cpu_communication_time_range)
kernel_time_range = []
for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items(
):
kernel_time_range = merge_ranges(
device_time_ranges[TracerEventType.Kernel],
kernel_time_range,
is_sorted=True)
gpu_communication_time_range = merge_ranges(
device_time_ranges[TracerEventType.Communication],
gpu_communication_time_range,
is_sorted=True)
communication_time_range = merge_ranges(
cpu_communication_time_range,
gpu_communication_time_range,
is_sorted=True)
computation_time_range = subtract_ranges(kernel_time_range,
gpu_communication_time_range)
overlap_time_range = intersection_ranges(communication_time_range,
computation_time_range)
communication_time = sum_ranges(communication_time_range)
computation_time = sum_ranges(computation_time_range)
overlap_time = sum_ranges(overlap_time_range)
communication_time = sum_ranges(
statistic_data.distributed_summary.communication_range)
computation_time = sum_ranges(
statistic_data.distributed_summary.computation_range)
overlap_time = sum_ranges(
statistic_data.distributed_summary.overlap_range)
row_values = [
'Communication', format_time(
communication_time, unit=time_unit),
......@@ -808,9 +867,9 @@ def _build_table(statistic_data,
append(row_format.format(*row_values))
append(header_sep)
append(
"Note:\nCommunication time: Communication Op time and its kernel time on gpu.\n"
"Computation time: Kernel time, substract kernels belong to communication op.\n"
"Overlap time: Communication time intersect with computation time.\n"
"Note:\nCommunication time: Communication Event time, Communication Op time and its kernel time on gpu.\n"
"Computation time: Kernel time, except kernels belong to communication(nccl kernels).\n"
"Overlap time: Communication time intersects with computation time.\n"
"Example:\n"
"Communication:\n"
" CPU: |_________________|\n"
......@@ -976,7 +1035,7 @@ def _build_table(statistic_data,
- 5]
device_node_name += "..."
row_values = [
' {}'.format(device_node_name), devicenode.call,
' {}'.format(device_node_name), devicenode.call,
'- / - / - / - / -',
'{} / {} / {} / {} / {}'.format(
format_time(
......@@ -1001,10 +1060,14 @@ def _build_table(statistic_data,
'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
'GPU Total / Avg / Max / Min / Ratio(%)'
]
name_column_width = 0
for name, item in statistic_data.event_summary.memory_manipulation_items.items(
):
name_column_width = max(len(name), name_column_width)
row_format_list = [""]
header_sep_list = [""]
line_length_list = [-SPACING_SIZE]
name_column_width = 30
name_column_width = min(name_column_width, 45)
add_column(name_column_width)
add_column(6)
add_column(40)
......@@ -1059,7 +1122,7 @@ def _build_table(statistic_data,
row_format_list = [""]
header_sep_list = [""]
line_length_list = [-SPACING_SIZE]
name_column_width = 30
name_column_width = 35
add_column(name_column_width)
add_column(6)
add_column(40)
......
......@@ -96,6 +96,7 @@ def merge_ranges(range_list1, range_list2, is_sorted=False):
else:
indx1 += 1
while indx2 < len2:
range2 = range_list2[indx2]
if range2[1] > merged_ranges[-1][1]:
if range2[0] <= merged_ranges[-1][1]:
merged_ranges[-1] = (merged_ranges[-1][0], range2[1])
......
......@@ -17,8 +17,8 @@ from warnings import warn
import functools
from contextlib import ContextDecorator
from paddle.fluid import core
from paddle.fluid.core import (_RecordEvent, TracerEventType)
import paddle.fluid.core as core
_AllowedEventTypeList = [
TracerEventType.Dataloader, TracerEventType.ProfileStep,
......@@ -30,13 +30,15 @@ _AllowedEventTypeList = [
class RecordEvent(ContextDecorator):
r"""
Interface for recording a time range.
Interface for recording a time range by user defined.
Parameters:
Args:
name(str): Name of the record event
event_type(TracerEventType, optional): Optional, default value is TracerEventType.UserDefined. It is reserved for internal purpose, and it is better not to specify this parameter.
Examples:
.. code-block:: python
:name: code-example1
import paddle
import paddle.profiler as profiler
......@@ -53,8 +55,8 @@ class RecordEvent(ContextDecorator):
result = data1 + data2
record_event.end()
Note:
RecordEvent will take effect only when profiler is on and at the state of RECORD.
**Note**:
RecordEvent will take effect only when :ref:`Profiler <api_paddle_profiler_Profiler>` is on and at the state of RECORD.
"""
def __init__(self,
......@@ -76,6 +78,7 @@ class RecordEvent(ContextDecorator):
Record the time of begining.
.. code-block:: python
:name: code-example2
import paddle
import paddle.profiler as profiler
......@@ -100,6 +103,7 @@ class RecordEvent(ContextDecorator):
Record the time of ending.
.. code-block:: python
:name: code-example3
import paddle
import paddle.profiler as profiler
......@@ -118,14 +122,15 @@ def load_profiler_result(filename: str):
r"""
Load dumped profiler data back to memory.
Parameters:
Args:
filename(str): Name of the exported protobuf file of profiler data.
Returns:
ProfilerResult object.
ProfilerResult object, which stores profiling data.
Examples:
.. code-block:: python
:name: code-example1
# required: gpu
import paddle.profiler as profiler
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册