未验证 提交 77a455c7 编写于 作者: C chenjian 提交者: GitHub

Fix profiler package bug (#40888)

* no

* fix bugs

* fix doc according to review

* fix api doc format

* fix api doc according to review

* fix bug and add unit test

* fix record event bug
上级 00448f0e
...@@ -162,7 +162,7 @@ void RecordEvent::OriginalConstruct(const std::string &name, ...@@ -162,7 +162,7 @@ void RecordEvent::OriginalConstruct(const std::string &name,
void RecordEvent::End() { void RecordEvent::End() {
#ifndef _WIN32 #ifndef _WIN32
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (g_enable_nvprof_hook && is_pushed_) { if (g_enable_nvprof_hook && is_pushed_ && is_enabled_) {
dynload::nvtxRangePop(); dynload::nvtxRangePop();
} }
#endif #endif
......
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
// Used to filter events, works like glog VLOG(level). // Used to filter events, works like glog VLOG(level).
// RecordEvent will works if host_trace_level >= level. // RecordEvent will works if host_trace_level >= level.
PADDLE_DEFINE_EXPORTED_int64(host_trace_level, 2, PADDLE_DEFINE_EXPORTED_int64(host_trace_level, 1,
"RecordEvent will works " "RecordEvent will works "
"if host_trace_level >= level."); "if host_trace_level >= level.");
......
...@@ -207,6 +207,228 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -207,6 +207,228 @@ class TestProfilerStatistic(unittest.TestCase):
thread_sep=False, thread_sep=False,
time_unit='ms')) time_unit='ms'))
def test_statistic_case2(self):
root_node = HostPythonNode('Root Node',
profiler.TracerEventType.UserDefined, 0,
float('inf'), 1000, 1001)
profilerstep_node = HostPythonNode('ProfileStep#1',
profiler.TracerEventType.ProfileStep,
0, 400, 1000, 1001)
dataloader_node = HostPythonNode(
'Dataloader', profiler.TracerEventType.Forward, 5, 15, 1000, 1001)
mobilenet_node = HostPythonNode(
'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
yolonet_node = HostPythonNode(
'Yolov3Net', profiler.TracerEventType.Forward, 50, 110, 1000, 1001)
userdefined_node = HostPythonNode('Communication Time',
profiler.TracerEventType.UserDefined,
100, 110, 1000, 1001)
reduce_all_launchkernel0 = HostPythonNode(
'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 102, 104,
1000, 1001)
nccl_reduce_all_kernel0 = DevicePythonNode(
'nccl_reduce_all_kernel', profiler.TracerEventType.Kernel, 105, 120,
0, 0, 2)
communication_node = HostPythonNode(
'Communication', profiler.TracerEventType.Communication, 105, 110,
1000, 1001)
reduce_all_op1 = HostPythonNode('cudalaunchkernel',
profiler.TracerEventType.Operator, 105,
108, 1000, 1001)
reduce_all_launchkernel1 = HostPythonNode(
'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 106, 107,
1000, 1001)
nccl_reduce_all_kernel1 = DevicePythonNode(
'nccl_reduce_all_kernel', profiler.TracerEventType.Kernel, 130, 150,
0, 0, 2)
backward_node = HostPythonNode('Gradient Backward',
profiler.TracerEventType.Backward, 120,
200, 1000, 1001)
optimization_node = HostPythonNode(
'Optimization', profiler.TracerEventType.Optimization, 220, 300,
1000, 1001)
conv2d_node = HostPythonNode(
'conv2d', profiler.TracerEventType.Operator, 25, 40, 1000, 1001)
sync_batch_norm_node = HostPythonNode('sync_batch_norm',
profiler.TracerEventType.Operator,
60, 100, 1000, 1001)
conv2d_infer_shape = HostPythonNode(
'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25,
30, 1000, 1001)
conv2d_compute = HostPythonNode('conv2d::compute',
profiler.TracerEventType.OperatorInner,
30, 40, 1000, 1001)
conv2d_launchkernel = HostPythonNode(
'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 30, 35,
1000, 1001)
conv2d_MemCpy = HostPythonNode('AsyncMemcpy',
profiler.TracerEventType.UserDefined, 35,
40, 1000, 1001)
conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy',
profiler.TracerEventType.CudaRuntime,
35, 40, 1000, 1001)
conv2d_kernel = DevicePythonNode(
'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 50, 0, 0, 0)
conv2d_memcpy = DevicePythonNode(
'conv2d_memcpy', profiler.TracerEventType.Memcpy, 50, 60, 0, 0, 0)
sync_batch_norm_infer_shape = HostPythonNode(
'sync_batch_norm::infer_shape',
profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001)
sync_batch_norm_compute = HostPythonNode(
'sync_batch_norm::compute', profiler.TracerEventType.OperatorInner,
80, 100, 1000, 1001)
sync_batch_norm_launchkernel = HostPythonNode(
'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 80, 90,
1000, 1001)
sync_batch_norm_MemCpy = HostPythonNode(
'AsyncMemcpy', profiler.TracerEventType.UserDefined, 90, 100, 1000,
1001)
sync_batch_norm_cudaMemCpy = HostPythonNode(
'cudaMemcpy', profiler.TracerEventType.CudaRuntime, 90, 100, 1000,
1001)
sync_batch_norm_kernel = DevicePythonNode(
'sync_batch_norm_kernel', profiler.TracerEventType.Kernel, 95, 300,
0, 0, 0)
sync_batch_norm_memcpy = DevicePythonNode(
'sync_batch_norm_memcpy', profiler.TracerEventType.Memcpy, 150, 200,
0, 0, 1)
reduce_all_node2 = HostPythonNode('reduce_all',
profiler.TracerEventType.Operator,
230, 250, 1000, 1001)
reduce_all_launchkernel2 = HostPythonNode(
'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 235, 240,
1000, 1001)
nccl_reduce_all_kernel2 = DevicePythonNode(
'nccl_reduce_all_kernel', profiler.TracerEventType.Kernel, 250, 280,
0, 0, 2)
root_node.children_node.append(profilerstep_node)
profilerstep_node.children_node.extend([
dataloader_node, mobilenet_node, yolonet_node, backward_node,
optimization_node
])
mobilenet_node.children_node.append(conv2d_node)
yolonet_node.children_node.extend(
[sync_batch_norm_node, userdefined_node])
userdefined_node.children_node.append(communication_node)
userdefined_node.runtime_node.append(reduce_all_launchkernel0)
reduce_all_launchkernel0.device_node.append(nccl_reduce_all_kernel0)
communication_node.children_node.append(reduce_all_op1)
reduce_all_op1.runtime_node.append(reduce_all_launchkernel1)
reduce_all_launchkernel1.device_node.append(nccl_reduce_all_kernel1)
conv2d_node.children_node.extend(
[conv2d_infer_shape, conv2d_compute, conv2d_MemCpy])
conv2d_compute.runtime_node.append(conv2d_launchkernel)
conv2d_MemCpy.runtime_node.append(conv2d_cudaMemCpy)
conv2d_launchkernel.device_node.append(conv2d_kernel)
conv2d_cudaMemCpy.device_node.append(conv2d_memcpy)
sync_batch_norm_node.children_node.extend([
sync_batch_norm_infer_shape, sync_batch_norm_compute,
sync_batch_norm_MemCpy
])
sync_batch_norm_compute.runtime_node.append(
sync_batch_norm_launchkernel)
sync_batch_norm_MemCpy.runtime_node.append(sync_batch_norm_cudaMemCpy)
sync_batch_norm_launchkernel.device_node.append(sync_batch_norm_kernel)
sync_batch_norm_cudaMemCpy.device_node.append(sync_batch_norm_memcpy)
optimization_node.children_node.append(reduce_all_node2)
reduce_all_node2.runtime_node.append(reduce_all_launchkernel2)
reduce_all_launchkernel2.device_node.append(nccl_reduce_all_kernel2)
thread_tree = {'thread1001': root_node}
extra_info = {
'Process Cpu Utilization': '1.02',
'System Cpu Utilization': '0.68'
}
statistic_data = profiler.profiler_statistic.StatisticData(thread_tree,
extra_info)
time_range_summary = statistic_data.time_range_summary
event_summary = statistic_data.event_summary
distributed_summary = statistic_data.distributed_summary
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.ProfileStep), 400)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Forward), 100)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Backward), 80)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Optimization), 80)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Operator), 78)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.OperatorInner), 45)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.CudaRuntime), 38)
self.assertEqual(
time_range_summary.get_gpu_range_sum(
0, profiler.TracerEventType.Kernel), 220)
self.assertEqual(
time_range_summary.get_gpu_range_sum(
0, profiler.TracerEventType.Memcpy), 60)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.UserDefined), 25)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Communication), 5)
self.assertEqual(
profiler.statistic_helper.sum_ranges(
distributed_summary.cpu_communication_range), 25)
self.assertEqual(
profiler.statistic_helper.sum_ranges(
distributed_summary.gpu_communication_range), 65)
self.assertEqual(
profiler.statistic_helper.sum_ranges(
distributed_summary.communication_range), 85)
self.assertEqual(
profiler.statistic_helper.sum_ranges(
distributed_summary.computation_range), 220)
self.assertEqual(
profiler.statistic_helper.sum_ranges(
distributed_summary.overlap_range), 85)
self.assertEqual(len(event_summary.items), 4)
self.assertEqual(len(event_summary.userdefined_items), 1)
self.assertEqual(len(event_summary.model_perspective_items), 3)
self.assertEqual(len(event_summary.memory_manipulation_items), 1)
self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
self.assertEqual(event_summary.items['conv2d'].gpu_time, 25)
self.assertEqual(
event_summary.model_perspective_items['Forward'].cpu_time, 100)
self.assertEqual(
event_summary.model_perspective_items['Forward'].gpu_time, 315)
self.assertEqual(
event_summary.model_perspective_items['Backward'].gpu_time, 0)
self.assertEqual(
event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15)
self.assertEqual(
event_summary.memory_manipulation_items['AsyncMemcpy'].gpu_time, 60)
print(
profiler.profiler_statistic._build_table(
statistic_data,
sorted_by=profiler.SortedKeys.CPUTotal,
op_detail=True,
thread_sep=False,
time_unit='ms'))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -29,17 +29,17 @@ from .profiler_statistic import StatisticData, _build_table, SortedKeys ...@@ -29,17 +29,17 @@ from .profiler_statistic import StatisticData, _build_table, SortedKeys
class ProfilerState(Enum): class ProfilerState(Enum):
r""" r"""
Profiler state that can be specified to control profiler action. ProfilerState is used to present the state of :ref:`Profiler <api_paddle_profiler_Profiler>` .
CLOSED: The profilers are closed. The meaning of each ProfilerState is as following
READY: The profilers are open, but the data will not be recorded. - **ProfilerState.CLOSED** : The profiler is closed, and no profiling data will be recorded.
This state is used for reducing overhead influence when profilers start.
RECORD: The profilers are open, and the data will be recorded. - **ProfilerState.READY** : The profiler is open, but the data will not be recorded. This state is used for reducing overhead influence when profiler starts.
RECORD_AND_RETURN: The profilers are open, and at the last batch of current profiler period, - **ProfilerState.RECORD** : The profiler is open, and the data will be recorded.
the collected data will be returned.
- **ProfilerState.RECORD_AND_RETURN** : The profiler is open, and this state stands for the last batch of "RECORD" state in current profiling period. The collected data will be returned in this state.
""" """
CLOSED = 0 CLOSED = 0
READY = 1 READY = 1
...@@ -49,11 +49,13 @@ class ProfilerState(Enum): ...@@ -49,11 +49,13 @@ class ProfilerState(Enum):
class ProfilerTarget(Enum): class ProfilerTarget(Enum):
r""" r"""
Target device for profiling. ProfilerTarget is used to specify target device for :ref:`profiling <api_paddle_profiler_Profiler>` . Only CPU and GPU are supported currently.
The meaning of each ProfilerState is as following
CPU: Profile events on CPU. - **ProfilerTarget.CPU** : Profile events on CPU.
GPU: Profile events on GPU. - **ProfilerTarget.GPU** : Profile events on GPU.
""" """
CPU = 0 CPU = 0
GPU = 1 GPU = 1
...@@ -66,7 +68,7 @@ def make_scheduler(*, ...@@ -66,7 +68,7 @@ def make_scheduler(*,
repeat: int=0, repeat: int=0,
skip_first: int=0) -> Callable: skip_first: int=0) -> Callable:
r""" r"""
Return a scheduler function, which scheduler the state according to the setting. Return a scheduler function, which scheduler the :ref:`state <api_paddle_profiler_ProfilerState>` according to the setting.
The state transform confirms to: The state transform confirms to:
.. code-block:: text .. code-block:: text
...@@ -78,22 +80,23 @@ def make_scheduler(*, ...@@ -78,22 +80,23 @@ def make_scheduler(*,
- - - - - - - - - - - - - - - - - - - - - - - -
Note that repeat <= 0 means the cycle will continue until the profiler exits. Note that repeat <= 0 means the cycle will continue until the profiler exits.
Parameters: Args:
closed(int): The number of steps in state ProfilerState.CLOSED. closed(int): The number of steps in state ProfilerState.CLOSED.
ready(int): The number of steps in state ProfilerState.READY. ready(int): The number of steps in state ProfilerState.READY.
record(int): The number of steps in state ProfilerState.RECORD. record(int): The number of steps in state ProfilerState.RECORD, and the state in last step will be set as ProfilerState.RECORD_AND_RETURN.
repeat(int): The number of cycles to repeat above state transform. repeat(int, optional): The number of cycles to repeat above state transform. Default value is 0, which means it will repeat this cycle until profiler exits.
skip_first(int): The number of first steps to drop, not participate in the state transform. skip_first(int, optional): The number of first steps to drop, not participate in the state transform, and at ProfilerState.CLOSED state. Default value is 0.
Returns: Returns:
A scheduler function, conforms to above state transform setting. A scheduler function, conforms to above state transform setting. The function will takes one parameter step_num, and returns corresponding ProfilerState.
Examples: Examples:
1. profiling range [2, 5] 1. profiling range [2, 5]
batch 0: closed, batch 1: ready, batch [2, 5] record Assume batch 0: closed, batch 1: ready, batch [2, 5] record
.. code-block:: python .. code-block:: python
:name: code-example1
import paddle.profiler as profiler import paddle.profiler as profiler
profiler.make_scheduler(closed=1, ready=1, record=4, repeat=1) profiler.make_scheduler(closed=1, ready=1, record=4, repeat=1)
...@@ -101,9 +104,10 @@ def make_scheduler(*, ...@@ -101,9 +104,10 @@ def make_scheduler(*,
2. profiling range [3,6], [9,12], [15,18]... 2. profiling range [3,6], [9,12], [15,18]...
batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat Assume batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat
.. code-block:: python .. code-block:: python
:name: code-example2
import paddle.profiler as profiler import paddle.profiler as profiler
profiler.make_scheduler(closed=1, ready=1, record=4, skip_first=1) profiler.make_scheduler(closed=1, ready=1, record=4, skip_first=1)
...@@ -148,15 +152,21 @@ def export_chrome_tracing(dir_name: str, ...@@ -148,15 +152,21 @@ def export_chrome_tracing(dir_name: str,
worker_name: Optional[str]=None) -> Callable: worker_name: Optional[str]=None) -> Callable:
r""" r"""
Return a callable, used for outputing tracing data to chrome tracing format file. Return a callable, used for outputing tracing data to chrome tracing format file.
The output file will be saved in directory 'dir_name', and file name will be set as worker_name. The output file will be saved in directory ``dir_name``, and file name will be set as worker_name.
if worker_name is not set, the default name is [hostname]_[pid]. if worker_name is not set, the default name is [hostname]_[pid].
Parameters: Args:
dir_name(str): Directory to save profiling data. dir_name(str): Directory to save profiling data.
worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid]. worker_name(str, optional): Prefix of the file name saved, default is [hostname]_[pid].
Returns:
A callable, which takes a Profiler object as parameter and calls its export method to save data to chrome tracing format file.
Examples: Examples:
The return value can be used as parameter ``on_trace_ready`` in :ref:`Profiler <api_paddle_profiler_Profiler>` .
.. code-block:: python .. code-block:: python
:name: code-example1
# required: gpu # required: gpu
import paddle.profiler as profiler import paddle.profiler as profiler
...@@ -192,15 +202,21 @@ def export_chrome_tracing(dir_name: str, ...@@ -192,15 +202,21 @@ def export_chrome_tracing(dir_name: str,
def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable: def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable:
r""" r"""
Return a callable, used for outputing tracing data to protobuf file. Return a callable, used for outputing tracing data to protobuf file.
The output file will be saved in directory 'dir_name', and file name will be set as worker_name. The output file will be saved in directory ``dir_name``, and file name will be set as worker_name.
if worker_name is not set, the default name is [hostname]_[pid]. if worker_name is not set, the default name is [hostname]_[pid].
Parameters: Args:
dir_name(str): Directory to save profiling data. dir_name(str): Directory to save profiling data.
worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid]. worker_name(str, optional): Prefix of the file name saved, default is [hostname]_[pid].
Returns:
A callable, which takes a Profiler object as parameter and calls its export method to save data to protobuf file.
Examples: Examples:
The return value can be used as parameter ``on_trace_ready`` in :ref:`Profiler <api_paddle_profiler_Profiler>` .
.. code-block:: python .. code-block:: python
:name: code-example1
# required: gpu # required: gpu
import paddle.profiler as profiler import paddle.profiler as profiler
...@@ -244,20 +260,21 @@ def _get_supported_targets() -> Iterable[ProfilerTarget]: ...@@ -244,20 +260,21 @@ def _get_supported_targets() -> Iterable[ProfilerTarget]:
class Profiler: class Profiler:
r""" r"""
Profiler context manager, user interface to manage profile process. Profiler context manager, user interface to manage profiling process to start, stop, export profiling data and print summary table.
Parameters: Args:
targets (iterable): list of tracing targets, currently supported values, ``ProfilerTarget.CPU``, ``ProfilerTarget.GPU`` . targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU <api_paddle_profiler_ProfilerTarget>` and :ref:`ProfilerTarget.GPU <api_paddle_profiler_ProfilerTarget>` .
scheduler (callable or tuple): If it is a callable object, it takes a step number as parameter and return the corresponding ``ProfilerState``. scheduler (Callable|tuple, optional): If it is a callable object, it takes a step number as parameter and return the corresponding :ref:`ProfilerState <api_paddle_profiler_ProfilerState>`. This callable object can be generated by :ref:`make_scheduler <api_paddle_profiler_make_scheduler>` function.
If not provided, the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch, If not provided (None), the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch,
which means profiling range [start_batch, end_batch). which means profiling range [start_batch, end_batch).
on_trace_ready (callable): callable object, takes the Profiler object as parameter, which provides a way for users to do post-processing. on_trace_ready (Callable, optional): Callable object, serves as callback function, and takes the Profiler object as parameter, which provides a way for users to do post-processing.
This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``. This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``. The default value is :ref:`export_chrome_tracing <api_paddle_profiler_export_chrome_tracing>` (./profiler_log/).
Examples: Examples:
1. profiling range [2, 5) 1. profiling range [2, 5).
.. code-block:: python .. code-block:: python
:name: code-example1
# required: gpu # required: gpu
import paddle.profiler as profiler import paddle.profiler as profiler
...@@ -272,6 +289,7 @@ class Profiler: ...@@ -272,6 +289,7 @@ class Profiler:
2. profiling range [2,4], [7, 9], [11,13] 2. profiling range [2,4], [7, 9], [11,13]
.. code-block:: python .. code-block:: python
:name: code-example2
# required: gpu # required: gpu
import paddle.profiler as profiler import paddle.profiler as profiler
...@@ -286,6 +304,7 @@ class Profiler: ...@@ -286,6 +304,7 @@ class Profiler:
3. Use profiler without context manager, and use default parameters 3. Use profiler without context manager, and use default parameters
.. code-block:: python .. code-block:: python
:name: code-example3
# required: gpu # required: gpu
import paddle.profiler as profiler import paddle.profiler as profiler
...@@ -367,6 +386,7 @@ class Profiler: ...@@ -367,6 +386,7 @@ class Profiler:
Examples: Examples:
.. code-block:: python .. code-block:: python
:name: code-example4
# required: gpu # required: gpu
import paddle.profiler as profiler import paddle.profiler as profiler
...@@ -401,6 +421,7 @@ class Profiler: ...@@ -401,6 +421,7 @@ class Profiler:
Examples: Examples:
.. code-block:: python .. code-block:: python
:name: code-example5
# required: gpu # required: gpu
import paddle.profiler as profiler import paddle.profiler as profiler
...@@ -437,6 +458,7 @@ class Profiler: ...@@ -437,6 +458,7 @@ class Profiler:
Examples: Examples:
.. code-block:: python .. code-block:: python
:name: code-example6
# required: gpu # required: gpu
import paddle.profiler as profiler import paddle.profiler as profiler
...@@ -522,10 +544,16 @@ class Profiler: ...@@ -522,10 +544,16 @@ class Profiler:
def export(self, path="", format="json"): def export(self, path="", format="json"):
r""" r"""
Exports the tracing data in Chrome tracing data format. Exports the tracing data to file.
Args:
path(str): file path of the output.
format(str, optional): output format, can be chosen from ['json', 'pb], 'json' for chrome tracing and 'pb' for protobuf, default value is "json".
Examples: Examples:
.. code-block:: python .. code-block:: python
:name: code-example7
# required: gpu # required: gpu
import paddle.profiler as profiler import paddle.profiler as profiler
...@@ -548,16 +576,17 @@ class Profiler: ...@@ -548,16 +576,17 @@ class Profiler:
thread_sep=False, thread_sep=False,
time_unit='ms'): time_unit='ms'):
r""" r"""
Print the Summary table. Print the Summary table. Currently support overview, model, distributed, operator, memory manipulation and userdefined summary.
Parameters: Args:
sorted_by(SortedKeys): how to rank the op table items. sorted_by( :ref:`SortedKeys <api_paddle_profiler_SortedKeys>` , optional): how to rank the op table items, default value is SortedKeys.CPUTotal.
op_detail(bool): expand each operator detail information. op_detail(bool, optional): expand each operator detail information, default value is True.
thread_sep(bool): print op table each thread. thread_sep(bool, optional): print op table each thread, default value is False.
time_unit(str): can be chosen form ['s', 'ms', 'us', 'ns'] time_unit(str, optional): time unit for display, can be chosen form ['s', 'ms', 'us', 'ns'], default value is 'ms'.
Examples: Examples:
.. code-block:: python .. code-block:: python
:name: code-example8
# required: gpu # required: gpu
import paddle.profiler as profiler import paddle.profiler as profiler
......
...@@ -33,23 +33,25 @@ _CommunicationOpName = ['reduce', 'broadcast', 'rpc'] ...@@ -33,23 +33,25 @@ _CommunicationOpName = ['reduce', 'broadcast', 'rpc']
class SortedKeys(Enum): class SortedKeys(Enum):
r""" r"""
Sorted keys for printing summary table. SortedKeys is used to specify how to sort items when printing :ref:`summary <api_paddle_profiler_profiler_summary>` table.
CPUTotal: Sorted by CPU total time. The meaning of each SortedKeys is as following
CPUAvg: Sorted by CPU average time. - **SortedKeys.CPUTotal** : Sorted by CPU total time.
CPUMax: Sorted by CPU max time. - **SortedKeys.CPUAvg** : Sorted by CPU average time.
CPUMin: Sorted by CPU min time. - **SortedKeys.CPUMax** : Sorted by CPU max time.
GPUTotal: Sorted by GPU total time. - **SortedKeys.CPUMin** : Sorted by CPU min time.
GPUAvg: Sorted by GPU average time. - **SortedKeys.GPUTotal** : Sorted by GPU total time.
GPUMax: Sorted by GPU max time. - **SortedKeys.GPUAvg** : Sorted by GPU average time.
GPUMin: Sorted by GPU min time. - **SortedKeys.GPUMax** : Sorted by GPU max time.
- **SortedKeys.GPUMin** : Sorted by GPU min time.
""" """
CPUTotal = 0 CPUTotal = 0
CPUAvg = 1 CPUAvg = 1
...@@ -119,6 +121,23 @@ def traverse_tree(nodetrees): ...@@ -119,6 +121,23 @@ def traverse_tree(nodetrees):
return results return results
def get_device_nodes(hostnode):
'''
Get all device nodes called in the time range of hostnode.
'''
stack = []
device_nodes = []
stack.append(hostnode)
while stack:
current_node = stack.pop()
for childnode in current_node.children_node:
stack.append(childnode)
for runtimenode in current_node.runtime_node:
for devicenode in runtimenode.device_node:
device_nodes.append(devicenode)
return device_nodes
def wrap_tree(nodetrees): def wrap_tree(nodetrees):
''' '''
Using HostStatisticNode to wrap original profiler result tree, and calculate node statistic metrics. Using HostStatisticNode to wrap original profiler result tree, and calculate node statistic metrics.
...@@ -186,16 +205,6 @@ class TimeRangeSummary: ...@@ -186,16 +205,6 @@ class TimeRangeSummary:
CPUTimeRange[hostnode.type].append( CPUTimeRange[hostnode.type].append(
(hostnode.start_ns, hostnode.end_ns)) (hostnode.start_ns, hostnode.end_ns))
self.call_times[hostnode.type] += 1 self.call_times[hostnode.type] += 1
if hostnode.type == TracerEventType.Operator and any([
name in hostnode.name for name in _CommunicationOpName
]): # special case, communication op
CPUTimeRange[TracerEventType.Communication].append(
(hostnode.start_ns, hostnode.end_ns))
self.call_times[TracerEventType.Communication] += 1
is_communication_node = (
hostnode.type == TracerEventType.Communication
) or (hostnode.type == TracerEventType.Operator and any(
[name in hostnode.name for name in _CommunicationOpName]))
for runtimenode in hostnode.runtime_node: for runtimenode in hostnode.runtime_node:
CPUTimeRange[runtimenode.type].append( CPUTimeRange[runtimenode.type].append(
(runtimenode.start_ns, runtimenode.end_ns)) (runtimenode.start_ns, runtimenode.end_ns))
...@@ -205,12 +214,6 @@ class TimeRangeSummary: ...@@ -205,12 +214,6 @@ class TimeRangeSummary:
devicenode.stream_id].append( devicenode.stream_id].append(
(devicenode.start_ns, devicenode.end_ns)) (devicenode.start_ns, devicenode.end_ns))
self.call_times[devicenode.type] += 1 self.call_times[devicenode.type] += 1
if is_communication_node: # gpu activity for communication node
GPUTimeRange[devicenode.device_id][
TracerEventType.Communication][
devicenode.stream_id].append((
devicenode.start_ns, devicenode.end_ns))
self.call_times[TracerEventType.Communication] += 1
for event_type, time_ranges in CPUTimeRange.items(): for event_type, time_ranges in CPUTimeRange.items():
time_ranges = merge_self_ranges(time_ranges, is_sorted=False) time_ranges = merge_self_ranges(time_ranges, is_sorted=False)
...@@ -243,6 +246,74 @@ class TimeRangeSummary: ...@@ -243,6 +246,74 @@ class TimeRangeSummary:
return self.CPUTimeRangeSum[event_type] return self.CPUTimeRangeSum[event_type]
class DistributedSummary:
r"""
Analysis communication and computation time range, and their overlap.
The computation time is all kernel except kernels for communication like nccl.
"""
def __init__(self):
self.cpu_communication_range = []
self.gpu_communication_range = []
self.communication_range = []
self.computation_range = []
self.overlap_range = []
def parse(self, nodetrees):
'''
Collect all communication and computation time ranges.
'''
thread2hostnodes = traverse_tree(nodetrees)
for threadid, hostnodes in thread2hostnodes.items():
for hostnode in hostnodes[1:]: #skip root node
# case 1: TracerEventType is Communication
if hostnode.type == TracerEventType.Communication:
self.cpu_communication_range.append(
(hostnode.start_ns, hostnode.end_ns))
device_nodes = get_device_nodes(hostnode)
for device_node in device_nodes:
if device_node.type == TracerEventType.Kernel:
self.gpu_communication_range.append(
(device_node.start_ns, device_node.end_ns))
#case 2: TracerEventType is Operator but is communication op
elif hostnode.type == TracerEventType.Operator and any([
name in hostnode.name.lower()
for name in _CommunicationOpName
]):
self.cpu_communication_range.append(
(hostnode.start_ns, hostnode.end_ns))
device_nodes = get_device_nodes(hostnode)
for device_node in device_nodes:
if device_node.type == TracerEventType.Kernel:
self.gpu_communication_range.append(
(device_node.start_ns, device_node.end_ns))
#case 3: Others, filter kernels named with nccl
else:
for runtimenode in hostnode.runtime_node:
for devicenode in runtimenode.device_node:
if devicenode.type == TracerEventType.Kernel:
if 'nccl' in devicenode.name.lower():
self.gpu_communication_range.append((
devicenode.start_ns, devicenode.end_ns))
else:
self.computation_range.append((
devicenode.start_ns, devicenode.end_ns))
self.cpu_communication_range = merge_self_ranges(
self.cpu_communication_range, is_sorted=False)
self.gpu_communication_range = merge_self_ranges(
self.gpu_communication_range, is_sorted=False)
self.communication_range = merge_ranges(
self.cpu_communication_range,
self.gpu_communication_range,
is_sorted=True)
self.computation_range = merge_self_ranges(
self.computation_range, is_sorted=False)
self.overlap_range = intersection_ranges(
self.communication_range, self.computation_range, is_sorted=True)
class EventSummary: class EventSummary:
r""" r"""
Analyse operator event in profiling data, correlate with its device event. Analyse operator event in profiling data, correlate with its device event.
...@@ -473,8 +544,10 @@ class StatisticData: ...@@ -473,8 +544,10 @@ class StatisticData:
self.extra_info = extra_info self.extra_info = extra_info
self.time_range_summary = TimeRangeSummary() self.time_range_summary = TimeRangeSummary()
self.event_summary = EventSummary() self.event_summary = EventSummary()
self.distributed_summary = DistributedSummary()
self.time_range_summary.parse(node_trees) self.time_range_summary.parse(node_trees)
self.event_summary.parse(node_trees) self.event_summary.parse(node_trees)
self.distributed_summary.parse(node_trees)
def _build_table(statistic_data, def _build_table(statistic_data,
...@@ -610,7 +683,11 @@ def _build_table(statistic_data, ...@@ -610,7 +683,11 @@ def _build_table(statistic_data,
gpu_type_time = collections.defaultdict(int) gpu_type_time = collections.defaultdict(int)
for event_type, value in statistic_data.time_range_summary.CPUTimeRangeSum.items( for event_type, value in statistic_data.time_range_summary.CPUTimeRangeSum.items(
): ):
if event_type != TracerEventType.Communication:
cpu_type_time[event_type] = value cpu_type_time[event_type] = value
if statistic_data.distributed_summary.cpu_communication_range:
cpu_type_time[TracerEventType.Communication] = sum_ranges(
statistic_data.distributed_summary.cpu_communication_range)
gpu_time_range = collections.defaultdict(list) gpu_time_range = collections.defaultdict(list)
for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items( for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items(
...@@ -620,6 +697,9 @@ def _build_table(statistic_data, ...@@ -620,6 +697,9 @@ def _build_table(statistic_data,
gpu_time_range[event_type], time_range, is_sorted=True) gpu_time_range[event_type], time_range, is_sorted=True)
for event_type, time_range in gpu_time_range.items(): for event_type, time_range in gpu_time_range.items():
gpu_type_time[event_type] = sum_ranges(time_range) gpu_type_time[event_type] = sum_ranges(time_range)
if statistic_data.distributed_summary.gpu_communication_range:
gpu_type_time[TracerEventType.Communication] = sum_ranges(
statistic_data.distributed_summary.gpu_communication_range)
sorted_items = sorted( sorted_items = sorted(
cpu_type_time.items(), key=lambda x: x[1], reverse=True) cpu_type_time.items(), key=lambda x: x[1], reverse=True)
...@@ -735,7 +815,7 @@ def _build_table(statistic_data, ...@@ -735,7 +815,7 @@ def _build_table(statistic_data,
append('') append('')
###### Print Distribution Summary Report ###### ###### Print Distribution Summary Report ######
if TracerEventType.Communication in statistic_data.time_range_summary.CPUTimeRange: if statistic_data.distributed_summary.communication_range:
headers = [ headers = [
'Name', 'Name',
'Total Time', 'Total Time',
...@@ -759,33 +839,12 @@ def _build_table(statistic_data, ...@@ -759,33 +839,12 @@ def _build_table(statistic_data,
append(header_sep) append(header_sep)
append(row_format.format(*headers)) append(row_format.format(*headers))
append(header_sep) append(header_sep)
cpu_communication_time_range = [] communication_time = sum_ranges(
gpu_communication_time_range = [] statistic_data.distributed_summary.communication_range)
cpu_communication_time_range = merge_ranges( computation_time = sum_ranges(
statistic_data.time_range_summary.CPUTimeRange[ statistic_data.distributed_summary.computation_range)
TracerEventType.Communication], cpu_communication_time_range) overlap_time = sum_ranges(
kernel_time_range = [] statistic_data.distributed_summary.overlap_range)
for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items(
):
kernel_time_range = merge_ranges(
device_time_ranges[TracerEventType.Kernel],
kernel_time_range,
is_sorted=True)
gpu_communication_time_range = merge_ranges(
device_time_ranges[TracerEventType.Communication],
gpu_communication_time_range,
is_sorted=True)
communication_time_range = merge_ranges(
cpu_communication_time_range,
gpu_communication_time_range,
is_sorted=True)
computation_time_range = subtract_ranges(kernel_time_range,
gpu_communication_time_range)
overlap_time_range = intersection_ranges(communication_time_range,
computation_time_range)
communication_time = sum_ranges(communication_time_range)
computation_time = sum_ranges(computation_time_range)
overlap_time = sum_ranges(overlap_time_range)
row_values = [ row_values = [
'Communication', format_time( 'Communication', format_time(
communication_time, unit=time_unit), communication_time, unit=time_unit),
...@@ -808,9 +867,9 @@ def _build_table(statistic_data, ...@@ -808,9 +867,9 @@ def _build_table(statistic_data,
append(row_format.format(*row_values)) append(row_format.format(*row_values))
append(header_sep) append(header_sep)
append( append(
"Note:\nCommunication time: Communication Op time and its kernel time on gpu.\n" "Note:\nCommunication time: Communication Event time, Communication Op time and its kernel time on gpu.\n"
"Computation time: Kernel time, substract kernels belong to communication op.\n" "Computation time: Kernel time, except kernels belong to communication(nccl kernels).\n"
"Overlap time: Communication time intersect with computation time.\n" "Overlap time: Communication time intersects with computation time.\n"
"Example:\n" "Example:\n"
"Communication:\n" "Communication:\n"
" CPU: |_________________|\n" " CPU: |_________________|\n"
...@@ -1001,10 +1060,14 @@ def _build_table(statistic_data, ...@@ -1001,10 +1060,14 @@ def _build_table(statistic_data,
'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)', 'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
'GPU Total / Avg / Max / Min / Ratio(%)' 'GPU Total / Avg / Max / Min / Ratio(%)'
] ]
name_column_width = 0
for name, item in statistic_data.event_summary.memory_manipulation_items.items(
):
name_column_width = max(len(name), name_column_width)
row_format_list = [""] row_format_list = [""]
header_sep_list = [""] header_sep_list = [""]
line_length_list = [-SPACING_SIZE] line_length_list = [-SPACING_SIZE]
name_column_width = 30 name_column_width = min(name_column_width, 45)
add_column(name_column_width) add_column(name_column_width)
add_column(6) add_column(6)
add_column(40) add_column(40)
...@@ -1059,7 +1122,7 @@ def _build_table(statistic_data, ...@@ -1059,7 +1122,7 @@ def _build_table(statistic_data,
row_format_list = [""] row_format_list = [""]
header_sep_list = [""] header_sep_list = [""]
line_length_list = [-SPACING_SIZE] line_length_list = [-SPACING_SIZE]
name_column_width = 30 name_column_width = 35
add_column(name_column_width) add_column(name_column_width)
add_column(6) add_column(6)
add_column(40) add_column(40)
......
...@@ -96,6 +96,7 @@ def merge_ranges(range_list1, range_list2, is_sorted=False): ...@@ -96,6 +96,7 @@ def merge_ranges(range_list1, range_list2, is_sorted=False):
else: else:
indx1 += 1 indx1 += 1
while indx2 < len2: while indx2 < len2:
range2 = range_list2[indx2]
if range2[1] > merged_ranges[-1][1]: if range2[1] > merged_ranges[-1][1]:
if range2[0] <= merged_ranges[-1][1]: if range2[0] <= merged_ranges[-1][1]:
merged_ranges[-1] = (merged_ranges[-1][0], range2[1]) merged_ranges[-1] = (merged_ranges[-1][0], range2[1])
......
...@@ -17,8 +17,8 @@ from warnings import warn ...@@ -17,8 +17,8 @@ from warnings import warn
import functools import functools
from contextlib import ContextDecorator from contextlib import ContextDecorator
from paddle.fluid import core
from paddle.fluid.core import (_RecordEvent, TracerEventType) from paddle.fluid.core import (_RecordEvent, TracerEventType)
import paddle.fluid.core as core
_AllowedEventTypeList = [ _AllowedEventTypeList = [
TracerEventType.Dataloader, TracerEventType.ProfileStep, TracerEventType.Dataloader, TracerEventType.ProfileStep,
...@@ -30,13 +30,15 @@ _AllowedEventTypeList = [ ...@@ -30,13 +30,15 @@ _AllowedEventTypeList = [
class RecordEvent(ContextDecorator): class RecordEvent(ContextDecorator):
r""" r"""
Interface for recording a time range. Interface for recording a time range by user defined.
Parameters: Args:
name(str): Name of the record event name(str): Name of the record event
event_type(TracerEventType, optional): Optional, default value is TracerEventType.UserDefined. It is reserved for internal purpose, and it is better not to specify this parameter.
Examples: Examples:
.. code-block:: python .. code-block:: python
:name: code-example1
import paddle import paddle
import paddle.profiler as profiler import paddle.profiler as profiler
...@@ -53,8 +55,8 @@ class RecordEvent(ContextDecorator): ...@@ -53,8 +55,8 @@ class RecordEvent(ContextDecorator):
result = data1 + data2 result = data1 + data2
record_event.end() record_event.end()
Note: **Note**:
RecordEvent will take effect only when profiler is on and at the state of RECORD. RecordEvent will take effect only when :ref:`Profiler <api_paddle_profiler_Profiler>` is on and at the state of RECORD.
""" """
def __init__(self, def __init__(self,
...@@ -76,6 +78,7 @@ class RecordEvent(ContextDecorator): ...@@ -76,6 +78,7 @@ class RecordEvent(ContextDecorator):
Record the time of begining. Record the time of begining.
.. code-block:: python .. code-block:: python
:name: code-example2
import paddle import paddle
import paddle.profiler as profiler import paddle.profiler as profiler
...@@ -100,6 +103,7 @@ class RecordEvent(ContextDecorator): ...@@ -100,6 +103,7 @@ class RecordEvent(ContextDecorator):
Record the time of ending. Record the time of ending.
.. code-block:: python .. code-block:: python
:name: code-example3
import paddle import paddle
import paddle.profiler as profiler import paddle.profiler as profiler
...@@ -118,14 +122,15 @@ def load_profiler_result(filename: str): ...@@ -118,14 +122,15 @@ def load_profiler_result(filename: str):
r""" r"""
Load dumped profiler data back to memory. Load dumped profiler data back to memory.
Parameters: Args:
filename(str): Name of the exported protobuf file of profiler data. filename(str): Name of the exported protobuf file of profiler data.
Returns: Returns:
ProfilerResult object. ProfilerResult object, which stores profiling data.
Examples: Examples:
.. code-block:: python .. code-block:: python
:name: code-example1
# required: gpu # required: gpu
import paddle.profiler as profiler import paddle.profiler as profiler
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册