diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 940fc98d3b32021ae8b278305c54d8819292daaf..9427702cbcfdcafd545e30f0f558633739d44487 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -162,7 +162,7 @@ void RecordEvent::OriginalConstruct(const std::string &name, void RecordEvent::End() { #ifndef _WIN32 #ifdef PADDLE_WITH_CUDA - if (g_enable_nvprof_hook && is_pushed_) { + if (g_enable_nvprof_hook && is_pushed_ && is_enabled_) { dynload::nvtxRangePop(); } #endif diff --git a/paddle/fluid/platform/profiler/host_tracer.cc b/paddle/fluid/platform/profiler/host_tracer.cc index afd1c2b3012d46100dbead81792108cffb52e9a3..b7eb53331b793a74ff265947e381ef6d5d03d3da 100644 --- a/paddle/fluid/platform/profiler/host_tracer.cc +++ b/paddle/fluid/platform/profiler/host_tracer.cc @@ -20,7 +20,7 @@ // Used to filter events, works like glog VLOG(level). // RecordEvent will works if host_trace_level >= level. -PADDLE_DEFINE_EXPORTED_int64(host_trace_level, 2, +PADDLE_DEFINE_EXPORTED_int64(host_trace_level, 1, "RecordEvent will works " "if host_trace_level >= level."); diff --git a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py index 73b501c9c7eade28e94281b6d07ce21140b72c53..4769a81e7936c5d879eebc4746bbafa6507afd6a 100644 --- a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py +++ b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py @@ -207,6 +207,228 @@ class TestProfilerStatistic(unittest.TestCase): thread_sep=False, time_unit='ms')) + def test_statistic_case2(self): + root_node = HostPythonNode('Root Node', + profiler.TracerEventType.UserDefined, 0, + float('inf'), 1000, 1001) + profilerstep_node = HostPythonNode('ProfileStep#1', + profiler.TracerEventType.ProfileStep, + 0, 400, 1000, 1001) + + dataloader_node = HostPythonNode( + 'Dataloader', profiler.TracerEventType.Forward, 5, 15, 1000, 1001) + + mobilenet_node = HostPythonNode( + 'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001) + yolonet_node = HostPythonNode( + 'Yolov3Net', profiler.TracerEventType.Forward, 50, 110, 1000, 1001) + + userdefined_node = HostPythonNode('Communication Time', + profiler.TracerEventType.UserDefined, + 100, 110, 1000, 1001) + reduce_all_launchkernel0 = HostPythonNode( + 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 102, 104, + 1000, 1001) + + nccl_reduce_all_kernel0 = DevicePythonNode( + 'nccl_reduce_all_kernel', profiler.TracerEventType.Kernel, 105, 120, + 0, 0, 2) + + communication_node = HostPythonNode( + 'Communication', profiler.TracerEventType.Communication, 105, 110, + 1000, 1001) + + reduce_all_op1 = HostPythonNode('cudalaunchkernel', + profiler.TracerEventType.Operator, 105, + 108, 1000, 1001) + + reduce_all_launchkernel1 = HostPythonNode( + 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 106, 107, + 1000, 1001) + + nccl_reduce_all_kernel1 = DevicePythonNode( + 'nccl_reduce_all_kernel', profiler.TracerEventType.Kernel, 130, 150, + 0, 0, 2) + + backward_node = HostPythonNode('Gradient Backward', + profiler.TracerEventType.Backward, 120, + 200, 1000, 1001) + optimization_node = HostPythonNode( + 'Optimization', profiler.TracerEventType.Optimization, 220, 300, + 1000, 1001) + conv2d_node = HostPythonNode( + 'conv2d', profiler.TracerEventType.Operator, 25, 40, 1000, 1001) + sync_batch_norm_node = HostPythonNode('sync_batch_norm', + profiler.TracerEventType.Operator, + 60, 100, 1000, 1001) + conv2d_infer_shape = HostPythonNode( + 'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25, + 30, 1000, 1001) + conv2d_compute = HostPythonNode('conv2d::compute', + profiler.TracerEventType.OperatorInner, + 30, 40, 1000, 1001) + conv2d_launchkernel = HostPythonNode( + 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 30, 35, + 1000, 1001) + conv2d_MemCpy = HostPythonNode('AsyncMemcpy', + profiler.TracerEventType.UserDefined, 35, + 40, 1000, 1001) + conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy', + profiler.TracerEventType.CudaRuntime, + 35, 40, 1000, 1001) + conv2d_kernel = DevicePythonNode( + 'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 50, 0, 0, 0) + conv2d_memcpy = DevicePythonNode( + 'conv2d_memcpy', profiler.TracerEventType.Memcpy, 50, 60, 0, 0, 0) + sync_batch_norm_infer_shape = HostPythonNode( + 'sync_batch_norm::infer_shape', + profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001) + sync_batch_norm_compute = HostPythonNode( + 'sync_batch_norm::compute', profiler.TracerEventType.OperatorInner, + 80, 100, 1000, 1001) + sync_batch_norm_launchkernel = HostPythonNode( + 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 80, 90, + 1000, 1001) + sync_batch_norm_MemCpy = HostPythonNode( + 'AsyncMemcpy', profiler.TracerEventType.UserDefined, 90, 100, 1000, + 1001) + sync_batch_norm_cudaMemCpy = HostPythonNode( + 'cudaMemcpy', profiler.TracerEventType.CudaRuntime, 90, 100, 1000, + 1001) + sync_batch_norm_kernel = DevicePythonNode( + 'sync_batch_norm_kernel', profiler.TracerEventType.Kernel, 95, 300, + 0, 0, 0) + sync_batch_norm_memcpy = DevicePythonNode( + 'sync_batch_norm_memcpy', profiler.TracerEventType.Memcpy, 150, 200, + 0, 0, 1) + + reduce_all_node2 = HostPythonNode('reduce_all', + profiler.TracerEventType.Operator, + 230, 250, 1000, 1001) + + reduce_all_launchkernel2 = HostPythonNode( + 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 235, 240, + 1000, 1001) + + nccl_reduce_all_kernel2 = DevicePythonNode( + 'nccl_reduce_all_kernel', profiler.TracerEventType.Kernel, 250, 280, + 0, 0, 2) + + root_node.children_node.append(profilerstep_node) + profilerstep_node.children_node.extend([ + dataloader_node, mobilenet_node, yolonet_node, backward_node, + optimization_node + ]) + mobilenet_node.children_node.append(conv2d_node) + yolonet_node.children_node.extend( + [sync_batch_norm_node, userdefined_node]) + userdefined_node.children_node.append(communication_node) + userdefined_node.runtime_node.append(reduce_all_launchkernel0) + reduce_all_launchkernel0.device_node.append(nccl_reduce_all_kernel0) + communication_node.children_node.append(reduce_all_op1) + reduce_all_op1.runtime_node.append(reduce_all_launchkernel1) + reduce_all_launchkernel1.device_node.append(nccl_reduce_all_kernel1) + conv2d_node.children_node.extend( + [conv2d_infer_shape, conv2d_compute, conv2d_MemCpy]) + conv2d_compute.runtime_node.append(conv2d_launchkernel) + conv2d_MemCpy.runtime_node.append(conv2d_cudaMemCpy) + conv2d_launchkernel.device_node.append(conv2d_kernel) + conv2d_cudaMemCpy.device_node.append(conv2d_memcpy) + sync_batch_norm_node.children_node.extend([ + sync_batch_norm_infer_shape, sync_batch_norm_compute, + sync_batch_norm_MemCpy + ]) + sync_batch_norm_compute.runtime_node.append( + sync_batch_norm_launchkernel) + sync_batch_norm_MemCpy.runtime_node.append(sync_batch_norm_cudaMemCpy) + sync_batch_norm_launchkernel.device_node.append(sync_batch_norm_kernel) + sync_batch_norm_cudaMemCpy.device_node.append(sync_batch_norm_memcpy) + optimization_node.children_node.append(reduce_all_node2) + reduce_all_node2.runtime_node.append(reduce_all_launchkernel2) + reduce_all_launchkernel2.device_node.append(nccl_reduce_all_kernel2) + thread_tree = {'thread1001': root_node} + extra_info = { + 'Process Cpu Utilization': '1.02', + 'System Cpu Utilization': '0.68' + } + statistic_data = profiler.profiler_statistic.StatisticData(thread_tree, + extra_info) + time_range_summary = statistic_data.time_range_summary + event_summary = statistic_data.event_summary + distributed_summary = statistic_data.distributed_summary + + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.ProfileStep), 400) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.Forward), 100) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.Backward), 80) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.Optimization), 80) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.Operator), 78) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.OperatorInner), 45) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.CudaRuntime), 38) + self.assertEqual( + time_range_summary.get_gpu_range_sum( + 0, profiler.TracerEventType.Kernel), 220) + self.assertEqual( + time_range_summary.get_gpu_range_sum( + 0, profiler.TracerEventType.Memcpy), 60) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.UserDefined), 25) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.Communication), 5) + self.assertEqual( + profiler.statistic_helper.sum_ranges( + distributed_summary.cpu_communication_range), 25) + self.assertEqual( + profiler.statistic_helper.sum_ranges( + distributed_summary.gpu_communication_range), 65) + self.assertEqual( + profiler.statistic_helper.sum_ranges( + distributed_summary.communication_range), 85) + self.assertEqual( + profiler.statistic_helper.sum_ranges( + distributed_summary.computation_range), 220) + self.assertEqual( + profiler.statistic_helper.sum_ranges( + distributed_summary.overlap_range), 85) + self.assertEqual(len(event_summary.items), 4) + self.assertEqual(len(event_summary.userdefined_items), 1) + self.assertEqual(len(event_summary.model_perspective_items), 3) + self.assertEqual(len(event_summary.memory_manipulation_items), 1) + self.assertEqual(event_summary.items['conv2d'].cpu_time, 15) + self.assertEqual(event_summary.items['conv2d'].gpu_time, 25) + self.assertEqual( + event_summary.model_perspective_items['Forward'].cpu_time, 100) + self.assertEqual( + event_summary.model_perspective_items['Forward'].gpu_time, 315) + self.assertEqual( + event_summary.model_perspective_items['Backward'].gpu_time, 0) + self.assertEqual( + event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15) + self.assertEqual( + event_summary.memory_manipulation_items['AsyncMemcpy'].gpu_time, 60) + print( + profiler.profiler_statistic._build_table( + statistic_data, + sorted_by=profiler.SortedKeys.CPUTotal, + op_detail=True, + thread_sep=False, + time_unit='ms')) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py index efbe88583b776d623b757628998e583ac65f6179..d978e594399f4030550edcd6ac26330dfe30d104 100644 --- a/python/paddle/profiler/profiler.py +++ b/python/paddle/profiler/profiler.py @@ -29,17 +29,17 @@ from .profiler_statistic import StatisticData, _build_table, SortedKeys class ProfilerState(Enum): r""" - Profiler state that can be specified to control profiler action. + ProfilerState is used to present the state of :ref:`Profiler ` . - CLOSED: The profilers are closed. + The meaning of each ProfilerState is as following - READY: The profilers are open, but the data will not be recorded. - This state is used for reducing overhead influence when profilers start. + - **ProfilerState.CLOSED** : The profiler is closed, and no profiling data will be recorded. - RECORD: The profilers are open, and the data will be recorded. + - **ProfilerState.READY** : The profiler is open, but the data will not be recorded. This state is used for reducing overhead influence when profiler starts. - RECORD_AND_RETURN: The profilers are open, and at the last batch of current profiler period, - the collected data will be returned. + - **ProfilerState.RECORD** : The profiler is open, and the data will be recorded. + + - **ProfilerState.RECORD_AND_RETURN** : The profiler is open, and this state stands for the last batch of "RECORD" state in current profiling period. The collected data will be returned in this state. """ CLOSED = 0 READY = 1 @@ -49,11 +49,13 @@ class ProfilerState(Enum): class ProfilerTarget(Enum): r""" - Target device for profiling. + ProfilerTarget is used to specify target device for :ref:`profiling ` . Only CPU and GPU are supported currently. - CPU: Profile events on CPU. - - GPU: Profile events on GPU. + The meaning of each ProfilerState is as following + + - **ProfilerTarget.CPU** : Profile events on CPU. + + - **ProfilerTarget.GPU** : Profile events on GPU. """ CPU = 0 GPU = 1 @@ -66,7 +68,7 @@ def make_scheduler(*, repeat: int=0, skip_first: int=0) -> Callable: r""" - Return a scheduler function, which scheduler the state according to the setting. + Return a scheduler function, which scheduler the :ref:`state ` according to the setting. The state transform confirms to: .. code-block:: text @@ -78,22 +80,23 @@ def make_scheduler(*, - - - - - - - - - - - - Note that repeat <= 0 means the cycle will continue until the profiler exits. - Parameters: + Args: closed(int): The number of steps in state ProfilerState.CLOSED. ready(int): The number of steps in state ProfilerState.READY. - record(int): The number of steps in state ProfilerState.RECORD. - repeat(int): The number of cycles to repeat above state transform. - skip_first(int): The number of first steps to drop, not participate in the state transform. + record(int): The number of steps in state ProfilerState.RECORD, and the state in last step will be set as ProfilerState.RECORD_AND_RETURN. + repeat(int, optional): The number of cycles to repeat above state transform. Default value is 0, which means it will repeat this cycle until profiler exits. + skip_first(int, optional): The number of first steps to drop, not participate in the state transform, and at ProfilerState.CLOSED state. Default value is 0. Returns: - A scheduler function, conforms to above state transform setting. + A scheduler function, conforms to above state transform setting. The function will takes one parameter step_num, and returns corresponding ProfilerState. Examples: 1. profiling range [2, 5] - batch 0: closed, batch 1: ready, batch [2, 5] record + Assume batch 0: closed, batch 1: ready, batch [2, 5] record .. code-block:: python + :name: code-example1 import paddle.profiler as profiler profiler.make_scheduler(closed=1, ready=1, record=4, repeat=1) @@ -101,9 +104,10 @@ def make_scheduler(*, 2. profiling range [3,6], [9,12], [15,18]... - batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat + Assume batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat .. code-block:: python + :name: code-example2 import paddle.profiler as profiler profiler.make_scheduler(closed=1, ready=1, record=4, skip_first=1) @@ -148,15 +152,21 @@ def export_chrome_tracing(dir_name: str, worker_name: Optional[str]=None) -> Callable: r""" Return a callable, used for outputing tracing data to chrome tracing format file. - The output file will be saved in directory 'dir_name', and file name will be set as worker_name. + The output file will be saved in directory ``dir_name``, and file name will be set as worker_name. if worker_name is not set, the default name is [hostname]_[pid]. - Parameters: + Args: dir_name(str): Directory to save profiling data. - worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid]. + worker_name(str, optional): Prefix of the file name saved, default is [hostname]_[pid]. + + Returns: + A callable, which takes a Profiler object as parameter and calls its export method to save data to chrome tracing format file. Examples: + The return value can be used as parameter ``on_trace_ready`` in :ref:`Profiler ` . + .. code-block:: python + :name: code-example1 # required: gpu import paddle.profiler as profiler @@ -192,15 +202,21 @@ def export_chrome_tracing(dir_name: str, def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable: r""" Return a callable, used for outputing tracing data to protobuf file. - The output file will be saved in directory 'dir_name', and file name will be set as worker_name. + The output file will be saved in directory ``dir_name``, and file name will be set as worker_name. if worker_name is not set, the default name is [hostname]_[pid]. - Parameters: + Args: dir_name(str): Directory to save profiling data. - worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid]. + worker_name(str, optional): Prefix of the file name saved, default is [hostname]_[pid]. + + Returns: + A callable, which takes a Profiler object as parameter and calls its export method to save data to protobuf file. Examples: + The return value can be used as parameter ``on_trace_ready`` in :ref:`Profiler ` . + .. code-block:: python + :name: code-example1 # required: gpu import paddle.profiler as profiler @@ -244,20 +260,21 @@ def _get_supported_targets() -> Iterable[ProfilerTarget]: class Profiler: r""" - Profiler context manager, user interface to manage profile process. + Profiler context manager, user interface to manage profiling process to start, stop, export profiling data and print summary table. - Parameters: - targets (iterable): list of tracing targets, currently supported values, ``ProfilerTarget.CPU``, ``ProfilerTarget.GPU`` . - scheduler (callable or tuple): If it is a callable object, it takes a step number as parameter and return the corresponding ``ProfilerState``. - If not provided, the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch, + Args: + targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU ` and :ref:`ProfilerTarget.GPU ` . + scheduler (Callable|tuple, optional): If it is a callable object, it takes a step number as parameter and return the corresponding :ref:`ProfilerState `. This callable object can be generated by :ref:`make_scheduler ` function. + If not provided (None), the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch, which means profiling range [start_batch, end_batch). - on_trace_ready (callable): callable object, takes the Profiler object as parameter, which provides a way for users to do post-processing. - This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``. + on_trace_ready (Callable, optional): Callable object, serves as callback function, and takes the Profiler object as parameter, which provides a way for users to do post-processing. + This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``. The default value is :ref:`export_chrome_tracing ` (./profiler_log/). Examples: - 1. profiling range [2, 5) + 1. profiling range [2, 5). .. code-block:: python + :name: code-example1 # required: gpu import paddle.profiler as profiler @@ -272,6 +289,7 @@ class Profiler: 2. profiling range [2,4], [7, 9], [11,13] .. code-block:: python + :name: code-example2 # required: gpu import paddle.profiler as profiler @@ -286,6 +304,7 @@ class Profiler: 3. Use profiler without context manager, and use default parameters .. code-block:: python + :name: code-example3 # required: gpu import paddle.profiler as profiler @@ -367,6 +386,7 @@ class Profiler: Examples: .. code-block:: python + :name: code-example4 # required: gpu import paddle.profiler as profiler @@ -401,6 +421,7 @@ class Profiler: Examples: .. code-block:: python + :name: code-example5 # required: gpu import paddle.profiler as profiler @@ -437,6 +458,7 @@ class Profiler: Examples: .. code-block:: python + :name: code-example6 # required: gpu import paddle.profiler as profiler @@ -522,10 +544,16 @@ class Profiler: def export(self, path="", format="json"): r""" - Exports the tracing data in Chrome tracing data format. + Exports the tracing data to file. + + Args: + path(str): file path of the output. + format(str, optional): output format, can be chosen from ['json', 'pb], 'json' for chrome tracing and 'pb' for protobuf, default value is "json". + Examples: .. code-block:: python + :name: code-example7 # required: gpu import paddle.profiler as profiler @@ -548,16 +576,17 @@ class Profiler: thread_sep=False, time_unit='ms'): r""" - Print the Summary table. + Print the Summary table. Currently support overview, model, distributed, operator, memory manipulation and userdefined summary. - Parameters: - sorted_by(SortedKeys): how to rank the op table items. - op_detail(bool): expand each operator detail information. - thread_sep(bool): print op table each thread. - time_unit(str): can be chosen form ['s', 'ms', 'us', 'ns'] + Args: + sorted_by( :ref:`SortedKeys ` , optional): how to rank the op table items, default value is SortedKeys.CPUTotal. + op_detail(bool, optional): expand each operator detail information, default value is True. + thread_sep(bool, optional): print op table each thread, default value is False. + time_unit(str, optional): time unit for display, can be chosen form ['s', 'ms', 'us', 'ns'], default value is 'ms'. Examples: .. code-block:: python + :name: code-example8 # required: gpu import paddle.profiler as profiler diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py index a0bbd6b633ef017dc983c8458eb5551494425989..8bb30d111c804874a0f1b0e0528589145e922e7d 100755 --- a/python/paddle/profiler/profiler_statistic.py +++ b/python/paddle/profiler/profiler_statistic.py @@ -33,23 +33,25 @@ _CommunicationOpName = ['reduce', 'broadcast', 'rpc'] class SortedKeys(Enum): r""" - Sorted keys for printing summary table. + SortedKeys is used to specify how to sort items when printing :ref:`summary ` table. - CPUTotal: Sorted by CPU total time. + The meaning of each SortedKeys is as following - CPUAvg: Sorted by CPU average time. + - **SortedKeys.CPUTotal** : Sorted by CPU total time. - CPUMax: Sorted by CPU max time. + - **SortedKeys.CPUAvg** : Sorted by CPU average time. - CPUMin: Sorted by CPU min time. + - **SortedKeys.CPUMax** : Sorted by CPU max time. - GPUTotal: Sorted by GPU total time. + - **SortedKeys.CPUMin** : Sorted by CPU min time. - GPUAvg: Sorted by GPU average time. + - **SortedKeys.GPUTotal** : Sorted by GPU total time. - GPUMax: Sorted by GPU max time. + - **SortedKeys.GPUAvg** : Sorted by GPU average time. - GPUMin: Sorted by GPU min time. + - **SortedKeys.GPUMax** : Sorted by GPU max time. + + - **SortedKeys.GPUMin** : Sorted by GPU min time. """ CPUTotal = 0 CPUAvg = 1 @@ -119,6 +121,23 @@ def traverse_tree(nodetrees): return results +def get_device_nodes(hostnode): + ''' + Get all device nodes called in the time range of hostnode. + ''' + stack = [] + device_nodes = [] + stack.append(hostnode) + while stack: + current_node = stack.pop() + for childnode in current_node.children_node: + stack.append(childnode) + for runtimenode in current_node.runtime_node: + for devicenode in runtimenode.device_node: + device_nodes.append(devicenode) + return device_nodes + + def wrap_tree(nodetrees): ''' Using HostStatisticNode to wrap original profiler result tree, and calculate node statistic metrics. @@ -186,16 +205,6 @@ class TimeRangeSummary: CPUTimeRange[hostnode.type].append( (hostnode.start_ns, hostnode.end_ns)) self.call_times[hostnode.type] += 1 - if hostnode.type == TracerEventType.Operator and any([ - name in hostnode.name for name in _CommunicationOpName - ]): # special case, communication op - CPUTimeRange[TracerEventType.Communication].append( - (hostnode.start_ns, hostnode.end_ns)) - self.call_times[TracerEventType.Communication] += 1 - is_communication_node = ( - hostnode.type == TracerEventType.Communication - ) or (hostnode.type == TracerEventType.Operator and any( - [name in hostnode.name for name in _CommunicationOpName])) for runtimenode in hostnode.runtime_node: CPUTimeRange[runtimenode.type].append( (runtimenode.start_ns, runtimenode.end_ns)) @@ -205,12 +214,6 @@ class TimeRangeSummary: devicenode.stream_id].append( (devicenode.start_ns, devicenode.end_ns)) self.call_times[devicenode.type] += 1 - if is_communication_node: # gpu activity for communication node - GPUTimeRange[devicenode.device_id][ - TracerEventType.Communication][ - devicenode.stream_id].append(( - devicenode.start_ns, devicenode.end_ns)) - self.call_times[TracerEventType.Communication] += 1 for event_type, time_ranges in CPUTimeRange.items(): time_ranges = merge_self_ranges(time_ranges, is_sorted=False) @@ -243,6 +246,74 @@ class TimeRangeSummary: return self.CPUTimeRangeSum[event_type] +class DistributedSummary: + r""" + Analysis communication and computation time range, and their overlap. + The computation time is all kernel except kernels for communication like nccl. + """ + + def __init__(self): + self.cpu_communication_range = [] + self.gpu_communication_range = [] + self.communication_range = [] + self.computation_range = [] + self.overlap_range = [] + + def parse(self, nodetrees): + ''' + Collect all communication and computation time ranges. + ''' + thread2hostnodes = traverse_tree(nodetrees) + for threadid, hostnodes in thread2hostnodes.items(): + for hostnode in hostnodes[1:]: #skip root node + # case 1: TracerEventType is Communication + if hostnode.type == TracerEventType.Communication: + self.cpu_communication_range.append( + (hostnode.start_ns, hostnode.end_ns)) + device_nodes = get_device_nodes(hostnode) + for device_node in device_nodes: + if device_node.type == TracerEventType.Kernel: + self.gpu_communication_range.append( + (device_node.start_ns, device_node.end_ns)) + + #case 2: TracerEventType is Operator but is communication op + elif hostnode.type == TracerEventType.Operator and any([ + name in hostnode.name.lower() + for name in _CommunicationOpName + ]): + self.cpu_communication_range.append( + (hostnode.start_ns, hostnode.end_ns)) + device_nodes = get_device_nodes(hostnode) + for device_node in device_nodes: + if device_node.type == TracerEventType.Kernel: + self.gpu_communication_range.append( + (device_node.start_ns, device_node.end_ns)) + + #case 3: Others, filter kernels named with nccl + else: + for runtimenode in hostnode.runtime_node: + for devicenode in runtimenode.device_node: + if devicenode.type == TracerEventType.Kernel: + if 'nccl' in devicenode.name.lower(): + self.gpu_communication_range.append(( + devicenode.start_ns, devicenode.end_ns)) + else: + self.computation_range.append(( + devicenode.start_ns, devicenode.end_ns)) + self.cpu_communication_range = merge_self_ranges( + self.cpu_communication_range, is_sorted=False) + self.gpu_communication_range = merge_self_ranges( + self.gpu_communication_range, is_sorted=False) + self.communication_range = merge_ranges( + self.cpu_communication_range, + self.gpu_communication_range, + is_sorted=True) + self.computation_range = merge_self_ranges( + self.computation_range, is_sorted=False) + self.overlap_range = intersection_ranges( + self.communication_range, self.computation_range, is_sorted=True) + + class EventSummary: r""" Analyse operator event in profiling data, correlate with its device event. @@ -473,8 +544,10 @@ class StatisticData: self.extra_info = extra_info self.time_range_summary = TimeRangeSummary() self.event_summary = EventSummary() + self.distributed_summary = DistributedSummary() self.time_range_summary.parse(node_trees) self.event_summary.parse(node_trees) + self.distributed_summary.parse(node_trees) def _build_table(statistic_data, @@ -610,7 +683,11 @@ def _build_table(statistic_data, gpu_type_time = collections.defaultdict(int) for event_type, value in statistic_data.time_range_summary.CPUTimeRangeSum.items( ): - cpu_type_time[event_type] = value + if event_type != TracerEventType.Communication: + cpu_type_time[event_type] = value + if statistic_data.distributed_summary.cpu_communication_range: + cpu_type_time[TracerEventType.Communication] = sum_ranges( + statistic_data.distributed_summary.cpu_communication_range) gpu_time_range = collections.defaultdict(list) for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items( @@ -620,6 +697,9 @@ def _build_table(statistic_data, gpu_time_range[event_type], time_range, is_sorted=True) for event_type, time_range in gpu_time_range.items(): gpu_type_time[event_type] = sum_ranges(time_range) + if statistic_data.distributed_summary.gpu_communication_range: + gpu_type_time[TracerEventType.Communication] = sum_ranges( + statistic_data.distributed_summary.gpu_communication_range) sorted_items = sorted( cpu_type_time.items(), key=lambda x: x[1], reverse=True) @@ -735,7 +815,7 @@ def _build_table(statistic_data, append('') ###### Print Distribution Summary Report ###### - if TracerEventType.Communication in statistic_data.time_range_summary.CPUTimeRange: + if statistic_data.distributed_summary.communication_range: headers = [ 'Name', 'Total Time', @@ -759,33 +839,12 @@ def _build_table(statistic_data, append(header_sep) append(row_format.format(*headers)) append(header_sep) - cpu_communication_time_range = [] - gpu_communication_time_range = [] - cpu_communication_time_range = merge_ranges( - statistic_data.time_range_summary.CPUTimeRange[ - TracerEventType.Communication], cpu_communication_time_range) - kernel_time_range = [] - for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items( - ): - kernel_time_range = merge_ranges( - device_time_ranges[TracerEventType.Kernel], - kernel_time_range, - is_sorted=True) - gpu_communication_time_range = merge_ranges( - device_time_ranges[TracerEventType.Communication], - gpu_communication_time_range, - is_sorted=True) - communication_time_range = merge_ranges( - cpu_communication_time_range, - gpu_communication_time_range, - is_sorted=True) - computation_time_range = subtract_ranges(kernel_time_range, - gpu_communication_time_range) - overlap_time_range = intersection_ranges(communication_time_range, - computation_time_range) - communication_time = sum_ranges(communication_time_range) - computation_time = sum_ranges(computation_time_range) - overlap_time = sum_ranges(overlap_time_range) + communication_time = sum_ranges( + statistic_data.distributed_summary.communication_range) + computation_time = sum_ranges( + statistic_data.distributed_summary.computation_range) + overlap_time = sum_ranges( + statistic_data.distributed_summary.overlap_range) row_values = [ 'Communication', format_time( communication_time, unit=time_unit), @@ -808,9 +867,9 @@ def _build_table(statistic_data, append(row_format.format(*row_values)) append(header_sep) append( - "Note:\nCommunication time: Communication Op time and its kernel time on gpu.\n" - "Computation time: Kernel time, substract kernels belong to communication op.\n" - "Overlap time: Communication time intersect with computation time.\n" + "Note:\nCommunication time: Communication Event time, Communication Op time and its kernel time on gpu.\n" + "Computation time: Kernel time, except kernels belong to communication(nccl kernels).\n" + "Overlap time: Communication time intersects with computation time.\n" "Example:\n" "Communication:\n" " CPU: |_________________|\n" @@ -976,7 +1035,7 @@ def _build_table(statistic_data, - 5] device_node_name += "..." row_values = [ - ' {}'.format(device_node_name), devicenode.call, + ' {}'.format(device_node_name), devicenode.call, '- / - / - / - / -', '{} / {} / {} / {} / {}'.format( format_time( @@ -1001,10 +1060,14 @@ def _build_table(statistic_data, 'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)', 'GPU Total / Avg / Max / Min / Ratio(%)' ] + name_column_width = 0 + for name, item in statistic_data.event_summary.memory_manipulation_items.items( + ): + name_column_width = max(len(name), name_column_width) row_format_list = [""] header_sep_list = [""] line_length_list = [-SPACING_SIZE] - name_column_width = 30 + name_column_width = min(name_column_width, 45) add_column(name_column_width) add_column(6) add_column(40) @@ -1059,7 +1122,7 @@ def _build_table(statistic_data, row_format_list = [""] header_sep_list = [""] line_length_list = [-SPACING_SIZE] - name_column_width = 30 + name_column_width = 35 add_column(name_column_width) add_column(6) add_column(40) diff --git a/python/paddle/profiler/statistic_helper.py b/python/paddle/profiler/statistic_helper.py index 1f11649928a7fe72b0a4ae6f368c3c92debf060c..76dd1f0a6439d14dd69182b747cd9de6a8444d59 100644 --- a/python/paddle/profiler/statistic_helper.py +++ b/python/paddle/profiler/statistic_helper.py @@ -96,6 +96,7 @@ def merge_ranges(range_list1, range_list2, is_sorted=False): else: indx1 += 1 while indx2 < len2: + range2 = range_list2[indx2] if range2[1] > merged_ranges[-1][1]: if range2[0] <= merged_ranges[-1][1]: merged_ranges[-1] = (merged_ranges[-1][0], range2[1]) diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py index 7fa7a27bad7bf5ffbefdddb28d67e2d65e319e6d..1acd583ee0f1a98f4e830385b3f8340449512e81 100644 --- a/python/paddle/profiler/utils.py +++ b/python/paddle/profiler/utils.py @@ -17,8 +17,8 @@ from warnings import warn import functools from contextlib import ContextDecorator +from paddle.fluid import core from paddle.fluid.core import (_RecordEvent, TracerEventType) -import paddle.fluid.core as core _AllowedEventTypeList = [ TracerEventType.Dataloader, TracerEventType.ProfileStep, @@ -30,13 +30,15 @@ _AllowedEventTypeList = [ class RecordEvent(ContextDecorator): r""" - Interface for recording a time range. + Interface for recording a time range by user defined. - Parameters: + Args: name(str): Name of the record event + event_type(TracerEventType, optional): Optional, default value is TracerEventType.UserDefined. It is reserved for internal purpose, and it is better not to specify this parameter. Examples: .. code-block:: python + :name: code-example1 import paddle import paddle.profiler as profiler @@ -53,8 +55,8 @@ class RecordEvent(ContextDecorator): result = data1 + data2 record_event.end() - Note: - RecordEvent will take effect only when profiler is on and at the state of RECORD. + **Note**: + RecordEvent will take effect only when :ref:`Profiler ` is on and at the state of RECORD. """ def __init__(self, @@ -76,6 +78,7 @@ class RecordEvent(ContextDecorator): Record the time of begining. .. code-block:: python + :name: code-example2 import paddle import paddle.profiler as profiler @@ -100,6 +103,7 @@ class RecordEvent(ContextDecorator): Record the time of ending. .. code-block:: python + :name: code-example3 import paddle import paddle.profiler as profiler @@ -118,14 +122,15 @@ def load_profiler_result(filename: str): r""" Load dumped profiler data back to memory. - Parameters: + Args: filename(str): Name of the exported protobuf file of profiler data. Returns: - ProfilerResult object. + ProfilerResult object, which stores profiling data. Examples: .. code-block:: python + :name: code-example1 # required: gpu import paddle.profiler as profiler