diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 8148c613a2d6aee0f11be7ac465cdd8641c23b74..c0decac1780f8f7bde43d320624aecca3dd4fbbb 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -1125,11 +1125,11 @@ paddle.fluid.dygraph_grad_clip.GradClipByNorm ('paddle.fluid.dygraph_grad_clip.G paddle.fluid.dygraph_grad_clip.GradClipByNorm.__init__ (ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph_grad_clip.GradClipByGlobalNorm ('paddle.fluid.dygraph_grad_clip.GradClipByGlobalNorm', ('document', 'd1872377e7d7a5fe0dd2e8c42e4c9656')) paddle.fluid.dygraph_grad_clip.GradClipByGlobalNorm.__init__ (ArgSpec(args=['self', 'max_global_norm'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.profiler.cuda_profiler (ArgSpec(args=['output_file', 'output_mode', 'config'], varargs=None, keywords=None, defaults=(None, None)), ('document', '4053b45953807a24e28027dc86829d6c')) +paddle.fluid.profiler.cuda_profiler (ArgSpec(args=['output_file', 'output_mode', 'config'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6ae5833bd2490c6a3bdcae0d31ce5ec5')) paddle.fluid.profiler.reset_profiler (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'fd1f25a7a06516ca9a1f4ab0783a4d70')) -paddle.fluid.profiler.profiler (ArgSpec(args=['state', 'sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', 'a2be24e028dffa06ab28cc55a27c59e4')) -paddle.fluid.profiler.start_profiler (ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None), ('document', '4c192ea399e6e80b1ab47a8265b022a5')) -paddle.fluid.profiler.stop_profiler (ArgSpec(args=['sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', 'bc8628b859b04242200e48a458c971c4')) +paddle.fluid.profiler.profiler (ArgSpec(args=['state', 'sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', '8e8d777eb0127876d7bdb6c421db7f5c')) +paddle.fluid.profiler.start_profiler (ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None), ('document', '9494b48e79a0e07b49017ba5a97800b6')) +paddle.fluid.profiler.stop_profiler (ArgSpec(args=['sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', '10406b144bd8b5e01ea44301219f7fef')) paddle.fluid.unique_name.generate (ArgSpec(args=['key'], varargs=None, keywords=None, defaults=None), ('document', '4d68cde4c4df8f1b8018620b4dc19b42')) paddle.fluid.unique_name.switch (ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)), ('document', '695a6e91afbcdbafac69a069038811be')) paddle.fluid.unique_name.guard (ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ead717d6d440a1eb11971695cd1727f4')) diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py index b0e168929b46a1dd1410d126e093883d79b99895..82b29e25fde11ed488440fb1486b954144ea7cc7 100644 --- a/python/paddle/fluid/profiler.py +++ b/python/paddle/fluid/profiler.py @@ -37,25 +37,27 @@ NVPROF_CONFIG = [ @signature_safe_contextmanager def cuda_profiler(output_file, output_mode=None, config=None): - """The CUDA profiler. + """ + The CUDA profiler. + This fuctions is used to profile CUDA program by CUDA runtime application programming interface. The profiling result will be written into - `output_file` with Key-Value pair format or Comma separated values format. - The user can set the output mode by `output_mode` argument and set the - counters/options for profiling by `config` argument. The default config - is ['gpustarttimestamp', 'gpuendtimestamp', 'gridsize3d', - 'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace']. - Then users can use NVIDIA Visual Profiler - (https://developer.nvidia.com/nvidia-visual-profiler) tools to load this - this output file to visualize results. + `output_file`. The users can set the output mode by `output_mode` argument + and set the nvidia profiling config by `config` argument. + + After getting the profiling result file, users can use + `NVIDIA Visual Profiler `_ + to load this output file to visualize results. Args: - output_file (string) : The output file name, the result will be + output_file (str) : The output file name, the result will be written into this file. - output_mode (string) : The output mode has Key-Value pair format and - Comma separated values format. It should be 'kvp' or 'csv'. - config (list of string) : The profiler options and counters can refer - to "Compute Command Line Profiler User Guide". + output_mode (str, optional) : The output mode has Key-Value pair format ('kvp') + and Comma separated values format ('csv', default). + config (list, optional) : Nvidia profile config. Default config is + ['gpustarttimestamp', 'gpuendtimestamp', 'gridsize3d', 'threadblocksize', + 'streamid', 'enableonstart 0', 'conckerneltrace']. For more details, please + refer to `Compute Command Line Profiler User Guide `_ . Raises: ValueError: If `output_mode` is not in ['kvp', 'csv']. @@ -70,7 +72,7 @@ def cuda_profiler(output_file, output_mode=None, config=None): epoc = 8 dshape = [4, 3, 28, 28] - data = fluid.layers.data(name='data', shape=[3, 28, 28], dtype='float32') + data = fluid.data(name='data', shape=[None, 3, 28, 28], dtype='float32') conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1]) place = fluid.CUDAPlace(0) @@ -127,13 +129,14 @@ def reset_profiler(): def start_profiler(state): """ Enable the profiler. Uers can use `fluid.profiler.start_profiler` and - `fluid.profiler.stop_profiler` to insert the code, except the usage of - `fluid.profiler.profiler` interface. + `fluid.profiler.stop_profiler` to profile, which is equal to the usage + of `fluid.profiler.profiler` interface. Args: - state (string) : The profiling state, which should be 'CPU', 'GPU' - or 'All'. 'CPU' means only profile CPU. 'GPU' means profiling - GPU as well. 'All' also generates timeline. + state (str) : The profiling state, which should be one of 'CPU', 'GPU' + or 'All'. 'CPU' means only profiling CPU; 'GPU' means profiling + both CPU and GPU; 'All' means profiling both CPU and GPU, and + generates timeline as well. Raises: ValueError: If `state` is not in ['CPU', 'GPU', 'All']. @@ -168,21 +171,21 @@ def start_profiler(state): def stop_profiler(sorted_key=None, profile_path='/tmp/profile'): """ Stop the profiler. Uers can use `fluid.profiler.start_profiler` and - `fluid.profiler.stop_profiler` to insert the code, except the usage of - `fluid.profiler.profiler` interface. + `fluid.profiler.stop_profiler` to profile, which is equal to the usage + of `fluid.profiler.profiler` interface. Args: - sorted_key (string) : If None, the profiling results will be printed - in the order of first end time of events. Otherwise, the profiling - results will be sorted by the this flag. This flag should be one - of 'calls', 'total', 'max', 'min' or 'ave'. + sorted_key (str, optional) : The order of profiling results, which + should be one of None, 'calls', 'total', 'max', 'min' or 'ave'. + Default is None, means the profiling results will be printed + in the order of first end time of events. The `calls` means sorting by the number of calls. The `total` means sorting by the total execution time. The `max` means sorting by the maximum execution time. The `min` means sorting by the minimum execution time. The `ave` means sorting by the average execution time. - profile_path (string) : If state == 'All', it will write a profile - proto output file. + profile_path (str, optional) : If state == 'All', it will generate timeline, + and write it into `profile_path`. The default profile_path is `/tmp/profile`. Raises: ValueError: If `sorted_key` is not in @@ -223,34 +226,26 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'): @signature_safe_contextmanager def profiler(state, sorted_key=None, profile_path='/tmp/profile'): - """The profiler interface. - Different from cuda_profiler, this profiler can be used to profile both CPU - and GPU program. By default, it records the CPU and GPU operator kernels, - if you want to profile other program, you can refer the profiling tutorial - to add more records in C++ code. - - If the state == 'All', a profile proto file will be written to - `profile_path`. This file records timeline information during the execution. - Then users can visualize this file to see the timeline, please refer - https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/timeline.md + """ + The profiler interface. Different from `fluid.profiler.cuda_profiler`, + this profiler can be used to profile both CPU and GPU program. Args: - state (string) : The profiling state, which should be 'CPU' or 'GPU', - telling the profiler to use CPU timer or GPU timer for profiling. - Although users may have already specified the execution place - (CPUPlace/CUDAPlace) in the beginning, for flexibility the profiler - would not inherit this place. - sorted_key (string) : If None, the profiling results will be printed - in the order of first end time of events. Otherwise, the profiling - results will be sorted by the this flag. This flag should be one - of 'calls', 'total', 'max', 'min' or 'ave'. + state (str) : The profiling state, which should be one of 'CPU', 'GPU' + or 'All'. 'CPU' means only profiling CPU; 'GPU' means profiling + both CPU and GPU; 'All' means profiling both CPU and GPU, and + generates timeline as well. + sorted_key (str, optional) : The order of profiling results, which + should be one of None, 'calls', 'total', 'max', 'min' or 'ave'. + Default is None, means the profiling results will be printed + in the order of first end time of events. The `calls` means sorting by the number of calls. The `total` means sorting by the total execution time. The `max` means sorting by the maximum execution time. The `min` means sorting by the minimum execution time. The `ave` means sorting by the average execution time. - profile_path (string) : If state == 'All', it will write a profile - proto output file. + profile_path (str, optional) : If state == 'All', it will generate timeline, + and write it into `profile_path`. The default profile_path is `/tmp/profile`. Raises: ValueError: If `state` is not in ['CPU', 'GPU', 'All']. If `sorted_key` is @@ -266,7 +261,7 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'): epoc = 8 dshape = [4, 3, 28, 28] - data = fluid.layers.data(name='data', shape=[3, 28, 28], dtype='float32') + data = fluid.data(name='data', shape=[None, 3, 28, 28], dtype='float32') conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1]) place = fluid.CPUPlace() @@ -277,6 +272,44 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'): for i in range(epoc): input = np.random.random(dshape).astype('float32') exe.run(fluid.default_main_program(), feed={'data': input}) + + Examples Results: + + .. code-block:: text + + #### Examples Results #### + #### 1) sorted_key = 'total', 'calls', 'max', 'min', 'ave' #### + # The only difference in 5 sorted_key results is the following sentense: + # "Sorted by number of xxx in descending order in the same thread." + # The reason is that in this example, above 5 columns are already sorted. + -------------------------> Profiling Report <------------------------- + + Place: CPU + Time unit: ms + Sorted by total time in descending order in the same thread + #Sorted by number of calls in descending order in the same thread + #Sorted by number of max in descending order in the same thread + #Sorted by number of min in descending order in the same thread + #Sorted by number of avg in descending order in the same thread + + Event Calls Total Min. Max. Ave. Ratio. + thread0::conv2d 8 129.406 0.304303 127.076 16.1758 0.983319 + thread0::elementwise_add 8 2.11865 0.193486 0.525592 0.264832 0.016099 + thread0::feed 8 0.076649 0.006834 0.024616 0.00958112 0.000582432 + + #### 2) sorted_key = None #### + # Since the profiling results are printed in the order of first end time of Ops, + # the printed order is feed->conv2d->elementwise_add + -------------------------> Profiling Report <------------------------- + + Place: CPU + Time unit: ms + Sorted by event first end time in descending order in the same thread + + Event Calls Total Min. Max. Ave. Ratio. + thread0::feed 8 0.077419 0.006608 0.023349 0.00967738 0.00775934 + thread0::conv2d 8 7.93456 0.291385 5.63342 0.99182 0.795243 + thread0::elementwise_add 8 1.96555 0.191884 0.518004 0.245693 0.196998 """ start_profiler(state) yield