refine profiler, name_scope document (#20431)

* refine profiler document (#20326) * refine profiler document test=develop test=document_fix * update profiler document test=develop test=document_fix * refine profiler, name_scope document test=develop test=document_fix

refine profiler, name_scope document (#20431)
* refine profiler document (#20326) * refine profiler document test=develop test=document_fix * update profiler document test=develop test=document_fix * refine profiler, name_scope document test=develop test=document_fix
234b530d · Tao Luo · GitHub · dfde0eaa · 234b530d · 234b530d
Showing with 118 addition and 66 deletion

paddle/fluid/API.spec paddle/fluid/API.spec +5 -5

python/paddle/fluid/framework.py python/paddle/fluid/framework.py +30 -11

python/paddle/fluid/profiler.py python/paddle/fluid/profiler.py +83 -50

未找到文件。
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -10,7 +10,7 @@ paddle.fluid.Program.to_string (ArgSpec(args=['self', 'throw_on_error', 'with_de
 paddle.fluid.default_startup_program (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'f53890b2fb8c0642b6047e4fee2d6d58'))
 paddle.fluid.default_main_program (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '853718df675e59aea7104f3d61bbf11d'))
 paddle.fluid.program_guard (ArgSpec(args=['main_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,)), ('document', '78fb5c7f70ef76bcf4a1862c3f6b8191'))
-paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)), ('document', '917d313881ff990de5fb18d98a9c7b42'))
+paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)), ('document', '907a5f877206079d8e67ae69b06bb3ba'))
 paddle.fluid.cuda_places (ArgSpec(args=['device_ids'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ab9bd2079536114aa7c1488a489ee87f'))
 paddle.fluid.cpu_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a7352a3dd39308fde4fbbf6421a4193d'))
 paddle.fluid.cuda_pinned_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', '567ac29567716fd8e7432b533337d529'))
@@ -1088,11 +1088,11 @@ paddle.fluid.dygraph_grad_clip.GradClipByNorm ('paddle.fluid.dygraph_grad_clip.G
 paddle.fluid.dygraph_grad_clip.GradClipByNorm.__init__ (ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph_grad_clip.GradClipByGlobalNorm ('paddle.fluid.dygraph_grad_clip.GradClipByGlobalNorm', ('document', 'd1872377e7d7a5fe0dd2e8c42e4c9656'))
 paddle.fluid.dygraph_grad_clip.GradClipByGlobalNorm.__init__ (ArgSpec(args=['self', 'max_global_norm'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.profiler.cuda_profiler (ArgSpec(args=['output_file', 'output_mode', 'config'], varargs=None, keywords=None, defaults=(None, None)), ('document', '4053b45953807a24e28027dc86829d6c'))
+paddle.fluid.profiler.cuda_profiler (ArgSpec(args=['output_file', 'output_mode', 'config'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6ae5833bd2490c6a3bdcae0d31ce5ec5'))
 paddle.fluid.profiler.reset_profiler (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'fd1f25a7a06516ca9a1f4ab0783a4d70'))
-paddle.fluid.profiler.profiler (ArgSpec(args=['state', 'sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', 'a2be24e028dffa06ab28cc55a27c59e4'))
+paddle.fluid.profiler.profiler (ArgSpec(args=['state', 'sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', '8e8d777eb0127876d7bdb6c421db7f5c'))
-paddle.fluid.profiler.start_profiler (ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None), ('document', '4c192ea399e6e80b1ab47a8265b022a5'))
+paddle.fluid.profiler.start_profiler (ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None), ('document', '9494b48e79a0e07b49017ba5a97800b6'))
-paddle.fluid.profiler.stop_profiler (ArgSpec(args=['sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', 'bc8628b859b04242200e48a458c971c4'))
+paddle.fluid.profiler.stop_profiler (ArgSpec(args=['sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', '10406b144bd8b5e01ea44301219f7fef'))
 paddle.fluid.unique_name.generate (ArgSpec(args=['key'], varargs=None, keywords=None, defaults=None), ('document', '4d68cde4c4df8f1b8018620b4dc19b42'))
 paddle.fluid.unique_name.switch (ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)), ('document', '695a6e91afbcdbafac69a069038811be'))
 paddle.fluid.unique_name.guard (ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ead717d6d440a1eb11971695cd1727f4'))

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -285,18 +285,19 @@ def name_scope(prefix=None):
    """
    Generate hierarchical name prefix for the operators.
-    Note: This should only used for debugging and visualization purpose.
+    Note: 
+        This should only used for debugging and visualization purpose.
        Don't use it for serious analysis such as graph/program transformations.
    Args:
-        prefix(str): prefix.
+        prefix(str, optional): prefix. Default is none.
    Examples:
        .. code-block:: python
          import paddle.fluid as fluid
          with fluid.name_scope("s1"):
-              a = fluid.layers.data(name='data', shape=[1], dtype='int32')
+             a = fluid.data(name='data', shape=[None, 1], dtype='int32')
             b = a + 1
             with fluid.name_scope("s2"):
                c = b * 1
@@ -306,6 +307,24 @@ def name_scope(prefix=None):
                f = fluid.layers.pow(d, 2.0)
          with fluid.name_scope("s4"):
                g = f - 1
+          # Op are created in the default main program.  
+          for op in fluid.default_main_program().block(0).ops:
+              # elementwise_add is created in /s1/
+              if op.type == 'elementwise_add':
+                  assert op.desc.attr("op_namescope") == '/s1/'
+              # elementwise_mul is created in '/s1/s2'
+              elif op.type == 'elementwise_mul':
+                  assert op.desc.attr("op_namescope") == '/s1/s2/'
+              # elementwise_div is created in '/s1/s3'
+              elif op.type == 'elementwise_div':
+                  assert op.desc.attr("op_namescope") == '/s1/s3/'
+              # elementwise_sum is created in '/s4'
+              elif op.type == 'elementwise_sub':
+                  assert op.desc.attr("op_namescope") == '/s4/'
+              # pow is created in /s1_1/
+              elif op.type == 'pow':
+                  assert op.desc.attr("op_namescope") == '/s1_1/'
    """
    # TODO(panyx0718): Only [0-9a-z].
    # in dygraph we don't need namescope since it will cause mem leak

--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -37,25 +37,27 @@ NVPROF_CONFIG = [
 @signature_safe_contextmanager
 def cuda_profiler(output_file, output_mode=None, config=None):
-    """The CUDA profiler.
+    """
+    The CUDA profiler.
    This fuctions is used to profile CUDA program by CUDA runtime application
    programming interface. The profiling result will be written into
-    `output_file` with Key-Value pair format or Comma separated values format.
+    `output_file`. The users can set the output mode by `output_mode` argument 
-    The user can set the output mode by `output_mode` argument and set the
+    and set the nvidia profiling config by `config` argument. 
-    counters/options for profiling by `config` argument. The default config
-    is ['gpustarttimestamp', 'gpuendtimestamp', 'gridsize3d',
+    After getting the profiling result file, users can use 
-    'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace'].
+    `NVIDIA Visual Profiler <https://developer.nvidia.com/nvidia-visual-profiler>`_ 
-    Then users can use NVIDIA Visual Profiler
+    to load this output file to visualize results.
-    (https://developer.nvidia.com/nvidia-visual-profiler) tools to load this
-    this output file to visualize results.
    Args:
-        output_file (string) : The output file name, the result will be
+        output_file (str) : The output file name, the result will be
            written into this file.
-        output_mode (string) : The output mode has Key-Value pair format and
+        output_mode (str, optional) : The output mode has Key-Value pair format ('kvp') 
-            Comma separated values format. It should be 'kvp' or 'csv'.
+            and Comma separated values format ('csv', default).
-        config (list of string) : The profiler options and counters can refer
+        config (list<str>, optional) : Nvidia profile config. Default config is 
-            to "Compute Command Line Profiler User Guide".
+            ['gpustarttimestamp', 'gpuendtimestamp', 'gridsize3d', 'threadblocksize', 
+            'streamid', 'enableonstart 0', 'conckerneltrace']. For more details, please
+            refer to `Compute Command Line Profiler User Guide <https://developer.download.nvidia.cn/compute/DevZone/docs/html/C/doc/Compute_Command_Line_Profiler_User_Guide.pdf>`_ .
    Raises:
        ValueError: If `output_mode` is not in ['kvp', 'csv'].
@@ -70,7 +72,7 @@ def cuda_profiler(output_file, output_mode=None, config=None):
            epoc = 8
            dshape = [4, 3, 28, 28]
-            data = fluid.layers.data(name='data', shape=[3, 28, 28], dtype='float32')
+            data = fluid.data(name='data', shape=[None, 3, 28, 28], dtype='float32')
            conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
            place = fluid.CUDAPlace(0)
@@ -127,13 +129,14 @@ def reset_profiler():
 def start_profiler(state):
    """
    Enable the profiler. Uers can use `fluid.profiler.start_profiler` and
-    `fluid.profiler.stop_profiler` to insert the code, except the usage of
+    `fluid.profiler.stop_profiler` to profile, which is equal to the usage 
-    `fluid.profiler.profiler` interface.
+    of `fluid.profiler.profiler` interface.
    Args:
-        state (string) : The profiling state, which should be 'CPU', 'GPU'
+        state (str) : The profiling state, which should be one of 'CPU', 'GPU'
-            or 'All'. 'CPU' means only profile CPU. 'GPU' means profiling
+            or 'All'. 'CPU' means only profiling CPU; 'GPU' means profiling
-            GPU as well. 'All' also generates timeline.
+            both CPU and GPU; 'All' means profiling both CPU and GPU, and 
+            generates timeline as well.
    Raises:
        ValueError: If `state` is not in ['CPU', 'GPU', 'All'].
@@ -168,21 +171,21 @@ def start_profiler(state):
 def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
    """
    Stop the profiler. Uers can use `fluid.profiler.start_profiler` and
-    `fluid.profiler.stop_profiler` to insert the code, except the usage of
+    `fluid.profiler.stop_profiler` to profile, which is equal to the usage 
-    `fluid.profiler.profiler` interface.
+    of `fluid.profiler.profiler` interface.
    Args:
-        sorted_key (string) : If None, the profiling results will be printed
+        sorted_key (str, optional) : The order of profiling results, which 
-            in the order of first end time of events. Otherwise, the profiling
+            should be one of None, 'calls', 'total', 'max', 'min' or 'ave'.
-            results will be sorted by the this flag. This flag should be one
+            Default is None, means the profiling results will be printed
-            of 'calls', 'total', 'max', 'min' or 'ave'.
+            in the order of first end time of events.
            The `calls` means sorting by the number of calls.
            The `total` means sorting by the total execution time.
            The `max` means sorting by the maximum execution time.
            The `min` means sorting by the minimum execution time.
            The `ave` means sorting by the average execution time.
-        profile_path (string) : If state == 'All', it will write a profile
+        profile_path (str, optional) : If state == 'All', it will generate timeline,
-            proto output file.
+            and write it into `profile_path`. The default profile_path is `/tmp/profile`. 
    Raises:
        ValueError: If `sorted_key` is not in
@@ -223,34 +226,26 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
 @signature_safe_contextmanager
 def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
-    """The profiler interface.
+    """
-    Different from cuda_profiler, this profiler can be used to profile both CPU
+    The profiler interface. Different from `fluid.profiler.cuda_profiler`, 
-    and GPU program. By default, it records the CPU and GPU operator kernels,
+    this profiler can be used to profile both CPU and GPU program.
-    if you want to profile other program, you can refer the profiling tutorial
-    to add more records in C++ code.
-    If the state == 'All', a profile proto file will be written to
-    `profile_path`. This file records timeline information during the execution.
-    Then users can visualize this file to see the timeline, please refer
-    https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/timeline.md
    Args:
-        state (string) : The profiling state, which should be 'CPU' or 'GPU',
+        state (str) : The profiling state, which should be one of 'CPU', 'GPU'
-            telling the profiler to use CPU timer or GPU timer for profiling.
+            or 'All'. 'CPU' means only profiling CPU; 'GPU' means profiling
-            Although users may have already specified the execution place
+            both CPU and GPU; 'All' means profiling both CPU and GPU, and 
-            (CPUPlace/CUDAPlace) in the beginning, for flexibility the profiler
+            generates timeline as well.
-            would not inherit this place.
+        sorted_key (str, optional) : The order of profiling results, which 
-        sorted_key (string) : If None, the profiling results will be printed
+            should be one of None, 'calls', 'total', 'max', 'min' or 'ave'.
-            in the order of first end time of events. Otherwise, the profiling
+            Default is None, means the profiling results will be printed
-            results will be sorted by the this flag. This flag should be one
+            in the order of first end time of events.
-            of 'calls', 'total', 'max', 'min' or 'ave'.
            The `calls` means sorting by the number of calls.
            The `total` means sorting by the total execution time.
            The `max` means sorting by the maximum execution time.
            The `min` means sorting by the minimum execution time.
            The `ave` means sorting by the average execution time.
-        profile_path (string) : If state == 'All', it will write a profile
+        profile_path (str, optional) : If state == 'All', it will generate timeline,
-            proto output file.
+            and write it into `profile_path`. The default profile_path is `/tmp/profile`. 
    Raises:
        ValueError: If `state` is not in ['CPU', 'GPU', 'All']. If `sorted_key` is
@@ -266,7 +261,7 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
            epoc = 8
            dshape = [4, 3, 28, 28]
-            data = fluid.layers.data(name='data', shape=[3, 28, 28], dtype='float32')
+            data = fluid.data(name='data', shape=[None, 3, 28, 28], dtype='float32')
            conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
            place = fluid.CPUPlace()
@@ -277,6 +272,44 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
                for i in range(epoc):
                    input = np.random.random(dshape).astype('float32')
                    exe.run(fluid.default_main_program(), feed={'data': input})
+    Examples Results:
+        .. code-block:: text
+            #### Examples Results ####
+            #### 1) sorted_key = 'total', 'calls', 'max', 'min', 'ave' ####
+            # The only difference in 5 sorted_key results is the following sentense: 
+            # "Sorted by number of xxx in descending order in the same thread."
+            # The reason is that in this example, above 5 columns are already sorted.
+            ------------------------->     Profiling Report     <-------------------------
+            Place: CPU
+            Time unit: ms
+            Sorted by total time in descending order in the same thread
+            #Sorted by number of calls in descending order in the same thread
+            #Sorted by number of max in descending order in the same thread
+            #Sorted by number of min in descending order in the same thread
+            #Sorted by number of avg in descending order in the same thread
+            Event                       Calls       Total       Min.        Max.        Ave.        Ratio.
+            thread0::conv2d             8           129.406     0.304303    127.076     16.1758     0.983319
+            thread0::elementwise_add    8           2.11865     0.193486    0.525592    0.264832    0.016099
+            thread0::feed               8           0.076649    0.006834    0.024616    0.00958112  0.000582432
+            #### 2) sorted_key = None  ####
+            # Since the profiling results are printed in the order of first end time of Ops,
+            # the printed order is feed->conv2d->elementwise_add 
+            ------------------------->     Profiling Report     <-------------------------
+            Place: CPU
+            Time unit: ms
+            Sorted by event first end time in descending order in the same thread
+            Event                       Calls       Total       Min.        Max.        Ave.        Ratio.
+            thread0::feed               8           0.077419    0.006608    0.023349    0.00967738  0.00775934
+            thread0::conv2d             8           7.93456     0.291385    5.63342     0.99182     0.795243
+            thread0::elementwise_add    8           1.96555     0.191884    0.518004    0.245693    0.196998
    """
    start_profiler(state)
    yield