From 234b530d5274906d0f2966e1ec7907195f318a68 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Thu, 10 Oct 2019 22:23:11 +0800
Subject: [PATCH] refine profiler, name_scope document (#20431)

* refine profiler document (#20326)

* refine profiler document

test=develop test=document_fix

* update profiler document

test=develop test=document_fix

* refine profiler, name_scope document

test=develop test=document_fix
---
 paddle/fluid/API.spec            |  10 +--
 python/paddle/fluid/framework.py |  41 +++++++---
 python/paddle/fluid/profiler.py  | 133 +++++++++++++++++++------------
 3 files changed, 118 insertions(+), 66 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index ebce930fe7c..e1049bd3603 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -10,7 +10,7 @@ paddle.fluid.Program.to_string (ArgSpec(args=['self', 'throw_on_error', 'with_de
 paddle.fluid.default_startup_program (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'f53890b2fb8c0642b6047e4fee2d6d58'))
 paddle.fluid.default_main_program (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '853718df675e59aea7104f3d61bbf11d'))
 paddle.fluid.program_guard (ArgSpec(args=['main_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,)), ('document', '78fb5c7f70ef76bcf4a1862c3f6b8191'))
-paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)), ('document', '917d313881ff990de5fb18d98a9c7b42'))
+paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)), ('document', '907a5f877206079d8e67ae69b06bb3ba'))
 paddle.fluid.cuda_places (ArgSpec(args=['device_ids'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ab9bd2079536114aa7c1488a489ee87f'))
 paddle.fluid.cpu_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a7352a3dd39308fde4fbbf6421a4193d'))
 paddle.fluid.cuda_pinned_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', '567ac29567716fd8e7432b533337d529'))
@@ -1088,11 +1088,11 @@ paddle.fluid.dygraph_grad_clip.GradClipByNorm ('paddle.fluid.dygraph_grad_clip.G
 paddle.fluid.dygraph_grad_clip.GradClipByNorm.__init__ (ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph_grad_clip.GradClipByGlobalNorm ('paddle.fluid.dygraph_grad_clip.GradClipByGlobalNorm', ('document', 'd1872377e7d7a5fe0dd2e8c42e4c9656'))
 paddle.fluid.dygraph_grad_clip.GradClipByGlobalNorm.__init__ (ArgSpec(args=['self', 'max_global_norm'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.profiler.cuda_profiler (ArgSpec(args=['output_file', 'output_mode', 'config'], varargs=None, keywords=None, defaults=(None, None)), ('document', '4053b45953807a24e28027dc86829d6c'))
+paddle.fluid.profiler.cuda_profiler (ArgSpec(args=['output_file', 'output_mode', 'config'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6ae5833bd2490c6a3bdcae0d31ce5ec5'))
 paddle.fluid.profiler.reset_profiler (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'fd1f25a7a06516ca9a1f4ab0783a4d70'))
-paddle.fluid.profiler.profiler (ArgSpec(args=['state', 'sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', 'a2be24e028dffa06ab28cc55a27c59e4'))
-paddle.fluid.profiler.start_profiler (ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None), ('document', '4c192ea399e6e80b1ab47a8265b022a5'))
-paddle.fluid.profiler.stop_profiler (ArgSpec(args=['sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', 'bc8628b859b04242200e48a458c971c4'))
+paddle.fluid.profiler.profiler (ArgSpec(args=['state', 'sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', '8e8d777eb0127876d7bdb6c421db7f5c'))
+paddle.fluid.profiler.start_profiler (ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None), ('document', '9494b48e79a0e07b49017ba5a97800b6'))
+paddle.fluid.profiler.stop_profiler (ArgSpec(args=['sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', '10406b144bd8b5e01ea44301219f7fef'))
 paddle.fluid.unique_name.generate (ArgSpec(args=['key'], varargs=None, keywords=None, defaults=None), ('document', '4d68cde4c4df8f1b8018620b4dc19b42'))
 paddle.fluid.unique_name.switch (ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)), ('document', '695a6e91afbcdbafac69a069038811be'))
 paddle.fluid.unique_name.guard (ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ead717d6d440a1eb11971695cd1727f4'))
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index f174f6e3fbf..622eb247c7a 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -285,27 +285,46 @@ def name_scope(prefix=None):
     """
     Generate hierarchical name prefix for the operators.
 
-    Note: This should only used for debugging and visualization purpose.
-    Don't use it for serious analysis such as graph/program transformations.
+    Note: 
+        This should only used for debugging and visualization purpose.
+        Don't use it for serious analysis such as graph/program transformations.
 
     Args:
-        prefix(str): prefix.
+        prefix(str, optional): prefix. Default is none.
 
     Examples:
         .. code-block:: python
 
           import paddle.fluid as fluid
           with fluid.name_scope("s1"):
-              a = fluid.layers.data(name='data', shape=[1], dtype='int32')
-              b = a + 1
-              with fluid.name_scope("s2"):
-                  c = b * 1
-              with fluid.name_scope("s3"):
-                  d = c / 1
+             a = fluid.data(name='data', shape=[None, 1], dtype='int32')
+             b = a + 1
+             with fluid.name_scope("s2"):
+                c = b * 1
+             with fluid.name_scope("s3"):
+                d = c / 1
           with fluid.name_scope("s1"):
-              f = fluid.layers.pow(d, 2.0)
+                f = fluid.layers.pow(d, 2.0)
           with fluid.name_scope("s4"):
-              g = f - 1
+                g = f - 1
+
+          # Op are created in the default main program.  
+          for op in fluid.default_main_program().block(0).ops:
+              # elementwise_add is created in /s1/
+              if op.type == 'elementwise_add':
+                  assert op.desc.attr("op_namescope") == '/s1/'
+              # elementwise_mul is created in '/s1/s2'
+              elif op.type == 'elementwise_mul':
+                  assert op.desc.attr("op_namescope") == '/s1/s2/'
+              # elementwise_div is created in '/s1/s3'
+              elif op.type == 'elementwise_div':
+                  assert op.desc.attr("op_namescope") == '/s1/s3/'
+              # elementwise_sum is created in '/s4'
+              elif op.type == 'elementwise_sub':
+                  assert op.desc.attr("op_namescope") == '/s4/'
+              # pow is created in /s1_1/
+              elif op.type == 'pow':
+                  assert op.desc.attr("op_namescope") == '/s1_1/'
     """
     # TODO(panyx0718): Only [0-9a-z].
     # in dygraph we don't need namescope since it will cause mem leak
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index b0e168929b4..82b29e25fde 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -37,25 +37,27 @@ NVPROF_CONFIG = [
 
 @signature_safe_contextmanager
 def cuda_profiler(output_file, output_mode=None, config=None):
-    """The CUDA profiler.
+    """
+    The CUDA profiler.
+    
     This fuctions is used to profile CUDA program by CUDA runtime application
     programming interface. The profiling result will be written into
-    `output_file` with Key-Value pair format or Comma separated values format.
-    The user can set the output mode by `output_mode` argument and set the
-    counters/options for profiling by `config` argument. The default config
-    is ['gpustarttimestamp', 'gpuendtimestamp', 'gridsize3d',
-    'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace'].
-    Then users can use NVIDIA Visual Profiler
-    (https://developer.nvidia.com/nvidia-visual-profiler) tools to load this
-    this output file to visualize results.
+    `output_file`. The users can set the output mode by `output_mode` argument 
+    and set the nvidia profiling config by `config` argument. 
+    
+    After getting the profiling result file, users can use 
+    `NVIDIA Visual Profiler <https://developer.nvidia.com/nvidia-visual-profiler>`_ 
+    to load this output file to visualize results.
 
     Args:
-        output_file (string) : The output file name, the result will be
+        output_file (str) : The output file name, the result will be
             written into this file.
-        output_mode (string) : The output mode has Key-Value pair format and
-            Comma separated values format. It should be 'kvp' or 'csv'.
-        config (list of string) : The profiler options and counters can refer
-            to "Compute Command Line Profiler User Guide".
+        output_mode (str, optional) : The output mode has Key-Value pair format ('kvp') 
+            and Comma separated values format ('csv', default).
+        config (list<str>, optional) : Nvidia profile config. Default config is 
+            ['gpustarttimestamp', 'gpuendtimestamp', 'gridsize3d', 'threadblocksize', 
+            'streamid', 'enableonstart 0', 'conckerneltrace']. For more details, please
+            refer to `Compute Command Line Profiler User Guide <https://developer.download.nvidia.cn/compute/DevZone/docs/html/C/doc/Compute_Command_Line_Profiler_User_Guide.pdf>`_ .
 
     Raises:
         ValueError: If `output_mode` is not in ['kvp', 'csv'].
@@ -70,7 +72,7 @@ def cuda_profiler(output_file, output_mode=None, config=None):
 
             epoc = 8
             dshape = [4, 3, 28, 28]
-            data = fluid.layers.data(name='data', shape=[3, 28, 28], dtype='float32')
+            data = fluid.data(name='data', shape=[None, 3, 28, 28], dtype='float32')
             conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
 
             place = fluid.CUDAPlace(0)
@@ -127,13 +129,14 @@ def reset_profiler():
 def start_profiler(state):
     """
     Enable the profiler. Uers can use `fluid.profiler.start_profiler` and
-    `fluid.profiler.stop_profiler` to insert the code, except the usage of
-    `fluid.profiler.profiler` interface.
+    `fluid.profiler.stop_profiler` to profile, which is equal to the usage 
+    of `fluid.profiler.profiler` interface.
 
     Args:
-        state (string) : The profiling state, which should be 'CPU', 'GPU'
-            or 'All'. 'CPU' means only profile CPU. 'GPU' means profiling
-            GPU as well. 'All' also generates timeline.
+        state (str) : The profiling state, which should be one of 'CPU', 'GPU'
+            or 'All'. 'CPU' means only profiling CPU; 'GPU' means profiling
+            both CPU and GPU; 'All' means profiling both CPU and GPU, and 
+            generates timeline as well.
 
     Raises:
         ValueError: If `state` is not in ['CPU', 'GPU', 'All'].
@@ -168,21 +171,21 @@ def start_profiler(state):
 def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
     """
     Stop the profiler. Uers can use `fluid.profiler.start_profiler` and
-    `fluid.profiler.stop_profiler` to insert the code, except the usage of
-    `fluid.profiler.profiler` interface.
+    `fluid.profiler.stop_profiler` to profile, which is equal to the usage 
+    of `fluid.profiler.profiler` interface.
 
     Args:
-        sorted_key (string) : If None, the profiling results will be printed
-            in the order of first end time of events. Otherwise, the profiling
-            results will be sorted by the this flag. This flag should be one
-            of 'calls', 'total', 'max', 'min' or 'ave'.
+        sorted_key (str, optional) : The order of profiling results, which 
+            should be one of None, 'calls', 'total', 'max', 'min' or 'ave'.
+            Default is None, means the profiling results will be printed
+            in the order of first end time of events.
             The `calls` means sorting by the number of calls.
             The `total` means sorting by the total execution time.
             The `max` means sorting by the maximum execution time.
             The `min` means sorting by the minimum execution time.
             The `ave` means sorting by the average execution time.
-        profile_path (string) : If state == 'All', it will write a profile
-            proto output file.
+        profile_path (str, optional) : If state == 'All', it will generate timeline,
+            and write it into `profile_path`. The default profile_path is `/tmp/profile`. 
 
     Raises:
         ValueError: If `sorted_key` is not in
@@ -223,34 +226,26 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
 
 @signature_safe_contextmanager
 def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
-    """The profiler interface.
-    Different from cuda_profiler, this profiler can be used to profile both CPU
-    and GPU program. By default, it records the CPU and GPU operator kernels,
-    if you want to profile other program, you can refer the profiling tutorial
-    to add more records in C++ code.
-
-    If the state == 'All', a profile proto file will be written to
-    `profile_path`. This file records timeline information during the execution.
-    Then users can visualize this file to see the timeline, please refer
-    https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/timeline.md
+    """
+    The profiler interface. Different from `fluid.profiler.cuda_profiler`, 
+    this profiler can be used to profile both CPU and GPU program.
 
     Args:
-        state (string) : The profiling state, which should be 'CPU' or 'GPU',
-            telling the profiler to use CPU timer or GPU timer for profiling.
-            Although users may have already specified the execution place
-            (CPUPlace/CUDAPlace) in the beginning, for flexibility the profiler
-            would not inherit this place.
-        sorted_key (string) : If None, the profiling results will be printed
-            in the order of first end time of events. Otherwise, the profiling
-            results will be sorted by the this flag. This flag should be one
-            of 'calls', 'total', 'max', 'min' or 'ave'.
+        state (str) : The profiling state, which should be one of 'CPU', 'GPU'
+            or 'All'. 'CPU' means only profiling CPU; 'GPU' means profiling
+            both CPU and GPU; 'All' means profiling both CPU and GPU, and 
+            generates timeline as well.
+        sorted_key (str, optional) : The order of profiling results, which 
+            should be one of None, 'calls', 'total', 'max', 'min' or 'ave'.
+            Default is None, means the profiling results will be printed
+            in the order of first end time of events.
             The `calls` means sorting by the number of calls.
             The `total` means sorting by the total execution time.
             The `max` means sorting by the maximum execution time.
             The `min` means sorting by the minimum execution time.
             The `ave` means sorting by the average execution time.
-        profile_path (string) : If state == 'All', it will write a profile
-            proto output file.
+        profile_path (str, optional) : If state == 'All', it will generate timeline,
+            and write it into `profile_path`. The default profile_path is `/tmp/profile`. 
 
     Raises:
         ValueError: If `state` is not in ['CPU', 'GPU', 'All']. If `sorted_key` is
@@ -266,7 +261,7 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
 
             epoc = 8
             dshape = [4, 3, 28, 28]
-            data = fluid.layers.data(name='data', shape=[3, 28, 28], dtype='float32')
+            data = fluid.data(name='data', shape=[None, 3, 28, 28], dtype='float32')
             conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
 
             place = fluid.CPUPlace()
@@ -277,6 +272,44 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
                 for i in range(epoc):
                     input = np.random.random(dshape).astype('float32')
                     exe.run(fluid.default_main_program(), feed={'data': input})
+
+    Examples Results:
+
+        .. code-block:: text
+
+            #### Examples Results ####
+            #### 1) sorted_key = 'total', 'calls', 'max', 'min', 'ave' ####
+            # The only difference in 5 sorted_key results is the following sentense: 
+            # "Sorted by number of xxx in descending order in the same thread."
+            # The reason is that in this example, above 5 columns are already sorted.
+            ------------------------->     Profiling Report     <-------------------------
+
+            Place: CPU
+            Time unit: ms
+            Sorted by total time in descending order in the same thread
+            #Sorted by number of calls in descending order in the same thread
+            #Sorted by number of max in descending order in the same thread
+            #Sorted by number of min in descending order in the same thread
+            #Sorted by number of avg in descending order in the same thread
+
+            Event                       Calls       Total       Min.        Max.        Ave.        Ratio.
+            thread0::conv2d             8           129.406     0.304303    127.076     16.1758     0.983319
+            thread0::elementwise_add    8           2.11865     0.193486    0.525592    0.264832    0.016099
+            thread0::feed               8           0.076649    0.006834    0.024616    0.00958112  0.000582432
+
+            #### 2) sorted_key = None  ####
+            # Since the profiling results are printed in the order of first end time of Ops,
+            # the printed order is feed->conv2d->elementwise_add 
+            ------------------------->     Profiling Report     <-------------------------
+
+            Place: CPU
+            Time unit: ms
+            Sorted by event first end time in descending order in the same thread
+
+            Event                       Calls       Total       Min.        Max.        Ave.        Ratio.
+            thread0::feed               8           0.077419    0.006608    0.023349    0.00967738  0.00775934
+            thread0::conv2d             8           7.93456     0.291385    5.63342     0.99182     0.795243
+            thread0::elementwise_add    8           1.96555     0.191884    0.518004    0.245693    0.196998
     """
     start_profiler(state)
     yield
-- 
GitLab