diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 8148c613a2d6aee0f11be7ac465cdd8641c23b74..c0decac1780f8f7bde43d320624aecca3dd4fbbb 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -1125,11 +1125,11 @@ paddle.fluid.dygraph_grad_clip.GradClipByNorm ('paddle.fluid.dygraph_grad_clip.G
 paddle.fluid.dygraph_grad_clip.GradClipByNorm.__init__ (ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph_grad_clip.GradClipByGlobalNorm ('paddle.fluid.dygraph_grad_clip.GradClipByGlobalNorm', ('document', 'd1872377e7d7a5fe0dd2e8c42e4c9656'))
 paddle.fluid.dygraph_grad_clip.GradClipByGlobalNorm.__init__ (ArgSpec(args=['self', 'max_global_norm'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.profiler.cuda_profiler (ArgSpec(args=['output_file', 'output_mode', 'config'], varargs=None, keywords=None, defaults=(None, None)), ('document', '4053b45953807a24e28027dc86829d6c'))
+paddle.fluid.profiler.cuda_profiler (ArgSpec(args=['output_file', 'output_mode', 'config'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6ae5833bd2490c6a3bdcae0d31ce5ec5'))
 paddle.fluid.profiler.reset_profiler (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'fd1f25a7a06516ca9a1f4ab0783a4d70'))
-paddle.fluid.profiler.profiler (ArgSpec(args=['state', 'sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', 'a2be24e028dffa06ab28cc55a27c59e4'))
-paddle.fluid.profiler.start_profiler (ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None), ('document', '4c192ea399e6e80b1ab47a8265b022a5'))
-paddle.fluid.profiler.stop_profiler (ArgSpec(args=['sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', 'bc8628b859b04242200e48a458c971c4'))
+paddle.fluid.profiler.profiler (ArgSpec(args=['state', 'sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', '8e8d777eb0127876d7bdb6c421db7f5c'))
+paddle.fluid.profiler.start_profiler (ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None), ('document', '9494b48e79a0e07b49017ba5a97800b6'))
+paddle.fluid.profiler.stop_profiler (ArgSpec(args=['sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', '10406b144bd8b5e01ea44301219f7fef'))
 paddle.fluid.unique_name.generate (ArgSpec(args=['key'], varargs=None, keywords=None, defaults=None), ('document', '4d68cde4c4df8f1b8018620b4dc19b42'))
 paddle.fluid.unique_name.switch (ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)), ('document', '695a6e91afbcdbafac69a069038811be'))
 paddle.fluid.unique_name.guard (ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ead717d6d440a1eb11971695cd1727f4'))
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index b0e168929b46a1dd1410d126e093883d79b99895..82b29e25fde11ed488440fb1486b954144ea7cc7 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -37,25 +37,27 @@ NVPROF_CONFIG = [
 
 @signature_safe_contextmanager
 def cuda_profiler(output_file, output_mode=None, config=None):
-    """The CUDA profiler.
+    """
+    The CUDA profiler.
+    
     This fuctions is used to profile CUDA program by CUDA runtime application
     programming interface. The profiling result will be written into
-    `output_file` with Key-Value pair format or Comma separated values format.
-    The user can set the output mode by `output_mode` argument and set the
-    counters/options for profiling by `config` argument. The default config
-    is ['gpustarttimestamp', 'gpuendtimestamp', 'gridsize3d',
-    'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace'].
-    Then users can use NVIDIA Visual Profiler
-    (https://developer.nvidia.com/nvidia-visual-profiler) tools to load this
-    this output file to visualize results.
+    `output_file`. The users can set the output mode by `output_mode` argument 
+    and set the nvidia profiling config by `config` argument. 
+    
+    After getting the profiling result file, users can use 
+    `NVIDIA Visual Profiler <https://developer.nvidia.com/nvidia-visual-profiler>`_ 
+    to load this output file to visualize results.
 
     Args:
-        output_file (string) : The output file name, the result will be
+        output_file (str) : The output file name, the result will be
             written into this file.
-        output_mode (string) : The output mode has Key-Value pair format and
-            Comma separated values format. It should be 'kvp' or 'csv'.
-        config (list of string) : The profiler options and counters can refer
-            to "Compute Command Line Profiler User Guide".
+        output_mode (str, optional) : The output mode has Key-Value pair format ('kvp') 
+            and Comma separated values format ('csv', default).
+        config (list<str>, optional) : Nvidia profile config. Default config is 
+            ['gpustarttimestamp', 'gpuendtimestamp', 'gridsize3d', 'threadblocksize', 
+            'streamid', 'enableonstart 0', 'conckerneltrace']. For more details, please
+            refer to `Compute Command Line Profiler User Guide <https://developer.download.nvidia.cn/compute/DevZone/docs/html/C/doc/Compute_Command_Line_Profiler_User_Guide.pdf>`_ .
 
     Raises:
         ValueError: If `output_mode` is not in ['kvp', 'csv'].
@@ -70,7 +72,7 @@ def cuda_profiler(output_file, output_mode=None, config=None):
 
             epoc = 8
             dshape = [4, 3, 28, 28]
-            data = fluid.layers.data(name='data', shape=[3, 28, 28], dtype='float32')
+            data = fluid.data(name='data', shape=[None, 3, 28, 28], dtype='float32')
             conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
 
             place = fluid.CUDAPlace(0)
@@ -127,13 +129,14 @@ def reset_profiler():
 def start_profiler(state):
     """
     Enable the profiler. Uers can use `fluid.profiler.start_profiler` and
-    `fluid.profiler.stop_profiler` to insert the code, except the usage of
-    `fluid.profiler.profiler` interface.
+    `fluid.profiler.stop_profiler` to profile, which is equal to the usage 
+    of `fluid.profiler.profiler` interface.
 
     Args:
-        state (string) : The profiling state, which should be 'CPU', 'GPU'
-            or 'All'. 'CPU' means only profile CPU. 'GPU' means profiling
-            GPU as well. 'All' also generates timeline.
+        state (str) : The profiling state, which should be one of 'CPU', 'GPU'
+            or 'All'. 'CPU' means only profiling CPU; 'GPU' means profiling
+            both CPU and GPU; 'All' means profiling both CPU and GPU, and 
+            generates timeline as well.
 
     Raises:
         ValueError: If `state` is not in ['CPU', 'GPU', 'All'].
@@ -168,21 +171,21 @@ def start_profiler(state):
 def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
     """
     Stop the profiler. Uers can use `fluid.profiler.start_profiler` and
-    `fluid.profiler.stop_profiler` to insert the code, except the usage of
-    `fluid.profiler.profiler` interface.
+    `fluid.profiler.stop_profiler` to profile, which is equal to the usage 
+    of `fluid.profiler.profiler` interface.
 
     Args:
-        sorted_key (string) : If None, the profiling results will be printed
-            in the order of first end time of events. Otherwise, the profiling
-            results will be sorted by the this flag. This flag should be one
-            of 'calls', 'total', 'max', 'min' or 'ave'.
+        sorted_key (str, optional) : The order of profiling results, which 
+            should be one of None, 'calls', 'total', 'max', 'min' or 'ave'.
+            Default is None, means the profiling results will be printed
+            in the order of first end time of events.
             The `calls` means sorting by the number of calls.
             The `total` means sorting by the total execution time.
             The `max` means sorting by the maximum execution time.
             The `min` means sorting by the minimum execution time.
             The `ave` means sorting by the average execution time.
-        profile_path (string) : If state == 'All', it will write a profile
-            proto output file.
+        profile_path (str, optional) : If state == 'All', it will generate timeline,
+            and write it into `profile_path`. The default profile_path is `/tmp/profile`. 
 
     Raises:
         ValueError: If `sorted_key` is not in
@@ -223,34 +226,26 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
 
 @signature_safe_contextmanager
 def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
-    """The profiler interface.
-    Different from cuda_profiler, this profiler can be used to profile both CPU
-    and GPU program. By default, it records the CPU and GPU operator kernels,
-    if you want to profile other program, you can refer the profiling tutorial
-    to add more records in C++ code.
-
-    If the state == 'All', a profile proto file will be written to
-    `profile_path`. This file records timeline information during the execution.
-    Then users can visualize this file to see the timeline, please refer
-    https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/timeline.md
+    """
+    The profiler interface. Different from `fluid.profiler.cuda_profiler`, 
+    this profiler can be used to profile both CPU and GPU program.
 
     Args:
-        state (string) : The profiling state, which should be 'CPU' or 'GPU',
-            telling the profiler to use CPU timer or GPU timer for profiling.
-            Although users may have already specified the execution place
-            (CPUPlace/CUDAPlace) in the beginning, for flexibility the profiler
-            would not inherit this place.
-        sorted_key (string) : If None, the profiling results will be printed
-            in the order of first end time of events. Otherwise, the profiling
-            results will be sorted by the this flag. This flag should be one
-            of 'calls', 'total', 'max', 'min' or 'ave'.
+        state (str) : The profiling state, which should be one of 'CPU', 'GPU'
+            or 'All'. 'CPU' means only profiling CPU; 'GPU' means profiling
+            both CPU and GPU; 'All' means profiling both CPU and GPU, and 
+            generates timeline as well.
+        sorted_key (str, optional) : The order of profiling results, which 
+            should be one of None, 'calls', 'total', 'max', 'min' or 'ave'.
+            Default is None, means the profiling results will be printed
+            in the order of first end time of events.
             The `calls` means sorting by the number of calls.
             The `total` means sorting by the total execution time.
             The `max` means sorting by the maximum execution time.
             The `min` means sorting by the minimum execution time.
             The `ave` means sorting by the average execution time.
-        profile_path (string) : If state == 'All', it will write a profile
-            proto output file.
+        profile_path (str, optional) : If state == 'All', it will generate timeline,
+            and write it into `profile_path`. The default profile_path is `/tmp/profile`. 
 
     Raises:
         ValueError: If `state` is not in ['CPU', 'GPU', 'All']. If `sorted_key` is
@@ -266,7 +261,7 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
 
             epoc = 8
             dshape = [4, 3, 28, 28]
-            data = fluid.layers.data(name='data', shape=[3, 28, 28], dtype='float32')
+            data = fluid.data(name='data', shape=[None, 3, 28, 28], dtype='float32')
             conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
 
             place = fluid.CPUPlace()
@@ -277,6 +272,44 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
                 for i in range(epoc):
                     input = np.random.random(dshape).astype('float32')
                     exe.run(fluid.default_main_program(), feed={'data': input})
+
+    Examples Results:
+
+        .. code-block:: text
+
+            #### Examples Results ####
+            #### 1) sorted_key = 'total', 'calls', 'max', 'min', 'ave' ####
+            # The only difference in 5 sorted_key results is the following sentense: 
+            # "Sorted by number of xxx in descending order in the same thread."
+            # The reason is that in this example, above 5 columns are already sorted.
+            ------------------------->     Profiling Report     <-------------------------
+
+            Place: CPU
+            Time unit: ms
+            Sorted by total time in descending order in the same thread
+            #Sorted by number of calls in descending order in the same thread
+            #Sorted by number of max in descending order in the same thread
+            #Sorted by number of min in descending order in the same thread
+            #Sorted by number of avg in descending order in the same thread
+
+            Event                       Calls       Total       Min.        Max.        Ave.        Ratio.
+            thread0::conv2d             8           129.406     0.304303    127.076     16.1758     0.983319
+            thread0::elementwise_add    8           2.11865     0.193486    0.525592    0.264832    0.016099
+            thread0::feed               8           0.076649    0.006834    0.024616    0.00958112  0.000582432
+
+            #### 2) sorted_key = None  ####
+            # Since the profiling results are printed in the order of first end time of Ops,
+            # the printed order is feed->conv2d->elementwise_add 
+            ------------------------->     Profiling Report     <-------------------------
+
+            Place: CPU
+            Time unit: ms
+            Sorted by event first end time in descending order in the same thread
+
+            Event                       Calls       Total       Min.        Max.        Ave.        Ratio.
+            thread0::feed               8           0.077419    0.006608    0.023349    0.00967738  0.00775934
+            thread0::conv2d             8           7.93456     0.291385    5.63342     0.99182     0.795243
+            thread0::elementwise_add    8           1.96555     0.191884    0.518004    0.245693    0.196998
     """
     start_profiler(state)
     yield