From 418cc35da83605a68ab424b46145ed672c844097 Mon Sep 17 00:00:00 2001
From: cyberslack_lee <luhputu0815@gmail.com>
Date: Thu, 3 Aug 2023 10:54:19 +0800
Subject: [PATCH] [xdoctest] reformat example code with google style in
 No.86-90 (#55812)

* norm, test=docs_preview

* test=docs_preview

* test=docs_preview

* test=docs_preview
---
 python/paddle/nn/layer/norm.py        | 281 +++++++++------
 python/paddle/nn/layer/pooling.py     | 482 ++++++++++++++------------
 python/paddle/nn/layer/rnn.py         | 197 ++++++-----
 python/paddle/nn/layer/transformer.py | 198 ++++++-----
 python/paddle/nn/layer/vision.py      |  68 ++--
 5 files changed, 682 insertions(+), 544 deletions(-)

diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index c85fa4f60ce..e01e426a75f 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -165,14 +165,18 @@ class InstanceNorm1D(_InstanceNormBase):
 
         .. code-block:: python
 
-          import paddle
-
-          x = paddle.rand((2, 2, 3))
-          instance_norm = paddle.nn.InstanceNorm1D(2)
-          instance_norm_out = instance_norm(x)
-
-          print(instance_norm_out)
-
+            >>> import paddle
+            >>> paddle.seed(100)
+            >>> x = paddle.rand((2, 2, 3))
+            >>> instance_norm = paddle.nn.InstanceNorm1D(2)
+            >>> instance_norm_out = instance_norm(x)
+
+            >>> print(instance_norm_out)
+            Tensor(shape=[2, 2, 3], dtype=float32, place=Place(cpu), stop_gradient=False,
+            [[[ 1.32132232, -0.22444785, -1.09687424],
+              [ 1.29506636, -0.15688568, -1.13818073]],
+             [[-0.27764025,  1.33961368, -1.06197333],
+              [ 0.44484580, -1.38489723,  0.94005162]]])
     """
 
     def __init__(
@@ -255,13 +259,22 @@ class InstanceNorm2D(_InstanceNormBase):
 
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.rand((2, 2, 2, 3))
-            instance_norm = paddle.nn.InstanceNorm2D(2)
-            instance_norm_out = instance_norm(x)
-
-            print(instance_norm_out)
+            >>> import paddle
+            >>> paddle.seed(100)
+            >>> x = paddle.rand((2, 2, 2, 3))
+            >>> instance_norm = paddle.nn.InstanceNorm2D(2)
+            >>> instance_norm_out = instance_norm(x)
+
+            >>> print(instance_norm_out)
+            Tensor(shape=[2, 2, 2, 3], dtype=float32, place=Place(cpu), stop_gradient=False,
+            [[[[ 1.26652932, -0.60229748, -1.65705574],
+               [ 1.06272733,  0.24229208, -0.31219524]],
+              [[-0.85414171,  0.31684181, -1.42204332],
+               [ 1.00412714, -0.43966094,  1.39487720]]],
+             [[[ 0.83324969,  1.25046813, -0.79470295],
+               [-1.38446140,  0.81851846, -0.72307163]],
+              [[-0.33560610,  0.95346332,  0.45585334],
+               [-0.53483474,  1.20336461, -1.74224067]]]])
     """
 
     def __init__(
@@ -342,13 +355,30 @@ class InstanceNorm3D(_InstanceNormBase):
 
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.rand((2, 2, 2, 2, 3))
-            instance_norm = paddle.nn.InstanceNorm3D(2)
-            instance_norm_out = instance_norm(x)
-
-            print(instance_norm_out.numpy)
+            >>> import paddle
+            >>> paddle.seed(100)
+            >>> x = paddle.rand((2, 2, 2, 2, 3))
+            >>> instance_norm = paddle.nn.InstanceNorm3D(2)
+            >>> instance_norm_out = instance_norm(x)
+
+            >>> print(instance_norm_out)
+            Tensor(shape=[2, 2, 2, 2, 3], dtype=float32, place=Place(cpu), stop_gradient=False,
+            [[[[[ 0.60520107, -0.67670596, -1.40020907],
+                [ 0.46540472, -0.09736639, -0.47771260]],
+               [[-0.74365318,  0.63718963, -1.41333199],
+                [ 1.44764769, -0.25489071,  1.90842640]]],
+              [[[ 1.09773374,  1.49568439, -0.45503727],
+                [-1.01755965,  1.08368278, -0.38671401]],
+               [[-0.62252384,  0.60490805,  0.13109155],
+                [-0.81222630,  0.84286022, -1.96189928]]]],
+             [[[[ 0.28014541,  0.91674680,  1.71797717],
+                [-0.52062720, -0.74274176, -0.86439967]],
+               [[ 0.25707796, -1.23866379,  1.64422870],
+                [-1.48577297, -0.13187379,  0.16790220]]],
+              [[[-1.49266160,  1.57909954,  0.46455818],
+                [-0.14981404,  1.46959865,  0.24957968]],
+               [[ 0.25134835, -0.03276967, -0.30318922],
+                [ 0.76263177, -1.11345232, -1.68492818]]]]])
     """
 
     def __init__(
@@ -410,13 +440,38 @@ class GroupNorm(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.arange(48, dtype="float32").reshape((2, 6, 2, 2))
-            group_norm = paddle.nn.GroupNorm(num_channels=6, num_groups=6)
-            group_norm_out = group_norm(x)
-
-            print(group_norm_out)
+            >>> import paddle
+            >>> paddle.seed(100)
+            >>> x = paddle.arange(48, dtype="float32").reshape((2, 6, 2, 2))
+            >>> group_norm = paddle.nn.GroupNorm(num_channels=6, num_groups=6)
+            >>> group_norm_out = group_norm(x)
+
+            >>> print(group_norm_out)
+            Tensor(shape=[2, 6, 2, 2], dtype=float32, place=Place(cpu), stop_gradient=False,
+            [[[[-1.34163547, -0.44721183],
+               [ 0.44721183,  1.34163547]],
+              [[-1.34163547, -0.44721183],
+               [ 0.44721183,  1.34163547]],
+              [[-1.34163547, -0.44721183],
+               [ 0.44721183,  1.34163547]],
+              [[-1.34163547, -0.44721183],
+               [ 0.44721183,  1.34163547]],
+              [[-1.34163547, -0.44721183],
+               [ 0.44721183,  1.34163547]],
+              [[-1.34163547, -0.44721183],
+               [ 0.44721183,  1.34163547]]],
+             [[[-1.34163547, -0.44721183],
+               [ 0.44721183,  1.34163547]],
+              [[-1.34163547, -0.44721183],
+               [ 0.44721183,  1.34163547]],
+              [[-1.34163547, -0.44721183],
+               [ 0.44721183,  1.34163547]],
+              [[-1.34163547, -0.44721183],
+               [ 0.44721183,  1.34163547]],
+              [[-1.34163547, -0.44721183],
+               [ 0.44721183,  1.34163547]],
+              [[-1.34163547, -0.44721183],
+               [ 0.44721183,  1.34163547]]]])
     """
 
     def __init__(
@@ -575,13 +630,22 @@ class LayerNorm(Layer):
 
         .. code-block:: python
 
-          import paddle
-
-          x = paddle.rand((2, 2, 2, 3))
-          layer_norm = paddle.nn.LayerNorm(x.shape[1:])
-          layer_norm_out = layer_norm(x)
-
-          print(layer_norm_out)
+            >>> import paddle
+            >>> paddle.seed(100)
+            >>> x = paddle.rand((2, 2, 2, 3))
+            >>> layer_norm = paddle.nn.LayerNorm(x.shape[1:])
+            >>> layer_norm_out = layer_norm(x)
+
+            >>> print(layer_norm_out)
+            Tensor(shape=[2, 2, 2, 3], dtype=float32, place=Place(cpu), stop_gradient=False,
+            [[[[ 0.60520101, -0.67670590, -1.40020895],
+               [ 0.46540466, -0.09736638, -0.47771254]],
+              [[-0.74365306,  0.63718957, -1.41333175],
+               [ 1.44764745, -0.25489068,  1.90842617]]],
+             [[[ 1.09773350,  1.49568415, -0.45503747],
+               [-1.01755989,  1.08368254, -0.38671425]],
+              [[-0.62252408,  0.60490781,  0.13109133],
+               [-0.81222653,  0.84285998, -1.96189952]]]])
     """
 
     def __init__(
@@ -891,17 +955,17 @@ class BatchNorm(Layer):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          import paddle.nn as nn
-          from paddle.fluid.dygraph.base import to_variable
-          import numpy as np
+            >>> import paddle.fluid as fluid
+            >>> import paddle.nn as nn
+            >>> from paddle.fluid.dygraph.base import to_variable
+            >>> import numpy as np
 
 
-          x = np.random.random(size=(3, 10, 3, 7)).astype('float32')
-          with fluid.dygraph.guard():
-              x = to_variable(x)
-              batch_norm = nn.layer.norm.BatchNorm(10)
-              hidden1 = batch_norm(x)
+            >>> x = np.random.random(size=(3, 10, 3, 7)).astype('float32')
+            >>> with fluid.dygraph.guard():
+            ...     x = to_variable(x)
+            ...     batch_norm = nn.layer.norm.BatchNorm(10)
+            ...     hidden1 = batch_norm(x)
     """
 
     def __init__(
@@ -1165,13 +1229,16 @@ class BatchNorm1D(_BatchNormBase):
     Examples:
         .. code-block:: python
 
-          import paddle
-
-          x = paddle.rand((2, 1, 3))
-          batch_norm = paddle.nn.BatchNorm1D(1)
-          batch_norm_out = batch_norm(x)
+            >>> import paddle
+            >>> paddle.seed(100)
+            >>> x = paddle.rand((2, 1, 3))
+            >>> batch_norm = paddle.nn.BatchNorm1D(1)
+            >>> batch_norm_out = batch_norm(x)
 
-          print(batch_norm_out)
+            >>> print(batch_norm_out)
+            Tensor(shape=[2, 1, 3], dtype=float32, place=Place(cpu), stop_gradient=False,
+            [[[ 1.26652932, -0.60229754, -1.65705597]],
+             [[ 1.06272745,  0.24229205, -0.31219530]]])
     """
 
     def __init__(
@@ -1277,13 +1344,18 @@ class BatchNorm2D(_BatchNormBase):
     Examples:
         .. code-block:: python
 
-          import paddle
-
-          x = paddle.rand((2, 1, 2, 3))
-          batch_norm = paddle.nn.BatchNorm2D(1)
-          batch_norm_out = batch_norm(x)
-
-          print(batch_norm_out)
+            >>> import paddle
+            >>> paddle.seed(100)
+            >>> x = paddle.rand((2, 1, 2, 3))
+            >>> batch_norm = paddle.nn.BatchNorm2D(1)
+            >>> batch_norm_out = batch_norm(x)
+
+            >>> print(batch_norm_out)
+            Tensor(shape=[2, 1, 2, 3], dtype=float32, place=Place(cpu), stop_gradient=False,
+            [[[[ 0.60520101, -0.67670590, -1.40020895],
+               [ 0.46540475, -0.09736633, -0.47771257]]],
+             [[[-0.74365312,  0.63718963, -1.41333187],
+               [ 1.44764757, -0.25489068,  1.90842628]]]])
     """
 
     def _check_data_format(self, input):
@@ -1363,13 +1435,22 @@ class BatchNorm3D(_BatchNormBase):
     Examples:
         .. code-block:: python
 
-          import paddle
-
-          x = paddle.rand((2, 1, 2, 2, 3))
-          batch_norm = paddle.nn.BatchNorm3D(1)
-          batch_norm_out = batch_norm(x)
-
-          print(batch_norm_out)
+            >>> import paddle
+            >>> paddle.seed(100)
+            >>> x = paddle.rand((2, 1, 2, 2, 3))
+            >>> batch_norm = paddle.nn.BatchNorm3D(1)
+            >>> batch_norm_out = batch_norm(x)
+
+            >>> print(batch_norm_out)
+            Tensor(shape=[2, 1, 2, 2, 3], dtype=float32, place=Place(cpu), stop_gradient=False,
+            [[[[[ 0.28011751, -0.95211101, -1.64757574],
+                [ 0.14573872, -0.39522290, -0.76082933]],
+               [[-1.01646376,  0.31086648, -1.66019011],
+                [ 1.08991623, -0.54664266,  1.53283834]]]],
+             [[[[ 1.33958006,  1.71585774, -0.12862551],
+                [-0.66051245,  1.32629418, -0.06402326]],
+               [[-0.28699064,  0.87359405,  0.42558217],
+                [-0.46636176,  1.09858704, -1.55342245]]]]])
     """
 
     def __init__(
@@ -1485,23 +1566,22 @@ class SyncBatchNorm(_BatchNormBase):
     Examples:
         .. code-block:: python
 
-            # required: gpu
-
-            import paddle
-            import paddle.nn as nn
-
-            x = paddle.to_tensor([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
+            >>> # doctest: +REQUIRES(env:GPU)
 
-            if paddle.is_compiled_with_cuda():
-                sync_batch_norm = nn.SyncBatchNorm(2)
-                hidden1 = sync_batch_norm(x)
-                print(hidden1)
-                # Tensor(shape=[1, 2, 2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
-                #        [[[[ 0.26824948,  1.09363246],
-                #           [ 0.26824948, -1.63013160]],
+            >>> import paddle
+            >>> import paddle.nn as nn
+            >>> paddle.device.set_device('gpu')
+            >>> x = paddle.to_tensor([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
 
-                #          [[ 0.80956620, -0.66528702],
-                #           [-1.27446556,  1.13018656]]]])
+            >>> if paddle.is_compiled_with_cuda():
+            ...     sync_batch_norm = nn.SyncBatchNorm(2)
+            ...     hidden1 = sync_batch_norm(x)
+            ...     print(hidden1)
+            Tensor(shape=[1, 2, 2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+            [[[[ 0.26824948,  1.09363246],
+               [ 0.26824948, -1.63013160]],
+              [[ 0.80956620, -0.66528702],
+               [-1.27446556,  1.13018656]]]])
 
     """
 
@@ -1625,11 +1705,16 @@ class SyncBatchNorm(_BatchNormBase):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.nn as nn
+                >>> import paddle
+                >>> import paddle.nn as nn
 
-                model = nn.Sequential(nn.Conv2D(3, 5, 3), nn.BatchNorm2D(5))
-                sync_model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
+                >>> model = nn.Sequential(nn.Conv2D(3, 5, 3), nn.BatchNorm2D(5))
+                >>> sync_model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
+                >>> print(sync_model)
+                Sequential(
+                    (0): Conv2D(3, 5, kernel_size=[3, 3], data_format=NCHW)
+                    (1): SyncBatchNorm(num_features=5, momentum=0.9, epsilon=1e-05)
+                )
 
         """
         layer_output = layer
@@ -1704,14 +1789,15 @@ class LocalResponseNorm(Layer):
 
     Examples:
 
-    .. code-block:: python
+        .. code-block:: python
 
-        import paddle
+            >>> import paddle
 
-        x = paddle.rand(shape=(3, 3, 112, 112), dtype="float32")
-        m = paddle.nn.LocalResponseNorm(size=5)
-        y = m(x)
-        print(y.shape)  # [3, 3, 112, 112]
+            >>> x = paddle.rand(shape=(3, 3, 112, 112), dtype="float32")
+            >>> m = paddle.nn.LocalResponseNorm(size=5)
+            >>> y = m(x)
+            >>> print(y.shape)
+            [3, 3, 112, 112]
     """
 
     def __init__(
@@ -1801,15 +1887,14 @@ class SpectralNorm(Layer):
         None
 
     Examples:
-       .. code-block:: python
-
-            import paddle
-            x = paddle.rand((2,8,32,32))
-
-            spectral_norm = paddle.nn.SpectralNorm(x.shape, dim=1, power_iters=2)
-            spectral_norm_out = spectral_norm(x)
+        .. code-block:: python
 
-            print(spectral_norm_out.shape) # [2, 8, 32, 32]
+            >>> import paddle
+            >>> x = paddle.rand((2,8,32,32))
+            >>> spectral_norm = paddle.nn.SpectralNorm(x.shape, dim=1, power_iters=2)
+            >>> spectral_norm_out = spectral_norm(x)
+            >>> print(spectral_norm_out.shape)
+            [2, 8, 32, 32]
 
     """
 
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 1a3e53095b5..5a872577100 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -65,13 +65,14 @@ class AvgPool1D(Layer):
 
         .. code-block:: python
 
-            import paddle
-            import paddle.nn as nn
+            >>> import paddle
+            >>> import paddle.nn as nn
 
-            data = paddle.uniform([1, 3, 32], dtype="float32", min=-1, max=1)
-            AvgPool1D = nn.AvgPool1D(kernel_size=2, stride=2, padding=0)
-            pool_out = AvgPool1D(data)
-            # pool_out shape: [1, 3, 16]
+            >>> data = paddle.uniform([1, 3, 32], dtype="float32", min=-1, max=1)
+            >>> AvgPool1D = nn.AvgPool1D(kernel_size=2, stride=2, padding=0)
+            >>> pool_out = AvgPool1D(data)
+            >>> print(pool_out.shape)
+            [1, 3, 16]
 
     """
 
@@ -169,15 +170,15 @@ class AvgPool2D(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn as nn
+            >>> import paddle
+            >>> import paddle.nn as nn
 
-            # max pool2d
-            input = paddle.uniform([1, 3, 32, 32], dtype="float32", min=-1, max=1)
-            AvgPool2D = nn.AvgPool2D(kernel_size=2,
-                                stride=2, padding=0)
-            output = AvgPool2D(input)
-            # output.shape [1, 3, 16, 16]
+            >>> # max pool2d
+            >>> input = paddle.uniform([1, 3, 32, 32], dtype="float32", min=-1, max=1)
+            >>> AvgPool2D = nn.AvgPool2D(kernel_size=2, stride=2, padding=0)
+            >>> output = AvgPool2D(input)
+            >>> print(output.shape)
+            [1, 3, 16, 16]
 
     """
 
@@ -268,15 +269,15 @@ class AvgPool3D(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn as nn
+            >>> import paddle
+            >>> import paddle.nn as nn
 
-            # avg pool3d
-            input = paddle.uniform([1, 2, 3, 32, 32], dtype="float32", min=-1, max=1)
-            AvgPool3D = nn.AvgPool3D(kernel_size=2,
-                                   stride=2, padding=0)
-            output = AvgPool3D(input)
-            # output.shape [1, 2, 3, 16, 16]
+            >>> # avg pool3d
+            >>> input = paddle.uniform([1, 2, 3, 32, 32], dtype="float32", min=-1, max=1)
+            >>> AvgPool3D = nn.AvgPool3D(kernel_size=2, stride=2, padding=0)
+            >>> output = AvgPool3D(input)
+            >>> print(output.shape)
+            [1, 2, 1, 16, 16]
 
     """
 
@@ -366,17 +367,21 @@ class MaxPool1D(Layer):
 
         .. code-block:: python
 
-            import paddle
-            import paddle.nn as nn
+            >>> import paddle
+            >>> import paddle.nn as nn
 
-            data = paddle.uniform([1, 3, 32], dtype="float32", min=-1, max=1)
-            MaxPool1D = nn.MaxPool1D(kernel_size=2, stride=2, padding=0)
-            pool_out = MaxPool1D(data)
-            # pool_out shape: [1, 3, 16]
+            >>> data = paddle.uniform([1, 3, 32], dtype="float32", min=-1, max=1)
+            >>> MaxPool1D = nn.MaxPool1D(kernel_size=2, stride=2, padding=0)
+            >>> pool_out = MaxPool1D(data)
+            >>> print(pool_out.shape)
+            [1, 3, 16]
 
-            MaxPool1D = nn.MaxPool1D(kernel_size=2, stride=2, padding=0, return_mask=True)
-            pool_out, indices = MaxPool1D(data)
-            # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
+            >>> MaxPool1D = nn.MaxPool1D(kernel_size=2, stride=2, padding=0, return_mask=True)
+            >>> pool_out, indices = MaxPool1D(data)
+            >>> print(pool_out.shape)
+            [1, 3, 16]
+            >>> print(indices.shape)
+            [1, 3, 16]
 
     """
 
@@ -471,20 +476,23 @@ class MaxPool2D(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn as nn
-
-            # max pool2d
-            input = paddle.uniform([1, 3, 32, 32], dtype="float32", min=-1, max=1)
-            MaxPool2D = nn.MaxPool2D(kernel_size=2,
-                                   stride=2, padding=0)
-            output = MaxPool2D(input)
-            # output.shape [1, 3, 16, 16]
-
-            # for return_mask=True
-            MaxPool2D = nn.MaxPool2D(kernel_size=2, stride=2, padding=0, return_mask=True)
-            output, max_indices = MaxPool2D(input)
-            # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
+            >>> import paddle
+            >>> import paddle.nn as nn
+
+            >>> # max pool2d
+            >>> input = paddle.uniform([1, 3, 32, 32], dtype="float32", min=-1, max=1)
+            >>> MaxPool2D = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)
+            >>> output = MaxPool2D(input)
+            >>> print(output.shape)
+            [1, 3, 16, 16]
+
+            >>> # for return_mask=True
+            >>> MaxPool2D = nn.MaxPool2D(kernel_size=2, stride=2, padding=0, return_mask=True)
+            >>> output, max_indices = MaxPool2D(input)
+            >>> print(output.shape)
+            [1, 3, 16, 16]
+            >>> print(max_indices.shape)
+            [1, 3, 16, 16]
     """
 
     def __init__(
@@ -568,20 +576,23 @@ class MaxPool3D(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn as nn
-
-            # max pool3d
-            input = paddle.uniform([1, 2, 3, 32, 32], dtype="float32", min=-1, max=1)
-            MaxPool3D = nn.MaxPool3D(kernel_size=2,
-                                   stride=2, padding=0)
-            output = MaxPool3D(input)
-            # output.shape [1, 2, 3, 16, 16]
-
-            # for return_mask=True
-            MaxPool3D = nn.MaxPool3D(kernel_size=2, stride=2, padding=0, return_mask=True)
-            output, max_indices = MaxPool3D(input)
-            # output.shape [1, 2, 3, 16, 16], max_indices.shape [1, 2, 3, 16, 16],
+            >>> import paddle
+            >>> import paddle.nn as nn
+
+            >>> # max pool3d
+            >>> input = paddle.uniform([1, 2, 3, 32, 32], dtype="float32", min=-1, max=1)
+            >>> MaxPool3D = nn.MaxPool3D(kernel_size=2, stride=2, padding=0)
+            >>> output = MaxPool3D(input)
+            >>> print(output.shape)
+            [1, 2, 1, 16, 16]
+
+            >>> # for return_mask=True
+            >>> MaxPool3D = nn.MaxPool3D(kernel_size=2, stride=2, padding=0, return_mask=True)
+            >>> output, max_indices = MaxPool3D(input)
+            >>> print(output.shape)
+            [1, 2, 1, 16, 16]
+            >>> print(max_indices.shape)
+            [1, 2, 1, 16, 16]
     """
 
     def __init__(
@@ -650,25 +661,26 @@ class AdaptiveAvgPool1D(Layer):
     Examples:
         .. code-block:: python
 
-            # average adaptive pool1d
-            # suppose input data in shape of [N, C, L], `output_size` is m or [m],
-            # output shape is [N, C, m], adaptive pool divide L dimension
-            # of input data into m grids averagely and performs poolings in each
-            # grid to get output.
-            # adaptive max pool performs calculations as follow:
-            #
-            #     for i in range(m):
-            #         lstart = floor(i * L / m)
-            #         lend = ceil((i + 1) * L / m)
-            #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lend - lstart)
-            #
-            import paddle
-            import paddle.nn as nn
-
-            data = paddle.uniform([1, 3, 32], dtype="float32", min=-1, max=1)
-            AdaptiveAvgPool1D = nn.AdaptiveAvgPool1D(output_size=16)
-            pool_out = AdaptiveAvgPool1D(data)
-            # pool_out shape: [1, 3, 16]
+            >>> # average adaptive pool1d
+            >>> # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+            >>> # output shape is [N, C, m], adaptive pool divide L dimension
+            >>> # of input data into m grids averagely and performs poolings in each
+            >>> # grid to get output.
+            >>> # adaptive max pool performs calculations as follow:
+            >>> #
+            >>> #     for i in range(m):
+            >>> #         lstart = floor(i * L / m)
+            >>> #         lend = ceil((i + 1) * L / m)
+            >>> #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lend - lstart)
+            >>> #
+            >>> import paddle
+            >>> import paddle.nn as nn
+
+            >>> data = paddle.uniform([1, 3, 32], dtype="float32", min=-1, max=1)
+            >>> AdaptiveAvgPool1D = nn.AdaptiveAvgPool1D(output_size=16)
+            >>> pool_out = AdaptiveAvgPool1D(data)
+            >>> print(pool_out.shape)
+            [1, 3, 16]
     """
 
     def __init__(self, output_size, name=None):
@@ -726,28 +738,29 @@ class AdaptiveAvgPool2D(Layer):
     Examples:
         .. code-block:: python
 
-            # adaptive avg pool2d
-            # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
-            # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
-            # of input data into m * n grids averagely and performs poolings in each
-            # grid to get output.
-            # adaptive avg pool performs calculations as follow:
-            #
-            #     for i in range(m):
-            #         for j in range(n):
-            #             hstart = floor(i * H / m)
-            #             hend = ceil((i + 1) * H / m)
-            #             wstart = floor(i * W / n)
-            #             wend = ceil((i + 1) * W / n)
-            #             output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend])
-            #
-            import paddle
-
-            x = paddle.rand([2, 3, 32, 32])
-
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(output_size=3)
-            pool_out = adaptive_avg_pool(x = x)
-            # pool_out.shape is [2, 3, 3, 3]
+            >>> # adaptive avg pool2d
+            >>> # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
+            >>> # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
+            >>> # of input data into m * n grids averagely and performs poolings in each
+            >>> # grid to get output.
+            >>> # adaptive avg pool performs calculations as follow:
+            >>> #
+            >>> #     for i in range(m):
+            >>> #         for j in range(n):
+            >>> #             hstart = floor(i * H / m)
+            >>> #             hend = ceil((i + 1) * H / m)
+            >>> #             wstart = floor(i * W / n)
+            >>> #             wend = ceil((i + 1) * W / n)
+            >>> #             output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend])
+            >>> #
+            >>> import paddle
+
+            >>> x = paddle.rand([2, 3, 32, 32])
+
+            >>> adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(output_size=3)
+            >>> pool_out = adaptive_avg_pool(x = x)
+            >>> print(pool_out.shape)
+            [2, 3, 3, 3]
     """
 
     def __init__(self, output_size, data_format="NCHW", name=None):
@@ -815,31 +828,32 @@ class AdaptiveAvgPool3D(Layer):
     Examples:
         .. code-block:: python
 
-            # adaptive avg pool3d
-            # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
-            # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
-            # of input data into l * m * n grids averagely and performs poolings in each
-            # grid to get output.
-            # adaptive avg pool performs calculations as follow:
-            #
-            #     for i in range(l):
-            #         for j in range(m):
-            #             for k in range(n):
-            #                 dstart = floor(i * D / l)
-            #                 dend = ceil((i + 1) * D / l)
-            #                 hstart = floor(j * H / m)
-            #                 hend = ceil((j + 1) * H / m)
-            #                 wstart = floor(k * W / n)
-            #                 wend = ceil((k + 1) * W / n)
-            #                 output[:, :, i, j, k] =
-            #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
-            import paddle
-
-            x = paddle.rand([2, 3, 8, 32, 32])
-
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3D(output_size=3)
-            pool_out = adaptive_avg_pool(x = x)
-            # pool_out = [2, 3, 3, 3, 3]
+            >>> # adaptive avg pool3d
+            >>> # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
+            >>> # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
+            >>> # of input data into l * m * n grids averagely and performs poolings in each
+            >>> # grid to get output.
+            >>> # adaptive avg pool performs calculations as follow:
+            >>> #
+            >>> #     for i in range(l):
+            >>> #         for j in range(m):
+            >>> #             for k in range(n):
+            >>> #                 dstart = floor(i * D / l)
+            >>> #                 dend = ceil((i + 1) * D / l)
+            >>> #                 hstart = floor(j * H / m)
+            >>> #                 hend = ceil((j + 1) * H / m)
+            >>> #                 wstart = floor(k * W / n)
+            >>> #                 wend = ceil((k + 1) * W / n)
+            >>> #                 output[:, :, i, j, k] =
+            >>> #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
+            >>> import paddle
+
+            >>> x = paddle.rand([2, 3, 8, 32, 32])
+
+            >>> adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3D(output_size=3)
+            >>> pool_out = adaptive_avg_pool(x = x)
+            >>> print(pool_out.shape)
+            [2, 3, 3, 3, 3]
     """
 
     def __init__(self, output_size, data_format="NCDHW", name=None):
@@ -898,30 +912,34 @@ class AdaptiveMaxPool1D(Layer):
     Examples:
         .. code-block:: python
 
-            # max adaptive pool1d
-            # suppose input data in shape of [N, C, L], `output_size` is m or [m],
-            # output shape is [N, C, m], adaptive pool divide L dimension
-            # of input data into m grids averagely and performs poolings in each
-            # grid to get output.
-            # adaptive max pool performs calculations as follow:
-            #
-            #     for i in range(m):
-            #         lstart = floor(i * L / m)
-            #         lend = ceil((i + 1) * L / m)
-            #         output[:, :, i] = max(input[:, :, lstart: lend])
-            #
-            import paddle
-            import paddle.nn as nn
-
-            data = paddle.uniform([1, 3, 32], dtype="float32", min=-1, max=1)
-            AdaptiveMaxPool1D = nn.AdaptiveMaxPool1D(output_size=16)
-            pool_out = AdaptiveMaxPool1D(data)
-            # pool_out shape: [1, 3, 16]
-
-            # for return_mask = true
-            AdaptiveMaxPool1D = nn.AdaptiveMaxPool1D(output_size=16, return_mask=True)
-            pool_out, indices = AdaptiveMaxPool1D(data)
-            # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
+            >>> # max adaptive pool1d
+            >>> # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+            >>> # output shape is [N, C, m], adaptive pool divide L dimension
+            >>> # of input data into m grids averagely and performs poolings in each
+            >>> # grid to get output.
+            >>> # adaptive max pool performs calculations as follow:
+            >>> #
+            >>> #     for i in range(m):
+            >>> #         lstart = floor(i * L / m)
+            >>> #         lend = ceil((i + 1) * L / m)
+            >>> #         output[:, :, i] = max(input[:, :, lstart: lend])
+            >>> #
+            >>> import paddle
+            >>> import paddle.nn as nn
+
+            >>> data = paddle.uniform([1, 3, 32], dtype="float32", min=-1, max=1)
+            >>> AdaptiveMaxPool1D = nn.AdaptiveMaxPool1D(output_size=16)
+            >>> pool_out = AdaptiveMaxPool1D(data)
+            >>> print(pool_out.shape)
+            [1, 3, 16]
+
+            >>> # for return_mask = true
+            >>> AdaptiveMaxPool1D = nn.AdaptiveMaxPool1D(output_size=16, return_mask=True)
+            >>> pool_out, indices = AdaptiveMaxPool1D(data)
+            >>> print(pool_out.shape)
+            [1, 3, 16]
+            >>> print(indices.shape)
+            [1, 3, 16]
 
     """
 
@@ -981,27 +999,31 @@ class AdaptiveMaxPool2D(Layer):
     Examples:
         .. code-block:: python
 
-            # adaptive max pool2d
-            # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
-            # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
-            # of input data into m * n grids averagely and performs poolings in each
-            # grid to get output.
-            # adaptive max pool performs calculations as follow:
-            #
-            #     for i in range(m):
-            #         for j in range(n):
-            #             hstart = floor(i * H / m)
-            #             hend = ceil((i + 1) * H / m)
-            #             wstart = floor(i * W / n)
-            #             wend = ceil((i + 1) * W / n)
-            #             output[:, :, i, j] = max(input[:, :, hstart: hend, wstart: wend])
-            #
-            import paddle
-
-            x = paddle.rand([2, 3, 32, 32])
-
-            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2D(output_size=3, return_mask=True)
-            pool_out, indices = adaptive_max_pool(x = x)
+            >>> # adaptive max pool2d
+            >>> # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
+            >>> # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
+            >>> # of input data into m * n grids averagely and performs poolings in each
+            >>> # grid to get output.
+            >>> # adaptive max pool performs calculations as follow:
+            >>> #
+            >>> #     for i in range(m):
+            >>> #         for j in range(n):
+            >>> #             hstart = floor(i * H / m)
+            >>> #             hend = ceil((i + 1) * H / m)
+            >>> #             wstart = floor(i * W / n)
+            >>> #             wend = ceil((i + 1) * W / n)
+            >>> #             output[:, :, i, j] = max(input[:, :, hstart: hend, wstart: wend])
+            >>> #
+            >>> import paddle
+
+            >>> x = paddle.rand([2, 3, 32, 32])
+
+            >>> adaptive_max_pool = paddle.nn.AdaptiveMaxPool2D(output_size=3, return_mask=True)
+            >>> pool_out, indices = adaptive_max_pool(x = x)
+            >>> print(pool_out.shape)
+            [2, 3, 3, 3]
+            >>> print(indices.shape)
+            [2, 3, 3, 3]
     """
 
     def __init__(self, output_size, return_mask=False, name=None):
@@ -1067,33 +1089,37 @@ class AdaptiveMaxPool3D(Layer):
     Examples:
         .. code-block:: python
 
-            # adaptive max pool3d
-            # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
-            # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
-            # of input data into l * m * n grids averagely and performs poolings in each
-            # grid to get output.
-            # adaptive max pool performs calculations as follow:
-            #
-            #     for i in range(l):
-            #         for j in range(m):
-            #             for k in range(n):
-            #                 dstart = floor(i * D / l)
-            #                 dend = ceil((i + 1) * D / l)
-            #                 hstart = floor(j * H / m)
-            #                 hend = ceil((j + 1) * H / m)
-            #                 wstart = floor(k * W / n)
-            #                 wend = ceil((k + 1) * W / n)
-            #                 output[:, :, i, j, k] =
-            #                     max(input[:, :, dstart:dend, hstart: hend, wstart: wend])
-            import paddle
-
-            x = paddle.rand([2, 3, 8, 32, 32])
-            pool = paddle.nn.AdaptiveMaxPool3D(output_size=4)
-            out = pool(x)
-            # out shape: [2, 3, 4, 4, 4]
-            pool = paddle.nn.AdaptiveMaxPool3D(output_size=3, return_mask=True)
-            out, indices = pool(x)
-            # out shape: [2, 3, 4, 4, 4], indices shape: [2, 3, 4, 4, 4]
+            >>> # adaptive max pool3d
+            >>> # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
+            >>> # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
+            >>> # of input data into l * m * n grids averagely and performs poolings in each
+            >>> # grid to get output.
+            >>> # adaptive max pool performs calculations as follow:
+            >>> #
+            >>> #     for i in range(l):
+            >>> #         for j in range(m):
+            >>> #             for k in range(n):
+            >>> #                 dstart = floor(i * D / l)
+            >>> #                 dend = ceil((i + 1) * D / l)
+            >>> #                 hstart = floor(j * H / m)
+            >>> #                 hend = ceil((j + 1) * H / m)
+            >>> #                 wstart = floor(k * W / n)
+            >>> #                 wend = ceil((k + 1) * W / n)
+            >>> #                 output[:, :, i, j, k] =
+            >>> #                     max(input[:, :, dstart:dend, hstart: hend, wstart: wend])
+            >>> import paddle
+
+            >>> x = paddle.rand([2, 3, 8, 32, 32])
+            >>> pool = paddle.nn.AdaptiveMaxPool3D(output_size=4)
+            >>> out = pool(x)
+            >>> print(out.shape)
+            [2, 3, 4, 4, 4]
+            >>> pool = paddle.nn.AdaptiveMaxPool3D(output_size=3, return_mask=True)
+            >>> out, indices = pool(x)
+            >>> print(out.shape)
+            [2, 3, 3, 3, 3]
+            >>> print(indices.shape)
+            [2, 3, 3, 3, 3]
 
     """
 
@@ -1156,15 +1182,19 @@ class MaxUnPool1D(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-            data = paddle.rand(shape=[1, 3, 16])
-            pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
-            # pool_out shape: [1, 3, 8],  indices shape: [1, 3, 8]
-            Unpool1D = paddle.nn.MaxUnPool1D(kernel_size=2, padding=0)
-            unpool_out = Unpool1D(pool_out, indices)
-            # unpool_out shape: [1, 3, 16]
+            >>> data = paddle.rand(shape=[1, 3, 16])
+            >>> pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
+            >>> print(pool_out.shape)
+            [1, 3, 8]
+            >>> print(indices.shape)
+            [1, 3, 8]
+            >>> Unpool1D = paddle.nn.MaxUnPool1D(kernel_size=2, padding=0)
+            >>> unpool_out = Unpool1D(pool_out, indices)
+            >>> print(unpool_out.shape)
+            [1, 3, 16]
 
     """
 
@@ -1244,15 +1274,19 @@ class MaxUnPool2D(Layer):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-        data = paddle.rand(shape=[1,1,6,6])
-        pool_out, indices = F.max_pool2d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
-        # pool_out shape: [1, 1, 3, 3],  indices shape: [1, 1, 3, 3]
-        Unpool2D = paddle.nn.MaxUnPool2D(kernel_size=2, padding=0)
-        unpool_out = Unpool2D(pool_out, indices)
-        # unpool_out shape: [1, 1, 6, 6]
+            >>> data = paddle.rand(shape=[1, 1, 6, 6])
+            >>> pool_out, indices = F.max_pool2d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
+            >>> print(pool_out.shape)
+            [1, 1, 3, 3]
+            >>> print(indices.shape)
+            [1, 1, 3, 3]
+            >>> Unpool2D = paddle.nn.MaxUnPool2D(kernel_size=2, padding=0)
+            >>> unpool_out = Unpool2D(pool_out, indices)
+            >>> print(unpool_out.shape)
+            [1, 1, 6, 6]
 
     """
 
@@ -1335,15 +1369,19 @@ class MaxUnPool3D(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
-
-            data = paddle.rand(shape=[1, 1, 4, 4, 6])
-            pool_out, indices = F.max_pool3d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
-            # pool_out shape: [1, 1, 2, 2, 3],  indices shape: [1, 1, 2, 2, 3]
-            Unpool3D = paddle.nn.MaxUnPool3D(kernel_size=2, padding=0)
-            unpool_out = Unpool3D(pool_out, indices)
-            # unpool_out shape: [1, 1, 4, 4, 6]
+            >>> import paddle
+            >>> import paddle.nn.functional as F
+
+            >>> data = paddle.rand(shape=[1, 1, 4, 4, 6])
+            >>> pool_out, indices = F.max_pool3d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
+            >>> print(pool_out.shape)
+            [1, 1, 2, 2, 3]
+            >>> print(indices.shape)
+            [1, 1, 2, 2, 3]
+            >>> Unpool3D = paddle.nn.MaxUnPool3D(kernel_size=2, padding=0)
+            >>> unpool_out = Unpool3D(pool_out, indices)
+            >>> print(unpool_out.shape)
+            [1, 1, 4, 4, 6]
 
     """
 
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index 991df623d96..a2122c2dab3 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -89,14 +89,18 @@ def rnn(
 
         .. code-block:: python
 
-            import paddle
-            paddle.disable_static()
+            >>> import paddle
 
-            cell = paddle.nn.SimpleRNNCell(16, 32)
+            >>> inputs = paddle.rand((4, 23, 16))
+            >>> prev_h = paddle.randn((4, 32))
 
-            inputs = paddle.rand((4, 23, 16))
-            prev_h = paddle.randn((4, 32))
-            outputs, final_states = paddle.nn.layer.rnn(cell, inputs, prev_h)
+            >>> cell = paddle.nn.SimpleRNNCell(16, 32)
+            >>> rnn = paddle.nn.RNN(cell)
+            >>> outputs, final_states = rnn(inputs, prev_h)
+            >>> print(outputs.shape)
+            [4, 23, 32]
+            >>> print(final_states.shape)
+            [4, 32]
 
     """
 
@@ -397,18 +401,17 @@ def birnn(
 
         .. code-block:: python
 
-            import paddle
-            paddle.disable_static()
+            >>> import paddle
 
-            cell_fw = paddle.nn.LSTMCell(16, 32)
-            cell_bw = paddle.nn.LSTMCell(16, 32)
-
-            inputs = paddle.rand((4, 23, 16))
-            hf, cf = paddle.rand((4, 32)), paddle.rand((4, 32))
-            hb, cb = paddle.rand((4, 32)), paddle.rand((4, 32))
-            initial_states = ((hf, cf), (hb, cb))
-            outputs, final_states = paddle.nn.layer.birnn(
-                cell_fw, cell_bw, inputs, initial_states)
+            >>> cell_fw = paddle.nn.LSTMCell(16, 32)
+            >>> cell_bw = paddle.nn.LSTMCell(16, 32)
+            >>> rnn = paddle.nn.BiRNN(cell_fw, cell_bw)
+            >>> inputs = paddle.rand((2, 23, 16))
+            >>> outputs, final_states = rnn(inputs)
+            >>> print(outputs.shape)
+            [2, 23, 64]
+            >>> print(final_states[0][0].shape)
+            [2, 32]
 
     """
 
@@ -743,16 +746,15 @@ class SimpleRNNCell(RNNCellBase):
 
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.randn((4, 16))
-            prev_h = paddle.randn((4, 32))
+            >>> import paddle
 
-            cell = paddle.nn.SimpleRNNCell(16, 32)
-            y, h = cell(x, prev_h)
-            print(y.shape)
+            >>> x = paddle.randn((4, 16))
+            >>> prev_h = paddle.randn((4, 32))
 
-            #[4,32]
+            >>> cell = paddle.nn.SimpleRNNCell(16, 32)
+            >>> y, h = cell(x, prev_h)
+            >>> print(y.shape)
+            [4, 32]
 
     """
 
@@ -897,22 +899,21 @@ class LSTMCell(RNNCellBase):
 
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.randn((4, 16))
-            prev_h = paddle.randn((4, 32))
-            prev_c = paddle.randn((4, 32))
+            >>> import paddle
 
-            cell = paddle.nn.LSTMCell(16, 32)
-            y, (h, c) = cell(x, (prev_h, prev_c))
+            >>> x = paddle.randn((4, 16))
+            >>> prev_h = paddle.randn((4, 32))
+            >>> prev_c = paddle.randn((4, 32))
 
-            print(y.shape)
-            print(h.shape)
-            print(c.shape)
+            >>> cell = paddle.nn.LSTMCell(16, 32)
+            >>> y, (h, c) = cell(x, (prev_h, prev_c))
 
-            #[4,32]
-            #[4,32]
-            #[4,32]
+            >>> print(y.shape)
+            [4, 32]
+            >>> print(h.shape)
+            [4, 32]
+            >>> print(c.shape)
+            [4, 32]
 
     """
 
@@ -1059,19 +1060,19 @@ class GRUCell(RNNCellBase):
 
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.randn((4, 16))
-            prev_h = paddle.randn((4, 32))
+            >>> x = paddle.randn((4, 16))
+            >>> prev_h = paddle.randn((4, 32))
 
-            cell = paddle.nn.GRUCell(16, 32)
-            y, h = cell(x, prev_h)
+            >>> cell = paddle.nn.GRUCell(16, 32)
+            >>> y, h = cell(x, prev_h)
 
-            print(y.shape)
-            print(h.shape)
+            >>> print(y.shape)
+            [4, 32]
+            >>> print(h.shape)
+            [4, 32]
 
-            #[4,32]
-            #[4,32]
 
     """
 
@@ -1189,20 +1190,19 @@ class RNN(Layer):
 
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            inputs = paddle.rand((4, 23, 16))
-            prev_h = paddle.randn((4, 32))
+            >>> inputs = paddle.rand((4, 23, 16))
+            >>> prev_h = paddle.randn((4, 32))
 
-            cell = paddle.nn.SimpleRNNCell(16, 32)
-            rnn = paddle.nn.RNN(cell)
-            outputs, final_states = rnn(inputs, prev_h)
+            >>> cell = paddle.nn.SimpleRNNCell(16, 32)
+            >>> rnn = paddle.nn.RNN(cell)
+            >>> outputs, final_states = rnn(inputs, prev_h)
 
-            print(outputs.shape)
-            print(final_states.shape)
-
-            #[4,23,32]
-            #[4,32]
+            >>> print(outputs.shape)
+            [4, 23, 32]
+            >>> print(final_states.shape)
+            [4, 32]
 
     """
 
@@ -1263,20 +1263,19 @@ class BiRNN(Layer):
 
         .. code-block:: python
 
-            import paddle
-
-            cell_fw = paddle.nn.LSTMCell(16, 32)
-            cell_bw = paddle.nn.LSTMCell(16, 32)
-            rnn = paddle.nn.BiRNN(cell_fw, cell_bw)
+            >>> import paddle
 
-            inputs = paddle.rand((2, 23, 16))
-            outputs, final_states = rnn(inputs)
+            >>> cell_fw = paddle.nn.LSTMCell(16, 32)
+            >>> cell_bw = paddle.nn.LSTMCell(16, 32)
+            >>> rnn = paddle.nn.BiRNN(cell_fw, cell_bw)
 
-            print(outputs.shape)
-            print(final_states[0][0].shape,len(final_states),len(final_states[0]))
+            >>> inputs = paddle.rand((2, 23, 16))
+            >>> outputs, final_states = rnn(inputs)
 
-            #[4,23,64]
-            #[2,32] 2 2
+            >>> print(outputs.shape)
+            [2, 23, 64]
+            >>> print(final_states[0][0].shape,len(final_states),len(final_states[0]))
+            [2, 32] 2 2
 
     """
 
@@ -1702,19 +1701,19 @@ class SimpleRNN(RNNBase):
 
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            rnn = paddle.nn.SimpleRNN(16, 32, 2)
+            >>> rnn = paddle.nn.SimpleRNN(16, 32, 2)
 
-            x = paddle.randn((4, 23, 16))
-            prev_h = paddle.randn((2, 4, 32))
-            y, h = rnn(x, prev_h)
+            >>> x = paddle.randn((4, 23, 16))
+            >>> prev_h = paddle.randn((2, 4, 32))
+            >>> y, h = rnn(x, prev_h)
 
-            print(y.shape)
-            print(h.shape)
+            >>> print(y.shape)
+            [4, 23, 32]
+            >>> print(h.shape)
+            [2, 4, 32]
 
-            #[4,23,32]
-            #[2,4,32]
 
     """
 
@@ -1833,22 +1832,22 @@ class LSTM(RNNBase):
 
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            rnn = paddle.nn.LSTM(16, 32, 2)
+            >>> rnn = paddle.nn.LSTM(16, 32, 2)
 
-            x = paddle.randn((4, 23, 16))
-            prev_h = paddle.randn((2, 4, 32))
-            prev_c = paddle.randn((2, 4, 32))
-            y, (h, c) = rnn(x, (prev_h, prev_c))
+            >>> x = paddle.randn((4, 23, 16))
+            >>> prev_h = paddle.randn((2, 4, 32))
+            >>> prev_c = paddle.randn((2, 4, 32))
+            >>> y, (h, c) = rnn(x, (prev_h, prev_c))
 
-            print(y.shape)
-            print(h.shape)
-            print(c.shape)
+            >>> print(y.shape)
+            [4, 23, 32]
+            >>> print(h.shape)
+            [2, 4, 32]
+            >>> print(c.shape)
+            [2, 4, 32]
 
-            #[4,23,32]
-            #[2,4,32]
-            #[2,4,32]
 
     """
 
@@ -1955,19 +1954,19 @@ class GRU(RNNBase):
 
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            rnn = paddle.nn.GRU(16, 32, 2)
+            >>> rnn = paddle.nn.GRU(16, 32, 2)
 
-            x = paddle.randn((4, 23, 16))
-            prev_h = paddle.randn((2, 4, 32))
-            y, h = rnn(x, prev_h)
+            >>> x = paddle.randn((4, 23, 16))
+            >>> prev_h = paddle.randn((2, 4, 32))
+            >>> y, h = rnn(x, prev_h)
 
-            print(y.shape)
-            print(h.shape)
+            >>> print(y.shape)
+            [4, 23, 32]
+            >>> print(h.shape)
+            [2, 4, 32]
 
-            #[4,23,32]
-            #[2,4,32]
 
     """
 
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index 335b47d2599..e2e3f052240 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -141,14 +141,16 @@ class MultiHeadAttention(Layer):
 
         .. code-block:: python
 
-            import paddle
-
-            # encoder input: [batch_size, sequence_length, d_model]
-            query = paddle.rand((2, 4, 128))
-            # self attention mask: [batch_size, num_heads, query_len, query_len]
-            attn_mask = paddle.rand((2, 2, 4, 4))
-            multi_head_attn = paddle.nn.MultiHeadAttention(128, 2)
-            output = multi_head_attn(query, None, None, attn_mask=attn_mask)  # [2, 4, 128]
+            >>> import paddle
+
+            >>> # encoder input: [batch_size, sequence_length, d_model]
+            >>> query = paddle.rand((2, 4, 128))
+            >>> # self attention mask: [batch_size, num_heads, query_len, query_len]
+            >>> attn_mask = paddle.rand((2, 2, 4, 4))
+            >>> multi_head_attn = paddle.nn.MultiHeadAttention(128, 2)
+            >>> output = multi_head_attn(query, None, None, attn_mask=attn_mask)
+            >>> print(output.shape)
+            [2, 4, 128]
     """
 
     Cache = collections.namedtuple("Cache", ["k", "v"])
@@ -490,15 +492,17 @@ class TransformerEncoderLayer(Layer):
 
         .. code-block:: python
 
-            import paddle
-            from paddle.nn import TransformerEncoderLayer
-
-            # encoder input: [batch_size, src_len, d_model]
-            enc_input = paddle.rand((2, 4, 128))
-            # self attention mask: [batch_size, n_head, src_len, src_len]
-            attn_mask = paddle.rand((2, 2, 4, 4))
-            encoder_layer = TransformerEncoderLayer(128, 2, 512)
-            enc_output = encoder_layer(enc_input, attn_mask)  # [2, 4, 128]
+            >>> import paddle
+            >>> from paddle.nn import TransformerEncoderLayer
+
+            >>> # encoder input: [batch_size, src_len, d_model]
+            >>> enc_input = paddle.rand((2, 4, 128))
+            >>> # self attention mask: [batch_size, n_head, src_len, src_len]
+            >>> attn_mask = paddle.rand((2, 2, 4, 4))
+            >>> encoder_layer = TransformerEncoderLayer(128, 2, 512)
+            >>> enc_output = encoder_layer(enc_input, attn_mask)
+            >>> print(enc_output.shape)
+            [2, 4, 128]
     """
 
     def __init__(
@@ -659,16 +663,18 @@ class TransformerEncoder(Layer):
 
         .. code-block:: python
 
-            import paddle
-            from paddle.nn import TransformerEncoderLayer, TransformerEncoder
-
-            # encoder input: [batch_size, src_len, d_model]
-            enc_input = paddle.rand((2, 4, 128))
-            # self attention mask: [batch_size, n_head, src_len, src_len]
-            attn_mask = paddle.rand((2, 2, 4, 4))
-            encoder_layer = TransformerEncoderLayer(128, 2, 512)
-            encoder = TransformerEncoder(encoder_layer, 2)
-            enc_output = encoder(enc_input, attn_mask)  # [2, 4, 128]
+            >>> import paddle
+            >>> from paddle.nn import TransformerEncoderLayer, TransformerEncoder
+
+            >>> # encoder input: [batch_size, src_len, d_model]
+            >>> enc_input = paddle.rand((2, 4, 128))
+            >>> # self attention mask: [batch_size, n_head, src_len, src_len]
+            >>> attn_mask = paddle.rand((2, 2, 4, 4))
+            >>> encoder_layer = TransformerEncoderLayer(128, 2, 512)
+            >>> encoder = TransformerEncoder(encoder_layer, 2)
+            >>> enc_output = encoder(enc_input, attn_mask)
+            >>> print(enc_output.shape)
+            [2, 4, 128]
     """
 
     def __init__(self, encoder_layer, num_layers, norm=None):
@@ -809,22 +815,24 @@ class TransformerDecoderLayer(Layer):
 
         .. code-block:: python
 
-            import paddle
-            from paddle.nn import TransformerDecoderLayer
-
-            # decoder input: [batch_size, tgt_len, d_model]
-            dec_input = paddle.rand((2, 4, 128))
-            # encoder output: [batch_size, src_len, d_model]
-            enc_output = paddle.rand((2, 6, 128))
-            # self attention mask: [batch_size, n_head, tgt_len, tgt_len]
-            self_attn_mask = paddle.rand((2, 2, 4, 4))
-            # cross attention mask: [batch_size, n_head, tgt_len, src_len]
-            cross_attn_mask = paddle.rand((2, 2, 4, 6))
-            decoder_layer = TransformerDecoderLayer(128, 2, 512)
-            output = decoder_layer(dec_input,
-                                   enc_output,
-                                   self_attn_mask,
-                                   cross_attn_mask)  # [2, 4, 128]
+            >>> import paddle
+            >>> from paddle.nn import TransformerDecoderLayer
+
+            >>> # decoder input: [batch_size, tgt_len, d_model]
+            >>> dec_input = paddle.rand((2, 4, 128))
+            >>> # encoder output: [batch_size, src_len, d_model]
+            >>> enc_output = paddle.rand((2, 6, 128))
+            >>> # self attention mask: [batch_size, n_head, tgt_len, tgt_len]
+            >>> self_attn_mask = paddle.rand((2, 2, 4, 4))
+            >>> # cross attention mask: [batch_size, n_head, tgt_len, src_len]
+            >>> cross_attn_mask = paddle.rand((2, 2, 4, 6))
+            >>> decoder_layer = TransformerDecoderLayer(128, 2, 512)
+            >>> output = decoder_layer(dec_input,
+            ...                        enc_output,
+            ...                        self_attn_mask,
+            ...                        cross_attn_mask)
+            >>> print(output.shape)
+            [2, 4, 128]
     """
 
     def __init__(
@@ -1031,23 +1039,25 @@ class TransformerDecoder(Layer):
 
         .. code-block:: python
 
-            import paddle
-            from paddle.nn import TransformerDecoderLayer, TransformerDecoder
-
-            # decoder input: [batch_size, tgt_len, d_model]
-            dec_input = paddle.rand((2, 4, 128))
-            # encoder output: [batch_size, src_len, d_model]
-            enc_output = paddle.rand((2, 6, 128))
-            # self attention mask: [batch_size, n_head, tgt_len, tgt_len]
-            self_attn_mask = paddle.rand((2, 2, 4, 4))
-            # cross attention mask: [batch_size, n_head, tgt_len, src_len]
-            cross_attn_mask = paddle.rand((2, 2, 4, 6))
-            decoder_layer = TransformerDecoderLayer(128, 2, 512)
-            decoder = TransformerDecoder(decoder_layer, 2)
-            output = decoder(dec_input,
-                             enc_output,
-                             self_attn_mask,
-                             cross_attn_mask)  # [2, 4, 128]
+            >>> import paddle
+            >>> from paddle.nn import TransformerDecoderLayer, TransformerDecoder
+
+            >>> # decoder input: [batch_size, tgt_len, d_model]
+            >>> dec_input = paddle.rand((2, 4, 128))
+            >>> # encoder output: [batch_size, src_len, d_model]
+            >>> enc_output = paddle.rand((2, 6, 128))
+            >>> # self attention mask: [batch_size, n_head, tgt_len, tgt_len]
+            >>> self_attn_mask = paddle.rand((2, 2, 4, 4))
+            >>> # cross attention mask: [batch_size, n_head, tgt_len, src_len]
+            >>> cross_attn_mask = paddle.rand((2, 2, 4, 6))
+            >>> decoder_layer = TransformerDecoderLayer(128, 2, 512)
+            >>> decoder = TransformerDecoder(decoder_layer, 2)
+            >>> output = decoder(dec_input,
+            ...                  enc_output,
+            ...                  self_attn_mask,
+            ...                  cross_attn_mask)
+            >>> print(output.shape)
+            [2, 4, 128]
     """
 
     def __init__(self, decoder_layer, num_layers, norm=None):
@@ -1242,25 +1252,27 @@ class Transformer(Layer):
 
         .. code-block:: python
 
-            import paddle
-            from paddle.nn import Transformer
-
-            # src: [batch_size, tgt_len, d_model]
-            enc_input = paddle.rand((2, 4, 128))
-            # tgt: [batch_size, src_len, d_model]
-            dec_input = paddle.rand((2, 6, 128))
-            # src_mask: [batch_size, n_head, src_len, src_len]
-            enc_self_attn_mask = paddle.rand((2, 2, 4, 4))
-            # tgt_mask: [batch_size, n_head, tgt_len, tgt_len]
-            dec_self_attn_mask = paddle.rand((2, 2, 6, 6))
-            # memory_mask: [batch_size, n_head, tgt_len, src_len]
-            cross_attn_mask = paddle.rand((2, 2, 6, 4))
-            transformer = Transformer(128, 2, 4, 4, 512)
-            output = transformer(enc_input,
-                                 dec_input,
-                                 enc_self_attn_mask,
-                                 dec_self_attn_mask,
-                                 cross_attn_mask)  # [2, 6, 128]
+            >>> import paddle
+            >>> from paddle.nn import Transformer
+
+            >>> # src: [batch_size, tgt_len, d_model]
+            >>> enc_input = paddle.rand((2, 4, 128))
+            >>> # tgt: [batch_size, src_len, d_model]
+            >>> dec_input = paddle.rand((2, 6, 128))
+            >>> # src_mask: [batch_size, n_head, src_len, src_len]
+            >>> enc_self_attn_mask = paddle.rand((2, 2, 4, 4))
+            >>> # tgt_mask: [batch_size, n_head, tgt_len, tgt_len]
+            >>> dec_self_attn_mask = paddle.rand((2, 2, 6, 6))
+            >>> # memory_mask: [batch_size, n_head, tgt_len, src_len]
+            >>> cross_attn_mask = paddle.rand((2, 2, 6, 4))
+            >>> transformer = Transformer(128, 2, 4, 4, 512)
+            >>> output = transformer(enc_input,
+            ...                      dec_input,
+            ...                      enc_self_attn_mask,
+            ...                      dec_self_attn_mask,
+            ...                      cross_attn_mask)
+            >>> print(output.shape)
+            [2, 6, 128]
     """
 
     def __init__(
@@ -1454,20 +1466,20 @@ class Transformer(Layer):
         Examples:
             .. code-block:: python
 
-                import paddle
-                from paddle.nn.layer.transformer import Transformer
-                length = 5
-                d_model, n_head, dim_feedforward = 8, 4, 64
-                transformer_paddle = Transformer(
-                    d_model, n_head, dim_feedforward=dim_feedforward)
-                mask = transformer_paddle.generate_square_subsequent_mask(length)
-                print(mask)
-
-                # [[  0. -inf -inf -inf -inf]
-                # [  0.   0. -inf -inf -inf]
-                # [  0.   0.   0. -inf -inf]
-                # [  0.   0.   0.   0. -inf]
-                # [  0.   0.   0.   0.   0.]]
+                >>> import paddle
+                >>> from paddle.nn.layer.transformer import Transformer
+                >>> length = 5
+                >>> d_model, n_head, dim_feedforward = 8, 4, 64
+                >>> transformer_paddle = Transformer(
+                ...     d_model, n_head, dim_feedforward=dim_feedforward)
+                >>> mask = transformer_paddle.generate_square_subsequent_mask(length)
+                >>> print(mask)
+                Tensor(shape=[5, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [[ 0.  , -inf., -inf., -inf., -inf.],
+                 [ 0.  ,  0.  , -inf., -inf., -inf.],
+                 [ 0.  ,  0.  ,  0.  , -inf., -inf.],
+                 [ 0.  ,  0.  ,  0.  ,  0.  , -inf.],
+                 [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ]])
 
         """
         return paddle.tensor.triu(
diff --git a/python/paddle/nn/layer/vision.py b/python/paddle/nn/layer/vision.py
index a48be90e74c..996699e513c 100644
--- a/python/paddle/nn/layer/vision.py
+++ b/python/paddle/nn/layer/vision.py
@@ -46,14 +46,14 @@ class PixelShuffle(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn as nn
+            >>> import paddle
+            >>> import paddle.nn as nn
 
-            x = paddle.randn(shape=[2,9,4,4])
-            pixel_shuffle = nn.PixelShuffle(3)
-            out = pixel_shuffle(x)
-            print(out.shape)
-            # [2, 1, 12, 12]
+            >>> x = paddle.randn(shape=[2, 9, 4, 4])
+            >>> pixel_shuffle = nn.PixelShuffle(3)
+            >>> out = pixel_shuffle(x)
+            >>> print(out.shape)
+            [2, 1, 12, 12]
 
     """
 
@@ -109,14 +109,14 @@ class PixelUnshuffle(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn as nn
+            >>> import paddle
+            >>> import paddle.nn as nn
 
-            x = paddle.randn([2, 1, 12, 12])
-            pixel_unshuffle = nn.PixelUnshuffle(3)
-            out = pixel_unshuffle(x)
-            print(out.shape)
-            # [2, 9, 4, 4]
+            >>> x = paddle.randn([2, 1, 12, 12])
+            >>> pixel_unshuffle = nn.PixelUnshuffle(3)
+            >>> out = pixel_unshuffle(x)
+            >>> print(out.shape)
+            [2, 9, 4, 4]
 
     """
 
@@ -175,24 +175,28 @@ class ChannelShuffle(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn as nn
-            x = paddle.arange(0, 0.6, 0.1, 'float32')
-            x = paddle.reshape(x, [1, 6, 1, 1])
-            # [[[[0.        ]],
-            #   [[0.10000000]],
-            #   [[0.20000000]],
-            #   [[0.30000001]],
-            #   [[0.40000001]],
-            #   [[0.50000000]]]]
-            channel_shuffle = nn.ChannelShuffle(3)
-            y = channel_shuffle(x)
-            # [[[[0.        ]],
-            #   [[0.20000000]],
-            #   [[0.40000001]],
-            #   [[0.10000000]],
-            #   [[0.30000001]],
-            #   [[0.50000000]]]]
+            >>> import paddle
+            >>> import paddle.nn as nn
+            >>> x = paddle.arange(0, 0.6, 0.1, 'float32')
+            >>> x = paddle.reshape(x, [1, 6, 1, 1])
+            >>> print(x)
+            Tensor(shape=[1, 6, 1, 1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[[0.        ]],
+              [[0.10000000]],
+              [[0.20000000]],
+              [[0.30000001]],
+              [[0.40000001]],
+              [[0.50000000]]]])
+            >>> channel_shuffle = nn.ChannelShuffle(3)
+            >>> y = channel_shuffle(x)
+            >>> print(y)
+            Tensor(shape=[1, 6, 1, 1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[[0.        ]],
+              [[0.20000000]],
+              [[0.40000001]],
+              [[0.10000000]],
+              [[0.30000001]],
+              [[0.50000000]]]])
     """
 
     def __init__(self, groups, data_format="NCHW", name=None):
-- 
GitLab