From 4ff6999aa5e63b54c00c82eddb10d22a70c8ca59 Mon Sep 17 00:00:00 2001
From: cyberslack_lee <luhputu0815@gmail.com>
Date: Thu, 3 Aug 2023 10:53:45 +0800
Subject: [PATCH] [xdoctest] reformat example code with google style No.80-85
 (#55806)

* [Doctest]fix No.80-85, test=docs_preview

* test=docs_preview

* test=docs_preview

* test=docs_preview

* test=docs_preview

* test=docs_preview

* test=docs_preview

* fix

* Apply suggestions from code review

* Apply suggestions from code review

* Apply suggestions from code review

* test=docs_preview

* test=docs_preview

* test=docs_preview

* test=docs_preview

---------

Co-authored-by: Nyakku Shigure <sigure.qaq@gmail.com>
---
 python/paddle/nn/functional/pooling.py        | 468 +++++----
 .../paddle/nn/functional/sparse_attention.py  |  89 +-
 python/paddle/nn/layer/activation.py          | 493 ++++-----
 python/paddle/nn/layer/common.py              | 684 ++++++-------
 python/paddle/nn/layer/layers.py              | 950 ++++++++++--------
 5 files changed, 1451 insertions(+), 1233 deletions(-)

diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 143f37ddc4d..955d63469d3 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -219,13 +219,14 @@ def avg_pool1d(
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn as nn
-
-            data = paddle.uniform([1, 3, 32], paddle.float32)
-            AvgPool1D = nn.AvgPool1D(kernel_size=2, stride=2, padding=0)
-            pool_out = AvgPool1D(data)
-            # pool_out shape: [1, 3, 16]
+            >>> import paddle
+            >>> import paddle.nn as nn
+
+            >>> data = paddle.uniform([1, 3, 32], paddle.float32)
+            >>> AvgPool1D = nn.AvgPool1D(kernel_size=2, stride=2, padding=0)
+            >>> pool_out = AvgPool1D(data)
+            >>> print(pool_out.shape)
+            [1, 3, 16]
     """
     """NCL to NCHW"""
     data_format = "NCHW"
@@ -350,15 +351,16 @@ def avg_pool2d(
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-            # avg pool2d
-            x = paddle.uniform([1, 3, 32, 32], paddle.float32)
-            out = F.avg_pool2d(x,
-                            kernel_size=2,
-                            stride=2, padding=0)
-            # out.shape [1, 3, 16, 16]
+            >>> # avg pool2d
+            >>> x = paddle.uniform([1, 3, 32, 32], paddle.float32)
+            >>> out = F.avg_pool2d(x,
+            ...                    kernel_size=2,
+            ...                    stride=2, padding=0)
+            >>> print(out.shape)
+            [1, 3, 16, 16]
     """
     kernel_size = convert_to_list(kernel_size, 2, 'pool_size')
     if stride is None:
@@ -480,16 +482,16 @@ def avg_pool3d(
     Examples:
         .. code-block:: python
 
-          import paddle
+            >>> import paddle
 
-          x = paddle.uniform([1, 3, 32, 32, 32], paddle.float32)
-          # avg pool3d
-          out = paddle.nn.functional.avg_pool3d(
-                                            x,
-                                            kernel_size = 2,
-                                            stride = 2,
-                                            padding=0)
-          # out.shape: [1, 3, 16, 16, 16]
+            >>> x = paddle.uniform([1, 3, 32, 32, 32], paddle.float32)
+            >>> # avg pool3d
+            >>> out = paddle.nn.functional.avg_pool3d(x,
+            ...                                       kernel_size = 2,
+            ...                                       stride = 2,
+            ...                                       padding=0)
+            >>> print(out.shape)
+            [1, 3, 16, 16, 16]
     """
     kernel_size = convert_to_list(kernel_size, 3, 'pool_size')
     if stride is None:
@@ -599,14 +601,18 @@ def max_pool1d(
     Examples:
         .. code-block:: python
 
-          import paddle
-          import paddle.nn.functional as F
-
-          data = paddle.uniform([1, 3, 32], paddle.float32)
-          pool_out = F.max_pool1d(data, kernel_size=2, stride=2, padding=0)
-          # pool_out shape: [1, 3, 16]
-          pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
-          # pool_out shape: [1, 3, 16],  indices shape: [1, 3, 16]
+            >>> import paddle
+            >>> import paddle.nn.functional as F
+
+            >>> data = paddle.uniform([1, 3, 32], paddle.float32)
+            >>> pool_out = F.max_pool1d(data, kernel_size=2, stride=2, padding=0)
+            >>> print(pool_out.shape)
+            [1, 3, 16]
+            >>> pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
+            >>> print(pool_out.shape)
+            [1, 3, 16]
+            >>> print(indices.shape)
+            [1, 3, 16]
     """
     """NCL to NCHW"""
     data_format = "NCHW"
@@ -789,14 +795,18 @@ def max_unpool1d(
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-            data = paddle.rand(shape=[1, 3, 16])
-            pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
-            # pool_out shape: [1, 3, 8],  indices shape: [1, 3, 8]
-            unpool_out = F.max_unpool1d(pool_out, indices, kernel_size=2, padding=0)
-            # unpool_out shape: [1, 3, 16]
+            >>> data = paddle.rand(shape=[1, 3, 16])
+            >>> pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
+            >>> print(pool_out.shape)
+            [1, 3, 8]
+            >>> print(indices.shape)
+            [1, 3, 8]
+            >>> unpool_out = F.max_unpool1d(pool_out, indices, kernel_size=2, padding=0)
+            >>> print(unpool_out.shape)
+            [1, 3, 16]
 
     """
     """NCL to NCHW"""
@@ -926,18 +936,23 @@ def max_unpool2d(
         Examples:
             .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+                >>> import paddle
+                >>> import paddle.nn.functional as F
 
-            data = paddle.rand(shape=[1,1,6,6])
-            pool_out, indices = F.max_pool2d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
-            # pool_out shape: [1, 1, 3, 3],  indices shape: [1, 1, 3, 3]
-            unpool_out = F.max_unpool2d(pool_out, indices, kernel_size=2, padding=0)
-            # unpool_out shape: [1, 1, 6, 6]
+                >>> data = paddle.rand(shape=[1, 1, 6, 6])
+                >>> pool_out, indices = F.max_pool2d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
+                >>> print(pool_out.shape)
+                [1, 1, 3, 3]
+                >>> print(indices.shape)
+                [1, 1, 3, 3]
+                >>> unpool_out = F.max_unpool2d(pool_out, indices, kernel_size=2, padding=0)
+                >>> print(unpool_out.shape)
+                [1, 1, 6, 6]
 
-            # specify a different output size than input size
-            unpool_out = F.max_unpool2d(pool_out, indices, kernel_size=2, padding=0, output_size=[7,7])
-            # unpool_out shape: [1, 1, 7, 7]
+                >>> # specify a different output size than input size
+                >>> unpool_out = F.max_unpool2d(pool_out, indices, kernel_size=2, padding=0, output_size=[7, 7])
+                >>> print(unpool_out.shape)
+                [1, 1, 7, 7]
 
     """
     if x.ndim != 4:
@@ -1073,14 +1088,18 @@ def max_unpool3d(
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-            data = paddle.rand(shape=[1, 1, 4, 4, 6])
-            pool_out, indices = F.max_pool3d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
-            # pool_out shape: [1, 1, 2, 2, 3],  indices shape: [1, 1, 2, 2, 3]
-            unpool_out = F.max_unpool3d(pool_out, indices, kernel_size=2, padding=0)
-            # unpool_out shape: [1, 1, 4, 4, 6]
+            >>> data = paddle.rand(shape=[1, 1, 4, 4, 6])
+            >>> pool_out, indices = F.max_pool3d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
+            >>> print(pool_out.shape)
+            [1, 1, 2, 2, 3]
+            >>> print(indices.shape)
+            [1, 1, 2, 2, 3]
+            >>> unpool_out = F.max_unpool3d(pool_out, indices, kernel_size=2, padding=0)
+            >>> print(unpool_out.shape)
+            [1, 1, 4, 4, 6]
 
     """
     if x.ndim != 5:
@@ -1200,16 +1219,20 @@ def max_pool2d(
     Examples:
         .. code-block:: python
 
-          import paddle
-          import paddle.nn.functional as F
-
-          # max pool2d
-          x = paddle.uniform([1, 3, 32, 32], paddle.float32)
-          out = F.max_pool2d(x, kernel_size=2, stride=2, padding=0)
-          # output.shape [1, 3, 16, 16]
-          # for return_mask=True
-          out, max_indices = F.max_pool2d(x, kernel_size=2, stride=2, padding=0, return_mask=True)
-          # out.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
+            >>> import paddle
+            >>> import paddle.nn.functional as F
+
+            >>> # max pool2d
+            >>> x = paddle.uniform([1, 3, 32, 32], paddle.float32)
+            >>> out = F.max_pool2d(x, kernel_size=2, stride=2, padding=0)
+            >>> print(out.shape)
+            [1, 3, 16, 16]
+            >>> # for return_mask=True
+            >>> out, max_indices = F.max_pool2d(x, kernel_size=2, stride=2, padding=0, return_mask=True)
+            >>> print(out.shape)
+            [1, 3, 16, 16]
+            >>> print(max_indices.shape)
+            [1, 3, 16, 16]
     """
 
     kernel_size = convert_to_list(kernel_size, 2, 'pool_size')
@@ -1359,24 +1382,30 @@ def max_pool3d(
     Examples:
         .. code-block:: python
 
-          import paddle
-          import paddle.nn.functional as F
-
-          # max pool3d
-          x = paddle.uniform([1, 3, 32, 32, 32])
-          output = F.max_pool3d(x,
-                                kernel_size=2,
-                                stride=2, padding=0)
-          # output.shape [1, 3, 16, 16, 16]
-          # for return_mask=True
-          x = paddle.uniform([1, 3, 32, 32, 32])
-          output, max_indices = paddle.nn.functional.max_pool3d(x,
-                                                                kernel_size=2,
-                                                                stride=2,
-                                                                padding=0,
-                                                                return_mask=True)
-
-          # output.shape [1, 3, 16, 16, 16], max_indices.shape [1, 3, 16, 16, 16]
+            >>> import paddle
+            >>> import paddle.nn.functional as F
+
+            >>> # max pool3d
+            >>> x = paddle.uniform([1, 3, 32, 32, 32])
+            >>> output = F.max_pool3d(x,
+            ...                       kernel_size=2,
+            ...                       stride=2,
+            ...                       padding=0)
+            >>> print(output.shape)
+            [1, 3, 16, 16, 16]
+
+            >>> # for return_mask=True
+            >>> x = paddle.uniform([1, 3, 32, 32, 32])
+            >>> output, max_indices = paddle.nn.functional.max_pool3d(x,
+            ...                                                       kernel_size=2,
+            ...                                                       stride=2,
+            ...                                                       padding=0,
+            ...                                                       return_mask=True)
+            ...
+            >>> print(output.shape)
+            [1, 3, 16, 16, 16]
+            >>> print(max_indices.shape)
+            [1, 3, 16, 16, 16]
     """
 
     kernel_size = convert_to_list(kernel_size, 3, 'pool_size')
@@ -1468,24 +1497,25 @@ def adaptive_avg_pool1d(x, output_size, name=None):
     Examples:
         .. code-block:: python
 
-            # average adaptive pool1d
-            # suppose input data in shape of [N, C, L], `output_size` is m or [m],
-            # output shape is [N, C, m], adaptive pool divide L dimension
-            # of input data into m grids averagely and performs poolings in each
-            # grid to get output.
-            # adaptive max pool performs calculations as follow:
-            #
-            #     for i in range(m):
-            #         lstart = floor(i * L / m)
-            #         lend = ceil((i + 1) * L / m)
-            #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend)
-            #
-            import paddle
-            import paddle.nn.functional as F
-
-            data = paddle.uniform([1, 3, 32])
-            pool_out = F.adaptive_avg_pool1d(data, output_size=16)
-            # pool_out shape: [1, 3, 16])
+            >>> # average adaptive pool1d
+            >>> # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+            >>> # output shape is [N, C, m], adaptive pool divide L dimension
+            >>> # of input data into m grids averagely and performs poolings in each
+            >>> # grid to get output.
+            >>> # adaptive max pool performs calculations as follow:
+            >>> #
+            >>> #     for i in range(m):
+            >>> #         lstart = floor(i * L / m)
+            >>> #         lend = ceil((i + 1) * L / m)
+            >>> #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend)
+            >>> #
+            >>> import paddle
+            >>> import paddle.nn.functional as F
+
+            >>> data = paddle.uniform([1, 3, 32])
+            >>> pool_out = F.adaptive_avg_pool1d(data, output_size=16)
+            >>> print(pool_out.shape)
+            [1, 3, 16]
     """
     pool_type = 'avg'
     _check_input(x, 3)
@@ -1567,29 +1597,29 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
     Examples:
         .. code-block:: python
 
-            # adaptive avg pool2d
-            # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
-            # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
-            # of input data into m * n grids averagely and performs poolings in each
-            # grid to get output.
-            # adaptive avg pool performs calculations as follow:
-            #
-            #     for i in range(m):
-            #         for j in range(n):
-            #             hstart = floor(i * H / m)
-            #             hend = ceil((i + 1) * H / m)
-            #             wstart = floor(i * W / n)
-            #             wend = ceil((i + 1) * W / n)
-            #             output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend])
-            #
-            import paddle
-
-            x = paddle.rand([2, 3, 32, 32])
-            # x.shape is [2, 3, 32, 32]
-            out = paddle.nn.functional.adaptive_avg_pool2d(
-                            x = x,
-                            output_size=[3, 3])
-            # out.shape is [2, 3, 3, 3]
+            >>> # adaptive avg pool2d
+            >>> # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
+            >>> # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
+            >>> # of input data into m * n grids averagely and performs poolings in each
+            >>> # grid to get output.
+            >>> # adaptive avg pool performs calculations as follow:
+            >>> #
+            >>> #     for i in range(m):
+            >>> #         for j in range(n):
+            >>> #             hstart = floor(i * H / m)
+            >>> #             hend = ceil((i + 1) * H / m)
+            >>> #             wstart = floor(i * W / n)
+            >>> #             wend = ceil((i + 1) * W / n)
+            >>> #             output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend])
+            >>> #
+            >>> import paddle
+
+            >>> x = paddle.rand([2, 3, 32, 32])
+            >>> # x.shape is [2, 3, 32, 32]
+            >>> out = paddle.nn.functional.adaptive_avg_pool2d(x = x,
+            ...                                                output_size=[3, 3])
+            >>> print(out.shape)
+            [2, 3, 3, 3]
 
     """
     if data_format not in ["NCHW", "NHWC"]:
@@ -1700,31 +1730,31 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
     Examples:
         .. code-block:: python
 
-            # adaptive avg pool3d
-            # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
-            # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
-            # of input data into l * m * n grids averagely and performs poolings in each
-            # grid to get output.
-            # adaptive avg pool performs calculations as follow:
-            #
-            #     for i in range(l):
-            #         for j in range(m):
-            #             for k in range(n):
-            #                 dstart = floor(i * D / l)
-            #                 dend = ceil((i + 1) * D / l)
-            #                 hstart = floor(j * H / m)
-            #                 hend = ceil((j + 1) * H / m)
-            #                 wstart = floor(k * W / n)
-            #                 wend = ceil((k + 1) * W / n)
-            #                 output[:, :, i, j, k] =
-            #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
-            import paddle
-
-            input_data = paddle.randn(shape=(2, 3, 8, 32, 32))
-            out = paddle.nn.functional.adaptive_avg_pool3d(
-                            x = input_data,
-                            output_size=[3, 3, 3])
-            # out.shape is [2, 3, 3, 3, 3]
+            >>> # adaptive avg pool3d
+            >>> # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
+            >>> # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
+            >>> # of input data into l * m * n grids averagely and performs poolings in each
+            >>> # grid to get output.
+            >>> # adaptive avg pool performs calculations as follow:
+            >>> #
+            >>> #     for i in range(l):
+            >>> #         for j in range(m):
+            >>> #             for k in range(n):
+            >>> #                 dstart = floor(i * D / l)
+            >>> #                 dend = ceil((i + 1) * D / l)
+            >>> #                 hstart = floor(j * H / m)
+            >>> #                 hend = ceil((j + 1) * H / m)
+            >>> #                 wstart = floor(k * W / n)
+            >>> #                 wend = ceil((k + 1) * W / n)
+            >>> #                 output[:, :, i, j, k] =
+            >>> #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
+            >>> import paddle
+
+            >>> input_data = paddle.randn(shape=(2, 3, 8, 32, 32))
+            >>> out = paddle.nn.functional.adaptive_avg_pool3d(x = input_data,
+            ...                                                output_size=[3, 3, 3])
+            >>> print(out.shape)
+            [2, 3, 3, 3, 3]
 
     """
     if data_format not in ["NCDHW", "NDHWC"]:
@@ -1815,26 +1845,30 @@ def adaptive_max_pool1d(x, output_size, return_mask=False, name=None):
     Examples:
         .. code-block:: python
 
-              # max adaptive pool1d
-              # suppose input data in shape of [N, C, L], `output_size` is m or [m],
-              # output shape is [N, C, m], adaptive pool divide L dimension
-              # of input data into m grids averagely and performs poolings in each
-              # grid to get output.
-              # adaptive max pool performs calculations as follow:
-              #
-              #     for i in range(m):
-              #         lstart = floor(i * L / m)
-              #         lend = ceil((i + 1) * L / m)
-              #         output[:, :, i] = max(input[:, :, lstart: lend])
-              #
-              import paddle
-              import paddle.nn.functional as F
-
-              data = paddle.uniform([1, 3, 32], paddle.float32)
-              pool_out = F.adaptive_max_pool1d(data, output_size=16)
-              # pool_out shape: [1, 3, 16])
-              pool_out, indices = F.adaptive_max_pool1d(data, output_size=16, return_mask=True)
-              # pool_out shape: [1, 3, 16] indices  shape: [1, 3, 16]
+            >>> # max adaptive pool1d
+            >>> # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+            >>> # output shape is [N, C, m], adaptive pool divide L dimension
+            >>> # of input data into m grids averagely and performs poolings in each
+            >>> # grid to get output.
+            >>> # adaptive max pool performs calculations as follow:
+            >>> #
+            >>> #     for i in range(m):
+            >>> #         lstart = floor(i * L / m)
+            >>> #         lend = ceil((i + 1) * L / m)
+            >>> #         output[:, :, i] = max(input[:, :, lstart: lend])
+            >>> #
+            >>> import paddle
+            >>> import paddle.nn.functional as F
+
+            >>> data = paddle.uniform([1, 3, 32], paddle.float32)
+            >>> pool_out = F.adaptive_max_pool1d(data, output_size=16)
+            >>> print(pool_out.shape)
+            [1, 3, 16]
+            >>> pool_out, indices = F.adaptive_max_pool1d(data, output_size=16, return_mask=True)
+            >>> print(pool_out.shape)
+            [1, 3, 16]
+            >>> print(indices.shape)
+            [1, 3, 16]
     """
     _check_input(x, 3)
 
@@ -1901,28 +1935,28 @@ def adaptive_max_pool2d(x, output_size, return_mask=False, name=None):
     Examples:
         .. code-block:: python
 
-          # max adaptive pool2d
-          # suppose input data in the shape of [N, C, H, W], `output_size` is [m, n]
-          # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
-          # of input data into m*n grids averagely and performs poolings in each
-          # grid to get output.
-          # adaptive max pool performs calculations as follow:
-          #
-          #     for i in range(m):
-          #         for j in range(n):
-          #             hstart = floor(i * H / m)
-          #             hend = ceil((i + 1) * H / m)
-          #             wstart = floor(i * W / n)
-          #             wend = ceil((i + 1) * W / n)
-          #             output[:, :, i, j] = max(input[:, :, hstart: hend, wstart: wend])
-          #
-          import paddle
-
-          input_data = paddle.randn(shape=(2, 3, 32, 32))
-          out = paddle.nn.functional.adaptive_max_pool2d(
-                        x = input_data,
-                        output_size=[3, 3])
-          # out.shape is [2, 3, 3, 3]
+            >>> # max adaptive pool2d
+            >>> # suppose input data in the shape of [N, C, H, W], `output_size` is [m, n]
+            >>> # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
+            >>> # of input data into m*n grids averagely and performs poolings in each
+            >>> # grid to get output.
+            >>> # adaptive max pool performs calculations as follow:
+            >>> #
+            >>> #     for i in range(m):
+            >>> #         for j in range(n):
+            >>> #             hstart = floor(i * H / m)
+            >>> #             hend = ceil((i + 1) * H / m)
+            >>> #             wstart = floor(i * W / n)
+            >>> #             wend = ceil((i + 1) * W / n)
+            >>> #             output[:, :, i, j] = max(input[:, :, hstart: hend, wstart: wend])
+            >>> #
+            >>> import paddle
+
+            >>> input_data = paddle.randn(shape=(2, 3, 32, 32))
+            >>> out = paddle.nn.functional.adaptive_max_pool2d(x = input_data,
+            ...                                                output_size=[3, 3])
+            >>> print(out.shape)
+            [2, 3, 3, 3]
     """
     _check_input(x, 4)
 
@@ -1987,31 +2021,31 @@ def adaptive_max_pool3d(x, output_size, return_mask=False, name=None):
     Examples:
         .. code-block:: python
 
-          # adaptive max pool3d
-          # suppose input data in the shape of [N, C, D, H, W], `output_size` is [l, m, n]
-          # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
-          # of input data into m*n grids averagely and performs poolings in each
-          # grid to get output.
-          # adaptive max pool performs calculations as follow:
-          #
-          #     for i in range(l):
-          #         for j in range(m):
-          #             for k in range(n):
-          #                 dstart = floor(i * D / l)
-          #                 dend = ceil((i + 1) * D / l)
-          #                 hstart = floor(i * H / m)
-          #                 hend = ceil((i + 1) * H / m)
-          #                 wstart = floor(i * W / n)
-          #                 wend = ceil((i + 1) * W / n)
-          #             output[:, :, i, j, k] = max(input[:, :, dstart: dend, hstart: hend, wstart: wend])
-          #
-          import paddle
-
-          input_data = paddle.randn(shape=(2, 3, 8, 32, 32))
-          out = paddle.nn.functional.adaptive_max_pool3d(
-                        x = input_data,
-                        output_size=[3, 3, 3])
-          # out.shape is [2, 3, 3, 3, 3]
+            >>> # adaptive max pool3d
+            >>> # suppose input data in the shape of [N, C, D, H, W], `output_size` is [l, m, n]
+            >>> # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
+            >>> # of input data into m*n grids averagely and performs poolings in each
+            >>> # grid to get output.
+            >>> # adaptive max pool performs calculations as follow:
+            >>> #
+            >>> #     for i in range(l):
+            >>> #         for j in range(m):
+            >>> #             for k in range(n):
+            >>> #                 dstart = floor(i * D / l)
+            >>> #                 dend = ceil((i + 1) * D / l)
+            >>> #                 hstart = floor(i * H / m)
+            >>> #                 hend = ceil((i + 1) * H / m)
+            >>> #                 wstart = floor(i * W / n)
+            >>> #                 wend = ceil((i + 1) * W / n)
+            >>> #             output[:, :, i, j, k] = max(input[:, :, dstart: dend, hstart: hend, wstart: wend])
+            >>> #
+            >>> import paddle
+
+            >>> input_data = paddle.randn(shape=(2, 3, 8, 32, 32))
+            >>> out = paddle.nn.functional.adaptive_max_pool3d(x = input_data,
+            ...                                                output_size=[3, 3, 3])
+            >>> print(out.shape)
+            [2, 3, 3, 3, 3]
     """
     _check_input(x, 5)
 
diff --git a/python/paddle/nn/functional/sparse_attention.py b/python/paddle/nn/functional/sparse_attention.py
index df95efb1705..bef511a3fa8 100644
--- a/python/paddle/nn/functional/sparse_attention.py
+++ b/python/paddle/nn/functional/sparse_attention.py
@@ -88,50 +88,51 @@ def sparse_attention(
     Examples:
         .. code-block:: python
 
-            # required: skiptest
-            import paddle
-
-            paddle.disable_static()
-
-            # `query`, `key` and `value` all have shape [1, 1, 4, 2]
-            query = paddle.to_tensor([[[[0, 1, ], [2, 3],
-                                        [0, 1], [2, 3]]]], dtype="float32")
-            key = paddle.to_tensor([[[[0, 1], [2, 3],
-                                    [0, 1], [2, 3]]]], dtype="float32")
-            value = paddle.to_tensor([[[[0, 1], [2, 3],
-                                        [0, 1], [2, 3]]]], dtype="float32")
-
-
-            offset = paddle.to_tensor([[[0, 2, 4, 6, 8]]], dtype="int32")
-            columns = paddle.to_tensor([[[0, 1, 0, 1, 2, 3, 2, 3]]], dtype="int32")
-
-            print(offset.shape)  # (1, 1, 5)
-            print(columns.shape)  # (1, 1, 8)
-
-            key_padding_mask = paddle.to_tensor([[1, 1, 1, 0]], dtype="float32")
-            attention_mask = paddle.to_tensor([[1, 0, 1, 1],
-                                            [1, 1, 1, 1],
-                                            [1, 1, 1, 1],
-                                            [1, 1, 1, 1]], dtype="float32")
-            output_mask = paddle.nn.functional.sparse_attention(query, key,
-                                                                value, offset, columns,
-                                                                key_padding_mask=key_padding_mask,
-                                                                attn_mask=attention_mask)
-            print(output_mask)
-            # Tensor(shape=[1, 1, 4, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
-            #        [[[[0.        , 1.        ],
-            #           [1.99830270, 2.99830270],
-            #           [0.        , 1.        ],
-            #           [0.        , 1.        ]]]])
-
-            output = paddle.nn.functional.sparse_attention(query, key,
-                                                        value, offset, columns)
-            print(output)
-            # Tensor(shape=[1, 1, 4, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
-            #        [[[[1.60885942, 2.60885954],
-            #           [1.99830270, 2.99830270],
-            #           [1.60885942, 2.60885954],
-            #           [1.99830270, 2.99830270]]]])
+            >>> # doctest: +SKIP('This API is only used in CUDA11.3 and above.')
+            >>> import paddle
+
+            >>> paddle.disable_static()
+
+            >>> # `query`, `key` and `value` all have shape [1, 1, 4, 2]
+            >>> query = paddle.to_tensor([[[[0, 1, ], [2, 3],
+            ...                             [0, 1], [2, 3]]]], dtype="float32")
+            >>> key = paddle.to_tensor([[[[0, 1], [2, 3],
+            ...                           [0, 1], [2, 3]]]], dtype="float32")
+            >>> value = paddle.to_tensor([[[[0, 1], [2, 3],
+            ...                             [0, 1], [2, 3]]]], dtype="float32")
+            ...
+            >>> offset = paddle.to_tensor([[[0, 2, 4, 6, 8]]], dtype="int32")
+            >>> columns = paddle.to_tensor([[[0, 1, 0, 1, 2, 3, 2, 3]]], dtype="int32")
+            ...
+            >>> print(offset.shape)
+            [1, 1, 5]
+            >>> print(columns.shape)
+            [1, 1, 8]
+            ...
+            >>> key_padding_mask = paddle.to_tensor([[1, 1, 1, 0]], dtype="float32")
+            >>> attention_mask = paddle.to_tensor([[1, 0, 1, 1],
+            ...                                    [1, 1, 1, 1],
+            ...                                    [1, 1, 1, 1],
+            ...                                    [1, 1, 1, 1]], dtype="float32")
+            >>> output_mask = paddle.nn.functional.sparse_attention(query, key,
+            ...                                                     value, offset, columns,
+            ...                                                     key_padding_mask=key_padding_mask,
+            ...                                                     attn_mask=attention_mask)
+            >>> print(output_mask)
+            Tensor(shape=[1, 1, 4, 2], dtype=float32, place=Place(cpu), stop_gradient=False,
+            [[[[0.        , 1.        ],
+               [1.99830270, 2.99830270],
+               [0.        , 1.        ],
+               [0.        , 1.        ]]]])
+
+            >>> output = paddle.nn.functional.sparse_attention(query, key,
+            ...                                             value, offset, columns)
+            >>> print(output)
+            Tensor(shape=[1, 1, 4, 2], dtype=float32, place=Place(cpu), stop_gradient=False,
+            [[[[1.60885942, 2.60885954],
+               [1.99830270, 2.99830270],
+               [1.60885942, 2.60885954],
+               [1.99830270, 2.99830270]]]])
     """
     if in_dynamic_mode():
         (
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 3a28e63c203..4bcb19ea95c 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -43,13 +43,15 @@ class CELU(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.to_tensor([[-1. ,6.], [1., 15.6]])
-            m = paddle.nn.CELU(0.2)
-            out = m(x)
-            # [[-0.19865242,  6.        ],
-            #  [ 1.        , 15.60000038]]
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([[-1. ,6.], [1., 15.6]])
+            >>> m = paddle.nn.CELU(0.2)
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.19865242,  6.        ],
+             [ 1.        , 15.60000038]])
     """
 
     def __init__(self, alpha=1.0, name=None):
@@ -91,13 +93,15 @@ class ELU(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([[-1. ,6.], [1., 15.6]])
-            m = paddle.nn.ELU(0.2)
-            out = m(x)
-            # [[-0.12642411  6.        ]
-            #  [ 1.          15.6      ]]
+            >>> x = paddle.to_tensor([[-1. ,6.], [1., 15.6]])
+            >>> m = paddle.nn.ELU(0.2)
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.12642412,  6.        ],
+             [ 1.        , 15.60000038]])
     """
 
     def __init__(self, alpha=1.0, name=None):
@@ -141,15 +145,20 @@ class GELU(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.to_tensor([[-1, 0.5],[1, 1.5]])
-
-            m = paddle.nn.GELU()
-            out = m(x) # [-0.158655 0.345731 0.841345 1.39979]
-
-            m = paddle.nn.GELU(True)
-            out = m(x) # [-0.158808 0.345714 0.841192 1.39957]
+            >>> import paddle
+            >>> x = paddle.to_tensor([[-1, 0.5],[1, 1.5]])
+            >>> m = paddle.nn.GELU()
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.15865529,  0.34573123],
+             [ 0.84134471,  1.39978933]])
+            >>> m = paddle.nn.GELU(True)
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.15880796,  0.34571400],
+             [ 0.84119201,  1.39957154]])
     """
 
     def __init__(self, approximate=False, name=None):
@@ -193,11 +202,14 @@ class Hardshrink(Layer):
 
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([-1, 0.3, 2.5])
-            m = paddle.nn.Hardshrink()
-            out = m(x) # [-1., 0., 2.5]
+            >>> x = paddle.to_tensor([-1, 0.3, 2.5])
+            >>> m = paddle.nn.Hardshrink()
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [-1.       ,  0.       , 2.50000000])
     """
 
     def __init__(self, threshold=0.5, name=None):
@@ -244,11 +256,14 @@ class Hardswish(Layer):
 
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([-4., 5., 1.])
-            m = paddle.nn.Hardswish()
-            out = m(x) # [0., 5., 0.666667]
+            >>> x = paddle.to_tensor([-4., 5., 1.])
+            >>> m = paddle.nn.Hardswish()
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [-0.       , 5.        , 0.66666669])
     """
 
     def __init__(self, name=None):
@@ -282,14 +297,14 @@ class Tanh(Layer):
 
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-            m = paddle.nn.Tanh()
-            out = m(x)
-            print(out)
-            # Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [-0.37994894, -0.19737533,  0.09966800,  0.29131261])
+            >>> x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+            >>> m = paddle.nn.Tanh()
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [-0.37994900, -0.19737528,  0.09966799,  0.29131261])
     """
 
     def __init__(self, name=None):
@@ -333,11 +348,14 @@ class Hardtanh(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([-1.5, 0.3, 2.5])
-            m = paddle.nn.Hardtanh()
-            out = m(x) # [-1., 0.3, 1.]
+            >>> x = paddle.to_tensor([-1.5, 0.3, 2.5])
+            >>> m = paddle.nn.Hardtanh()
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [-1.       , 0.30000001,  1.       ])
     """
 
     def __init__(self, min=-1.0, max=1.0, name=None):
@@ -386,25 +404,25 @@ class PReLU(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-            paddle.set_default_dtype("float64")
-
-            data = paddle.to_tensor([[[[-2.0,  3.0, -4.0,  5.0],
-                                    [ 3.0, -4.0,  5.0, -6.0],
-                                    [-7.0, -8.0,  8.0,  9.0]],
-                                    [[ 1.0, -2.0, -3.0,  4.0],
-                                    [-5.0,  6.0,  7.0, -8.0],
-                                    [ 6.0,  7.0,  8.0,  9.0]]]])
-
-            m = paddle.nn.PReLU(1, 0.25)
-            out = m(data)
-            print(out)
-            # [[[[-0.5 ,  3.  , -1.  ,  5.  ],
-            #    [ 3.  , -1.  ,  5.  , -1.5 ],
-            #    [-1.75, -2.  ,  8.  ,  9.  ]],
-            #   [[ 1.  , -0.5 , -0.75,  4.  ],
-            #    [-1.25,  6.  ,  7.  , -2.  ],
-            #    [ 6.  ,  7.  ,  8.  ,  9.  ]]]]
+            >>> import paddle
+
+            >>> data = paddle.to_tensor([[[[-2.0,  3.0, -4.0,  5.0],
+            ...                            [ 3.0, -4.0,  5.0, -6.0],
+            ...                            [-7.0, -8.0,  8.0,  9.0]],
+            ...                           [[ 1.0, -2.0, -3.0,  4.0],
+            ...                            [-5.0,  6.0,  7.0, -8.0],
+            ...                            [ 6.0,  7.0,  8.0,  9.0]]]])
+            ...
+            >>> m = paddle.nn.PReLU(1, 0.25)
+            >>> out = m(data)
+            >>> print(out)
+            Tensor(shape=[1, 2, 3, 4], dtype=float32, place=Place(cpu), stop_gradient=False,
+            [[[[-0.50000000,  3.        , -1.        ,  5.        ],
+               [ 3.        , -1.        ,  5.        , -1.50000000],
+               [-1.75000000, -2.        ,  8.        ,  9.        ]],
+              [[ 1.        , -0.50000000, -0.75000000,  4.        ],
+               [-1.25000000,  6.        ,  7.        , -2.        ],
+               [ 6.        ,  7.        ,  8.        ,  9.        ]]]])
     """
 
     def __init__(
@@ -495,24 +513,26 @@ class RReLU(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            input_tensor = paddle.to_tensor([[[[-2.0,  3.0, -4.0,  5.0],
-                                            [ 3.0, -4.0,  5.0, -6.0],
-                                            [-7.0, -8.0,  8.0,  9.0]],
-                                            [[ 1.0, -2.0, -3.0,  4.0],
-                                            [-5.0,  6.0,  7.0, -8.0],
-                                            [ 6.0,  7.0,  8.0,  9.0]]]], dtype='float32')
-
-            rrelu_layer = paddle.nn.RReLU(0.1, 0.3)
-            out = rrelu_layer(input_tensor)
-            print(out)
-            #[[[[-0.20000899  3.         -0.88108218  5.        ]
-            #   [ 3.         -0.55175185  5.         -1.07761011]
-            #   [-1.06806871 -1.98962009  8.          9.        ]]
-            #  [[ 1.         -0.52382672 -0.65515128  4.        ]
-            #   [-1.37663394  6.          7.         -2.34657836]
-            #   [ 6.          7.          8.          9.        ]]]]
+            >>> import paddle
+            >>> paddle.seed(2023)
+
+            >>> input_tensor = paddle.to_tensor([[[[-2.0,  3.0, -4.0,  5.0],
+            ...                                    [ 3.0, -4.0,  5.0, -6.0],
+            ...                                    [-7.0, -8.0,  8.0,  9.0]],
+            ...                                   [[ 1.0, -2.0, -3.0,  4.0],
+            ...                                    [-5.0,  6.0,  7.0, -8.0],
+            ...                                    [ 6.0,  7.0,  8.0,  9.0]]]], dtype='float32')
+            ...
+            >>> rrelu_layer = paddle.nn.RReLU(0.1, 0.3)
+            >>> out = rrelu_layer(input_tensor)
+            >>> print(out)
+            Tensor(shape=[1, 2, 3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[[-0.54633451,  3.        , -0.81611776,  5.        ],
+               [ 3.        , -0.60768753,  5.        , -1.68630385],
+               [-1.29360127, -1.45026064,  8.        ,  9.        ]],
+              [[ 1.        , -0.58808362, -0.74662417,  4.        ],
+               [-1.01785135,  6.        ,  7.        , -1.97268605],
+               [ 6.        ,  7.        ,  8.        ,  9.        ]]]])
     """
 
     def __init__(self, lower=1.0 / 8.0, upper=1.0 / 3.0, name=None):
@@ -554,13 +574,14 @@ class ReLU(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([-2., 0., 1.])
-            m = paddle.nn.ReLU()
-            out = m(x)
-            print(out)
-            # [0., 0., 1.]
+            >>> x = paddle.to_tensor([-2., 0., 1.])
+            >>> m = paddle.nn.ReLU()
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [0., 0., 1.])
     """
 
     def __init__(self, name=None):
@@ -596,13 +617,14 @@ class ReLU6(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([-1., 0.3, 6.5])
-            m = paddle.nn.ReLU6()
-            out = m(x)
-            print(out)
-            # [0, 0.3, 6]
+            >>> x = paddle.to_tensor([-1., 0.3, 6.5])
+            >>> m = paddle.nn.ReLU6()
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [0.        , 0.30000000, 6.        ])
     """
 
     def __init__(self, name=None):
@@ -644,13 +666,15 @@ class SELU(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([[0.0, 1.0],[2.0, 3.0]])
-            m = paddle.nn.SELU()
-            out = m(x)
-            print(out)
-            # [[0, 1.050701],[2.101402, 3.152103]]
+            >>> x = paddle.to_tensor([[0.0, 1.0],[2.0, 3.0]])
+            >>> m = paddle.nn.SELU()
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0.        , 1.05070102],
+             [2.10140204, 3.15210295]])
     """
 
     def __init__(
@@ -703,11 +727,14 @@ class LeakyReLU(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            m = paddle.nn.LeakyReLU()
-            x = paddle.to_tensor([-2.0, 0, 1])
-            out = m(x)  # [-0.02, 0., 1.]
+            >>> m = paddle.nn.LeakyReLU()
+            >>> x = paddle.to_tensor([-2.0, 0, 1])
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [-0.02000000,  0.        ,  1.        ])
     """
 
     def __init__(self, negative_slope=0.01, name=None):
@@ -744,11 +771,14 @@ class Sigmoid(Layer):
 
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            m = paddle.nn.Sigmoid()
-            x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
-            out = m(x) # [0.7310586, 0.880797, 0.95257413, 0.98201376]
+            >>> m = paddle.nn.Sigmoid()
+            >>> x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [0.73105860, 0.88079703, 0.95257413, 0.98201376])
     """
 
     def __init__(self, name=None):
@@ -795,11 +825,14 @@ class Hardsigmoid(Layer):
 
         .. code-block:: python
 
-          import paddle
+            >>> import paddle
 
-          m = paddle.nn.Hardsigmoid()
-          x = paddle.to_tensor([-4., 5., 1.])
-          out = m(x) # [0., 1, 0.666667]
+            >>> m = paddle.nn.Hardsigmoid()
+            >>> x = paddle.to_tensor([-4., 5., 1.])
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [0.        , 1.        , 0.66666669])
     """
 
     def __init__(self, name=None):
@@ -836,11 +869,14 @@ class Softplus(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3], dtype='float32')
-            m = paddle.nn.Softplus()
-            out = m(x) # [0.513015, 0.598139, 0.744397, 0.854355]
+            >>> x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3], dtype='float32')
+            >>> m = paddle.nn.Softplus()
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [0.51301527, 0.59813893, 0.74439669, 0.85435522])
     """
 
     def __init__(self, beta=1, threshold=20, name=None):
@@ -887,14 +923,14 @@ class Softshrink(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([-0.9, -0.2, 0.1, 0.8])
-            m = paddle.nn.Softshrink()
-            out = m(x)
-            print(out)
-            # Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [-0.39999998,  0.        ,  0.        ,  0.30000001])
+            >>> x = paddle.to_tensor([-0.9, -0.2, 0.1, 0.8])
+            >>> m = paddle.nn.Softshrink()
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [-0.39999998,  0.        ,  0.        ,  0.30000001])
     """
 
     def __init__(self, threshold=0.5, name=None):
@@ -929,14 +965,14 @@ class Softsign(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-            m = paddle.nn.Softsign()
-            out = m(x)
-            print(out)
-            # Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [-0.28571430, -0.16666666,  0.09090909,  0.23076925])
+            >>> x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+            >>> m = paddle.nn.Softsign()
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [-0.28571430, -0.16666666,  0.09090909,  0.23076925])
     """
 
     def __init__(self, name=None):
@@ -970,14 +1006,14 @@ class Swish(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([-2., 0., 1.])
-            m = paddle.nn.Swish()
-            out = m(x)
-            print(out)
-            # Tensor(shape=[3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [-0.23840584,  0.        ,  0.73105854])
+            >>> x = paddle.to_tensor([-2., 0., 1.])
+            >>> m = paddle.nn.Swish()
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [-0.23840584,  0.        ,  0.73105860])
     """
 
     def __init__(self, name=None):
@@ -1017,11 +1053,14 @@ class Mish(Layer):
 
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([-5., 0., 5.])
-            m = paddle.nn.Mish()
-            out = m(x) # [-0.03357624, 0., 4.99955208]
+            >>> x = paddle.to_tensor([-5., 0., 5.])
+            >>> m = paddle.nn.Mish()
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [-0.03357624,  0.        ,  4.99955177])
 
     """
 
@@ -1056,14 +1095,14 @@ class Tanhshrink(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-            m = paddle.nn.Tanhshrink()
-            out = m(x)
-            print(out)
-            # Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [-0.02005106, -0.00262468,  0.00033200,  0.00868741])
+            >>> x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+            >>> m = paddle.nn.Tanhshrink()
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [-0.02005100, -0.00262472,  0.00033201,  0.00868741])
     """
 
     def __init__(self, name=None):
@@ -1105,14 +1144,14 @@ class ThresholdedReLU(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([2., 0., 1.])
-            m = paddle.nn.ThresholdedReLU()
-            out = m(x)
-            print(out)
-            # Tensor(shape=[3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [2., 0., 0.])
+            >>> x = paddle.to_tensor([2., 0., 1.])
+            >>> m = paddle.nn.ThresholdedReLU()
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [2., 0., 0.])
     """
 
     def __init__(self, threshold=1.0, name=None):
@@ -1148,11 +1187,14 @@ class Silu(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
-            m = paddle.nn.Silu()
-            out = m(x) # [ 0.731059, 1.761594, 2.857722, 3.928055 ]
+            >>> x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
+            >>> m = paddle.nn.Silu()
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [0.73105860, 1.76159406, 2.85772228, 3.92805505])
     """
 
     def __init__(self, name=None):
@@ -1187,11 +1229,14 @@ class LogSigmoid(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
-            m = paddle.nn.LogSigmoid()
-            out = m(x) # [-0.313262 -0.126928 -0.0485874 -0.0181499]
+            >>> x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
+            >>> m = paddle.nn.LogSigmoid()
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [-0.31326166, -0.12692805, -0.04858733, -0.01814996])
     """
 
     def __init__(self, name=None):
@@ -1299,22 +1344,25 @@ class Softmax(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.to_tensor([[[2.0, 3.0, 4.0, 5.0],
-                        [3.0, 4.0, 5.0, 6.0],
-                        [7.0, 8.0, 8.0, 9.0]],
-                        [[1.0, 2.0, 3.0, 4.0],
-                        [5.0, 6.0, 7.0, 8.0],
-                        [6.0, 7.0, 8.0, 9.0]]], dtype='float32')
-            m = paddle.nn.Softmax()
-            out = m(x)
-            # [[[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
-            #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
-            #   [0.07232949, 0.19661193, 0.19661193, 0.53444665]],
-            # [[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
-            #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
-            #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426]]]
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([[[2.0, 3.0, 4.0, 5.0],
+            ...                        [3.0, 4.0, 5.0, 6.0],
+            ...                        [7.0, 8.0, 8.0, 9.0]],
+            ...                       [[1.0, 2.0, 3.0, 4.0],
+            ...                        [5.0, 6.0, 7.0, 8.0],
+            ...                        [6.0, 7.0, 8.0, 9.0]]], dtype='float32')
+            >>> m = paddle.nn.Softmax()
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[2, 3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[0.03205860, 0.08714432, 0.23688284, 0.64391428],
+              [0.03205860, 0.08714432, 0.23688284, 0.64391428],
+              [0.07232949, 0.19661194, 0.19661194, 0.53444666]],
+             [[0.03205860, 0.08714432, 0.23688284, 0.64391428],
+              [0.03205860, 0.08714432, 0.23688284, 0.64391428],
+              [0.03205860, 0.08714432, 0.23688284, 0.64391428]]])
+
     """
 
     def __init__(self, axis=-1, name=None):
@@ -1357,23 +1405,26 @@ class LogSoftmax(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = [[[-2.0, 3.0, -4.0, 5.0],
-                  [3.0, -4.0, 5.0, -6.0],
-                  [-7.0, -8.0, 8.0, 9.0]],
-                 [[1.0, -2.0, -3.0, 4.0],
-                  [-5.0, 6.0, 7.0, -8.0],
-                  [6.0, 7.0, 8.0, 9.0]]]
-            m = paddle.nn.LogSoftmax()
-            x = paddle.to_tensor(x)
-            out = m(x)
-            # [[[ -7.1278396   -2.1278396   -9.127839    -0.12783948]
-            #   [ -2.1270514   -9.127051    -0.12705144 -11.127051  ]
-            #   [-16.313261   -17.313261    -1.3132617   -0.31326184]]
-            #  [[ -3.0518122   -6.051812    -7.051812    -0.051812  ]
-            #   [-12.313267    -1.3132664   -0.3132665  -15.313267  ]
-            #   [ -3.4401896   -2.4401896   -1.4401896   -0.44018966]]]
+            >>> import paddle
+
+            >>> x = [[[-2.0,  3.0, -4.0,  5.0],
+            ...       [ 3.0, -4.0,  5.0, -6.0],
+            ...       [-7.0, -8.0,  8.0,  9.0]],
+            ...      [[ 1.0, -2.0, -3.0,  4.0],
+            ...       [-5.0,  6.0,  7.0, -8.0],
+            ...       [ 6.0,  7.0,  8.0,  9.0]]]
+            >>> m = paddle.nn.LogSoftmax()
+            >>> x = paddle.to_tensor(x)
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[2, 3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[-7.12783957 , -2.12783957 , -9.12783909 , -0.12783945 ],
+              [-2.12705135 , -9.12705135 , -0.12705141 , -11.12705135],
+              [-16.31326103, -17.31326103, -1.31326187 , -0.31326184 ]],
+             [[-3.05181193 , -6.05181217 , -7.05181217 , -0.05181199 ],
+              [-12.31326675, -1.31326652 , -0.31326646 , -15.31326675],
+              [-3.44018984 , -2.44018984 , -1.44018972 , -0.44018975 ]]])
+
     """
 
     def __init__(self, axis=-1, name=None):
@@ -1426,20 +1477,17 @@ class Maxout(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.rand([1, 2, 3, 4])
-            # [[[[0.5002636  0.22272532 0.17402348 0.2874594 ]
-            #    [0.95313174 0.6228939  0.7129065  0.7087491 ]
-            #    [0.02879342 0.88725346 0.61093384 0.38833922]]
-            #   [[0.5231306  0.03807496 0.91661984 0.15602879]
-            #    [0.666127   0.616567   0.30741522 0.24044901]
-            #    [0.7142536  0.7351477  0.31588817 0.23782359]]]]
-            m = paddle.nn.Maxout(groups=2)
-            out = m(x)
-            # [[[[0.5231306  0.22272532 0.91661984 0.2874594 ]
-            #    [0.95313174 0.6228939  0.7129065  0.7087491 ]
-            #    [0.7142536  0.88725346 0.61093384 0.38833922]]]]
+            >>> import paddle
+            >>> paddle.seed(100)
+
+            >>> x = paddle.rand([1, 2, 3, 4])
+            >>> m = paddle.nn.Maxout(groups=2)
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[1, 1, 3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[[0.85139430, 0.95717543, 0.43864486, 0.51577556],
+               [0.84765935, 0.45680618, 0.39412445, 0.72039396],
+               [0.59444654, 0.78120756, 0.78364515, 0.90572405]]]])
     """
 
     def __init__(self, groups, axis=1, name=None):
@@ -1473,25 +1521,20 @@ class Softmax2D(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.rand([1, 2, 3, 4])
-            # [[[[0.42496058 0.1172187  0.14664008 0.8151267 ]
-            #    [0.24430142 0.42052492 0.60372984 0.79307914]
-            #    [0.4539401  0.90458065 0.10235776 0.62009853]]
-
-            #   [[0.11731581 0.16053623 0.05667042 0.91876775]
-            #    [0.9413854  0.30770817 0.6788164  0.9543593 ]
-            #    [0.4145064  0.75909156 0.11598814 0.73599935]]]]
-            m = paddle.nn.Softmax2D()
-            out = m(x)
-            # [[[[0.5763103  0.48917228 0.5224772  0.4741129 ]
-            #    [0.3324591  0.5281743  0.48123717 0.45976716]
-            #    [0.5098571  0.5363083  0.49659243 0.4710572 ]]
-
-            #   [[0.42368975 0.51082766 0.47752273 0.5258871 ]
-            #    [0.66754097 0.47182566 0.5187628  0.5402329 ]
-            #    [0.49014282 0.46369177 0.50340754 0.5289428 ]]]]
+            >>> import paddle
+            >>> paddle.seed(100)
+
+            >>> x = paddle.rand([1, 2, 3, 4])
+            >>> m = paddle.nn.Softmax2D()
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[1, 2, 3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[[0.42608523, 0.32081410, 0.39483935, 0.55642301],
+               [0.38131708, 0.45118359, 0.44891062, 0.46053308],
+               [0.35746980, 0.60766530, 0.38638926, 0.70425135]],
+              [[0.57391477, 0.67918587, 0.60516071, 0.44357699],
+               [0.61868292, 0.54881644, 0.55108935, 0.53946698],
+               [0.64253020, 0.39233473, 0.61361068, 0.29574865]]]])
 
     """
 
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 64caff4c169..539a030ad21 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -50,18 +50,22 @@ class Identity(Layer):
     Examples:
         .. code-block:: python
 
-          import paddle
-
-          input_tensor = paddle.randn(shape=[3, 2])
-          layer = paddle.nn.Identity()
-          out = layer(input_tensor)
-          # input_tensor: [[-0.32342386 -1.200079  ]
-          #                [ 0.7979031  -0.90978354]
-          #                [ 0.40597573  1.8095392 ]]
-          # out: [[-0.32342386 -1.200079  ]
-          #      [ 0.7979031  -0.90978354]
-          #      [ 0.40597573  1.8095392 ]]
-
+            >>> import paddle
+            >>> paddle.seed(100)
+
+            >>> input_tensor = paddle.randn(shape=[3, 2])
+            >>> layer = paddle.nn.Identity()
+            >>> out = layer(input_tensor)
+            >>> print(input_tensor)
+            Tensor(shape=[3, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-1.41661501,  0.25904641],
+             [ 0.00979547, -0.30324230],
+             [-1.34256756, -0.76540256]])
+            >>> print(out)
+            Tensor(shape=[3, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-1.41661501,  0.25904641],
+             [ 0.00979547, -0.30324230],
+             [-1.34256756, -0.76540256]])
 
     """
 
@@ -120,28 +124,35 @@ class Linear(Layer):
     Examples:
         .. code-block:: python
 
-          import paddle
-
-          # Define the linear layer.
-          weight_attr = paddle.ParamAttr(
-              name="weight",
-              initializer=paddle.nn.initializer.Constant(value=0.5))
-          bias_attr = paddle.ParamAttr(
-              name="bias",
-              initializer=paddle.nn.initializer.Constant(value=1.0))
-          linear = paddle.nn.Linear(2, 4, weight_attr=weight_attr, bias_attr=bias_attr)
-          # linear.weight: [[0.5 0.5 0.5 0.5]
-          #                 [0.5 0.5 0.5 0.5]]
-          # linear.bias: [1. 1. 1. 1.]
-
-          x = paddle.randn((3, 2), dtype="float32")
-          # x: [[-0.32342386 -1.200079  ]
-          #     [ 0.7979031  -0.90978354]
-          #     [ 0.40597573  1.8095392 ]]
-          y = linear(x)
-          # y: [[0.23824859 0.23824859 0.23824859 0.23824859]
-          #     [0.9440598  0.9440598  0.9440598  0.9440598 ]
-          #     [2.1077576  2.1077576  2.1077576  2.1077576 ]]
+            >>> import paddle
+            >>> paddle.seed(100)
+
+            >>> # Define the linear layer.
+            >>> weight_attr = paddle.ParamAttr(
+            ...     name="weight",
+            ...     initializer=paddle.nn.initializer.Constant(value=0.5))
+            >>> bias_attr = paddle.ParamAttr(
+            ...     name="bias",
+            ...     initializer=paddle.nn.initializer.Constant(value=1.0))
+            >>> linear = paddle.nn.Linear(2, 4, weight_attr=weight_attr, bias_attr=bias_attr)
+            >>> print(linear.weight)
+            Parameter containing:
+            Tensor(shape=[2, 4], dtype=float32, place=Place(cpu), stop_gradient=False,
+            [[0.50000000, 0.50000000, 0.50000000, 0.50000000],
+             [0.50000000, 0.50000000, 0.50000000, 0.50000000]])
+
+            >>> print(linear.bias)
+            Parameter containing:
+            Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=False,
+            [1., 1., 1., 1.])
+
+            >>> x = paddle.randn((3, 2), dtype="float32")
+            >>> y = linear(x)
+            >>> print(y)
+            Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=False,
+            [[ 0.42121571,  0.42121571,  0.42121571,  0.42121571],
+             [ 0.85327661,  0.85327661,  0.85327661,  0.85327661],
+             [-0.05398512, -0.05398512, -0.05398512, -0.05398512]])
     """
 
     def __init__(
@@ -237,19 +248,22 @@ class LinearCompress(Layer):
     Examples:
         .. code-block:: python
 
-          import paddle
-
-          # Define the linear layer.
-          paddle.set_default_dtype('float16')
-          weight_attr = paddle.ParamAttr(
-              name="weight",
-              initializer=paddle.nn.initializer.Constant(value=0.5))
-          bias_attr = paddle.ParamAttr(
-              name="bias",
-              initializer=paddle.nn.initializer.Constant(value=1.0))
-          linear = paddle.nn.LinearCompress(128, 64, weight_attr=weight_attr, bias_attr=bias_attr, bits=8, algo='weight_only')
-          x = paddle.randn((3, 128), dtype="float16")
-          y = linear(x)
+            >>> import paddle
+            >>> paddle.seed(100)
+
+            >>> # Define the linear layer.
+            >>> paddle.set_default_dtype('float16')
+            >>> weight_attr = paddle.ParamAttr(
+            ...     name="weight",
+            ...     initializer=paddle.nn.initializer.Constant(value=0.5))
+
+            >>> bias_attr = paddle.ParamAttr(
+            ...     name="bias",
+            ...     initializer=paddle.nn.initializer.Constant(value=1.0))
+
+            >>> linear = paddle.nn.LinearCompress(128, 64, weight_attr=weight_attr, bias_attr=bias_attr, bits=8, algo='weight_only')
+            >>> x = paddle.randn((3, 128), dtype="float16")
+            >>> y = linear(x)
     """
 
     def __init__(
@@ -527,14 +541,14 @@ class Upsample(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            input = paddle.rand([2,3,6,10], dtype="float32")
-            upsample_out = paddle.nn.Upsample(size=[12,12])
+            >>> input = paddle.rand([2, 3, 6, 10], dtype="float32")
+            >>> upsample_out = paddle.nn.Upsample(size=[12, 12])
 
-            output = upsample_out(x=input)
-            print(output.shape)
-            # [2, 3, 12, 12]
+            >>> output = upsample_out(x=input)
+            >>> print(output.shape)
+            [2, 3, 12, 12]
 
     """
 
@@ -627,15 +641,15 @@ class UpsamplingNearest2D(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn as nn
+            >>> import paddle
+            >>> import paddle.nn as nn
 
-            input_data = paddle.rand(shape=(2,3,6,10)).astype("float32")
-            upsample_out  = paddle.nn.UpsamplingNearest2D(size=[12,12])
-            input = paddle.to_tensor(input_data)
-            output = upsample_out(x=input)
-            print(output.shape)
-            # [2L, 3L, 12L, 12L]
+            >>> input_data = paddle.rand(shape=(2, 3, 6, 10)).astype("float32")
+            >>> upsample_out  = paddle.nn.UpsamplingNearest2D(size=[12, 12])
+            >>> input = paddle.to_tensor(input_data)
+            >>> output = upsample_out(x=input)
+            >>> print(output.shape)
+            [2, 3, 12, 12]
     """
 
     def __init__(
@@ -713,15 +727,15 @@ class UpsamplingBilinear2D(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn as nn
+            >>> import paddle
+            >>> import paddle.nn as nn
 
-            input_data = paddle.rand(shape=(2,3,6,10)).astype("float32")
-            upsample_out  = paddle.nn.UpsamplingBilinear2D(size=[12,12])
-            input = paddle.to_tensor(input_data)
-            output = upsample_out(x=input)
-            print(output.shape)
-            # [2L, 3L, 12L, 12L]
+            >>> input_data = paddle.rand(shape=(2, 3, 6, 10)).astype("float32")
+            >>> upsample_out  = paddle.nn.UpsamplingBilinear2D(size=[12, 12])
+            >>> input = paddle.to_tensor(input_data)
+            >>> output = upsample_out(x=input)
+            >>> print(output.shape)
+            [2, 3, 12, 12]
     """
 
     def __init__(
@@ -798,15 +812,19 @@ class Bilinear(Layer):
        Tensor: A 2-D Tensor of shape [batch_size, out_features].
 
     Examples:
-       .. code-block:: python
+        .. code-block:: python
+
+            >>> import paddle
 
-        import paddle
+            >>> layer1 = paddle.rand((5, 5)).astype('float32')
+            >>> layer2 = paddle.rand((5, 4)).astype('float32')
+            >>> bilinear = paddle.nn.Bilinear(in1_features=5,
+            ...                               in2_features=4,
+            ...                               out_features=1000)
 
-        layer1 = paddle.rand((5, 5)).astype('float32')
-        layer2 = paddle.rand((5, 4)).astype('float32')
-        bilinear = paddle.nn.Bilinear(
-            in1_features=5, in2_features=4, out_features=1000)
-        result = bilinear(layer1,layer2)    # result shape [5, 1000]
+            >>> result = bilinear(layer1,layer2)
+            >>> print(result.shape)
+            [5, 1000]
 
     """
 
@@ -897,23 +915,24 @@ class Dropout(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
+            >>> paddle.seed(2023)
 
-            x = paddle.to_tensor([[1,2,3], [4,5,6]], dtype="float32")
-            m = paddle.nn.Dropout(p=0.5)
+            >>> x = paddle.to_tensor([[1, 2, 3], [4, 5, 6]], dtype="float32")
+            >>> m = paddle.nn.Dropout(p=0.5)
 
-            y_train = m(x)
-            print(y_train)
-            # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[2., 0., 6.],
-            #         [0., 0., 0.]])
+            >>> y_train = m(x)
+            >>> print(y_train)
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[2., 4., 0.],
+             [8., 0., 0.]])
 
-            m.eval()  # switch the model to test phase
-            y_test = m(x)
-            print(y_test)
-            # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[1., 2., 3.],
-            #         [4., 5., 6.]])
+            >>> m.eval()  # switch the model to test phase
+            >>> y_test = m(x)
+            >>> print(y_test)
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[1., 2., 3.],
+             [4., 5., 6.]])
     """
 
     def __init__(self, p=0.5, axis=None, mode="upscale_in_train", name=None):
@@ -967,36 +986,33 @@ class Dropout2D(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.rand([2, 2, 1, 3], dtype="float32")
-            print(x)
-            # Tensor(shape=[2, 2, 1, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[[[0.10052059, 0.93890846, 0.45351565]],
-            #          [[0.47507706, 0.45021373, 0.11331241]]],
-
-            #         [[[0.53358698, 0.97375143, 0.34997326]],
-            #          [[0.24758087, 0.52628899, 0.17970420]]]])
-
-            m = paddle.nn.Dropout2D(p=0.5)
-            y_train = m(x)
-            print(y_train)
-            # Tensor(shape=[2, 2, 1, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[[[0.        , 0.        , 0.        ]],
-            #          [[0.95015413, 0.90042746, 0.22662482]]],
-
-            #         [[[1.06717396, 1.94750285, 0.69994652]],
-            #          [[0.        , 0.        , 0.        ]]]])
-
-            m.eval()  # switch the model to test phase
-            y_test = m(x)
-            print(y_test)
-            # Tensor(shape=[2, 2, 1, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[[[0.10052059, 0.93890846, 0.45351565]],
-            #          [[0.47507706, 0.45021373, 0.11331241]]],
-
-            #         [[[0.53358698, 0.97375143, 0.34997326]],
-            #          [[0.24758087, 0.52628899, 0.17970420]]]])
+            >>> import paddle
+            >>> paddle.seed(100)
+            >>> x = paddle.rand([2, 2, 1, 3], dtype="float32")
+            >>> print(x)
+            Tensor(shape=[2, 2, 1, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[[0.55355281, 0.20714243, 0.01162981]],
+              [[0.51577556, 0.36369765, 0.26091650]]],
+             [[[0.18905126, 0.56219709, 0.00808361]],
+              [[0.78120756, 0.32112977, 0.90572405]]]])
+
+            >>> m = paddle.nn.Dropout2D(p=0.5)
+            >>> y_train = m(x)
+            >>> print(y_train)
+            Tensor(shape=[2, 2, 1, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[[1.10710561, 0.41428486, 0.02325963]],
+              [[1.03155112, 0.72739530, 0.52183300]]],
+             [[[0.        , 0.        , 0.        ]],
+              [[0.        , 0.        , 0.        ]]]])
+
+            >>> m.eval()  # switch the model to test phase
+            >>> y_test = m(x)
+            >>> print(y_test)
+            Tensor(shape=[2, 2, 1, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[[0.55355281, 0.20714243, 0.01162981]],
+              [[0.51577556, 0.36369765, 0.26091650]]],
+             [[[0.18905126, 0.56219709, 0.00808361]],
+              [[0.78120756, 0.32112977, 0.90572405]]]])
     """
 
     def __init__(self, p=0.5, data_format='NCHW', name=None):
@@ -1048,48 +1064,35 @@ class Dropout3D(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.arange(24, dtype="float32").reshape((1, 2, 2, 2, 3))
-            print(x)
-            # Tensor(shape=[1, 2, 2, 2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[[[[0. , 1. , 2. ],
-            #            [3. , 4. , 5. ]],
-            #           [[6. , 7. , 8. ],
-            #            [9. , 10., 11.]]],
-
-            #          [[[12., 13., 14.],
-            #            [15., 16., 17.]],
-            #           [[18., 19., 20.],
-            #            [21., 22., 23.]]]]])
-
-            m = paddle.nn.Dropout3D(p=0.5)
-            y_train = m(x)
-            print(y_train)
-            # Tensor(shape=[1, 2, 2, 2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[[[[0. , 2. , 4. ],
-            #            [6. , 8. , 10.]],
-            #           [[12., 14., 16.],
-            #            [18., 20., 22.]]],
-
-            #          [[[0. , 0. , 0. ],
-            #            [0. , 0. , 0. ]],
-            #           [[0. , 0. , 0. ],
-            #            [0. , 0. , 0. ]]]]])
-
-            m.eval()  # switch the model to test phase
-            y_test = m(x)
-            print(y_test)
-            # Tensor(shape=[1, 2, 2, 2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[[[[0. , 1. , 2. ],
-            #            [3. , 4. , 5. ]],
-            #           [[6. , 7. , 8. ],
-            #            [9. , 10., 11.]]],
-
-            #          [[[12., 13., 14.],
-            #            [15., 16., 17.]],
-            #           [[18., 19., 20.],
-            #            [21., 22., 23.]]]]])
+            >>> import paddle
+
+            >>> x = paddle.arange(24, dtype="float32").reshape((1, 2, 2, 2, 3))
+            >>> print(x)
+            Tensor(shape=[1, 2, 2, 2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[[[0. , 1. , 2. ],
+                [3. , 4. , 5. ]],
+               [[6. , 7. , 8. ],
+                [9. , 10., 11.]]],
+              [[[12., 13., 14.],
+                [15., 16., 17.]],
+               [[18., 19., 20.],
+                [21., 22., 23.]]]]])
+
+            >>> m = paddle.nn.Dropout3D(p=0.5)
+            >>> y_train = m(x)
+
+            >>> m.eval()  # switch the model to test phase
+            >>> y_test = m(x)
+            >>> print(y_test)
+            Tensor(shape=[1, 2, 2, 2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[[[0. , 1. , 2. ],
+                [3. , 4. , 5. ]],
+               [[6. , 7. , 8. ],
+                [9. , 10., 11.]]],
+              [[[12., 13., 14.],
+                [15., 16., 17.]],
+               [[18., 19., 20.],
+                [21., 22., 23.]]]]])
     """
 
     def __init__(self, p=0.5, data_format='NCDHW', name=None):
@@ -1139,22 +1142,23 @@ class AlphaDropout(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.to_tensor([[-1, 1], [-1, 1]], dtype="float32")
-            m = paddle.nn.AlphaDropout(p=0.5)
-            y_train = m(x)
-            print(y_train)
-            # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[-0.77919382,  1.66559887],
-            #         [-0.77919382, -0.77919382]])
-
-            m.eval()  # switch the model to test phase
-            y_test = m(x)
-            print(y_test)
-            # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[-1.,  1.],
-            #         [-1.,  1.]])
+            >>> import paddle
+            >>> paddle.seed(2023)
+
+            >>> x = paddle.to_tensor([[-1, 1], [-1, 1]], dtype="float32")
+            >>> m = paddle.nn.AlphaDropout(p=0.5)
+            >>> y_train = m(x)
+            >>> print(y_train)
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.10721093,  1.66559887],
+             [-0.77919382,  1.66559887]])
+
+            >>> m.eval()  # switch the model to test phase
+            >>> y_test = m(x)
+            >>> print(y_test)
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-1.,  1.],
+             [-1.,  1.]])
     """
 
     def __init__(self, p=0.5, name=None):
@@ -1201,18 +1205,19 @@ class Pad1D(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn as nn
-
-            input_shape = (1, 2, 3)
-            pad = [1, 2]
-            mode = "constant"
-            data = paddle.arange(paddle.prod(paddle.to_tensor(input_shape)), dtype="float32").reshape(input_shape) + 1
-            my_pad = nn.Pad1D(padding=pad, mode=mode)
-            result = my_pad(data)
-            print(result)
-            # [[[0. 1. 2. 3. 0. 0.]
-            #   [0. 4. 5. 6. 0. 0.]]]
+            >>> import paddle
+            >>> import paddle.nn as nn
+
+            >>> input_shape = (1, 2, 3)
+            >>> pad = [1, 2]
+            >>> mode = "constant"
+            >>> data = paddle.arange(paddle.prod(paddle.to_tensor(input_shape)), dtype="float32").reshape(input_shape) + 1
+            >>> my_pad = nn.Pad1D(padding=pad, mode=mode)
+            >>> result = my_pad(data)
+            >>> print(result)
+            Tensor(shape=[1, 2, 6], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[0., 1., 2., 3., 0., 0.],
+              [0., 4., 5., 6., 0., 0.]]])
     """
 
     def __init__(
@@ -1271,21 +1276,22 @@ class Pad2D(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn as nn
-
-            input_shape = (1, 1, 2, 3)
-            pad = [1, 0, 1, 2]
-            mode = "constant"
-            data = paddle.arange(paddle.prod(paddle.to_tensor(input_shape)), dtype="float32").reshape(input_shape) + 1
-            my_pad = nn.Pad2D(padding=pad, mode=mode)
-            result = my_pad(data)
-            print(result)
-            # [[[[0. 0. 0. 0.]
-            #    [0. 1. 2. 3.]
-            #    [0. 4. 5. 6.]
-            #    [0. 0. 0. 0.]
-            #    [0. 0. 0. 0.]]]]
+            >>> import paddle
+            >>> import paddle.nn as nn
+
+            >>> input_shape = (1, 1, 2, 3)
+            >>> pad = [1, 0, 1, 2]
+            >>> mode = "constant"
+            >>> data = paddle.arange(paddle.prod(paddle.to_tensor(input_shape)), dtype="float32").reshape(input_shape) + 1
+            >>> my_pad = nn.Pad2D(padding=pad, mode=mode)
+            >>> result = my_pad(data)
+            >>> print(result)
+            Tensor(shape=[1, 1, 5, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[[0., 0., 0., 0.],
+               [0., 1., 2., 3.],
+               [0., 4., 5., 6.],
+               [0., 0., 0., 0.],
+               [0., 0., 0., 0.]]]])
     """
 
     def __init__(
@@ -1336,26 +1342,24 @@ class ZeroPad2D(Layer):
           The data type is same as input x.
 
     Examples:
-        Examples are as follows.
 
         .. code-block:: python
 
-            import paddle
-            import paddle.nn as nn
-
-            input_shape = paddle.to_tensor([1, 1, 2, 3])
-            pad = [1, 0, 1, 2]
-            data = paddle.arange(paddle.prod(input_shape), dtype="float32").reshape(input_shape) + 1
-
-            my_pad = nn.ZeroPad2D(padding=pad)
-            result = my_pad(data)
-
-            print(result)
-            # [[[[0. 0. 0. 0.]
-            #    [0. 1. 2. 3.]
-            #    [0. 4. 5. 6.]
-            #    [0. 0. 0. 0.]
-            #    [0. 0. 0. 0.]]]]
+            >>> import paddle
+            >>> import paddle.nn as nn
+
+            >>> input_shape = paddle.to_tensor([1, 1, 2, 3])
+            >>> pad = [1, 0, 1, 2]
+            >>> data = paddle.arange(paddle.prod(input_shape), dtype="float32").reshape(input_shape) + 1
+            >>> my_pad = nn.ZeroPad2D(padding=pad)
+            >>> result = my_pad(data)
+            >>> print(result)
+            Tensor(shape=[1, 1, 5, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[[0., 0., 0., 0.],
+               [0., 1., 2., 3.],
+               [0., 4., 5., 6.],
+               [0., 0., 0., 0.],
+               [0., 0., 0., 0.]]]])
     """
 
     def __init__(self, padding, data_format="NCHW", name=None):
@@ -1412,21 +1416,22 @@ class Pad3D(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn as nn
-
-            input_shape = (1, 1, 1, 2, 3)
-            pad = [1, 0, 1, 2, 0, 0]
-            mode = "constant"
-            data = paddle.arange(paddle.prod(paddle.to_tensor(input_shape)), dtype="float32").reshape(input_shape) + 1
-            my_pad = nn.Pad3D(padding=pad, mode=mode)
-            result = my_pad(data)
-            print(result)
-            # [[[[[0. 0. 0. 0.]
-            #     [0. 1. 2. 3.]
-            #     [0. 4. 5. 6.]
-            #     [0. 0. 0. 0.]
-            #     [0. 0. 0. 0.]]]]]
+            >>> import paddle
+            >>> import paddle.nn as nn
+
+            >>> input_shape = (1, 1, 1, 2, 3)
+            >>> pad = [1, 0, 1, 2, 0, 0]
+            >>> mode = "constant"
+            >>> data = paddle.arange(paddle.prod(paddle.to_tensor(input_shape)), dtype="float32").reshape(input_shape) + 1
+            >>> my_pad = nn.Pad3D(padding=pad, mode=mode)
+            >>> result = my_pad(data)
+            >>> print(result)
+            Tensor(shape=[1, 1, 1, 5, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[[[0., 0., 0., 0.],
+                [0., 1., 2., 3.],
+                [0., 4., 5., 6.],
+                [0., 0., 0., 0.],
+                [0., 0., 0., 0.]]]]])
     """
 
     def __init__(
@@ -1476,13 +1481,13 @@ class CosineSimilarity(Layer):
 
             Case 0:
                 x1 = [[0.8024077  0.9927354  0.27238318 0.8344984 ]
-                     [0.48949873 0.5797396  0.65444374 0.66510963]
-                     [0.1031398  0.9614342  0.08365563 0.6796464 ]
-                     [0.10760343 0.7461209  0.7726148  0.5801006 ]]
+                      [0.48949873 0.5797396  0.65444374 0.66510963]
+                      [0.1031398  0.9614342  0.08365563 0.6796464 ]
+                      [0.10760343 0.7461209  0.7726148  0.5801006 ]]
                 x2 = [[0.62913156 0.1536727  0.9847992  0.04591406]
-                     [0.9098952  0.15715368 0.8671125  0.3156102 ]
-                     [0.4427798  0.54136837 0.5276275  0.32394758]
-                     [0.3769419  0.8535014  0.48041078 0.9256797 ]]
+                      [0.9098952  0.15715368 0.8671125  0.3156102 ]
+                      [0.4427798  0.54136837 0.5276275  0.32394758]
+                      [0.3769419  0.8535014  0.48041078 0.9256797 ]]
                 axis = 1
                 eps = 1e-8
                 Out: [0.5275037  0.8368967  0.75037485 0.9245899]
@@ -1490,19 +1495,19 @@ class CosineSimilarity(Layer):
     Code Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn as nn
+            >>> import paddle
+            >>> import paddle.nn as nn
 
-            x1 = paddle.to_tensor([[1., 2., 3.],
-                                [2., 3., 4.]], dtype="float32")
-            x2 = paddle.to_tensor([[8., 3., 3.],
-                                [2., 3., 4.]], dtype="float32")
+            >>> x1 = paddle.to_tensor([[1., 2., 3.],
+            ...                        [2., 3., 4.]], dtype="float32")
+            >>> x2 = paddle.to_tensor([[8., 3., 3.],
+            ...                        [2., 3., 4.]], dtype="float32")
 
-            cos_sim_func = nn.CosineSimilarity(axis=0)
-            result = cos_sim_func(x1, x2)
-            print(result)
-            # Tensor(shape=[3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [0.65079135, 0.98058069, 1.        ])
+            >>> cos_sim_func = nn.CosineSimilarity(axis=0)
+            >>> result = cos_sim_func(x1, x2)
+            >>> print(result)
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [0.65079135, 0.98058069, 1.        ])
     """
 
     def __init__(self, axis=1, eps=1e-8):
@@ -1544,19 +1549,16 @@ class Embedding(Layer):
         output is a Tensor:
             out.shape = [3, 2, 16]
             out.data = [[[0.129435295, 0.244512452, ..., 0.436322452],
-                        [0.345421456, 0.524563927, ..., 0.144534654]],
-
+                         [0.345421456, 0.524563927, ..., 0.144534654]],
                         [[0.345249859, 0.124939536, ..., 0.194353745],
-                        [0.945345345, 0.435394634, ..., 0.435345365]],
-
+                         [0.945345345, 0.435394634, ..., 0.435345365]],
                         [[0.945345345, 0.435394634, ..., 0.435345365],
-                        [0.0,         0.0,         ..., 0.0        ]]]  # padding data
+                         [0.0,         0.0,         ..., 0.0        ]]]  # padding data
         The input padding_idx is less than 0, it is automatically converted to padding_idx = -1 + 128 = 127
         It will pad all-zero data when ids is 127.
 
     Parameters:
-        num_embeddings (int): Just one element which indicate the size
-            of the dictionary of embeddings.
+        num_embeddings (int): Just one element which indicate the size of the dictionary of embeddings.
         embedding_dim (int):  Just one element which indicate the size of each embedding vector respectively.
         padding_idx(int|long|None, optional): padding_idx needs to be in the interval [-num_embeddings, num_embeddings).
             If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
@@ -1574,9 +1576,8 @@ class Embedding(Layer):
             The local word vector needs to be transformed into numpy format, and the shape of local word
             vector should be consistent with :attr:`num_embeddings` . Then :ref:`api_initializer_NumpyArrayInitializer`
             is used to load custom or pre-trained word vectors. See code example for details.
-        name(str|None, optional): For detailed information, please refer
-               to :ref:`api_guide_Name`. Usually name is no need to set and
-               None by default.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and
+            None by default.
 
     Attribute:
         **weight** (Parameter): the learnable weights of this layer.
@@ -1588,36 +1589,36 @@ class Embedding(Layer):
 
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.to_tensor([[0], [1], [3]], dtype="int64", stop_gradient=False)
-            embedding = paddle.nn.Embedding(4, 3, sparse=True)
-
-            w0 = paddle.to_tensor([[0., 0., 0.],
-                                [1., 1., 1.],
-                                [2., 2., 2.],
-                                [3., 3., 3.]], dtype="float32")
-            embedding.weight.set_value(w0)
-            print(embedding.weight)
-            # Tensor(shape=[4, 3], dtype=float32, place=Place(gpu:0), stop_gradient=False,
-            #        [[0., 0., 0.],
-            #         [1., 1., 1.],
-            #         [2., 2., 2.],
-            #         [3., 3., 3.]])
-
-            adam = paddle.optimizer.Adam(parameters=[embedding.weight], learning_rate=0.01)
-            adam.clear_grad()
-
-
-            out = embedding(x)
-            print(out)
-            # Tensor(shape=[3, 1, 3], dtype=float32, place=Place(gpu:0), stop_gradient=False,
-            #        [[[0., 0., 0.]],
-            #         [[1., 1., 1.]],
-            #         [[3., 3., 3.]]])
-
-            out.backward()
-            adam.step()
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([[0], [1], [3]], dtype="int64", stop_gradient=False)
+            >>> embedding = paddle.nn.Embedding(4, 3, sparse=True)
+
+            >>> w0 = paddle.to_tensor([[0., 0., 0.],
+            ...                        [1., 1., 1.],
+            ...                        [2., 2., 2.],
+            ...                        [3., 3., 3.]], dtype="float32")
+            >>> embedding.weight.set_value(w0)
+            >>> print(embedding.weight)
+            Parameter containing:
+            Tensor(shape=[4, 3], dtype=float32, place=Place(cpu), stop_gradient=False,
+            [[0., 0., 0.],
+             [1., 1., 1.],
+             [2., 2., 2.],
+             [3., 3., 3.]])
+
+            >>> adam = paddle.optimizer.Adam(parameters=[embedding.weight], learning_rate=0.01)
+            >>> adam.clear_grad()
+
+            >>> out = embedding(x)
+            >>> print(out)
+            Tensor(shape=[3, 1, 3], dtype=float32, place=Place(cpu), stop_gradient=False,
+            [[[0., 0., 0.]],
+             [[1., 1., 1.]],
+             [[3., 3., 3.]]])
+
+            >>> out.backward()
+            >>> adam.step()
 
     """
 
@@ -1708,36 +1709,35 @@ class Unfold(Layer):
 
 
     Parameters:
-        kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
-                                  or an integer k treated as [k, k].
-        strides(int|list):        The strides, should be [stride_h, stride_w]
-                                  or an integer stride treated as [sride, stride].
-                                  For default, strides will be [1, 1].
-        paddings(int|list):       The paddings of each dimension, should be
-                                  [padding_top, padding_left, padding_bottom, padding_right]
-                                  or [padding_h, padding_w] or an integer padding.
-                                  If [padding_h, padding_w] was given, it will expanded to
-                                  [padding_h, padding_w, padding_h, padding_w]. If an integer
-                                  padding was given, [padding, padding, padding, padding] will
-                                  be used. For default, paddings will be [0, 0, 0, 0]
-        dilations(int|list):      the dilations of convolution kernel, should be
-                                  [dilation_h, dilation_w], or an integer dilation treated as
-                                  [dilation, dilation]. For default, it will be [1, 1].
-        name(str, optional): The default value is None.
-                             Normally there is no need for user to set this property.
-                             For more information, please refer to :ref:`api_guide_Name`
+        kernel_sizes(int|list): The size of convolution kernel, should be [k_h, k_w]
+            or an integer k treated as [k, k].
+        strides(int|list, optional): The strides, should be [stride_h, stride_w]
+            or an integer stride treated as [sride, stride]. For default, strides will be [1, 1].
+        paddings(int|list, optional): The paddings of each dimension, should be
+            [padding_top, padding_left, padding_bottom, padding_right] or [padding_h, padding_w]
+            or an integer padding. If [padding_h, padding_w] was given, it will expanded to
+            [padding_h, padding_w, padding_h, padding_w]. If an integer padding was given,
+            [padding, padding, padding, padding] will be used. For default,
+            paddings will be [0, 0, 0, 0].
+        dilations(int|list, optional): The dilations of convolution kernel, should be
+            [dilation_h, dilation_w], or an integer dilation treated as [dilation, dilation].
+            For default, it will be [1, 1].
+        name(str, optional): The default value is None. Normally there is no need for user to
+            set this property. For more information, please refer to :ref:`api_guide_Name`
 
 
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn as nn
+            >>> import paddle
+            >>> import paddle.nn as nn
+
+            >>> x = paddle.randn((100, 3, 224, 224))
+            >>> unfold = nn.Unfold(kernel_sizes=[3, 3])
+            >>> result = unfold(x)
+            >>> print(result.shape)
+            [100, 27, 49284]
 
-            x = paddle.randn((100,3,224,224))
-            unfold = nn.Unfold(kernel_sizes=[3, 3])
-            result = unfold(x)
-            print(result)
     """
 
     def __init__(
@@ -1790,21 +1790,21 @@ class Fold(Layer):
         C_{out} &= \frac{C_{in}}{kernel\_sizes[0]\times kernel\_sizes[1]} \\
 
     Parameters:
-        output_sizes(list):       The size of output size, should be [output_size_h, output_size_w]
+        output_sizes(list):  The size of output size, should be [output_size_h, output_size_w]
                                   or an interger o treated as [o, o].
         kernel_sizes(int|list|tuple):   The size of convolution kernel, should be [k_h, k_w]
                                   or an integer k treated as [k, k].
-        strides(int|list|tuple, optional):        The strides, should be [stride_h, stride_w]
+        strides(int|list|tuple, optional):  The strides, should be [stride_h, stride_w]
                                   or an integer stride treated as [sride, stride].
                                   For default, strides will be [1, 1].
-        paddings(int|list|tuple, optional):       The paddings of each dimension, should be
+        paddings(int|list|tuple, optional):  The paddings of each dimension, should be
                                   [padding_top, padding_left, padding_bottom, padding_right]
                                   or [padding_h, padding_w] or an integer padding.
                                   If [padding_h, padding_w] was given, it will expanded to
                                   [padding_h, padding_w, padding_h, padding_w]. If an integer
                                   padding was given, [padding, padding, padding, padding] will
                                   be used. For default, paddings will be [0, 0, 0, 0]
-        dilations(int|list|tuple, optional):      the dilations of convolution kernel, should be
+        dilations(int|list|tuple, optional): The dilations of convolution kernel, should be
                                   [dilation_h, dilation_w], or an integer dilation treated as
                                   [dilation, dilation]. For default, it will be [1, 1].
         name(str, optional): The default value is None.
@@ -1820,13 +1820,14 @@ class Fold(Layer):
 
         .. code-block:: python
 
-            import paddle
-            import paddle.nn as nn
+            >>> import paddle
+            >>> import paddle.nn as nn
 
-            x = paddle.randn([2,3*2*2,12])
-            fold = nn.Fold(output_sizes=[4, 5], kernel_sizes=2)
-            y = fold(x)
-            # y.shape = [2,3,4,5]
+            >>> x = paddle.randn([2, 3*2*2, 12])
+            >>> fold = nn.Fold(output_sizes=[4, 5], kernel_sizes=2)
+            >>> y = fold(x)
+            >>> print(y.shape)
+            [2, 3, 4, 5]
    """
 
     def __init__(
@@ -1886,12 +1887,13 @@ class Flatten(Layer):
 
         .. code-block:: python
 
-          import paddle
+            >>> import paddle
 
-          inp = paddle.ones([5, 2, 3, 4]).astype('float32')
-          flatten = paddle.nn.Flatten(start_axis=1, stop_axis=2)
-          y = flatten(inp)
-          # y.shape = [5, 6, 4]
+            >>> inp = paddle.ones([5, 2, 3, 4]).astype('float32')
+            >>> flatten = paddle.nn.Flatten(start_axis=1, stop_axis=2)
+            >>> y = flatten(inp)
+            >>> print(y.shape)
+            [5, 6, 4]
 
     """
 
@@ -1928,15 +1930,15 @@ class Unflatten(Layer):
 
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.randn(shape=[4, 6, 8])
-            shape = [2, 3]
-            axis = 1
-            unflatten = paddle.nn.Unflatten(axis, shape)
-            res = unflatten(x)
-            print(res.shape)
-            # [4, 2, 3, 8]
+            >>> x = paddle.randn(shape=[4, 6, 8])
+            >>> shape = [2, 3]
+            >>> axis = 1
+            >>> unflatten = paddle.nn.Unflatten(axis, shape)
+            >>> res = unflatten(x)
+            >>> print(res.shape)
+            [4, 2, 3, 8]
 
     """
 
diff --git a/python/paddle/nn/layer/layers.py b/python/paddle/nn/layer/layers.py
index abd15b00424..2e3bd180129 100644
--- a/python/paddle/nn/layer/layers.py
+++ b/python/paddle/nn/layer/layers.py
@@ -357,22 +357,38 @@ class Layer:
     Examples:
         .. code-block:: python
 
-            import paddle
-            class MyLayer(paddle.nn.Layer):
-                def __init__(self):
-                    super().__init__()
-                    self._linear = paddle.nn.Linear(1, 1)
-                    self._dropout = paddle.nn.Dropout(p=0.5)
-                def forward(self, input):
-                    temp = self._linear(input)
-                    temp = self._dropout(temp)
-                    return temp
-            x = paddle.randn([10, 1], 'float32')
-            mylayer = MyLayer()
-            mylayer.eval()  # set mylayer._dropout to eval mode
-            out = mylayer(x)
-            mylayer.train()  # set mylayer._dropout to train mode
-            out = mylayer(x)
+            >>> import paddle
+            >>> paddle.seed(100)
+
+            >>> class MyLayer(paddle.nn.Layer):
+            ...     def __init__(self):
+            ...         super().__init__()
+            ...         self._linear = paddle.nn.Linear(1, 1)
+            ...         self._dropout = paddle.nn.Dropout(p=0.5)
+            ...
+            ...     def forward(self, input):
+            ...         temp = self._linear(input)
+            ...         temp = self._dropout(temp)
+            ...         return temp
+            ...
+            >>> x = paddle.randn([10, 1], 'float32')
+            >>> mylayer = MyLayer()
+            >>> mylayer.eval()  # set mylayer._dropout to eval mode
+            >>> out = mylayer(x)
+            >>> mylayer.train()  # set mylayer._dropout to train mode
+            >>> out = mylayer(x)
+            >>> print(out)
+            Tensor(shape=[10, 1], dtype=float32, place=Place(cpu), stop_gradient=False,
+            [[-3.44879317],
+             [ 0.        ],
+             [ 0.        ],
+             [-0.73825276],
+             [ 0.        ],
+             [ 0.        ],
+             [ 0.64444798],
+             [-3.22185946],
+             [ 0.        ],
+             [-0.68077987]])
     """
 
     def __init__(self, name_scope=None, dtype="float32"):
@@ -419,25 +435,38 @@ class Layer:
         Examples:
             .. code-block:: python
 
-                import paddle
-
-                class MyLayer(paddle.nn.Layer):
-                    def __init__(self):
-                        super().__init__()
-                        self._linear = paddle.nn.Linear(1, 1)
-                        self._dropout = paddle.nn.Dropout(p=0.5)
-
-                    def forward(self, input):
-                        temp = self._linear(input)
-                        temp = self._dropout(temp)
-                        return temp
-
-                x = paddle.randn([10, 1], 'float32')
-                mylayer = MyLayer()
-                mylayer.eval()  # set mylayer._dropout to eval mode
-                out = mylayer(x)
-                mylayer.train()  # set mylayer._dropout to train mode
-                out = mylayer(x)
+                >>> import paddle
+                >>> paddle.seed(100)
+
+                >>> class MyLayer(paddle.nn.Layer):
+                ...     def __init__(self):
+                ...         super().__init__()
+                ...         self._linear = paddle.nn.Linear(1, 1)
+                ...         self._dropout = paddle.nn.Dropout(p=0.5)
+                ...
+                ...     def forward(self, input):
+                ...         temp = self._linear(input)
+                ...         temp = self._dropout(temp)
+                ...         return temp
+                ...
+                >>> x = paddle.randn([10, 1], 'float32')
+                >>> mylayer = MyLayer()
+                >>> mylayer.eval()  # set mylayer._dropout to eval mode
+                >>> out = mylayer(x)
+                >>> mylayer.train()  # set mylayer._dropout to train mode
+                >>> out = mylayer(x)
+                >>> print(out)
+                Tensor(shape=[10, 1], dtype=float32, place=Place(cpu), stop_gradient=False,
+                [[-3.44879317],
+                 [ 0.        ],
+                 [ 0.        ],
+                 [-0.73825276],
+                 [ 0.        ],
+                 [ 0.        ],
+                 [ 0.64444798],
+                 [-3.22185946],
+                 [ 0.        ],
+                 [-0.68077987]])
 
         """
         # global setting in dygraph
@@ -461,24 +490,35 @@ class Layer:
         Example::
             .. code-block:: python
 
-                import paddle
-
-                class MyLayer(paddle.nn.Layer):
-                    def __init__(self):
-                        super().__init__()
-                        self._linear = paddle.nn.Linear(1, 1)
-                        self._dropout = paddle.nn.Dropout(p=0.5)
-
-                    def forward(self, input):
-                        temp = self._linear(input)
-                        temp = self._dropout(temp)
-                        return temp
-
-                x = paddle.randn([10, 1], 'float32')
-                mylayer = MyLayer()
-                mylayer.eval()  # set mylayer._dropout to eval mode
-                out = mylayer(x)
-                print(out)
+                >>> import paddle
+                >>> paddle.seed(100)
+                >>> class MyLayer(paddle.nn.Layer):
+                ...     def __init__(self):
+                ...         super().__init__()
+                ...         self._linear = paddle.nn.Linear(1, 1)
+                ...         self._dropout = paddle.nn.Dropout(p=0.5)
+                ...
+                ...     def forward(self, input):
+                ...         temp = self._linear(input)
+                ...         temp = self._dropout(temp)
+                ...         return temp
+                ...
+                >>> x = paddle.randn([10, 1], 'float32')
+                >>> mylayer = MyLayer()
+                >>> mylayer.eval()  # set mylayer._dropout to eval mode
+                >>> out = mylayer(x)
+                >>> print(out)
+                Tensor(shape=[10, 1], dtype=float32, place=Place(cpu), stop_gradient=False,
+                [[-1.72439659],
+                 [ 0.31532824],
+                 [ 0.01192369],
+                 [-0.36912638],
+                 [-1.63426113],
+                 [-0.93169814],
+                 [ 0.32222399],
+                 [-1.61092973],
+                 [ 0.77209264],
+                 [-0.34038994]])
 
         """
         # global setting in dygraph
@@ -506,22 +546,41 @@ class Layer:
         Example::
             .. code-block:: python
 
-              import paddle
-              import paddle.nn as nn
-
-              net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
-
-              def init_weights(layer):
-                  if type(layer) == nn.Linear:
-                      print('before init weight:', layer.weight.numpy())
-                      new_weight = paddle.full(shape=layer.weight.shape, dtype=layer.weight.dtype, fill_value=0.9)
-                      layer.weight.set_value(new_weight)
-                      print('after init weight:', layer.weight.numpy())
-
-              net.apply(init_weights)
-
-              print(net.state_dict())
-
+                >>> import paddle
+                >>> import paddle.nn as nn
+                >>> paddle.seed(2023)
+
+                >>> net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
+
+                >>> def init_weights(layer):
+                ...     if type(layer) == nn.Linear:
+                ...         print('before init weight:', layer.weight.numpy())
+                ...         new_weight = paddle.full(shape=layer.weight.shape, dtype=layer.weight.dtype, fill_value=0.9)
+                ...         layer.weight.set_value(new_weight)
+                ...         print('after init weight:', layer.weight.numpy())
+                ...
+                >>> net.apply(init_weights)
+
+                >>> print(net.state_dict())
+                before init weight: [[ 0.89611185  0.04935038]
+                                     [-0.5888344   0.99266374]]
+                after init weight: [[0.9 0.9]
+                                    [0.9 0.9]]
+                before init weight: [[-0.18615901 -0.22924072]
+                                     [ 1.1517721   0.59859073]]
+                after init weight: [[0.9 0.9]
+                                    [0.9 0.9]]
+                OrderedDict([('0.weight', Parameter containing:
+                Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=False,
+                [[0.89999998, 0.89999998],
+                 [0.89999998, 0.89999998]])), ('0.bias', Parameter containing:
+                Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=False,
+                [0., 0.])), ('1.weight', Parameter containing:
+                Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=False,
+                [[0.89999998, 0.89999998],
+                 [0.89999998, 0.89999998]])), ('1.bias', Parameter containing:
+                Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=False,
+                [0., 0.]))])
         """
         for layer in self.children():
             layer.apply(fn)
@@ -541,18 +600,19 @@ class Layer:
         Example::
             .. code-block:: python
 
-                import paddle
-
-                class LinearNet(paddle.nn.Layer):
-                    def __init__(self):
-                        super().__init__(name_scope = "demo_linear_net")
-                        self._linear = paddle.nn.Linear(1, 1)
+                >>> import paddle
 
-                    def forward(self, x):
-                        return self._linear(x)
-
-                linear_net = LinearNet()
-                print(linear_net.full_name())   # demo_linear_net_0
+                >>> class LinearNet(paddle.nn.Layer):
+                ...     def __init__(self):
+                ...         super().__init__(name_scope = "demo_linear_net")
+                ...         self._linear = paddle.nn.Linear(1, 1)
+                ...
+                ...     def forward(self, x):
+                ...         return self._linear(x)
+                ...
+                >>> linear_net = LinearNet()
+                >>> print(linear_net.full_name())
+                demo_linear_net_0
 
         """
         return self._full_name
@@ -576,33 +636,33 @@ class Layer:
         Examples:
             .. code-block:: python
 
-                import paddle
-                import numpy as np
-
-                # the forward_post_hook change the output of the layer: output = output * 2
-                def forward_post_hook(layer, input, output):
-                    # user can use layer, input and output for information statistis tasks
+                >>> import paddle
+                >>> import numpy as np
 
-                    # change the output
-                    return output * 2
+                >>> # the forward_post_hook change the output of the layer: output = output * 2
+                >>> def forward_post_hook(layer, input, output):
+                ...     # user can use layer, input and output for information statistis tasks
+                ...
+                ...     # change the output
+                ...     return output * 2
+                ...
+                >>> linear = paddle.nn.Linear(13, 5)
 
-                linear = paddle.nn.Linear(13, 5)
+                >>> # register the hook
+                >>> forward_post_hook_handle = linear.register_forward_post_hook(forward_post_hook)
 
-                # register the hook
-                forward_post_hook_handle = linear.register_forward_post_hook(forward_post_hook)
+                >>> value1 = np.arange(26).reshape(2, 13).astype("float32")
+                >>> in1 = paddle.to_tensor(value1)
 
-                value1 = np.arange(26).reshape(2, 13).astype("float32")
-                in1 = paddle.to_tensor(value1)
+                >>> out0 = linear(in1)
 
-                out0 = linear(in1)
+                >>> # remove the hook
+                >>> forward_post_hook_handle.remove()
 
-                # remove the hook
-                forward_post_hook_handle.remove()
+                >>> out1 = linear(in1)
 
-                out1 = linear(in1)
-
-                # hook change the linear's output to output * 2, so out0 is equal to out1 * 2.
-                assert (out0.numpy() == (out1.numpy()) * 2).any()
+                >>> # hook change the linear's output to output * 2, so out0 is equal to out1 * 2.
+                >>> assert (out0.numpy() == (out1.numpy()) * 2).any()
 
         """
         hook_remove_helper = HookRemoveHelper(self._forward_post_hooks)
@@ -630,35 +690,35 @@ class Layer:
         Examples:
             .. code-block:: python
 
-                import paddle
-                import numpy as np
-
-                # the forward_pre_hook change the input of the layer: input = input * 2
-                def forward_pre_hook(layer, input):
-                    # user can use layer and input for information statistis tasks
+                >>> import paddle
+                >>> import numpy as np
 
-                    # change the input
-                    input_return = (input[0] * 2)
-                    return input_return
+                >>> # the forward_pre_hook change the input of the layer: input = input * 2
+                >>> def forward_pre_hook(layer, input):
+                ...     # user can use layer and input for information statistis tasks
+                ...
+                ...     # change the input
+                ...     input_return = (input[0] * 2)
+                ...     return input_return
+                ...
+                >>> linear = paddle.nn.Linear(13, 5)
 
-                linear = paddle.nn.Linear(13, 5)
+                >>> # register the hook
+                >>> forward_pre_hook_handle = linear.register_forward_pre_hook(forward_pre_hook)
 
-                # register the hook
-                forward_pre_hook_handle = linear.register_forward_pre_hook(forward_pre_hook)
+                >>> value0 = np.arange(26).reshape(2, 13).astype("float32")
+                >>> in0 = paddle.to_tensor(value0)
+                >>> out0 = linear(in0)
 
-                value0 = np.arange(26).reshape(2, 13).astype("float32")
-                in0 = paddle.to_tensor(value0)
-                out0 = linear(in0)
+                >>> # remove the hook
+                >>> forward_pre_hook_handle.remove()
 
-                # remove the hook
-                forward_pre_hook_handle.remove()
+                >>> value1 = value0 * 2
+                >>> in1 = paddle.to_tensor(value1)
+                >>> out1 = linear(in1)
 
-                value1 = value0 * 2
-                in1 = paddle.to_tensor(value1)
-                out1 = linear(in1)
-
-                # hook change the linear's input to input * 2, so out0 is equal to out1.
-                assert (out0.numpy() == out1.numpy()).any()
+                >>> # hook change the linear's input to input * 2, so out0 is equal to out1.
+                >>> assert (out0.numpy() == out1.numpy()).any()
         """
         hook_remove_helper = HookRemoveHelper(self._forward_pre_hooks)
         self._forward_pre_hooks[hook_remove_helper._hook_id] = hook
@@ -691,22 +751,31 @@ class Layer:
         Examples:
             .. code-block:: python
 
-                import paddle
-
-                class MyLayer(paddle.nn.Layer):
-                    def __init__(self):
-                        super().__init__()
-                        self._linear = paddle.nn.Linear(1, 1)
-                        w_tmp = self.create_parameter([1,1])
-                        self.add_parameter("w_tmp", w_tmp)
-
-                    def forward(self, input):
-                        return self._linear(input)
-
-                mylayer = MyLayer()
-                for name, param in mylayer.named_parameters():
-                    print(name, param)      # will print w_tmp,_linear.weight,_linear.bias
-
+                >>> import paddle
+                >>> paddle.seed(2023)
+
+                >>> class MyLayer(paddle.nn.Layer):
+                ...     def __init__(self):
+                ...         super().__init__()
+                ...         self._linear = paddle.nn.Linear(1, 1)
+                ...         w_tmp = self.create_parameter([1,1])
+                ...         self.add_parameter("w_tmp", w_tmp)
+                ...
+                ...     def forward(self, input):
+                ...         return self._linear(input)
+                ...
+                >>> mylayer = MyLayer()
+                >>> for name, param in mylayer.named_parameters():
+                ...     print(name, param)      # will print w_tmp,_linear.weight,_linear.bias
+                w_tmp Parameter containing:
+                Tensor(shape=[1, 1], dtype=float32, place=Place(cpu), stop_gradient=False,
+                [[0.06979191]])
+                _linear.weight Parameter containing:
+                Tensor(shape=[1, 1], dtype=float32, place=Place(cpu), stop_gradient=False,
+                [[1.26729357]])
+                _linear.bias Parameter containing:
+                Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=False,
+                [0.])
         """
         temp_attr = copy.deepcopy(attr)
         if isinstance(temp_attr, str) and temp_attr == "":
@@ -738,22 +807,22 @@ class Layer:
         Examples:
             .. code-block:: python
 
-                import paddle
-
-                class MyLinear(paddle.nn.Layer):
-                    def __init__(self,
-                                in_features,
-                                out_features):
-                        super().__init__()
-                        self.linear = paddle.nn.Linear( 10, 10)
-
-                        self.back_var = self.create_variable(name = "linear_tmp_0", dtype=self._dtype)
-
-                    def forward(self, input):
-                        out = self.linear(input)
-                        paddle.assign( out, self.back_var)
-
-                        return out
+                >>> import paddle
+
+                >>> class MyLinear(paddle.nn.Layer):
+                ...     def __init__(self,
+                ...                 in_features,
+                ...                 out_features):
+                ...         super().__init__()
+                ...         self.linear = paddle.nn.Linear( 10, 10)
+                ...
+                ...         self.back_var = self.create_variable(name = "linear_tmp_0", dtype=self._dtype)
+                ...
+                ...     def forward(self, input):
+                ...         out = self.linear(input)
+                ...         paddle.assign( out, self.back_var)
+                ...
+                ...         return out
 
         """
         if name is not None:
@@ -790,22 +859,22 @@ class Layer:
         Examples:
             .. code-block:: python
 
-                import paddle
-
-                class MyLinear(paddle.nn.Layer):
-                    def __init__(self,
-                                in_features,
-                                out_features):
-                        super().__init__()
-                        self.linear = paddle.nn.Linear( 10, 10)
-
-                        self.back_var = self.create_tensor(name = "linear_tmp_0", dtype=self._dtype)
-
-                    def forward(self, input):
-                        out = self.linear(input)
-                        paddle.assign( out, self.back_var)
-
-                        return out
+                >>> import paddle
+
+                >>> class MyLinear(paddle.nn.Layer):
+                ...     def __init__(self,
+                ...                  in_features,
+                ...                  out_features):
+                ...         super().__init__()
+                ...         self.linear = paddle.nn.Linear(10, 10)
+                ...
+                ...         self.back_var = self.create_tensor(name = "linear_tmp_0", dtype=self._dtype)
+                ...
+                ...     def forward(self, input):
+                ...         out = self.linear(input)
+                ...         paddle.assign(out, self.back_var)
+                ...
+                ...         return out
 
         """
         if name is not None:
@@ -833,10 +902,16 @@ class Layer:
         Examples:
             .. code-block:: python
 
-                import paddle
+                >>> import paddle
+                >>> paddle.seed(100)
 
-                linear = paddle.nn.Linear(1,1)
-                print(linear.parameters())  # print linear_0.w_0 and linear_0.b_0
+                >>> linear = paddle.nn.Linear(1, 1)
+                >>> print(linear.parameters())
+                [Parameter containing:
+                Tensor(shape=[1, 1], dtype=float32, place=Place(cpu), stop_gradient=False,
+                [[0.18551230]]), Parameter containing:
+                Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=False,
+                [0.])]
 
         """
         ret = [
@@ -858,15 +933,16 @@ class Layer:
         Examples:
             .. code-block:: python
 
-                import paddle
+                >>> import paddle
 
-                linear1 = paddle.nn.Linear(10, 3)
-                linear2 = paddle.nn.Linear(3, 10, bias_attr=False)
-                model = paddle.nn.Sequential(linear1, linear2)
+                >>> linear1 = paddle.nn.Linear(10, 3)
+                >>> linear2 = paddle.nn.Linear(3, 10, bias_attr=False)
+                >>> model = paddle.nn.Sequential(linear1, linear2)
 
-                layer_list = list(model.children())
+                >>> layer_list = list(model.children())
 
-                print(layer_list)   # [<paddle.nn.layer.common.Linear object at 0x7f7b8113f830>, <paddle.nn.layer.common.Linear object at 0x7f7b8113f950>]
+                >>> print(layer_list)
+                [Linear(in_features=10, out_features=3, dtype=float32), Linear(in_features=3, out_features=10, dtype=float32)]
 
         """
         for _, layer in self.named_children():
@@ -882,16 +958,15 @@ class Layer:
         Examples:
             .. code-block:: python
 
-                import paddle
-
-                linear1 = paddle.nn.Linear(10, 3)
-                linear2 = paddle.nn.Linear(3, 10, bias_attr=False)
-                model = paddle.nn.Sequential(linear1, linear2)
-                for prefix, layer in model.named_children():
-                    print(prefix, layer)
-                    # ('0', <paddle.nn.layer.common.Linear object at 0x7fb61ed85830>)
-                    # ('1', <paddle.nn.layer.common.Linear object at 0x7fb61ed85950>)
+                >>> import paddle
 
+                >>> linear1 = paddle.nn.Linear(10, 3)
+                >>> linear2 = paddle.nn.Linear(3, 10, bias_attr=False)
+                >>> model = paddle.nn.Sequential(linear1, linear2)
+                >>> for prefix, layer in model.named_children():
+                ...     print(prefix, layer)
+                0 Linear(in_features=10, out_features=3, dtype=float32)
+                1 Linear(in_features=3, out_features=10, dtype=float32)
         """
         memo = set()
         for name, layer in self._sub_layers.items():
@@ -913,21 +988,22 @@ class Layer:
         Examples:
             .. code-block:: python
 
-                import paddle
-
-                class MyLayer(paddle.nn.Layer):
-                    def __init__(self):
-                        super().__init__()
-                        self._linear = paddle.nn.Linear(1, 1)
-                        self._dropout = paddle.nn.Dropout(p=0.5)
-
-                    def forward(self, input):
-                        temp = self._linear(input)
-                        temp = self._dropout(temp)
-                        return temp
-
-                mylayer = MyLayer()
-                print(mylayer.sublayers())  # [<paddle.nn.layer.common.Linear object at 0x7f44b58977d0>, <paddle.nn.layer.common.Dropout object at 0x7f44b58978f0>]
+                >>> import paddle
+
+                >>> class MyLayer(paddle.nn.Layer):
+                ...     def __init__(self):
+                ...         super().__init__()
+                ...         self._linear = paddle.nn.Linear(1, 1)
+                ...         self._dropout = paddle.nn.Dropout(p=0.5)
+                ...
+                ...     def forward(self, input):
+                ...         temp = self._linear(input)
+                ...         temp = self._dropout(temp)
+                ...         return temp
+                ...
+                >>> mylayer = MyLayer()
+                >>> print(mylayer.sublayers())
+                [Linear(in_features=1, out_features=1, dtype=float32), Dropout(p=0.5, axis=None, mode=upscale_in_train)]
 
         """
         ret = [
@@ -951,14 +1027,37 @@ class Layer:
         Examples:
             .. code-block:: python
 
-                import paddle
-
-                fc1 = paddle.nn.Linear(10, 3)
-                fc2 = paddle.nn.Linear(3, 10, bias_attr=False)
-                model = paddle.nn.Sequential(fc1, fc2)
-                for name, param in model.named_parameters():
-                    print(name, param)
-
+                >>> import paddle
+                >>> paddle.seed(100)
+
+                >>> fc1 = paddle.nn.Linear(10, 3)
+                >>> fc2 = paddle.nn.Linear(3, 10, bias_attr=False)
+                >>> model = paddle.nn.Sequential(fc1, fc2)
+                >>> for name, param in model.named_parameters():
+                ...     print(name, param)
+                0.weight Parameter containing:
+                Tensor(shape=[10, 3], dtype=float32, place=Place(cpu), stop_gradient=False,
+                [[ 0.07276392, -0.39791510, -0.66356444],
+                 [ 0.02143478, -0.18519843, -0.32485050],
+                 [-0.42249614,  0.08450919, -0.66838276],
+                 [ 0.38208580, -0.24303678,  0.55127048],
+                 [ 0.47745085,  0.62117910, -0.08336520],
+                 [-0.28653207,  0.47237599, -0.05868882],
+                 [-0.14385653,  0.29945642,  0.12832761],
+                 [-0.21237159,  0.38539791, -0.62760031],
+                 [ 0.02637231,  0.20621127,  0.43255770],
+                 [-0.19984481, -0.26259184, -0.29696006]])
+                0.bias Parameter containing:
+                Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=False,
+                [0., 0., 0.])
+                1.weight Parameter containing:
+                Tensor(shape=[3, 10], dtype=float32, place=Place(cpu), stop_gradient=False,
+                [[ 0.01985580, -0.40268910,  0.41172385, -0.47249708, -0.09002256,
+                 -0.00533628, -0.52048630,  0.62360322,  0.20848787, -0.02033746],
+                 [ 0.58281910,  0.12841827,  0.12907702,  0.02325618, -0.07746267,
+                 0.31950659, -0.37924835, -0.59209681, -0.11732036, -0.58378261],
+                 [-0.62100595,  0.22293305,  0.28229684, -0.03687060, -0.59323978,
+                 0.08411229,  0.53275704,  0.40431368,  0.03171402, -0.17922515]])
         """
         params_set = set()
         named_sublayers = (
@@ -991,14 +1090,15 @@ class Layer:
         Examples:
             .. code-block:: python
 
-                import paddle
-
-                fc1 = paddle.nn.Linear(10, 3)
-                fc2 = paddle.nn.Linear(3, 10, bias_attr=False)
-                model = paddle.nn.Sequential(fc1, fc2)
-                for prefix, layer in model.named_sublayers():
-                    print(prefix, layer)
+                >>> import paddle
 
+                >>> fc1 = paddle.nn.Linear(10, 3)
+                >>> fc2 = paddle.nn.Linear(3, 10, bias_attr=False)
+                >>> model = paddle.nn.Sequential(fc1, fc2)
+                >>> for prefix, layer in model.named_sublayers():
+                ...     print(prefix, layer)
+                0 Linear(in_features=10, out_features=3, dtype=float32)
+                1 Linear(in_features=3, out_features=10, dtype=float32)
         """
         if layers_set is None:
             layers_set = set()
@@ -1039,16 +1139,18 @@ class Layer:
         Examples:
             .. code-block:: python
 
-                import numpy as np
-                import paddle
+                >>> import numpy as np
+                >>> import paddle
 
-                linear = paddle.nn.Linear(10, 3)
-                value = np.array([0]).astype("float32")
-                buffer = paddle.to_tensor(value)
-                linear.register_buffer("buf_name", buffer, persistable=True)
+                >>> linear = paddle.nn.Linear(10, 3)
+                >>> value = np.array([0]).astype("float32")
+                >>> buffer = paddle.to_tensor(value)
+                >>> linear.register_buffer("buf_name", buffer, persistable=True)
 
-                # get the buffer by attribute.
-                print(linear.buf_name)
+                >>> # get the buffer by attribute.
+                >>> print(linear.buf_name)
+                Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [0.])
 
         """
 
@@ -1097,15 +1199,17 @@ class Layer:
         Examples:
             .. code-block:: python
 
-                import numpy as np
-                import paddle
+                >>> import numpy as np
+                >>> import paddle
 
-                linear = paddle.nn.Linear(10, 3)
-                value = np.array([0]).astype("float32")
-                buffer = paddle.to_tensor(value)
-                linear.register_buffer("buf_name", buffer, persistable=True)
+                >>> linear = paddle.nn.Linear(10, 3)
+                >>> value = np.array([0]).astype("float32")
+                >>> buffer = paddle.to_tensor(value)
+                >>> linear.register_buffer("buf_name", buffer, persistable=True)
 
-                print(linear.buffers())     # == print([linear.buf_name])
+                >>> print(linear.buffers())
+                [Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [0.])]
 
         """
         ret = [
@@ -1131,26 +1235,29 @@ class Layer:
         Examples:
             .. code-block:: python
 
-                import numpy as np
-                import paddle
-
-                fc1 = paddle.nn.Linear(10, 3)
-                buffer1 = paddle.to_tensor(np.array([0]).astype("float32"))
-                # register a tensor as buffer by specific `persistable`
-                fc1.register_buffer("buf_name_1", buffer1, persistable=True)
-
-                fc2 = paddle.nn.Linear(3, 10)
-                buffer2 = paddle.to_tensor(np.array([1]).astype("float32"))
-                # register a buffer by assigning an attribute with Tensor.
-                # The `persistable` can only be False by this way.
-                fc2.buf_name_2 = buffer2
-
-                model = paddle.nn.Sequential(fc1, fc2)
-
-                # get all named buffers
-                for name, buffer in model.named_buffers():
-                    print(name, buffer)
-
+                >>> import numpy as np
+                >>> import paddle
+
+                >>> fc1 = paddle.nn.Linear(10, 3)
+                >>> buffer1 = paddle.to_tensor(np.array([0]).astype("float32"))
+                >>> # register a tensor as buffer by specific `persistable`
+                >>> fc1.register_buffer("buf_name_1", buffer1, persistable=True)
+
+                >>> fc2 = paddle.nn.Linear(3, 10)
+                >>> buffer2 = paddle.to_tensor(np.array([1]).astype("float32"))
+                >>> # register a buffer by assigning an attribute with Tensor.
+                >>> # The `persistable` can only be False by this way.
+                >>> fc2.buf_name_2 = buffer2
+
+                >>> model = paddle.nn.Sequential(fc1, fc2)
+
+                >>> # get all named buffers
+                >>> for name, buffer in model.named_buffers():
+                ...     print(name, buffer)
+                0.buf_name_1 Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [0.])
+                1.buf_name_2 Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [1.])
         """
         buffers_set = set()
         named_sublayers = (
@@ -1177,18 +1284,18 @@ class Layer:
         Examples:
             .. code-block:: python
 
-                import paddle
-                import numpy as np
+                >>> import paddle
+                >>> import numpy as np
 
-                value = np.arange(26).reshape(2, 13).astype("float32")
-                a = paddle.to_tensor(value)
-                linear = paddle.nn.Linear(13, 5)
-                adam = paddle.optimizer.Adam(learning_rate=0.01,
-                                            parameters=linear.parameters())
-                out = linear(a)
-                out.backward()
-                adam.step()
-                linear.clear_gradients()
+                >>> value = np.arange(26).reshape(2, 13).astype("float32")
+                >>> a = paddle.to_tensor(value)
+                >>> linear = paddle.nn.Linear(13, 5)
+                >>> adam = paddle.optimizer.Adam(learning_rate=0.01,
+                ...                              parameters=linear.parameters())
+                >>> out = linear(a)
+                >>> out.backward()
+                >>> adam.step()
+                >>> linear.clear_gradients()
 
         """
         for p in self.parameters():
@@ -1271,29 +1378,30 @@ class Layer:
         Examples:
             .. code-block:: python
 
-                import paddle
-
-                class MySequential(paddle.nn.Layer):
-                    def __init__(self, *layers):
-                        super().__init__()
-                        if len(layers) > 0 and isinstance(layers[0], tuple):
-                            for name, layer in layers:
-                                self.add_sublayer(name, layer)
-                        else:
-                            for idx, layer in enumerate(layers):
-                                self.add_sublayer(str(idx), layer)
-
-                    def forward(self, input):
-                        for layer in self._sub_layers.values():
-                            input = layer(input)
-                        return input
-
-                fc1 = paddle.nn.Linear(10, 3)
-                fc2 = paddle.nn.Linear(3, 10, bias_attr=False)
-                model = MySequential(fc1, fc2)
-                for prefix, layer in model.named_sublayers():
-                    print(prefix, layer)
-
+                >>> import paddle
+
+                >>> class MySequential(paddle.nn.Layer):
+                ...     def __init__(self, *layers):
+                ...         super().__init__()
+                ...         if len(layers) > 0 and isinstance(layers[0], tuple):
+                ...             for name, layer in layers:
+                ...                 self.add_sublayer(name, layer)
+                ...         else:
+                ...             for idx, layer in enumerate(layers):
+                ...                 self.add_sublayer(str(idx), layer)
+                ...
+                ...     def forward(self, input):
+                ...         for layer in self._sub_layers.values():
+                ...             input = layer(input)
+                ...         return input
+                ...
+                >>> fc1 = paddle.nn.Linear(10, 3)
+                >>> fc2 = paddle.nn.Linear(3, 10, bias_attr=False)
+                >>> model = MySequential(fc1, fc2)
+                >>> for prefix, layer in model.named_sublayers():
+                ...     print(prefix, layer)
+                0 Linear(in_features=10, out_features=3, dtype=float32)
+                1 Linear(in_features=3, out_features=10, dtype=float32)
         """
         assert isinstance(sublayer, Layer) or sublayer is None
 
@@ -1313,22 +1421,31 @@ class Layer:
         Examples:
             .. code-block:: python
 
-                import paddle
-
-                class MyLayer(paddle.nn.Layer):
-                    def __init__(self):
-                        super().__init__()
-                        self._linear = paddle.nn.Linear(1, 1)
-                        w_tmp = self.create_parameter([1,1])
-                        self.add_parameter("w_tmp", w_tmp)
-
-                    def forward(self, input):
-                        return self._linear(input)
-
-                mylayer = MyLayer()
-                for name, param in mylayer.named_parameters():
-                    print(name, param)      # will print w_tmp,_linear.weight,_linear.bias
-
+                >>> import paddle
+                >>> paddle.seed(100)
+
+                >>> class MyLayer(paddle.nn.Layer):
+                ...     def __init__(self):
+                ...         super().__init__()
+                ...         self._linear = paddle.nn.Linear(1, 1)
+                ...         w_tmp = self.create_parameter([1,1])
+                ...         self.add_parameter("w_tmp", w_tmp)
+                ...
+                ...     def forward(self, input):
+                ...         return self._linear(input)
+                ...
+                >>> mylayer = MyLayer()
+                >>> for name, param in mylayer.named_parameters():
+                ...     print(name, param)
+                w_tmp Parameter containing:
+                Tensor(shape=[1, 1], dtype=float32, place=Place(cpu), stop_gradient=False,
+                [[-1.01448846]])
+                _linear.weight Parameter containing:
+                Tensor(shape=[1, 1], dtype=float32, place=Place(cpu), stop_gradient=False,
+                [[0.18551230]])
+                _linear.bias Parameter containing:
+                Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=False,
+                [0.])
         """
         if '_parameters' not in self.__dict__:
             raise RuntimeError("super().__init__() should be called firstly.")
@@ -1580,23 +1697,21 @@ class Layer:
 
         Examples:
             .. code-block:: python
-                import paddle
-                import numpy as np
-
-                class Mylayer(paddle.nn.Layer):
-                    def __init__(self):
-                        super().__init__()
-                        self.linear1 = paddle.nn.Linear(10, 10)
-                        self.linear2 = paddle.nn.Linear(5, 5)
-                        self.conv2d = paddle.nn.Conv2D(3, 2, 3)
-                        self.embedding = paddle.nn.Embedding(128, 16)
-                        self.h_0 = paddle.to_tensor(np.zeros([10, 10]).astype('float32'))
-
-                mylayer = Mylayer()
-                print(dir(mylayer))
-                # only parts are shown, because of list have too much content
-                # ['__call__', '__class__',  ... , 'conv2d', 'embedding', 'h_0', 'linear1', 'linear2', ... , 'sublayers', 'train']
-
+                >>> import paddle
+                >>> import numpy as np
+
+                >>> class Mylayer(paddle.nn.Layer):
+                ...     def __init__(self):
+                ...         super().__init__()
+                ...         self.linear1 = paddle.nn.Linear(10, 10)
+                ...         self.linear2 = paddle.nn.Linear(5, 5)
+                ...         self.conv2d = paddle.nn.Conv2D(3, 2, 3)
+                ...         self.embedding = paddle.nn.Embedding(128, 16)
+                ...         self.h_0 = paddle.to_tensor(np.zeros([10, 10]).astype('float32'))
+                ...
+                >>> mylayer = Mylayer()
+                >>> print(dir(mylayer))
+                ['__call__', '__class__', '__delattr__', '__dict__', ..., 'training']
         """
         method = dir(self.__class__)
         attrs = list(self.__dict__.keys())
@@ -1756,12 +1871,12 @@ class Layer:
         Examples:
             .. code-block:: python
 
-                import paddle
+                >>> import paddle
 
-                emb = paddle.nn.Embedding(10, 10)
+                >>> emb = paddle.nn.Embedding(10, 10)
 
-                state_dict = emb.to_static_state_dict()
-                paddle.save( state_dict, "paddle_dy.pdparams")
+                >>> state_dict = emb.to_static_state_dict()
+                >>> paddle.save( state_dict, "paddle_dy.pdparams")
 
         '''
         return self._state_dict_impl(
@@ -1793,12 +1908,12 @@ class Layer:
         Examples:
             .. code-block:: python
 
-                import paddle
+                >>> import paddle
 
-                emb = paddle.nn.Embedding(10, 10)
+                >>> emb = paddle.nn.Embedding(10, 10)
 
-                state_dict = emb.state_dict()
-                paddle.save( state_dict, "paddle_dy.pdparams")
+                >>> state_dict = emb.state_dict()
+                >>> paddle.save( state_dict, "paddle_dy.pdparams")
 
         '''
         return self._state_dict_impl(
@@ -1825,14 +1940,14 @@ class Layer:
         Examples:
             .. code-block:: python
 
-                import paddle
+                >>> import paddle
 
-                emb = paddle.nn.Embedding(10, 10)
+                >>> emb = paddle.nn.Embedding(10, 10)
 
-                state_dict = emb.state_dict()
-                paddle.save(state_dict, "paddle_dy.pdparams")
-                para_state_dict = paddle.load("paddle_dy.pdparams")
-                emb.set_state_dict(para_state_dict)
+                >>> state_dict = emb.state_dict()
+                >>> paddle.save(state_dict, "paddle_dy.pdparams")
+                >>> para_state_dict = paddle.load("paddle_dy.pdparams")
+                >>> emb.set_state_dict(para_state_dict)
 
         '''
         missing_keys = []
@@ -1950,32 +2065,40 @@ class Layer:
         Examples:
             .. code-block:: python
 
-                # required: skip
-                import paddle
-
-                linear=paddle.nn.Linear(2, 2)
-                linear.weight
-                #Parameter containing:
-                #Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
-                #       [[-0.32770029,  0.38653070],
-                #        [ 0.46030545,  0.08158520]])
-
-                linear.to(dtype='float64')
-                linear.weight
-                #Tenor(shape=[2, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=False,
-                #       [[-0.32770029,  0.38653070],
-                #        [ 0.46030545,  0.08158520]])
-
-                linear.to(device='cpu')
-                linear.weight
-                #Tensor(shape=[2, 2], dtype=float64, place=CPUPlace, stop_gradient=False,
-                #       [[-0.32770029,  0.38653070],
-                #        [ 0.46030545,  0.08158520]])
-                linear.to(device=paddle.CUDAPinnedPlace(), blocking=False)
-                linear.weight
-                #Tensor(shape=[2, 2], dtype=float64, place=CUDAPinnedPlace, stop_gradient=False,
-                #       [[-0.04989364, -0.56889004],
-                #        [ 0.33960250,  0.96878713]])
+                >>> import paddle
+                >>> paddle.seed(2023)
+
+                >>> linear=paddle.nn.Linear(2, 2)
+                >>> linear.weight
+                >>> print(linear.weight)
+                Parameter containing:
+                Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+                [[ 0.89611185,  0.04935038],
+                 [-0.58883440,  0.99266374]])
+
+                >>> linear.to(dtype='float64')
+                >>> linear.weight
+                >>> print(linear.weight)
+                Parameter containing:
+                Tensor(shape=[2, 2], dtype=float64, place=Place(gpu:0), stop_gradient=False,
+                [[ 0.89611185,  0.04935038],
+                 [-0.58883440,  0.99266374]])
+
+                >>> linear.to(device='cpu')
+                >>> linear.weight
+                >>> print(linear.weight)
+                Parameter containing:
+                Tensor(shape=[2, 2], dtype=float64, place=Place(cpu), stop_gradient=False,
+                [[ 0.89611185,  0.04935038],
+                 [-0.58883440,  0.99266374]])
+
+                >>> # doctest: +REQUIRES(env:GPU)
+                >>> linear.to(device=paddle.CUDAPinnedPlace(), blocking=False)
+                >>> linear.weight
+                >>> print(linear.weight)
+                Tensor(shape=[2, 2], dtype=float64, place=Place(gpu_pinned), stop_gradient=False,
+                [[ 0.89611185,  0.04935038],
+                 [-0.58883440,  0.99266374]])
 
         '''
         return self._to_impl(
@@ -2161,21 +2284,25 @@ class Layer:
         Examples:
             .. code-block:: python
 
-                import paddle
-
-                class Model(paddle.nn.Layer):
-                    def __init__(self):
-                        super().__init__()
-                        self.linear = paddle.nn.Linear(1, 1)
-                        self.dropout = paddle.nn.Dropout(p=0.5)
-
-                    def forward(self, input):
-                        out = self.linear(input)
-                        out = self.dropout(out)
-                        return out
-
-                model = Model()
-                model.float()
+                >>> import paddle
+
+                >>> class Model(paddle.nn.Layer):
+                ...     def __init__(self):
+                ...         super().__init__()
+                ...         self.linear = paddle.nn.Linear(1, 1)
+                ...         self.dropout = paddle.nn.Dropout(p=0.5)
+                ...
+                ...     def forward(self, input):
+                ...         out = self.linear(input)
+                ...         out = self.dropout(out)
+                ...         return out
+                ...
+                >>> model = Model()
+                >>> model.float()
+                Model(
+                    (linear): Linear(in_features=1, out_features=1, dtype=paddle.float32)
+                    (dropout): Dropout(p=0.5, axis=None, mode=upscale_in_train)
+                )
         '''
 
         excluded_layers = [] if excluded_layers is None else excluded_layers
@@ -2213,21 +2340,26 @@ class Layer:
         Examples:
             .. code-block:: python
 
-                import paddle
-
-                class Model(paddle.nn.Layer):
-                    def __init__(self):
-                        super().__init__()
-                        self.linear = paddle.nn.Linear(1, 1)
-                        self.dropout = paddle.nn.Dropout(p=0.5)
-
-                    def forward(self, input):
-                        out = self.linear(input)
-                        out = self.dropout(out)
-                        return out
-
-                model = Model()
-                model.float16()
+                >>> # doctest: +SKIP('Paddle compiled by the user does not support float16, so keep original data type.')
+                >>> import paddle
+
+                >>> class Model(paddle.nn.Layer):
+                ...     def __init__(self):
+                ...         super().__init__()
+                ...         self.linear = paddle.nn.Linear(1, 1)
+                ...         self.dropout = paddle.nn.Dropout(p=0.5)
+                ...
+                ...     def forward(self, input):
+                ...         out = self.linear(input)
+                ...         out = self.dropout(out)
+                ...         return out
+                ...
+                >>> model = Model()
+                >>> model.float16()
+                Model(
+                    (linear): Linear(in_features=1, out_features=1, dtype=float32)
+                    (dropout): Dropout(p=0.5, axis=None, mode=upscale_in_train)
+                )
         '''
 
         if paddle.amp.is_float16_supported() is False:
@@ -2273,21 +2405,27 @@ class Layer:
         Examples:
             .. code-block:: python
 
-                import paddle
-
-                class Model(paddle.nn.Layer):
-                    def __init__(self):
-                        super().__init__()
-                        self.linear = paddle.nn.Linear(1, 1)
-                        self.dropout = paddle.nn.Dropout(p=0.5)
-
-                    def forward(self, input):
-                        out = self.linear(input)
-                        out = self.dropout(out)
-                        return out
-
-                model = Model()
-                model.bfloat16()
+                >>> # doctest: +SKIP('bfloat need V100 compile')
+                >>> import paddle
+
+                >>> class Model(paddle.nn.Layer):
+                ...     def __init__(self):
+                ...         super().__init__()
+                ...         self.linear = paddle.nn.Linear(1, 1)
+                ...         self.dropout = paddle.nn.Dropout(p=0.5)
+                ...
+                ...     def forward(self, input):
+                ...         out = self.linear(input)
+                ...         out = self.dropout(out)
+                ...         return out
+                ...
+                >>> model = Model()
+                >>> model.bfloat16()
+                >>> #UserWarning: Paddle compiled by the user does not support bfloat16, so keep original data type.
+                Model(
+                    (linear): Linear(in_features=1, out_features=1, dtype=float32)
+                    (dropout): Dropout(p=0.5, axis=None, mode=upscale_in_train)
+                )
         '''
 
         if paddle.amp.is_bfloat16_supported() is False:
-- 
GitLab