[xdoctest] reformat example code with google style in No. 250-260 (#56541)

* test=docs_preview * test=docs_preview * test=docs_preview * test=docs_preview * test=docs_preview * test=docs_preview * fix * test=docs_preview * test=docs_preview * fix * move stmts under imports --------- Co-authored-by: N SigureMo <sigure.qaq@gmail.com>

[xdoctest] reformat example code with google style in No. 250-260 (#56541)
* test=docs_preview * test=docs_preview * test=docs_preview * test=docs_preview * test=docs_preview * test=docs_preview * fix * test=docs_preview * test=docs_preview * fix * move stmts under imports --------- Co-authored-by: N SigureMo <sigure.qaq@gmail.com>
4dbe441c · cyberslack_lee · GitHub · 7314cf69 · 4dbe441c · 4dbe441c
11 changed file
--- a/python/paddle/incubate/distributed/fleet/parameter_server/pslib/__init__.py
+++ b/python/paddle/incubate/distributed/fleet/parameter_server/pslib/__init__.py
--- a/python/paddle/incubate/distributed/models/moe/grad_clip.py
+++ b/python/paddle/incubate/distributed/models/moe/grad_clip.py
@@ -22,7 +22,7 @@ from paddle.nn.clip import ClipGradBase, _squared_l2_norm
 class ClipGradForMOEByGlobalNorm(ClipGradBase):
    r"""
-    The Algrithm is the same as paddle.nn.ClipGradByGlobalNorm
+    The Algorithm is the same as paddle.nn.ClipGradByGlobalNorm
    Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
    :math:`t\_list` , and limit it to ``clip_norm`` .
@@ -50,7 +50,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
    Note:
        ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
-        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+        Please use ``need_clip`` in ``ParamAttr`` to specify the clip scope.
    Reference:
        https://github.com/laekov/fastmoe/blob/master/examples/megatron/clip-grad-v2.2.patch
@@ -64,22 +64,22 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
        group_name (str, optional): The group name for this clip. Default value is ``default_moe_group``.
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
+            >>> x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(in_features=10, out_features=10,
+            >>> linear = paddle.nn.Linear(in_features=10, out_features=10,
-                                      weight_attr=paddle.ParamAttr(need_clip=True),
+            ...                           weight_attr=paddle.ParamAttr(need_clip=True),
-                                      bias_attr=paddle.ParamAttr(need_clip=False))
+            ...                           bias_attr=paddle.ParamAttr(need_clip=False))
-            out = linear(x)
+            >>> out = linear(x)
-            loss = paddle.mean(out)
+            >>> loss = paddle.mean(out)
-            loss.backward()
+            >>> loss.backward()
-            is_expert_func = lambda param: "expert_" in param.name
+            >>> clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) # Cause paddle.nn hasn't this interface, so we use ClipGradByGlobalNorm here.
-            clip = paddle.nn.ClipGradForMOEByGlobalNorm(clip_norm=1.0,is_expert_func, None)
+            >>> sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
-            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
+            >>> sdg.step()
-            sdg.step()
    """
    def __init__(
@@ -124,7 +124,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
            else:
                sum_square_list.append(sum_square)
-        # all parameters have been filterd out
+        # all parameters have been filtered out
        if (
            len(sum_square_list)
            + len(sum_square_list_fp16)

--- a/python/paddle/incubate/distributed/models/moe/moe_layer.py
+++ b/python/paddle/incubate/distributed/models/moe/moe_layer.py
@@ -263,63 +263,68 @@ def prepare_forward(gate, num_expert, world_size, moe_group):
 class MoELayer(nn.Layer):
    """MoE Layer
    Args:
-        d_model: (int) model dimention
+        d_model (int): Model dimention.
-        experts: (nn.LayerList) expert networks list
+        experts (nn.LayerList): Expert networks list.
-        gate: (dict|NaiveGate|SwitchGate|NaiveGate):
+        gate (dict|NaiveGate|SwitchGate|NaiveGate):
-                if gate is a dict:
-                    gate is a gate network config, containing 2 keys:
+            - If gate is a dict:
-                    `type`(str) value can be: "naive", "gshard", "switch" or None, default is "gshard"
+              gate is a gate network config, containing 2 keys:
-                    `top_k`(int) default value is 2
+              `type` (str) value can be: "naive", "gshard", "switch" or None, default is "gshard".
-                else gate is an instance of NaiveGate|SwitchGate|NaiveGate:
+              `top_k` (int) Default value is 2.
+            else gate is an instance of NaiveGate|SwitchGate|NaiveGate:
-        moe_group: moe group for experts communication
-        mp_group: mp group for mp commutication
+        moe_group: moe group for experts communication.
-        recompute_interval(int, optional): whether to use recompute, default 0, means to disable recompute.
+        mp_group: mp group for mp communication.
-        recompute_ctx(dict, optional): the context for recompute, if recompute_interval > 1, recompute_ctx must be given.
+        recompute_interval (int, optional): Whether to use recompute, default 0, means to disable recompute.
+        recompute_ctx (dict, optional): The context for recompute, if recompute_interval > 1, recompute_ctx must be given.
    Examples:
        .. code-block:: python
-        from paddle.nn import layer, LayerList
-        from paddle.distributed.moe import MoElayer
-        from paddle.distributed.collective import Group
-        from paddle.distributed import fleet
-        moe_group = Group(fleet.worker_index(),
-                          0,
-                          list(range(fleet.worker_num())))
-        mp_group = None
-        num_experts=8
-        dim_feedforward=512
-        d_model=8
-        top_k=2
-        class ExpertLayer(Layer):
-            def __init__(self, d_model, d_hidden, name=None,rank=0, windex = 0, num_expert=1):
-                super().__init__()
-                self.htoh4 = nn.Linear(d_model, d_hidden)
-                self.h4toh = nn.Linear(d_hidden, d_model)
-            def forward(self, x):
-                x = self.htoh4(x)
-                x = self.h4toh(x)
-                return x
-        gate_config = {
+            >>> # doctest: +SKIP('Until Distributed move successfully, just skip it')
-                "type": "gshard",
+            >>> from paddle.nn import layer, LayerList
-                "top_k": top_k,
+            >>> from paddle.distributed.moe import MoElayer
-        }
+            >>> from paddle.distributed.collective import Group
+            >>> from paddle.distributed import fleet
-        experts_list = LayerList()
-        for expi in range(num_experts):
+            >>> moe_group = Group(fleet.worker_index(),
-            exp_layer = ExpertLayer(d_model, dim_feedforward // top_k, windex=expi, num_expert=num_experts)
+            ...                   0,
-            experts_list.append(exp_layer)
+            ...                   list(range(fleet.worker_num())))
+            >>> mp_group = None
-        moeLayer = MoELayer(d_model = d_model,
-                            experts=experts_list,
+            >>> num_experts=8
-                            gate=gate_config,
+            >>> dim_feedforward=512
-                            moe_group=moe_group,
+            >>> d_model=8
-                            mp_group=mp_group,
+            >>> top_k=2
-                            recompute_interval=0)
+            >>> class ExpertLayer(Layer):
+            ...     def __init__(self, d_model, d_hidden, name=None,rank=0, windex = 0, num_expert=1):
+            ...         super().__init__()
+            ...         self.htoh4 = nn.Linear(d_model, d_hidden)
+            ...         self.h4toh = nn.Linear(d_hidden, d_model)
+            ...     def forward(self, x):
+            ...         x = self.htoh4(x)
+            ...         x = self.h4toh(x)
+            ...         return x
+            >>> gate_config = {
+            ...         "type": "gshard",
+            ...         "top_k": top_k,
+            ... }
+            >>> experts_list = LayerList()
+            >>> for expi in range(num_experts):
+            ...     exp_layer = ExpertLayer(d_model, dim_feedforward // top_k, windex=expi, num_expert=num_experts)
+            ...     experts_list.append(exp_layer)
+            >>> moeLayer = MoELayer(d_model = d_model,
+            ...                     experts=experts_list,
+            ...                     gate=gate_config,
+            ...                     moe_group=moe_group,
+            ...                     mp_group=mp_group,
+            ...                     recompute_interval=0)
    """

--- a/python/paddle/incubate/layers/nn.py
+++ b/python/paddle/incubate/layers/nn.py
--- a/python/paddle/incubate/nn/functional/fused_dropout_add.py
+++ b/python/paddle/incubate/nn/functional/fused_dropout_add.py
@@ -51,15 +51,27 @@ def fused_dropout_add(
    Examples:
-        ..  code-block:: python
+        .. code-block:: python
-            # required: gpu
+            >>> # doctest: +REQUIRES(env:GPU)
-            import paddle
+            >>> import paddle
-            from paddle.incubate.nn.functional import fused_dropout_add
+            >>> from paddle.incubate.nn.functional import fused_dropout_add
-            x = paddle.randn([4, 10], dtype='float16')
+            >>> paddle.set_device('gpu')
-            y = paddle.randn([4, 10], dtype='float16')
+            >>> paddle.seed(2023)
-            out = fused_dropout_add(x, y, p=0.5)
+            >>> x = paddle.randn([4, 10], dtype="float32")
+            >>> y = paddle.randn([4, 10], dtype="float32")
+            >>> out = fused_dropout_add(x, y, p=0.5)
+            >>> print(out)
+            Tensor(shape=[4, 10], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            [[-0.49133155,  0.53819323, -2.58393312,  0.06336236, -1.09908366,
+               0.22085167,  2.19751787,  0.05034769,  0.53417486,  0.84864247],
+             [ 0.78248203, -1.59652555, -0.14399840, -0.77985179, -0.17006736,
+              -0.30991879, -0.36593807, -0.51025450,  1.46401680,  0.61627960],
+             [ 4.50472546, -0.48472026,  0.60729283,  0.33509624, -0.25593102,
+              -1.45173049,  1.06727099,  0.00440830, -0.77340341,  0.67393088],
+             [ 1.29453969,  0.07568165,  0.71947742, -0.71768606, -2.57172823,
+               1.89179027,  3.26482797,  1.10493207, -1.04569530, -1.04862499]])
    """
    if isinstance(p, (int, float)):
        # fast return for p == 0

--- a/python/paddle/incubate/nn/functional/fused_ec_moe.py
+++ b/python/paddle/incubate/nn/functional/fused_ec_moe.py
@@ -37,25 +37,20 @@ def fused_ec_moe(
    Examples:
        .. code-block:: python
-            # required: gpu
+            >>> # doctest: +REQUIRES(env:GPU)
-            import paddle
+            >>> import paddle
-            from paddle.incubate.nn.functional import fused_ec_moe
+            >>> from paddle.incubate.nn.functional import fused_ec_moe
-            batch = 10
+            >>> paddle.set_device('gpu')
-            seq_len = 128
+            >>> x = paddle.randn([10, 128, 1024])
-            d_model = 1024
+            >>> gate = paddle.randn([10, 128, 8])
-            d_feed_forward = d_model * 4
+            >>> bmm0_weight = paddle.randn([8, 1024, 4096])
-            num_expert = 8
+            >>> bmm0_bias = paddle.randn([8, 1024, 4096])
+            >>> bmm1_weight = paddle.randn([8, 1024, 4096])
-            x = paddle.randn([batch, seq_len, d_model])
+            >>> bmm1_bias = paddle.randn([8, 1024, 4096])
-            gate = paddle.randn([batch, seq_len, num_expert])
+            >>> out = fused_ec_moe(x, gate, bmm0_weight, bmm0_bias, bmm1_weight, bmm1_bias, act_type="gelu")
-            bmm0_weight = paddle.randn([num_expert, d_model, d_feed_forward])
+            >>> print(out.shape)
-            bmm0_bias = paddle.randn([num_expert, d_model, d_feed_forward])
+            [10, 128, 1024]
-            bmm1_weight = paddle.randn([num_expert, d_model, d_feed_forward])
-            bmm1_bias = paddle.randn([num_expert, d_model, d_feed_forward])
-            out = fused_ec_moe(x, gate, bmm0_weight, bmm0_bias, bmm1_weight, bmm1_bias, act_type="gelu")
-            print(out.shape) # [batch, seq_len, num_expert]
    """
    helper = LayerHelper('fused_moe', **locals())
    out = helper.create_variable_for_type_inference(dtype=x.dtype)

--- a/python/paddle/incubate/nn/functional/fused_gate_attention.py
+++ b/python/paddle/incubate/nn/functional/fused_gate_attention.py
@@ -39,7 +39,7 @@ def fused_gate_attention(
    to information from different representation subspaces. This API only
    support self_attention. The pseudo code is as follows:
-    .. code-block:: python
+    .. code-block:: text
        c = c ** (-0.5)
        q = paddle.einsum('nbqa,ahc->nbqhc', q_data, query_w) * c
@@ -64,20 +64,20 @@ def fused_gate_attention(
    Args:
        query (Tensor): The input query tensor. The shape is [batch_size, msa_len, res_len, q_dim].
        key (Tensor, optional): The input key tensor, which can be set when
-                                merge_qkv is False. The shape is [batch_size, msa_len, m_size, kv_dim].
+            merge_qkv is False. The shape is [batch_size, msa_len, m_size, kv_dim]. Default None.
-        query_weight (Tensor, optional): The weight of query linear, which
+        query_weight (Tensor, optional): The weight of query linear, which should be set when input
-                                         should be set when input key is not None. The shape is [q_dim, num_heads, head_dim].
+            key is not None. The shape is [q_dim, num_heads, head_dim]. Default None.
-        key_weight (Tensor, optional): The weight of key linear, which should
+        key_weight (Tensor, optional): The weight of key linear, which should be set when input key
-                                       be set when input key is not None. The shape is [kv_dim, num_heads, head_dim].
+            is not None. The shape is [kv_dim, num_heads, head_dim]. Default None.
-        value_weight (Tensor, optional): The weight of value linear, which should
+        value_weight (Tensor, optional): The weight of value linear, which should be set when input
-                                         be set when input key is not None. The shape is [kv_dim, num_heads, head_dim].
+            key is not None. The shape is [kv_dim, num_heads, head_dim]. Default None.
-        qkv_weight (Tensor, optional): The weight of qkv linear, which should
+        qkv_weight (Tensor, optional): The weight of qkv linear, which should be set when merge_qkv
-                                       be set when merge_qkv is True. The shape is [3, num_heads, head_dim, q_dim].
+            is True. The shape is [3, num_heads, head_dim, q_dim]. Default None.
-        gate_linear_weight (Tensor, optional): The weight of gating linear,
+        gate_linear_weight (Tensor, optional): The weight of gating linear, which should be set when
-                                       which should be set when has_gating is True. The shape is [q_dim, num_heads, head_dim].
+            has_gating is True. The shape is [q_dim, num_heads, head_dim]. Default None.
-        gate_linear_bias (Tensor, optional): The bias of gating linear, which
+        gate_linear_bias (Tensor, optional): The bias of gating linear, which should be set when
-                                             should be set when has_gating is True. The shape is [num_heads, head_dim]. Default None.
+            has_gating is True. The shape is [num_heads, head_dim]. Default None.
-        out_linear_weight (Tensor, optional): The weight of output linear. The shape is [num_heads, head_dim, q_dim].
+        out_linear_weight (Tensor, optional): The weight of output linear. The shape is [num_heads, head_dim, q_dim]. Default None.
        out_linear_bias (Tensor): The bias of output linear, the shape is [q_dim]. Default None.
        nonbatched_bias (Tensor, optional): The extra bias. The shape is [batch_size, 1, num_heads, res_len, m_size]. Default None.
        attn_mask (Tensor, optional):  The attention mask. The shape is [batch_size, msa_len, 1, 1, res_len]. Default None.
@@ -92,54 +92,54 @@ def fused_gate_attention(
        .. code-block:: python
-            # required: gpu
+            >>> # doctest: +REQUIRES(env:GPU)
-            import paddle
+            >>> import paddle
-            import paddle.incubate.nn.functional as F
+            >>> import paddle.incubate.nn.functional as F
-            # batch_size = 2
+            >>> # batch_size = 2
-            # msa_len = 4
+            >>> # msa_len = 4
-            # res_len = 2
+            >>> # res_len = 2
-            # q_dim = 4
+            >>> # q_dim = 4
-            # num_heads = 8
+            >>> # num_heads = 8
-            # head_dim = 4
+            >>> # head_dim = 4
-            # m_size = res_len (when merge_qkv is True)
+            >>> # m_size = res_len (when merge_qkv is True)
-            # query: [batch_size, msa_len, res_len, q_dim]
+            >>> # query: [batch_size, msa_len, res_len, q_dim]
-            query = paddle.rand(shape=[2, 4, 2, 4], dtype="float32")
+            >>> query = paddle.rand(shape=[2, 4, 2, 4], dtype="float32")
-            # qkv_weight:  [3, n_heads, head_dim, q_dim]
+            >>> # qkv_weight:  [3, n_heads, head_dim, q_dim]
-            qkv_weight = paddle.rand(shape=[3, 8, 4, 4], dtype="float32")
+            >>> qkv_weight = paddle.rand(shape=[3, 8, 4, 4], dtype="float32")
-            # nonbatched_bias: [batch_size, 1, num_heads, res_len, m_size]
+            >>> # nonbatched_bias: [batch_size, 1, num_heads, res_len, m_size]
-            nonbatched_bias = paddle.rand(shape=[2, 1, 8, 2, 2], dtype="float32")
+            >>> nonbatched_bias = paddle.rand(shape=[2, 1, 8, 2, 2], dtype="float32")
-            # attn_mask: [batch_size, msa_len, 1, 1, m_size]
+            >>> # attn_mask: [batch_size, msa_len, 1, 1, m_size]
-            attn_mask = paddle.rand(shape=[2, 4, 1, 1, 2], dtype="float32")
+            >>> attn_mask = paddle.rand(shape=[2, 4, 1, 1, 2], dtype="float32")
-            # gate_linear_weight: [q_dim, num_heads, head_dim]
+            >>> # gate_linear_weight: [q_dim, num_heads, head_dim]
-            gate_linear_weight = paddle.rand(shape=[4, 8, 4], dtype="float32")
+            >>> gate_linear_weight = paddle.rand(shape=[4, 8, 4], dtype="float32")
-            # gate_bias: [num_heads, head_dim]
+            >>> # gate_bias: [num_heads, head_dim]
-            gate_linear_bias = paddle.rand(shape=[8, 4], dtype="float32")
+            >>> gate_linear_bias = paddle.rand(shape=[8, 4], dtype="float32")
-            # out_linear_weight: [num_heads, head_dim, q_dim]
+            >>> # out_linear_weight: [num_heads, head_dim, q_dim]
-            out_linear_weight = paddle.rand(shape=[8, 4, 4], dtype="float32")
+            >>> out_linear_weight = paddle.rand(shape=[8, 4, 4], dtype="float32")
-            # out_linear_bias: [q_dim]
+            >>> # out_linear_bias: [q_dim]
-            out_linear_bias = paddle.rand(shape=[4], dtype="float32")
+            >>> out_linear_bias = paddle.rand(shape=[4], dtype="float32")
-            # output: [batch_size, msa_len, res_len, q_dim]
+            >>> # output: [batch_size, msa_len, res_len, q_dim]
-            output = F.fused_gate_attention(
+            >>> output = F.fused_gate_attention(
-                query=query,
+            ...     query=query,
-                qkv_weight=qkv_weight,
+            ...     qkv_weight=qkv_weight,
-                gate_linear_weight=gate_linear_weight,
+            ...     gate_linear_weight=gate_linear_weight,
-                gate_linear_bias=gate_linear_bias,
+            ...     gate_linear_bias=gate_linear_bias,
-                out_linear_weight=out_linear_weight,
+            ...     out_linear_weight=out_linear_weight,
-                out_linear_bias=out_linear_bias,
+            ...     out_linear_bias=out_linear_bias,
-                nonbatched_bias=nonbatched_bias,
+            ...     nonbatched_bias=nonbatched_bias,
-                attn_mask=attn_mask,
+            ...     attn_mask=attn_mask,
-                has_gating=True,
+            ...     has_gating=True,
-                merge_qkv=True)
+            ...     merge_qkv=True)
-            print(output.shape)
+            >>> print(output.shape)
-            # [2, 4, 2, 4]
+            [2, 4, 2, 4]
    """
    if in_dynamic_mode():

--- a/python/paddle/incubate/nn/functional/fused_matmul_bias.py
+++ b/python/paddle/incubate/nn/functional/fused_matmul_bias.py
@@ -28,11 +28,11 @@ def fused_matmul_bias(
    Args:
        x (Tensor): the first input Tensor to be multiplied.
        y (Tensor): the second input Tensor to be multiplied. Its rank must be 2.
-        bias (Tensor|None): the input bias Tensor. If it is None, no bias addition would
+        bias (Tensor, optional): the input bias Tensor. If it is None, no bias addition would
-            be performed. Otherwise, the bias is added to the matrix multiplication result.
+            be performed. Otherwise, the bias is added to the matrix multiplication result. Default: None.
-        transpose_x (bool): Whether to transpose :math:`x` before multiplication.
+        transpose_x (bool, optional): Whether to transpose :math:`x` before multiplication. Default: False.
-        transpose_y (bool): Whether to transpose :math:`y` before multiplication.
+        transpose_y (bool, optional): Whether to transpose :math:`y` before multiplication. Default: False.
-        name(str|None): For detailed information, please refer to
+        name (str, optional): For detailed information, please refer to
            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
    Returns:
@@ -41,15 +41,18 @@ def fused_matmul_bias(
    Examples:
        .. code-block:: python
-            # required: gpu
+            >>> # doctest: +SKIP('fused_gemm_epilogue is only supported when CUDA version >= 11.6')
-            import paddle
+            >>> # doctest: +REQUIRES(env:GPU)
-            from paddle.incubate.nn.functional import fused_matmul_bias
+            >>> import paddle
+            >>> from paddle.incubate.nn.functional import fused_matmul_bias
-            x = paddle.randn([3, 4])
-            y = paddle.randn([4, 5])
+            >>> paddle.set_device('gpu')
-            bias = paddle.randn([5])
+            >>> x = paddle.randn([3, 5])
-            out = fused_matmul_bias(x, y, bias)
+            >>> y = paddle.randn([4, 5])
-            print(out.shape) # [3, 5]
+            >>> bias = paddle.randn([5])
+            >>> out = fused_matmul_bias(x, y, bias)
+            >>> print(out.shape)
+            [3, 5]
    """
    if bias is None:
        return matmul(x, y, transpose_x, transpose_y, name)
@@ -76,10 +79,10 @@ def fused_linear(x, weight, bias=None, transpose_weight=False, name=None):
    Args:
        x (Tensor): the input Tensor to be multiplied.
        weight (Tensor): the weight Tensor to be multiplied. Its rank must be 2.
-        bias (Tensor|None): the input bias Tensor. If it is None, no bias addition would
+        bias (Tensor, optional): the input bias Tensor. If it is None, no bias addition would
-            be performed. Otherwise, the bias is added to the matrix multiplication result.
+            be performed. Otherwise, the bias is added to the matrix multiplication result. Default: None.
-        transpose_weight (bool): Whether to transpose :math:`weight` before multiplication.
+        transpose_weight (bool, optional): Whether to transpose :math:`weight` before multiplication. Default: False.
-        name(str|None): For detailed information, please refer to
+        name (str, optional): For detailed information, please refer to
            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
    Returns:
@@ -88,15 +91,18 @@ def fused_linear(x, weight, bias=None, transpose_weight=False, name=None):
    Examples:
        .. code-block:: python
-            # required: gpu
+            >>> # doctest: +SKIP('fused_gemm_epilogue is only supported when CUDA version >= 11.6')
-            import paddle
+            >>> # doctest: +REQUIRES(env:GPU)
-            from paddle.incubate.nn.functional import fused_linear
+            >>> import paddle
+            >>> from paddle.incubate.nn.functional import fused_linear
-            x = paddle.randn([3, 4])
-            weight = paddle.randn([4, 5])
+            >>> paddle.set_device('gpu')
-            bias = paddle.randn([5])
+            >>> x = paddle.randn([3, 4])
-            out = fused_linear(x, weight, bias)
+            >>> weight = paddle.randn([4, 5])
-            print(out.shape) # [3, 5]
+            >>> bias = paddle.randn([5])
+            >>> out = fused_linear(x, weight, bias)
+            >>> print(out.shape)
+            [3, 5]
    """
    return fused_matmul_bias(x, weight, bias, False, transpose_weight, name)
@@ -109,25 +115,32 @@ def fused_linear_activation(
    Args:
        x (Tensor): the input Tensor to be multiplied.
-        weight (Tensor): the weight Tensor to be multiplied. Its rank must be 2.
+        y (Tensor): the weight Tensor to be multiplied. Its rank must be 2.
        bias (Tensor): the input bias Tensor, the bias is added to the matrix multiplication result.
-        transpose_weight (bool): Whether to transpose :math:`weight` before multiplication.
+        trans_x (bool, optional): Whether to transpose :math:`x` before multiplication.
-        activation(str|None): Activation function, Currently, the available activation functions are limited to "gelu" (Gaussian Error Linear Unit) and "relu" (Rectified Linear Unit). These activation functions are applied to the output of the bias add.
+        trans_y (bool, optional): Whether to transpose :math:`y` before multiplication.
+        activation (str, optional): Activation function, Currently, the available activation functions are
+            limited to "gelu" (Gaussian Error Linear Unit) and "relu" (Rectified Linear Unit).
+            These activation functions are applied to the output of the bias add. Default: None.
    Returns:
        Tensor: the output Tensor.
    Examples:
        .. code-block:: python
-            # required: gpu
+            >>> # doctest: +SKIP('fused_gemm_epilogue is only supported when CUDA version >= 11.6')
-            import paddle
+            >>> # doctest: +REQUIRES(env:GPU)
-            from paddle.incubate.nn.functional import fused_linear_activation
+            >>> import paddle
+            >>> from paddle.incubate.nn.functional import fused_linear_activation
-            x = paddle.randn([3, 4])
-            weight = paddle.randn([4, 5])
+            >>> paddle.set_device('gpu')
-            bias = paddle.randn([5])
+            >>> x = paddle.randn([3, 4])
-            out = fused_linear_activation(x, weight, bias)
+            >>> weight = paddle.randn([4, 5])
-            print(out.shape) # [3, 5]
+            >>> bias = paddle.randn([5])
+            >>> out = fused_linear_activation(x, weight, bias)
+            >>> print(out.shape)
+            [3, 5]
    """
    if activation is None:
        activation = "none"

--- a/python/paddle/incubate/nn/functional/fused_rotary_position_embedding.py
+++ b/python/paddle/incubate/nn/functional/fused_rotary_position_embedding.py
@@ -44,14 +44,13 @@ def fused_rotary_position_embedding(
    Examples:
-        ..  code-block:: python
+        .. code-block:: python
-            >>> # required: gpu
            >>> # doctest: +REQUIRES(env:GPU)
            >>> import paddle
            >>> from paddle.incubate.nn.functional import fused_rotary_position_embedding
-            >>> paddle.device.set_device('gpu')
+            >>> paddle.set_device('gpu')
            >>> # batch_size = 2
            >>> # seq_len = 2

--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py