[fix-doc-bug] Fix fused_attention_op english doc test=document_fix (#36803) (#36829)

* Fix fused_attention english doc test=document_fix

[fix-doc-bug] Fix fused_attention_op english doc test=document_fix (#36803) (#36829)
* Fix fused_attention english doc test=document_fix
9a964901 · Li Min · GitHub · 5fb28500 · 9a964901 · 9a964901
2 changed file
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -194,24 +194,27 @@ def fused_multi_head_attention(x,
    Multi-Head Attention performs multiple parallel attention to jointly attending
    to information from different representation subspaces. This API only
    support self_attention. The pseudo code is as follows:
+
+    .. code-block:: python
+
    	if pre_layer_norm:
-    	out = layer_norm(x);
-        out = linear(out) + qkv)bias
+    	    out = layer_norm(x)
+            out = linear(out) + qkv) + bias
    	else:
-	out = linear(x) + bias;
-    out = transpose(out, perm=[2, 0, 3, 1, 4]);
+	    out = linear(x) + bias
+    	out = transpose(out, perm=[2, 0, 3, 1, 4])
    	# extract q, k and v from out.
    	q = out[0:1,::]
    	k = out[1:2,::]
    	v = out[2:3,::]
-    out = q * k^t;
-    out = attn_mask + out;
-    out = softmax(out);
-    out = dropout(out);
-    out = out * v;
-    out = transpose(out, perm=[0, 2, 1, 3]);
-    out = out_linear(out);
-    out = layer_norm(x + dropout(linear_bias + out));
+    	out = q * k^t
+    	out = attn_mask + out
+    	out = softmax(out)
+    	out = dropout(out)
+    	out = out * v
+    	out = transpose(out, perm=[0, 2, 1, 3])
+    	out = out_linear(out)
+    	out = layer_norm(x + dropout(linear_bias + out))

    Parameters:
        x (Tensor): The input tensor of fused_multi_head_attention. The shape is
@@ -245,6 +248,9 @@ def fused_multi_head_attention(x,
        ln_epsilon (float, optional): Small float value added to denominator of layer_norm
            to avoid dividing by zero. Default is 1e-5.

+    Returns:
+        Tensor: The output Tensor, the data type and shape is same as `x`.
+
    Examples:

        .. code-block:: python

--- a/python/paddle/incubate/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
@@ -29,6 +29,7 @@ class FusedMultiHeadAttention(Layer):
    to information from different representation subspaces.
    Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
    for more details.
+
    Parameters:
        embed_dim (int): The expected feature size in the input and output.
        num_heads (int): The number of heads in multi-head attention.
@@ -42,17 +43,18 @@ class FusedMultiHeadAttention(Layer):
            `embed_dim`. Default None.
        vdim (int, optional): The feature size in value. If None, assumed equal to
            `embed_dim`. Default None.
-        normalize_before (bool, optional): Indicate  whether it is pre_layer_norm (True)
-            or post_layer_norm architecture (False). Default False.
+        normalize_before (bool, optional): Indicate  whether it is pre_layer_norm 
+            (True) or post_layer_norm architecture (False). Default False.
        need_weights (bool, optional): Indicate whether to return the attention
            weights. Now, only False is supported. Default False.
        weight_attr(ParamAttr, optional):  To specify the weight parameter property.
            Default: None, which means the default weight parameter property is used.
-            See usage for details in :code:`ParamAttr` .
+            See usage for details in :code:`ParamAttr`.
        bias_attr (ParamAttr|bool, optional): To specify the bias parameter property.
            Default: None, which means the default bias parameter property is used.
            If it is set to False, this layer will not have trainable bias parameter.
-            See usage for details in :code:`ParamAttr` .
+            See usage for details in :code:`ParamAttr`.
+
    Examples:

        .. code-block:: python
@@ -139,6 +141,7 @@ class FusedMultiHeadAttention(Layer):
        """
        Applies multi-head attention to map queries and a set of key-value pairs
        to outputs.
+
        Parameters:
            query (Tensor): The queries for multi-head attention. It is a
                tensor with shape `[batch_size, query_length, embed_dim]`. The
@@ -163,6 +166,7 @@ class FusedMultiHeadAttention(Layer):
                nothing wanted or needed to be prevented attention to. Default None.
            cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):
                Now, only None is supported. Default None.
+
        Returns:
            Tensor|tuple: It is a tensor that has the same shape and data type \
                as `query`, representing attention output.