From 9a9649017db25347a627137aabb56e10251ec0d8 Mon Sep 17 00:00:00 2001 From: Li Min <11663212+limin2021@users.noreply.github.com> Date: Thu, 28 Oct 2021 15:32:10 +0800 Subject: [PATCH] [fix-doc-bug] Fix fused_attention_op english doc test=document_fix (#36803) (#36829) * Fix fused_attention english doc test=document_fix --- .../nn/functional/fused_transformer.py | 42 +++++++++++-------- .../incubate/nn/layer/fused_transformer.py | 14 ++++--- 2 files changed, 33 insertions(+), 23 deletions(-) diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py index f692283841..6c447a73c5 100644 --- a/python/paddle/incubate/nn/functional/fused_transformer.py +++ b/python/paddle/incubate/nn/functional/fused_transformer.py @@ -194,24 +194,27 @@ def fused_multi_head_attention(x, Multi-Head Attention performs multiple parallel attention to jointly attending to information from different representation subspaces. This API only support self_attention. The pseudo code is as follows: - if pre_layer_norm: - out = layer_norm(x); - out = linear(out) + qkv)bias - else: - out = linear(x) + bias; - out = transpose(out, perm=[2, 0, 3, 1, 4]); - # extract q, k and v from out. - q = out[0:1,::] - k = out[1:2,::] - v = out[2:3,::] - out = q * k^t; - out = attn_mask + out; - out = softmax(out); - out = dropout(out); - out = out * v; - out = transpose(out, perm=[0, 2, 1, 3]); - out = out_linear(out); - out = layer_norm(x + dropout(linear_bias + out)); + + .. code-block:: python + + if pre_layer_norm: + out = layer_norm(x) + out = linear(out) + qkv) + bias + else: + out = linear(x) + bias + out = transpose(out, perm=[2, 0, 3, 1, 4]) + # extract q, k and v from out. + q = out[0:1,::] + k = out[1:2,::] + v = out[2:3,::] + out = q * k^t + out = attn_mask + out + out = softmax(out) + out = dropout(out) + out = out * v + out = transpose(out, perm=[0, 2, 1, 3]) + out = out_linear(out) + out = layer_norm(x + dropout(linear_bias + out)) Parameters: x (Tensor): The input tensor of fused_multi_head_attention. The shape is @@ -245,6 +248,9 @@ def fused_multi_head_attention(x, ln_epsilon (float, optional): Small float value added to denominator of layer_norm to avoid dividing by zero. Default is 1e-5. + Returns: + Tensor: The output Tensor, the data type and shape is same as `x`. + Examples: .. code-block:: python diff --git a/python/paddle/incubate/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py index bc887875c7..a3d8a74844 100644 --- a/python/paddle/incubate/nn/layer/fused_transformer.py +++ b/python/paddle/incubate/nn/layer/fused_transformer.py @@ -24,11 +24,12 @@ import collections class FusedMultiHeadAttention(Layer): """ - Attention mapps queries and a set of key-value pairs to outputs, and + Attention mapps queries and a set of key-value pairs to outputs, and Multi-Head Attention performs multiple parallel attention to jointly attending to information from different representation subspaces. Please refer to `Attention Is All You Need `_ for more details. + Parameters: embed_dim (int): The expected feature size in the input and output. num_heads (int): The number of heads in multi-head attention. @@ -42,17 +43,18 @@ class FusedMultiHeadAttention(Layer): `embed_dim`. Default None. vdim (int, optional): The feature size in value. If None, assumed equal to `embed_dim`. Default None. - normalize_before (bool, optional): Indicate whether it is pre_layer_norm (True) - or post_layer_norm architecture (False). Default False. + normalize_before (bool, optional): Indicate whether it is pre_layer_norm + (True) or post_layer_norm architecture (False). Default False. need_weights (bool, optional): Indicate whether to return the attention weights. Now, only False is supported. Default False. weight_attr(ParamAttr, optional): To specify the weight parameter property. Default: None, which means the default weight parameter property is used. - See usage for details in :code:`ParamAttr` . + See usage for details in :code:`ParamAttr`. bias_attr (ParamAttr|bool, optional): To specify the bias parameter property. Default: None, which means the default bias parameter property is used. If it is set to False, this layer will not have trainable bias parameter. - See usage for details in :code:`ParamAttr` . + See usage for details in :code:`ParamAttr`. + Examples: .. code-block:: python @@ -139,6 +141,7 @@ class FusedMultiHeadAttention(Layer): """ Applies multi-head attention to map queries and a set of key-value pairs to outputs. + Parameters: query (Tensor): The queries for multi-head attention. It is a tensor with shape `[batch_size, query_length, embed_dim]`. The @@ -163,6 +166,7 @@ class FusedMultiHeadAttention(Layer): nothing wanted or needed to be prevented attention to. Default None. cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional): Now, only None is supported. Default None. + Returns: Tensor|tuple: It is a tensor that has the same shape and data type \ as `query`, representing attention output. -- GitLab