未验证 提交 9d183662 编写于 作者: 小飞猪 提交者: GitHub

[xdoctest][task 268] reformat example code with google style in...

[xdoctest][task 268] reformat example code with google style in `/incubate/nn/layer/fused_transformer.py` (#56965)

* [Doctest]fix No.268, test=docs_preview

* Apply suggestions from code review

---------
Co-authored-by: NNyakku Shigure <sigure.qaq@gmail.com>
上级 875a4ea9
...@@ -100,14 +100,17 @@ class FusedBiasDropoutResidualLayerNorm(Layer): ...@@ -100,14 +100,17 @@ class FusedBiasDropoutResidualLayerNorm(Layer):
.. code-block:: python .. code-block:: python
# required: gpu >>> # doctest: +REQUIRES(env:GPU)
import paddle >>> import paddle
# input: [batch_size, seq_len, embed_dim] >>> paddle.device.set_device('gpu')
x = paddle.rand((2, 4, 128)) >>> # input: [batch_size, seq_len, embed_dim]
# residual: [batch_size, seq_len, embed_dim] >>> x = paddle.rand((2, 4, 128))
residual = paddle.rand((2, 4, 128)) >>> # residual: [batch_size, seq_len, embed_dim]
fused_bias_dropout_residual_ln = paddle.incubate.nn.FusedBiasDropoutResidualLayerNorm(128) >>> residual = paddle.rand((2, 4, 128))
output = fused_bias_dropout_residual_ln(x, residual) # [2, 4, 128] >>> fused_bias_dropout_residual_ln = paddle.incubate.nn.FusedBiasDropoutResidualLayerNorm(128)
>>> output = fused_bias_dropout_residual_ln(x, residual)
>>> print(output.shape)
[2, 4, 128]
""" """
def __init__( def __init__(
...@@ -259,14 +262,17 @@ class FusedMultiHeadAttention(Layer): ...@@ -259,14 +262,17 @@ class FusedMultiHeadAttention(Layer):
.. code-block:: python .. code-block:: python
# required: gpu >>> # doctest: +REQUIRES(env:GPU)
import paddle >>> import paddle
# input: [batch_size, sequence_length, embed_dim] >>> paddle.device.set_device('gpu')
query = paddle.rand((2, 4, 128)) >>> # input: [batch_size, sequence_length, embed_dim]
# self attention mask: [batch_size, num_heads, query_len, query_len] >>> query = paddle.rand((2, 4, 128))
attn_mask = paddle.rand((2, 2, 4, 4)) >>> # self attention mask: [batch_size, num_heads, query_len, query_len]
multi_head_attn = paddle.incubate.nn.FusedMultiHeadAttention(128, 2) >>> attn_mask = paddle.rand((2, 2, 4, 4))
output = multi_head_attn(query, None, None, attn_mask=attn_mask) # [2, 4, 128] >>> multi_head_attn = paddle.incubate.nn.FusedMultiHeadAttention(128, 2)
>>> output = multi_head_attn(query, None, None, attn_mask=attn_mask)
>>> print(output.shape)
[2, 4, 128]
""" """
def __init__( def __init__(
...@@ -545,15 +551,16 @@ class FusedFeedForward(Layer): ...@@ -545,15 +551,16 @@ class FusedFeedForward(Layer):
Examples: Examples:
.. code-block:: python .. code-block:: python
# required: gpu >>> # doctest: +REQUIRES(env:GPU)
import paddle >>> import paddle
from paddle.incubate.nn import FusedFeedForward >>> from paddle.incubate.nn import FusedFeedForward
>>> paddle.device.set_device('gpu')
fused_feedforward_layer = FusedFeedForward(8, 8) >>> fused_feedforward_layer = FusedFeedForward(8, 8)
x = paddle.rand((1, 8, 8)) >>> x = paddle.rand((1, 8, 8))
out = fused_feedforward_layer(x) >>> out = fused_feedforward_layer(x)
print(out.shape) >>> print(out.shape)
# [1, 8, 8] [1, 8, 8]
""" """
def __init__( def __init__(
...@@ -768,16 +775,19 @@ class FusedTransformerEncoderLayer(Layer): ...@@ -768,16 +775,19 @@ class FusedTransformerEncoderLayer(Layer):
Examples: Examples:
.. code-block:: python .. code-block:: python
# required: gpu >>> # doctest: +REQUIRES(env:GPU)
import paddle >>> import paddle
from paddle.incubate.nn import FusedTransformerEncoderLayer >>> from paddle.incubate.nn import FusedTransformerEncoderLayer
>>> paddle.device.set_device('gpu')
# encoder input: [batch_size, src_len, d_model] >>> # encoder input: [batch_size, src_len, d_model]
enc_input = paddle.rand((2, 4, 128)) >>> enc_input = paddle.rand((2, 4, 128))
# self attention mask: [batch_size, n_head, src_len, src_len] >>> # self attention mask: [batch_size, n_head, src_len, src_len]
attn_mask = paddle.rand((2, 2, 4, 4)) >>> attn_mask = paddle.rand((2, 2, 4, 4))
encoder_layer = FusedTransformerEncoderLayer(128, 2, 512) >>> encoder_layer = FusedTransformerEncoderLayer(128, 2, 512)
enc_output = encoder_layer(enc_input, attn_mask) # [2, 4, 128] >>> enc_output = encoder_layer(enc_input, attn_mask)
>>> print(enc_output.shape)
[2, 4, 128]
""" """
...@@ -973,25 +983,27 @@ class FusedTransformer(Layer): ...@@ -973,25 +983,27 @@ class FusedTransformer(Layer):
.. code-block:: python .. code-block:: python
import paddle >>> import paddle
from paddle.nn import Transformer >>> from paddle.nn import Transformer
# src: [batch_size, tgt_len, d_model] >>> # src: [batch_size, tgt_len, d_model]
enc_input = paddle.rand((2, 4, 128)) >>> enc_input = paddle.rand((2, 4, 128))
# tgt: [batch_size, src_len, d_model] >>> # tgt: [batch_size, src_len, d_model]
dec_input = paddle.rand((2, 6, 128)) >>> dec_input = paddle.rand((2, 6, 128))
# src_mask: [batch_size, n_head, src_len, src_len] >>> # src_mask: [batch_size, n_head, src_len, src_len]
enc_self_attn_mask = paddle.rand((2, 2, 4, 4)) >>> enc_self_attn_mask = paddle.rand((2, 2, 4, 4))
# tgt_mask: [batch_size, n_head, tgt_len, tgt_len] >>> # tgt_mask: [batch_size, n_head, tgt_len, tgt_len]
dec_self_attn_mask = paddle.rand((2, 2, 6, 6)) >>> dec_self_attn_mask = paddle.rand((2, 2, 6, 6))
# memory_mask: [batch_size, n_head, tgt_len, src_len] >>> # memory_mask: [batch_size, n_head, tgt_len, src_len]
cross_attn_mask = paddle.rand((2, 2, 6, 4)) >>> cross_attn_mask = paddle.rand((2, 2, 6, 4))
transformer = Transformer(128, 2, 4, 4, 512) >>> transformer = Transformer(128, 2, 4, 4, 512)
output = transformer(enc_input, >>> output = transformer(enc_input,
dec_input, ... dec_input,
enc_self_attn_mask, ... enc_self_attn_mask,
dec_self_attn_mask, ... dec_self_attn_mask,
cross_attn_mask) # [2, 6, 128] ... cross_attn_mask)
>>> print(output.shape)
[2, 6, 128]
""" """
def __init__( def __init__(
...@@ -1026,37 +1038,38 @@ class FusedMultiTransformer(Layer): ...@@ -1026,37 +1038,38 @@ class FusedMultiTransformer(Layer):
.. code-block:: python .. code-block:: python
if pre_layer_norm: >>> # doctest: +SKIP('This is not an example')
out = layer_norm(x) >>> if pre_layer_norm:
out = qkv_linear(out) + qkv_bias ... out = layer_norm(x)
else: ... out = qkv_linear(out) + qkv_bias
out = qkv_linear(x) + qkv_bias ... else:
out = transpose(out, perm=[2, 0, 3, 1, 4]) ... out = qkv_linear(x) + qkv_bias
# extract q, k and v from out. >>> out = transpose(out, perm=[2, 0, 3, 1, 4])
q = out[0:1, ::] >>> # extract q, k and v from out.
k = out[1:2, ::] >>> q = out[0:1, ::]
v = out[2:3, ::] >>> k = out[1:2, ::]
out = q * k^t >>> v = out[2:3, ::]
out = attn_mask + out >>> out = q * k^t
out = softmax(out) >>> out = attn_mask + out
out = dropout(out) >>> out = softmax(out)
out = out * v >>> out = dropout(out)
out = transpose(out, perm=[0, 2, 1, 3]) >>> out = out * v
out = linear(out) >>> out = transpose(out, perm=[0, 2, 1, 3])
if pre_layer_norm: >>> out = linear(out)
out = x + dropout(out + bias) >>> if pre_layer_norm:
else: ... out = x + dropout(out + bias)
out = layer_norm(x + dropout(out + bias)) ... else:
... out = layer_norm(x + dropout(out + bias))
residual = out;
if pre_layer_norm: >>> residual = out;
out = ffn_layer_norm(out) >>> if pre_layer_norm:
out = ffn1_linear(out) ... out = ffn_layer_norm(out)
out = dropout(activation(out + ffn1_bias)) >>> out = ffn1_linear(out)
out = ffn2_linear(out) >>> out = dropout(activation(out + ffn1_bias))
out = residual + dropout(out + ffn2_bias) >>> out = ffn2_linear(out)
if not pre_layer_norm: >>> out = residual + dropout(out + ffn2_bias)
out = ffn_layer_norm(out) >>> if not pre_layer_norm:
... out = ffn_layer_norm(out)
Parameters: Parameters:
embed_dim (int): The expected feature size in the input and output. embed_dim (int): The expected feature size in the input and output.
...@@ -1166,16 +1179,19 @@ class FusedMultiTransformer(Layer): ...@@ -1166,16 +1179,19 @@ class FusedMultiTransformer(Layer):
.. code-block:: python .. code-block:: python
# required: gpu >>> # doctest: +REQUIRES(env:GPU)
import paddle >>> import paddle
from paddle.incubate.nn import FusedMultiTransformer >>> from paddle.incubate.nn import FusedMultiTransformer
>>> paddle.device.set_device('gpu')
# encoder input: [batch_size, src_len, d_model]
enc_input = paddle.rand((2, 4, 128)) >>> # encoder input: [batch_size, src_len, d_model]
# self attention mask: [batch_size, 1, src_len, src_len] >>> enc_input = paddle.rand((2, 4, 128))
attn_mask = paddle.rand((2, 1, 4, 4)) >>> # self attention mask: [batch_size, 1, src_len, src_len]
encoder_layers = FusedMultiTransformer(128, 2, 512, num_layers=1) >>> attn_mask = paddle.rand((2, 1, 4, 4))
enc_output = encoder_layers(enc_input, attn_mask) # [2, 4, 128] >>> encoder_layers = FusedMultiTransformer(128, 2, 512, num_layers=1)
>>> enc_output = encoder_layers(enc_input, attn_mask)
>>> print(enc_output.shape)
[2, 4, 128]
""" """
def __init__( def __init__(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册