未验证 提交 9d183662 编写于 作者: 小飞猪 提交者: GitHub

[xdoctest][task 268] reformat example code with google style in...

[xdoctest][task 268] reformat example code with google style in `/incubate/nn/layer/fused_transformer.py` (#56965)

* [Doctest]fix No.268, test=docs_preview

* Apply suggestions from code review

---------
Co-authored-by: NNyakku Shigure <sigure.qaq@gmail.com>
上级 875a4ea9
......@@ -100,14 +100,17 @@ class FusedBiasDropoutResidualLayerNorm(Layer):
.. code-block:: python
# required: gpu
import paddle
# input: [batch_size, seq_len, embed_dim]
x = paddle.rand((2, 4, 128))
# residual: [batch_size, seq_len, embed_dim]
residual = paddle.rand((2, 4, 128))
fused_bias_dropout_residual_ln = paddle.incubate.nn.FusedBiasDropoutResidualLayerNorm(128)
output = fused_bias_dropout_residual_ln(x, residual) # [2, 4, 128]
>>> # doctest: +REQUIRES(env:GPU)
>>> import paddle
>>> paddle.device.set_device('gpu')
>>> # input: [batch_size, seq_len, embed_dim]
>>> x = paddle.rand((2, 4, 128))
>>> # residual: [batch_size, seq_len, embed_dim]
>>> residual = paddle.rand((2, 4, 128))
>>> fused_bias_dropout_residual_ln = paddle.incubate.nn.FusedBiasDropoutResidualLayerNorm(128)
>>> output = fused_bias_dropout_residual_ln(x, residual)
>>> print(output.shape)
[2, 4, 128]
"""
def __init__(
......@@ -259,14 +262,17 @@ class FusedMultiHeadAttention(Layer):
.. code-block:: python
# required: gpu
import paddle
# input: [batch_size, sequence_length, embed_dim]
query = paddle.rand((2, 4, 128))
# self attention mask: [batch_size, num_heads, query_len, query_len]
attn_mask = paddle.rand((2, 2, 4, 4))
multi_head_attn = paddle.incubate.nn.FusedMultiHeadAttention(128, 2)
output = multi_head_attn(query, None, None, attn_mask=attn_mask) # [2, 4, 128]
>>> # doctest: +REQUIRES(env:GPU)
>>> import paddle
>>> paddle.device.set_device('gpu')
>>> # input: [batch_size, sequence_length, embed_dim]
>>> query = paddle.rand((2, 4, 128))
>>> # self attention mask: [batch_size, num_heads, query_len, query_len]
>>> attn_mask = paddle.rand((2, 2, 4, 4))
>>> multi_head_attn = paddle.incubate.nn.FusedMultiHeadAttention(128, 2)
>>> output = multi_head_attn(query, None, None, attn_mask=attn_mask)
>>> print(output.shape)
[2, 4, 128]
"""
def __init__(
......@@ -545,15 +551,16 @@ class FusedFeedForward(Layer):
Examples:
.. code-block:: python
# required: gpu
import paddle
from paddle.incubate.nn import FusedFeedForward
>>> # doctest: +REQUIRES(env:GPU)
>>> import paddle
>>> from paddle.incubate.nn import FusedFeedForward
>>> paddle.device.set_device('gpu')
fused_feedforward_layer = FusedFeedForward(8, 8)
x = paddle.rand((1, 8, 8))
out = fused_feedforward_layer(x)
print(out.shape)
# [1, 8, 8]
>>> fused_feedforward_layer = FusedFeedForward(8, 8)
>>> x = paddle.rand((1, 8, 8))
>>> out = fused_feedforward_layer(x)
>>> print(out.shape)
[1, 8, 8]
"""
def __init__(
......@@ -768,16 +775,19 @@ class FusedTransformerEncoderLayer(Layer):
Examples:
.. code-block:: python
# required: gpu
import paddle
from paddle.incubate.nn import FusedTransformerEncoderLayer
>>> # doctest: +REQUIRES(env:GPU)
>>> import paddle
>>> from paddle.incubate.nn import FusedTransformerEncoderLayer
>>> paddle.device.set_device('gpu')
# encoder input: [batch_size, src_len, d_model]
enc_input = paddle.rand((2, 4, 128))
# self attention mask: [batch_size, n_head, src_len, src_len]
attn_mask = paddle.rand((2, 2, 4, 4))
encoder_layer = FusedTransformerEncoderLayer(128, 2, 512)
enc_output = encoder_layer(enc_input, attn_mask) # [2, 4, 128]
>>> # encoder input: [batch_size, src_len, d_model]
>>> enc_input = paddle.rand((2, 4, 128))
>>> # self attention mask: [batch_size, n_head, src_len, src_len]
>>> attn_mask = paddle.rand((2, 2, 4, 4))
>>> encoder_layer = FusedTransformerEncoderLayer(128, 2, 512)
>>> enc_output = encoder_layer(enc_input, attn_mask)
>>> print(enc_output.shape)
[2, 4, 128]
"""
......@@ -973,25 +983,27 @@ class FusedTransformer(Layer):
.. code-block:: python
import paddle
from paddle.nn import Transformer
# src: [batch_size, tgt_len, d_model]
enc_input = paddle.rand((2, 4, 128))
# tgt: [batch_size, src_len, d_model]
dec_input = paddle.rand((2, 6, 128))
# src_mask: [batch_size, n_head, src_len, src_len]
enc_self_attn_mask = paddle.rand((2, 2, 4, 4))
# tgt_mask: [batch_size, n_head, tgt_len, tgt_len]
dec_self_attn_mask = paddle.rand((2, 2, 6, 6))
# memory_mask: [batch_size, n_head, tgt_len, src_len]
cross_attn_mask = paddle.rand((2, 2, 6, 4))
transformer = Transformer(128, 2, 4, 4, 512)
output = transformer(enc_input,
dec_input,
enc_self_attn_mask,
dec_self_attn_mask,
cross_attn_mask) # [2, 6, 128]
>>> import paddle
>>> from paddle.nn import Transformer
>>> # src: [batch_size, tgt_len, d_model]
>>> enc_input = paddle.rand((2, 4, 128))
>>> # tgt: [batch_size, src_len, d_model]
>>> dec_input = paddle.rand((2, 6, 128))
>>> # src_mask: [batch_size, n_head, src_len, src_len]
>>> enc_self_attn_mask = paddle.rand((2, 2, 4, 4))
>>> # tgt_mask: [batch_size, n_head, tgt_len, tgt_len]
>>> dec_self_attn_mask = paddle.rand((2, 2, 6, 6))
>>> # memory_mask: [batch_size, n_head, tgt_len, src_len]
>>> cross_attn_mask = paddle.rand((2, 2, 6, 4))
>>> transformer = Transformer(128, 2, 4, 4, 512)
>>> output = transformer(enc_input,
... dec_input,
... enc_self_attn_mask,
... dec_self_attn_mask,
... cross_attn_mask)
>>> print(output.shape)
[2, 6, 128]
"""
def __init__(
......@@ -1026,37 +1038,38 @@ class FusedMultiTransformer(Layer):
.. code-block:: python
if pre_layer_norm:
out = layer_norm(x)
out = qkv_linear(out) + qkv_bias
else:
out = qkv_linear(x) + qkv_bias
out = transpose(out, perm=[2, 0, 3, 1, 4])
# extract q, k and v from out.
q = out[0:1, ::]
k = out[1:2, ::]
v = out[2:3, ::]
out = q * k^t
out = attn_mask + out
out = softmax(out)
out = dropout(out)
out = out * v
out = transpose(out, perm=[0, 2, 1, 3])
out = linear(out)
if pre_layer_norm:
out = x + dropout(out + bias)
else:
out = layer_norm(x + dropout(out + bias))
residual = out;
if pre_layer_norm:
out = ffn_layer_norm(out)
out = ffn1_linear(out)
out = dropout(activation(out + ffn1_bias))
out = ffn2_linear(out)
out = residual + dropout(out + ffn2_bias)
if not pre_layer_norm:
out = ffn_layer_norm(out)
>>> # doctest: +SKIP('This is not an example')
>>> if pre_layer_norm:
... out = layer_norm(x)
... out = qkv_linear(out) + qkv_bias
... else:
... out = qkv_linear(x) + qkv_bias
>>> out = transpose(out, perm=[2, 0, 3, 1, 4])
>>> # extract q, k and v from out.
>>> q = out[0:1, ::]
>>> k = out[1:2, ::]
>>> v = out[2:3, ::]
>>> out = q * k^t
>>> out = attn_mask + out
>>> out = softmax(out)
>>> out = dropout(out)
>>> out = out * v
>>> out = transpose(out, perm=[0, 2, 1, 3])
>>> out = linear(out)
>>> if pre_layer_norm:
... out = x + dropout(out + bias)
... else:
... out = layer_norm(x + dropout(out + bias))
>>> residual = out;
>>> if pre_layer_norm:
... out = ffn_layer_norm(out)
>>> out = ffn1_linear(out)
>>> out = dropout(activation(out + ffn1_bias))
>>> out = ffn2_linear(out)
>>> out = residual + dropout(out + ffn2_bias)
>>> if not pre_layer_norm:
... out = ffn_layer_norm(out)
Parameters:
embed_dim (int): The expected feature size in the input and output.
......@@ -1166,16 +1179,19 @@ class FusedMultiTransformer(Layer):
.. code-block:: python
# required: gpu
import paddle
from paddle.incubate.nn import FusedMultiTransformer
# encoder input: [batch_size, src_len, d_model]
enc_input = paddle.rand((2, 4, 128))
# self attention mask: [batch_size, 1, src_len, src_len]
attn_mask = paddle.rand((2, 1, 4, 4))
encoder_layers = FusedMultiTransformer(128, 2, 512, num_layers=1)
enc_output = encoder_layers(enc_input, attn_mask) # [2, 4, 128]
>>> # doctest: +REQUIRES(env:GPU)
>>> import paddle
>>> from paddle.incubate.nn import FusedMultiTransformer
>>> paddle.device.set_device('gpu')
>>> # encoder input: [batch_size, src_len, d_model]
>>> enc_input = paddle.rand((2, 4, 128))
>>> # self attention mask: [batch_size, 1, src_len, src_len]
>>> attn_mask = paddle.rand((2, 1, 4, 4))
>>> encoder_layers = FusedMultiTransformer(128, 2, 512, num_layers=1)
>>> enc_output = encoder_layers(enc_input, attn_mask)
>>> print(enc_output.shape)
[2, 4, 128]
"""
def __init__(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册