From 9d1836625c957c4da2a06e9219882a12764484ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=8F=E9=A3=9E=E7=8C=AA?= <106524776+ooooo-create@users.noreply.github.com> Date: Wed, 6 Sep 2023 10:30:07 +0800 Subject: [PATCH] [xdoctest][task 268] reformat example code with google style in `/incubate/nn/layer/fused_transformer.py` (#56965) * [Doctest]fix No.268, test=docs_preview * Apply suggestions from code review --------- Co-authored-by: Nyakku Shigure --- .../incubate/nn/layer/fused_transformer.py | 202 ++++++++++-------- 1 file changed, 109 insertions(+), 93 deletions(-) diff --git a/python/paddle/incubate/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py index b70517156fe..a3f3ea1d83a 100644 --- a/python/paddle/incubate/nn/layer/fused_transformer.py +++ b/python/paddle/incubate/nn/layer/fused_transformer.py @@ -100,14 +100,17 @@ class FusedBiasDropoutResidualLayerNorm(Layer): .. code-block:: python - # required: gpu - import paddle - # input: [batch_size, seq_len, embed_dim] - x = paddle.rand((2, 4, 128)) - # residual: [batch_size, seq_len, embed_dim] - residual = paddle.rand((2, 4, 128)) - fused_bias_dropout_residual_ln = paddle.incubate.nn.FusedBiasDropoutResidualLayerNorm(128) - output = fused_bias_dropout_residual_ln(x, residual) # [2, 4, 128] + >>> # doctest: +REQUIRES(env:GPU) + >>> import paddle + >>> paddle.device.set_device('gpu') + >>> # input: [batch_size, seq_len, embed_dim] + >>> x = paddle.rand((2, 4, 128)) + >>> # residual: [batch_size, seq_len, embed_dim] + >>> residual = paddle.rand((2, 4, 128)) + >>> fused_bias_dropout_residual_ln = paddle.incubate.nn.FusedBiasDropoutResidualLayerNorm(128) + >>> output = fused_bias_dropout_residual_ln(x, residual) + >>> print(output.shape) + [2, 4, 128] """ def __init__( @@ -259,14 +262,17 @@ class FusedMultiHeadAttention(Layer): .. code-block:: python - # required: gpu - import paddle - # input: [batch_size, sequence_length, embed_dim] - query = paddle.rand((2, 4, 128)) - # self attention mask: [batch_size, num_heads, query_len, query_len] - attn_mask = paddle.rand((2, 2, 4, 4)) - multi_head_attn = paddle.incubate.nn.FusedMultiHeadAttention(128, 2) - output = multi_head_attn(query, None, None, attn_mask=attn_mask) # [2, 4, 128] + >>> # doctest: +REQUIRES(env:GPU) + >>> import paddle + >>> paddle.device.set_device('gpu') + >>> # input: [batch_size, sequence_length, embed_dim] + >>> query = paddle.rand((2, 4, 128)) + >>> # self attention mask: [batch_size, num_heads, query_len, query_len] + >>> attn_mask = paddle.rand((2, 2, 4, 4)) + >>> multi_head_attn = paddle.incubate.nn.FusedMultiHeadAttention(128, 2) + >>> output = multi_head_attn(query, None, None, attn_mask=attn_mask) + >>> print(output.shape) + [2, 4, 128] """ def __init__( @@ -545,15 +551,16 @@ class FusedFeedForward(Layer): Examples: .. code-block:: python - # required: gpu - import paddle - from paddle.incubate.nn import FusedFeedForward + >>> # doctest: +REQUIRES(env:GPU) + >>> import paddle + >>> from paddle.incubate.nn import FusedFeedForward + >>> paddle.device.set_device('gpu') - fused_feedforward_layer = FusedFeedForward(8, 8) - x = paddle.rand((1, 8, 8)) - out = fused_feedforward_layer(x) - print(out.shape) - # [1, 8, 8] + >>> fused_feedforward_layer = FusedFeedForward(8, 8) + >>> x = paddle.rand((1, 8, 8)) + >>> out = fused_feedforward_layer(x) + >>> print(out.shape) + [1, 8, 8] """ def __init__( @@ -768,16 +775,19 @@ class FusedTransformerEncoderLayer(Layer): Examples: .. code-block:: python - # required: gpu - import paddle - from paddle.incubate.nn import FusedTransformerEncoderLayer + >>> # doctest: +REQUIRES(env:GPU) + >>> import paddle + >>> from paddle.incubate.nn import FusedTransformerEncoderLayer + >>> paddle.device.set_device('gpu') - # encoder input: [batch_size, src_len, d_model] - enc_input = paddle.rand((2, 4, 128)) - # self attention mask: [batch_size, n_head, src_len, src_len] - attn_mask = paddle.rand((2, 2, 4, 4)) - encoder_layer = FusedTransformerEncoderLayer(128, 2, 512) - enc_output = encoder_layer(enc_input, attn_mask) # [2, 4, 128] + >>> # encoder input: [batch_size, src_len, d_model] + >>> enc_input = paddle.rand((2, 4, 128)) + >>> # self attention mask: [batch_size, n_head, src_len, src_len] + >>> attn_mask = paddle.rand((2, 2, 4, 4)) + >>> encoder_layer = FusedTransformerEncoderLayer(128, 2, 512) + >>> enc_output = encoder_layer(enc_input, attn_mask) + >>> print(enc_output.shape) + [2, 4, 128] """ @@ -973,25 +983,27 @@ class FusedTransformer(Layer): .. code-block:: python - import paddle - from paddle.nn import Transformer - - # src: [batch_size, tgt_len, d_model] - enc_input = paddle.rand((2, 4, 128)) - # tgt: [batch_size, src_len, d_model] - dec_input = paddle.rand((2, 6, 128)) - # src_mask: [batch_size, n_head, src_len, src_len] - enc_self_attn_mask = paddle.rand((2, 2, 4, 4)) - # tgt_mask: [batch_size, n_head, tgt_len, tgt_len] - dec_self_attn_mask = paddle.rand((2, 2, 6, 6)) - # memory_mask: [batch_size, n_head, tgt_len, src_len] - cross_attn_mask = paddle.rand((2, 2, 6, 4)) - transformer = Transformer(128, 2, 4, 4, 512) - output = transformer(enc_input, - dec_input, - enc_self_attn_mask, - dec_self_attn_mask, - cross_attn_mask) # [2, 6, 128] + >>> import paddle + >>> from paddle.nn import Transformer + + >>> # src: [batch_size, tgt_len, d_model] + >>> enc_input = paddle.rand((2, 4, 128)) + >>> # tgt: [batch_size, src_len, d_model] + >>> dec_input = paddle.rand((2, 6, 128)) + >>> # src_mask: [batch_size, n_head, src_len, src_len] + >>> enc_self_attn_mask = paddle.rand((2, 2, 4, 4)) + >>> # tgt_mask: [batch_size, n_head, tgt_len, tgt_len] + >>> dec_self_attn_mask = paddle.rand((2, 2, 6, 6)) + >>> # memory_mask: [batch_size, n_head, tgt_len, src_len] + >>> cross_attn_mask = paddle.rand((2, 2, 6, 4)) + >>> transformer = Transformer(128, 2, 4, 4, 512) + >>> output = transformer(enc_input, + ... dec_input, + ... enc_self_attn_mask, + ... dec_self_attn_mask, + ... cross_attn_mask) + >>> print(output.shape) + [2, 6, 128] """ def __init__( @@ -1026,37 +1038,38 @@ class FusedMultiTransformer(Layer): .. code-block:: python - if pre_layer_norm: - out = layer_norm(x) - out = qkv_linear(out) + qkv_bias - else: - out = qkv_linear(x) + qkv_bias - out = transpose(out, perm=[2, 0, 3, 1, 4]) - # extract q, k and v from out. - q = out[0:1, ::] - k = out[1:2, ::] - v = out[2:3, ::] - out = q * k^t - out = attn_mask + out - out = softmax(out) - out = dropout(out) - out = out * v - out = transpose(out, perm=[0, 2, 1, 3]) - out = linear(out) - if pre_layer_norm: - out = x + dropout(out + bias) - else: - out = layer_norm(x + dropout(out + bias)) - - residual = out; - if pre_layer_norm: - out = ffn_layer_norm(out) - out = ffn1_linear(out) - out = dropout(activation(out + ffn1_bias)) - out = ffn2_linear(out) - out = residual + dropout(out + ffn2_bias) - if not pre_layer_norm: - out = ffn_layer_norm(out) + >>> # doctest: +SKIP('This is not an example') + >>> if pre_layer_norm: + ... out = layer_norm(x) + ... out = qkv_linear(out) + qkv_bias + ... else: + ... out = qkv_linear(x) + qkv_bias + >>> out = transpose(out, perm=[2, 0, 3, 1, 4]) + >>> # extract q, k and v from out. + >>> q = out[0:1, ::] + >>> k = out[1:2, ::] + >>> v = out[2:3, ::] + >>> out = q * k^t + >>> out = attn_mask + out + >>> out = softmax(out) + >>> out = dropout(out) + >>> out = out * v + >>> out = transpose(out, perm=[0, 2, 1, 3]) + >>> out = linear(out) + >>> if pre_layer_norm: + ... out = x + dropout(out + bias) + ... else: + ... out = layer_norm(x + dropout(out + bias)) + + >>> residual = out; + >>> if pre_layer_norm: + ... out = ffn_layer_norm(out) + >>> out = ffn1_linear(out) + >>> out = dropout(activation(out + ffn1_bias)) + >>> out = ffn2_linear(out) + >>> out = residual + dropout(out + ffn2_bias) + >>> if not pre_layer_norm: + ... out = ffn_layer_norm(out) Parameters: embed_dim (int): The expected feature size in the input and output. @@ -1166,16 +1179,19 @@ class FusedMultiTransformer(Layer): .. code-block:: python - # required: gpu - import paddle - from paddle.incubate.nn import FusedMultiTransformer - - # encoder input: [batch_size, src_len, d_model] - enc_input = paddle.rand((2, 4, 128)) - # self attention mask: [batch_size, 1, src_len, src_len] - attn_mask = paddle.rand((2, 1, 4, 4)) - encoder_layers = FusedMultiTransformer(128, 2, 512, num_layers=1) - enc_output = encoder_layers(enc_input, attn_mask) # [2, 4, 128] + >>> # doctest: +REQUIRES(env:GPU) + >>> import paddle + >>> from paddle.incubate.nn import FusedMultiTransformer + >>> paddle.device.set_device('gpu') + + >>> # encoder input: [batch_size, src_len, d_model] + >>> enc_input = paddle.rand((2, 4, 128)) + >>> # self attention mask: [batch_size, 1, src_len, src_len] + >>> attn_mask = paddle.rand((2, 1, 4, 4)) + >>> encoder_layers = FusedMultiTransformer(128, 2, 512, num_layers=1) + >>> enc_output = encoder_layers(enc_input, attn_mask) + >>> print(enc_output.shape) + [2, 4, 128] """ def __init__( -- GitLab