From 48d8a3903318b9447d2522a99562178d78d450c4 Mon Sep 17 00:00:00 2001 From: guosheng Date: Mon, 11 May 2020 11:42:05 +0800 Subject: [PATCH] Add api docs for TransformerEncoder. --- hapi/text/text.py | 108 ++++++++++++++++++++++++++++------------------ 1 file changed, 65 insertions(+), 43 deletions(-) diff --git a/hapi/text/text.py b/hapi/text/text.py index b5a849d..d320b32 100644 --- a/hapi/text/text.py +++ b/hapi/text/text.py @@ -2367,10 +2367,20 @@ class TransformerCell(Layer): import paddle import paddle.fluid as fluid + from paddle.fluid.dygraph import Embedding from paddle.incubate.hapi.text import TransformerCell from paddle.incubate.hapi.text import TransformerBeamSearchDecoder - embedder = Embedding(size=[1000, 128]) + class Embedder(fluid.dygraph.Layer): + def __init__(self): + self.word_embedder = Embedding(size=[1000, 128]) + self.pos_embedder = Embedding(size=[500, 128]) + + def forward(self, inputs): + word, position = inputs + return self.word_embedder(word) + self.pos_embedder(position) + + embedder = Embedder() output_layer = Linear(128, 1000) decoder = TransformerDecoder(2, 2, 64, 64, 128, 512) transformer_cell = TransformerCell(decoder, embedder, output_layer) @@ -2392,7 +2402,7 @@ class TransformerCell(Layer): enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch( enc_output, beam_size=4) trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch( - trg_src_attn_bias, self.beam_size) + trg_src_attn_bias, beam_size=4) static_caches = decoder.prepare_static_cache(enc_output) outputs = dynamic_decoder( inits=caches, @@ -2517,10 +2527,20 @@ class TransformerBeamSearchDecoder(layers.BeamSearchDecoder): import paddle import paddle.fluid as fluid + from paddle.fluid.dygraph import Embedding from paddle.incubate.hapi.text import TransformerCell from paddle.incubate.hapi.text import TransformerBeamSearchDecoder - embedder = Embedding(size=[1000, 128]) + class Embedder(fluid.dygraph.Layer): + def __init__(self): + self.word_embedder = Embedding(size=[1000, 128]) + self.pos_embedder = Embedding(size=[500, 128]) + + def forward(self, inputs): + word, position = inputs + return self.word_embedder(word) + self.pos_embedder(position) + + embedder = Embedder() output_layer = Linear(128, 1000) decoder = TransformerDecoder(2, 2, 64, 64, 128, 512) transformer_cell = TransformerCell(decoder, embedder, output_layer) @@ -2542,7 +2562,7 @@ class TransformerBeamSearchDecoder(layers.BeamSearchDecoder): enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch( enc_output, beam_size=4) trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch( - trg_src_attn_bias, self.beam_size) + trg_src_attn_bias, beam_size=4) static_caches = decoder.prepare_static_cache(enc_output) outputs = dynamic_decoder( inits=caches, @@ -2944,53 +2964,33 @@ class TransformerEncoder(Layer): """ TransformerEncoder is a stack of N encoder layers. - Applies a stacked multi-layer gated recurrent unit (GRU) RNN to an input - sequence. - Parameters: n_layer (int): The number of encoder layers to be stacked. - n_head (int): The number of heads in the multi-head attention(MHA). - d_key (int): The number of heads in the multi-head attention. Mostly . - d_value (int): The number of heads in the multiheadattention. + n_head (int): The number of heads in multi-head attention(MHA). + d_key (int): The feature size to transformer queries and keys as in + multi-head attention. Mostly it equals to `d_model // n_head`. + d_value (int): The feature size to transformer values as in multi-head + attention. Mostly it equals to `d_model // n_head`. d_model (int): The expected feature size in the input and output. d_inner_hid (int): The hidden layer size in the feedforward network(FFN). prepostprocess_dropout (float, optional): The dropout probability used in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1 attention_dropout (float, optional): The dropout probability used in MHA to drop some attention target. Default 0.1 - relu_dropout (float, optional): The dropout probability used in FFN - in MHA to drop some attention target. Default 0.1 + relu_dropout (float, optional): The dropout probability used after FFN + activition. Default 0.1 preprocess_cmd (str, optional): The process applied before each MHA and - FFN sub-layer, and it also would be applied. It should be a string - that includes `d`, `a`, `n` as , where `d` for dropout, `a` for add - residual connection, `n` for layer normalization. - network. Default `n`. + FFN sub-layer, and it also would be applied on output of the last + stacked layer. It should be a string composed of `d`, `a`, `n`, + where `d` for dropout, `a` for add residual connection, `n` for + layer normalization. Default `n`. + postprocess_cmd (str, optional): The process applied after each MHA and + FFN sub-layer. Same as `preprocess_cmd`. It should be a string + composed of `d`, `a`, `n`, where `d` for dropout, `a` for add + residual connection, `n` for layer normalization. Default `da`. ffn_fc1_act (str, optional): The activation function in the feedforward network. Default relu. - dropout(float|list|tuple, optional): The dropout probability after each - GRU. It also can be a list or tuple, including dropout probabilities - for the corresponding GRU. Default 0.0 - is_reverse (bool, optional): Indicate whether to calculate in the reverse - order of input sequences. Default: `False`. - time_major (bool, optional): Indicate the data layout of Tensor included - in `input` and `output` tensors. If `False`, the data layout would - be batch major with shape `[batch_size, sequence_length, ...]`. If - `True`, the data layout would be time major with shape - `[sequence_length, batch_size, ...]`. Default: `False`. - param_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is - a list or tuple, it's length must equal to `num_layers`. Otherwise, - construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`. - Default None. - bias_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is - a list or tuple, it's length must equal to `num_layers`. Otherwise, - construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`. - Default None. - dtype(string, optional): The data type used in this cell. It can be - float32 or float64. Default float32. - Examples: .. code-block:: python @@ -2999,9 +2999,12 @@ class TransformerEncoder(Layer): import paddle.fluid as fluid from paddle.incubate.hapi.text import TransformerEncoder - inputs = paddle.rand((2, 4, 32)) - gru = TransformerEncoder(n_layers=2, input_size=32, hidden_size=64,) - outputs, _ = gru(inputs) # [2, 4, 32] + # encoder input: [batch_size, src_len, d_model] + enc_input = paddle.rand((2, 4, 32)) + # self attention bias: [batch_size, n_head, src_len, src_len] + attn_bias = paddle.rand((2, 2, 4, 4)) + encoder = TransformerEncoder(2, 2, 64, 64, 128, 512) + enc_output = encoder(inputs, attn_bias) # [2, 4, 32] """ def __init__(self, @@ -3040,7 +3043,26 @@ class TransformerEncoder(Layer): self.processer = PrePostProcessLayer(preprocess_cmd, d_model, prepostprocess_dropout) - def forward(self, enc_input, attn_bias): + def forward(self, enc_input, attn_bias=None): + """ + Applies a stack of N Transformer encoder layers on input sequences. + + Parameters: + enc_input (Variable): The input of Transformer encoder. It is a tensor + with shape `[batch_size, sequence_length, d_model]`. The data + type should be float32 or float64. + attn_bias(Variable, optional): A tensor used in encoder self attention + to mask out attention on unwanted positions, usually the paddings. It + is a tensor with shape `[batch_size, n_head, sequence_length, sequence_length]`, + where the unwanted positions have `-INF` values and the others + have 0 values. It can be None for inference. The data type should + be float32 or float64. It can be None when nothing wanted to be + masked out. Default None + + Returns: + Variable: The output of Transformer encoder. It is a tensor that has \ + the same shape and data type as `enc_input`. + """ for encoder_layer in self.encoder_layers: enc_output = encoder_layer(enc_input, attn_bias) enc_input = enc_output -- GitLab