diff --git a/hapi/text/text.py b/hapi/text/text.py index e338f0c3ca84869c83b913f88c852900c05a6657..2e9d9b3e4ec864c4c8e3d27bf36b4e12a1af4e5f 100644 --- a/hapi/text/text.py +++ b/hapi/text/text.py @@ -214,8 +214,7 @@ class BasicLSTMCell(RNNCell): h_{t} & = o_{t} act_c (c_{t}) Please refer to `An Empirical Exploration of Recurrent Network Architectures - `_ - for more details. + `_ for more details. Parameters: input_size (int): The input size in the LSTM cell. @@ -547,8 +546,7 @@ class BasicGRUCell(RNNCell): h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t} Please refer to `An Empirical Exploration of Recurrent Network Architectures - `_ - for more details. + `_ for more details. Parameters: input_size (int): The input size for the first GRU cell. @@ -2719,23 +2717,26 @@ class TransformerCell(Layer): attention to mask out attention on unwanted target positions. It is a tensor with shape `[batch_size, n_head, target_length, target_length]`, where the unwanted positions have `-INF` values and the others - have 0 values. It can be None for inference. The data type should - be float32 or float64. - trg_src_attn_bias(Variable, optional): A tensor used in decoder encoder + have 0 values. It can be None when nothing wanted or needed to + be masked out. It can be None for inference. The data type should + be float32 or float64. Default None + trg_src_attn_bias(Variable, optional): A tensor used in decoder-encoder cross attention to mask out unwanted attention on source (encoder output). It is a tensor with shape `[batch_size, n_head, target_length, source_length]`, where the unwanted positions have `-INF` values and the others - have 0 values. The data type should be float32 or float64. - static_caches(list): It stores the multi-head attention intermediate - results of encoder output. It is a list of dict where the length - of list is decoder layer number, and each dict has `static_k` and - `static_v` as keys and values are stored results. Default empty list + have 0 values. It can be None when nothing wanted or needed to + be masked out. The data type should be float32 or float64. Default None + static_caches(list): It stores projected results of encoder output + to be used as keys and values in decoder-encoder cross attention + It is a list of dict where the length of list is decoder layer + number, and each dict has `static_k` and `static_v` as keys and + values are stored results. Default empty list Returns: tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` \ is a float32 or float64 3D tensor representing logits shaped \ `[batch_size, sequence_length, vocab_size]`. `new_states has \ - the same structure and date type with `states` while the length \ + the same structure and data type with `states` while the length \ is one larger since the intermediate results of current step are \ concatenated into it. """ @@ -2830,7 +2831,7 @@ class TransformerBeamSearchDecoder(layers.BeamSearchDecoder): max_step_num, is_test=True) - enc_output = paddle.rand((2, 4, 64)) + enc_output = paddle.rand((2, 4, 128)) # cross attention bias: [batch_size, n_head, trg_len, src_len] trg_src_attn_bias = paddle.rand((2, 2, 1, 4)) # inputs for beam search on Transformer @@ -3015,7 +3016,37 @@ class PrePostProcessLayer(Layer): class MultiHeadAttention(Layer): """ - Multi-Head Attention + MultiHead Attention mapps queries and a set of key-value pairs to outputs + by jointly attending to information from different representation subspaces, + as what multi-head indicates it performs multiple attention in parallel. + + Please refer to `Attention Is All You Need `_ + for more details. + + Parameters: + d_key (int): The feature size to transformer queries and keys as in + multi-head attention. Mostly it equals to `d_model // n_head`. + d_value (int): The feature size to transformer values as in multi-head + attention. Mostly it equals to `d_model // n_head`. + d_model (int): The expected feature size in the input and output. + n_head (int): The number of heads in multi-head attention(MHA). + dropout_rate (float, optional): The dropout probability used in MHA to + drop some attention target. Default 0.1 + + Examples: + + .. code-block:: python + + import paddle + import paddle.fluid as fluid + from paddle.incubate.hapi.text import MultiHeadAttention + + # encoder input: [batch_size, sequence_length, d_model] + query = paddle.rand((2, 4, 128)) + # self attention bias: [batch_size, n_head, src_len, src_len] + attn_bias = paddle.rand((2, 2, 4, 4)) + multi_head_attn = MultiHeadAttention(64, 64, 2, 128) + output = multi_head_attn(query, attn_bias=attn_bias) # [2, 4, 128] """ def __init__(self, @@ -3062,6 +3093,37 @@ class MultiHeadAttention(Layer): bias_attr=False) def _prepare_qkv(self, queries, keys, values, cache=None): + """ + Prapares linear projected queries, keys and values for usage of subsequnt + multiple attention in parallel. If `cache` is not None, using cached + results to reduce redundant calculations. + + Parameters: + queries (Variable): The queries for multi-head attention. It is a + tensor with shape `[batch_size, sequence_length, d_model]`. The + data type should be float32 or float64. + keys (Variable, optional): The keys for multi-head attention. It is + a tensor with shape `[batch_size, sequence_length, d_model]`. The + data type should be float32 or float64. + values (Variable, optional): The values for multi-head attention. It + is a tensor with shape `[batch_size, sequence_length, d_model]`. + The data type should be float32 or float64. + cache(dict, optional): It is a dict with `k` and `v` as keys, and + values cache the multi-head attention intermediate results of + history decoding steps for decoder self attention; Or a dict + with `static_k` and `statkc_v` as keys, and values stores intermediate + results of encoder output for decoder-encoder cross attention. + If it is for decoder self attention, values for `k` and `v` would + be updated by new tensors concatanating raw tensors with intermediate + results of current step. It is only used for inference and should + be None for training. Default None + + Returns: + tuple: A tuple including linear projected keys and values. These two \ + tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \ + and `[batch_size, n_head, sequence_length, d_value]` separately, \ + and their data types are same as inputs. + """ if keys is None: # self-attention keys, values = queries, queries static_kv = False @@ -3097,7 +3159,47 @@ class MultiHeadAttention(Layer): return q, k, v - def forward(self, queries, keys, values, attn_bias, cache=None): + def forward(self, + queries, + keys=None, + values=None, + attn_bias=None, + cache=None): + """ + Applies multi-head attention to map queries and a set of key-value pairs + to outputs. + + Parameters: + queries (Variable): The queries for multi-head attention. It is a + tensor with shape `[batch_size, sequence_length, d_model]`. The + data type should be float32 or float64. + keys (Variable, optional): The keys for multi-head attention. It is + a tensor with shape `[batch_size, sequence_length, d_model]`. The + data type should be float32 or float64. + values (Variable, optional): The values for multi-head attention. It + is a tensor with shape `[batch_size, sequence_length, d_model]`. + The data type should be float32 or float64. + attn_bias (Variable, optional): A tensor used in multi-head attention + to mask out attention on unwanted positions, usually the + paddings or the subsequent positions. It is a tensor with shape + `[batch_size, n_head, sequence_length, sequence_length]`, + where the unwanted positions have `-INF` values and the others + have 0 values. The data type should be float32 or float64. It can + be None when nothing wanted or needed to be masked out. Default None + cache(dict, optional): It is a dict with `k` and `v` as keys, and + values cache the multi-head attention intermediate results of + history decoding steps for decoder self attention; Or a dict + with `static_k` and `statkc_v` as keys, and values stores intermediate + results of encoder output for decoder-encoder cross attention. + If it is for decoder self attention, values for `k` and `v` would + be updated by new tensors concatanating raw tensors with intermediate + results of current step. It is only used for inference and should + be None for training. Default None + + Returns: + Variable: The output of multi-head attention. It is a tensor \ + that has the same shape and data type as `queries`. + """ # compute q ,k ,v q, k, v = self._prepare_qkv(queries, keys, values, cache) @@ -3122,6 +3224,25 @@ class MultiHeadAttention(Layer): return out def cal_kv(self, keys, values): + """ + Applies linear projection on input keys and values, then splits heads + (reshape and transpose) to get keys and values from different representation + subspaces for usage of subsequnt multiple attention in parallel. + + Parameters: + keys (Variable, optional): The keys for multi-head attention. It is + a tensor with shape `[batch_size, sequence_length, d_model]`. The + data type should be float32 or float64. + values (Variable, optional): The values for multi-head attention. It + is a tensor with shape `[batch_size, sequence_length, d_model]`. + The data type should be float32 or float64. + + Returns: + tuple: A tuple including linear projected keys and values. These two \ + tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \ + and `[batch_size, n_head, sequence_length, d_value]` separately, \ + and their data types are same as inputs. + """ k = self.k_fc(keys) v = self.v_fc(values) k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key]) @@ -3133,7 +3254,17 @@ class MultiHeadAttention(Layer): class FFN(Layer): """ - Feed-Forward Network + A fully connected feed-forward network applied to each position separately + and identically. This consists of two linear transformations with a activation + and dropout in between. + + Parameters: + d_inner_hid (int): The hidden size in the feedforward network(FFN). + d_model (int): The expected feature size in the input and output. + dropout_rate (float, optional): The dropout probability used after + activition. Default 0.1 + ffn_fc1_act (str, optional): The activation function in the feedforward + network. Default relu. """ def __init__(self, @@ -3156,6 +3287,19 @@ class FFN(Layer): self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model) def forward(self, x): + """ + Applies a fully connected feed-forward network on each position of the + input sequences separately and identically. + + Parameters: + x (Variable): The input of feed-forward network. It is a tensor + with shape `[batch_size, sequence_length, d_model]`. The data + type should be float32 or float64. + + Returns: + Variable: The output of feed-forward network. It is a tensor that has \ + the same shape and data type as `enc_input`. + """ hidden = self.fc1(x) if self.dropout_rate: hidden = layers.dropout( @@ -3166,7 +3310,50 @@ class FFN(Layer): class TransformerEncoderLayer(Layer): """ - EncoderLayer + TransformerEncoderLayer is composed of two sub-layers which are self (multi-head) + attention and feedforward network. Before and after each sub-layer, pre-process + and post-precess would be applied on the input and output. + + Parameters: + n_head (int): The number of heads in multi-head attention(MHA). + d_key (int): The feature size to transformer queries and keys as in + multi-head attention. Mostly it equals to `d_model // n_head`. + d_value (int): The feature size to transformer values as in multi-head + attention. Mostly it equals to `d_model // n_head`. + d_model (int): The expected feature size in the input and output. + d_inner_hid (int): The hidden layer size in the feedforward network(FFN). + prepostprocess_dropout (float, optional): The dropout probability used + in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1 + attention_dropout (float, optional): The dropout probability used + in MHA to drop some attention target. Default 0.1 + relu_dropout (float, optional): The dropout probability used after FFN + activition. Default 0.1 + preprocess_cmd (str, optional): The process applied before each MHA and + FFN sub-layer, and it also would be applied on output of the last + stacked layer. It should be a string composed of `d`, `a`, `n`, + where `d` for dropout, `a` for add residual connection, `n` for + layer normalization. Default `n`. + postprocess_cmd (str, optional): The process applied after each MHA and + FFN sub-layer. Same as `preprocess_cmd`. It should be a string + composed of `d`, `a`, `n`, where `d` for dropout, `a` for add + residual connection, `n` for layer normalization. Default `da`. + ffn_fc1_act (str, optional): The activation function in the feedforward + network. Default relu. + + Examples: + + .. code-block:: python + + import paddle + import paddle.fluid as fluid + from paddle.incubate.hapi.text import TransformerEncoderLayer + + # encoder input: [batch_size, src_len, d_model] + enc_input = paddle.rand((2, 4, 128)) + # self attention bias: [batch_size, n_head, src_len, src_len] + attn_bias = paddle.rand((2, 2, 4, 4)) + encoder_layer = TransformerEncoderLayer(2, 2, 64, 64, 128, 512) + enc_output = encoder_layer(inputs, attn_bias) # [2, 4, 128] """ def __init__(self, @@ -3175,9 +3362,9 @@ class TransformerEncoderLayer(Layer): d_value, d_model, d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, + prepostprocess_dropout=0.1, + attention_dropout=0.1, + relu_dropout=0.1, preprocess_cmd="n", postprocess_cmd="da", ffn_fc1_act="relu", @@ -3226,7 +3413,25 @@ class TransformerEncoderLayer(Layer): prepostprocess_dropout, reused_post_ffn_layernorm) - def forward(self, enc_input, attn_bias): + def forward(self, enc_input, attn_bias=None): + """ + Applies a Transformer encoder layer on the input. + + Parameters: + enc_input (Variable): The input of Transformer encoder layer. It is + a tensor with shape `[batch_size, sequence_length, d_model]`. + The data type should be float32 or float64. + attn_bias(Variable, optional): A tensor used in encoder self attention + to mask out attention on unwanted positions, usually the paddings. It + is a tensor with shape `[batch_size, n_head, sequence_length, sequence_length]`, + where the unwanted positions have `-INF` values and the others + have 0 values. The data type should be float32 or float64. It can + be None when nothing wanted or needed to be masked out. Default None + + Returns: + Variable: The output of Transformer encoder layer. It is a tensor that \ + has the same shape and data type as `enc_input`. + """ attn_output = self.self_attn( self.preprocesser1(enc_input), None, None, attn_bias) attn_output = self.postprocesser1(attn_output, enc_input) @@ -3276,11 +3481,11 @@ class TransformerEncoder(Layer): from paddle.incubate.hapi.text import TransformerEncoder # encoder input: [batch_size, src_len, d_model] - enc_input = paddle.rand((2, 4, 32)) + enc_input = paddle.rand((2, 4, 128)) # self attention bias: [batch_size, n_head, src_len, src_len] attn_bias = paddle.rand((2, 2, 4, 4)) encoder = TransformerEncoder(2, 2, 64, 64, 128, 512) - enc_output = encoder(inputs, attn_bias) # [2, 4, 32] + enc_output = encoder(inputs, attn_bias) # [2, 4, 128] """ def __init__(self, @@ -3331,9 +3536,8 @@ class TransformerEncoder(Layer): to mask out attention on unwanted positions, usually the paddings. It is a tensor with shape `[batch_size, n_head, sequence_length, sequence_length]`, where the unwanted positions have `-INF` values and the others - have 0 values. It can be None for inference. The data type should - be float32 or float64. It can be None when nothing wanted to be - masked out. Default None + have 0 values. The data type should be float32 or float64. It can + be None when nothing wanted or needed to be masked out. Default None Returns: Variable: The output of Transformer encoder. It is a tensor that has \ @@ -3348,7 +3552,58 @@ class TransformerEncoder(Layer): class TransformerDecoderLayer(Layer): """ - decoder + TransformerDecoderLayer is composed of three sub-layers which are decoder + self (multi-head) attention, decoder-encoder cross attention and feedforward + network. Before and after each sub-layer, pre-process and post-precess would + be applied on the input and output. + + Parameters: + n_head (int): The number of heads in multi-head attention(MHA). + d_key (int): The feature size to transformer queries and keys as in + multi-head attention. Mostly it equals to `d_model // n_head`. + d_value (int): The feature size to transformer values as in multi-head + attention. Mostly it equals to `d_model // n_head`. + d_model (int): The expected feature size in the input and output. + d_inner_hid (int): The hidden layer size in the feedforward network(FFN). + prepostprocess_dropout (float, optional): The dropout probability used + in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1 + attention_dropout (float, optional): The dropout probability used + in MHA to drop some attention target. Default 0.1 + relu_dropout (float, optional): The dropout probability used after FFN + activition. Default 0.1 + preprocess_cmd (str, optional): The process applied before each MHA and + FFN sub-layer, and it also would be applied on output of the last + stacked layer. It should be a string composed of `d`, `a`, `n`, + where `d` for dropout, `a` for add residual connection, `n` for + layer normalization. Default `n`. + postprocess_cmd (str, optional): The process applied after each MHA and + FFN sub-layer. Same as `preprocess_cmd`. It should be a string + composed of `d`, `a`, `n`, where `d` for dropout, `a` for add + residual connection, `n` for layer normalization. Default `da`. + ffn_fc1_act (str, optional): The activation function in the feedforward + network. Default relu. + + Examples: + + .. code-block:: python + + import paddle + import paddle.fluid as fluid + from paddle.incubate.hapi.text import TransformerDecoderLayer + + # decoder input: [batch_size, trg_len, d_model] + dec_input = paddle.rand((2, 4, 128)) + # encoder output: [batch_size, src_len, d_model] + enc_output = paddle.rand((2, 6, 128)) + # self attention bias: [batch_size, n_head, trg_len, trg_len] + self_attn_bias = paddle.rand((2, 2, 4, 4)) + # cross attention bias: [batch_size, n_head, trg_len, src_len] + cross_attn_bias = paddle.rand((2, 2, 4, 6)) + decoder_layer = TransformerDecoderLayer(2, 64, 64, 128, 512) + output = decoder_layer(dec_input, + enc_output, + self_attn_bias, + cross_attn_bias) # [2, 4, 128] """ def __init__(self, @@ -3438,9 +3693,41 @@ class TransformerDecoderLayer(Layer): def forward(self, dec_input, enc_output, - self_attn_bias, - cross_attn_bias, + self_attn_bias=None, + cross_attn_bias=None, cache=None): + """ + Applies a Transformer decoder layer on the input. + + Parameters: + dec_input (Variable): The input of Transformer decoder. It is a tensor + with shape `[batch_size, target_length, d_model]`. The data type + should be float32 or float64. + enc_output (Variable): The output of Transformer encoder. It is a tensor + with shape `[batch_size, source_length, d_model]`. The data type + should be float32 or float64. + self_attn_bias (Variable, optional): A tensor used in decoder self attention + to mask out attention on unwanted positions, usually the subsequent positions. + It is a tensor with shape `[batch_size, n_head, target_length, target_length]`, + where the unwanted positions have `-INF` values and the others + have 0 values. The data type should be float32 or float64. It can + be None when nothing wanted or needed to be masked out. Default None + cross_attn_bias (Variable, optional): A tensor used in decoder-encoder cross + attention to mask out attention on unwanted positions, usually the paddings. + It is a tensor with shape `[batch_size, n_head, target_length, target_length]`, + where the unwanted positions have `-INF` values and the others + have 0 values. The data type should be float32 or float64. It can + be None when nothing wanted or needed to be masked out. Default None + caches(dict, optional): It caches the multi-head attention intermediate + results of history decoding steps and encoder output. It is a dict + has `k`, `v`, `static_k`, `statkc_v` as keys and values are cached + results. It is only used for inference and should be None for + training. Default None + + Returns: + Variable: The output of Transformer decoder layer. It is a tensor \ + that has the same shape and data type as `dec_input`. + """ self_attn_output = self.self_attn( self.preprocesser1(dec_input), None, None, self_attn_bias, cache) self_attn_output = self.postprocesser1(self_attn_output, dec_input) @@ -3459,12 +3746,71 @@ class TransformerDecoderLayer(Layer): class TransformerDecoder(Layer): """ - decoder + TransformerDecoder is a stack of N decoder layers. + + Parameters: + n_layer (int): The number of encoder layers to be stacked. + n_head (int): The number of heads in multi-head attention(MHA). + d_key (int): The feature size to transformer queries and keys as in + multi-head attention. Mostly it equals to `d_model // n_head`. + d_value (int): The feature size to transformer values as in multi-head + attention. Mostly it equals to `d_model // n_head`. + d_model (int): The expected feature size in the input and output. + d_inner_hid (int): The hidden layer size in the feedforward network(FFN). + prepostprocess_dropout (float, optional): The dropout probability used + in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1 + attention_dropout (float, optional): The dropout probability used + in MHA to drop some attention target. Default 0.1 + relu_dropout (float, optional): The dropout probability used after FFN + activition. Default 0.1 + preprocess_cmd (str, optional): The process applied before each MHA and + FFN sub-layer, and it also would be applied on output of the last + stacked layer. It should be a string composed of `d`, `a`, `n`, + where `d` for dropout, `a` for add residual connection, `n` for + layer normalization. Default `n`. + postprocess_cmd (str, optional): The process applied after each MHA and + FFN sub-layer. Same as `preprocess_cmd`. It should be a string + composed of `d`, `a`, `n`, where `d` for dropout, `a` for add + residual connection, `n` for layer normalization. Default `da`. + ffn_fc1_act (str, optional): The activation function in the feedforward + network. Default relu. + + Examples: + + .. code-block:: python + + import paddle + import paddle.fluid as fluid + from paddle.incubate.hapi.text import TransformerDecoder + + # decoder input: [batch_size, trg_len, d_model] + dec_input = paddle.rand((2, 4, 128)) + # encoder output: [batch_size, src_len, d_model] + enc_output = paddle.rand((2, 6, 128)) + # self attention bias: [batch_size, n_head, trg_len, trg_len] + self_attn_bias = paddle.rand((2, 2, 4, 4)) + # cross attention bias: [batch_size, n_head, trg_len, src_len] + cross_attn_bias = paddle.rand((2, 2, 4, 6)) + decoder = TransformerDecoder(2, 2, 64, 64, 128, 512) + dec_output = decoder(dec_input, + enc_output, + self_attn_bias, + cross_attn_bias) # [2, 4, 128] """ - def __init__(self, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, - prepostprocess_dropout, attention_dropout, relu_dropout, - preprocess_cmd, postprocess_cmd): + def __init__(self, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout=0.1, + attention_dropout=0.1, + relu_dropout=0.1, + preprocess_cmd="n", + postprocess_cmd="da", + ffn_fc1_act="relu"): super(TransformerDecoder, self).__init__() self.n_layer = n_layer @@ -3487,9 +3833,42 @@ class TransformerDecoder(Layer): def forward(self, dec_input, enc_output, - self_attn_bias, - cross_attn_bias, + self_attn_bias=None, + cross_attn_bias=None, caches=None): + """ + Applies a stack of N Transformer decoder layers on inputs. + + Parameters: + dec_input (Variable): The input of Transformer decoder. It is a tensor + with shape `[batch_size, target_length, d_model]`. The data type + should be float32 or float64. + enc_output (Variable): The output of Transformer encoder. It is a tensor + with shape `[batch_size, source_length, d_model]`. The data type + should be float32 or float64. + self_attn_bias (Variable, optional): A tensor used in decoder self attention + to mask out attention on unwanted positions, usually the subsequent positions. + It is a tensor with shape `[batch_size, n_head, target_length, target_length]`, + where the unwanted positions have `-INF` values and the others + have 0 values. The data type should be float32 or float64. It can + be None when nothing wanted or needed to be masked out. Default None + cross_attn_bias (Variable, optional): A tensor used in decoder-encoder cross + attention to mask out attention on unwanted positions, usually the paddings. + It is a tensor with shape `[batch_size, n_head, target_length, target_length]`, + where the unwanted positions have `-INF` values and the others + have 0 values. The data type should be float32 or float64. It can + be None when nothing wanted or needed to be masked out. Default None + caches(list, optional): It caches the multi-head attention intermediate results + of history decoding steps and encoder output. It is a list of dict + where the length of list is decoder layer number, and each dict + has `k`, `v`, `static_k`, `statkc_v` as keys and values are cached + results. It is only used for inference and should be None for + training. Default None + + Returns: + Variable: The output of Transformer decoder. It is a tensor that has \ + the same shape and data type as `dec_input`. + """ for i, decoder_layer in enumerate(self.decoder_layers): dec_output = decoder_layer(dec_input, enc_output, self_attn_bias, cross_attn_bias, caches[i] @@ -3499,6 +3878,22 @@ class TransformerDecoder(Layer): return self.processer(dec_output) def prepare_static_cache(self, enc_output): + """ + Generate a list of dict where the length of list is decoder layer number. + Each dict has `static_k`, `statkc_v` as keys, and values are projected + results of encoder output to be used as keys and values in decoder-encoder + cross (multi-head) attention. Used in inference. + + Parameters: + enc_output (Variable): The output of Transformer encoder. It is a tensor + with shape `[batch_size, source_length, d_model]`. The data type + should be float32 or float64. + + Returns: + list: A list of dict. Each dict has `static_k`, `statkc_v` as keys, \ + and values are projected results of encoder output to be used as \ + keys and values in decoder-encoder cross (multi-head) attention. + """ return [ dict( zip(("static_k", "static_v"), @@ -3507,6 +3902,26 @@ class TransformerDecoder(Layer): ] def prepare_incremental_cache(self, enc_output): + """ + Generate a list of dict where the length of list is decoder layer number. + Each dict has `k`, `v` as keys, and values are empty tensors with shape + `[batch_size, n_head, 0, d_key]` and `[batch_size, n_head, 0, d_value]`, + representing the decoder self (multi-head) attention intermediate results, + and 0 is the initial length which would increase as inference decoding + continued. Used in inference. + + Parameters: + enc_output (Variable): The output of Transformer encoder. It is a tensor + with shape `[batch_size, source_length, d_model]`. The data type + should be float32 or float64. Actually, it is used to provide batch + size for Transformer initial states(caches), thus any tensor has + wanted batch size can be used here. + + Returns: + list: A list of dict. Each dict has `k`, `v` as keys, and values are \ + empty tensors representing intermediate results of history decoding \ + steps in decoder self (multi-head) attention at time step 0. + """ return [{ "k": layers.fill_constant_batch_size_like( input=enc_output,