未验证 提交 101e20e9 编写于 作者: G Guo Sheng 提交者: GitHub

Update some en api docs (#20496)

* Fix api docs. test=document-fix

* Fix en docs. test=develop

* Fix the doc of dynamic_gru. test=document_fix

* Update API.spec. test=document_fix

* Fix codestyle test=develop, test=document_fix
上级 200cc5e2
......@@ -127,15 +127,15 @@ paddle.fluid.layers.center_loss (ArgSpec(args=['input', 'label', 'num_classes',
paddle.fluid.layers.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', 'c51fcac7a4f5786ca41f27fa60bd22c5'))
paddle.fluid.layers.dynamic_lstm (ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)), ('document', 'd4a82e2f5feb20c4a23ced8054e047ed'))
paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None)), ('document', 'b35fe3e0c2ecca15a8be658277e064ec'))
paddle.fluid.layers.dynamic_gru (ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False)), ('document', '83617c165827e030636c80486d5de6f3'))
paddle.fluid.layers.gru_unit (ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False)), ('document', '33974b9bfa69f2f1eb85e6f956dff04e'))
paddle.fluid.layers.dynamic_gru (ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False)), ('document', 'a3364b36fb3190b9bd75e419aa75573b'))
paddle.fluid.layers.gru_unit (ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False)), ('document', '0b10a755b469d0b85b3a5cac38b4cf01'))
paddle.fluid.layers.linear_chain_crf (ArgSpec(args=['input', 'label', 'param_attr', 'length'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'b28bdb43160e9667be2a3457d19d9f5b'))
paddle.fluid.layers.crf_decoding (ArgSpec(args=['input', 'param_attr', 'label', 'length'], varargs=None, keywords=None, defaults=(None, None)), ('document', '708ce0348b74d3e0c7885c2c524b7fa7'))
paddle.fluid.layers.cos_sim (ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None), ('document', '48ec1ba2d75c4e2faf8d9a47350462ae'))
paddle.fluid.layers.cross_entropy (ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)), ('document', 'd1985a930a59c3bd41a7c1d72594f5b9'))
paddle.fluid.layers.bpr_loss (ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ae57e6e5136dade436f0df1f11770afa'))
paddle.fluid.layers.square_error_cost (ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None), ('document', '4ed09e115b50ec7393674c4c09d223a2'))
paddle.fluid.layers.chunk_eval (ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types', 'seq_length'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'b02844e0ad4bd713c5fe6802aa13219c'))
paddle.fluid.layers.chunk_eval (ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types', 'seq_length'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'a8aa2071cae18df1e8dde9183d64bfb1'))
paddle.fluid.layers.sequence_conv (ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'padding_start', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, True, None, None, None, None, None)), ('document', 'ebddcc5a1073ef065d22b4673e36b1d2'))
paddle.fluid.layers.conv2d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name', 'data_format'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None, 'NCHW')), ('document', 'e91c63b8ac8c35982c0ac518537e44bf'))
paddle.fluid.layers.conv3d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name', 'data_format'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None, 'NCDHW')), ('document', 'feff9c8ebb4d4d0be5345f9042f57c8e'))
......@@ -149,14 +149,14 @@ paddle.fluid.layers.adaptive_pool3d (ArgSpec(args=['input', 'pool_size', 'pool_t
paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', '1400433bae7876d0407ae205be39b7a1'))
paddle.fluid.layers.instance_norm (ArgSpec(args=['input', 'epsilon', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None)), ('document', '23d6fba8ad8495f67a66d8878be5b0be'))
paddle.fluid.layers.data_norm (ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, None, None, None, False)), ('document', '5ba4cdb4ea5c03382da545335ffc05b7'))
paddle.fluid.layers.beam_search_decode (ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '83e08f21af41ac8bac37aeab1f86fdd0'))
paddle.fluid.layers.beam_search_decode (ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'eafa177a7fed6178a51c1affa7f46a40'))
paddle.fluid.layers.conv2d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name', 'data_format'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None, 'NCHW')), ('document', 'ed24c2d0f82cd9a3b40488157285a584'))
paddle.fluid.layers.conv3d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name', 'data_format'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None, 'NCDHW')), ('document', 'efb1e3bc87339cb26faa2edae210e8b0'))
paddle.fluid.layers.sequence_expand (ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '10e122eb755c2bd1f78ef2332b28f1a0'))
paddle.fluid.layers.sequence_expand_as (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '858c432e7cbd8bb952cc2eb555457d50'))
paddle.fluid.layers.sequence_pad (ArgSpec(args=['x', 'pad_value', 'maxlen', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'df08b9c499ab3a90f95d08ab5b6c6c62'))
paddle.fluid.layers.sequence_unpad (ArgSpec(args=['x', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e478180d5bc010a84f35af958cafa62c'))
paddle.fluid.layers.lstm_unit (ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None)), ('document', 'fe126c58e4339410e875ab1eba246d21'))
paddle.fluid.layers.lstm_unit (ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None)), ('document', 'f5a878b6166f34878376a58d7e6fa95c'))
paddle.fluid.layers.reduce_sum (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'ecb55075fdf89a866bcede85e60aebad'))
paddle.fluid.layers.reduce_mean (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '968c9b17affaf714e5021c3dc8d68c73'))
paddle.fluid.layers.reduce_max (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'd37e3a9a05c00e032d4b7876c4f6b414'))
......@@ -181,7 +181,7 @@ paddle.fluid.layers.im2sequence (ArgSpec(args=['input', 'filter_size', 'stride',
paddle.fluid.layers.nce (ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)), ('document', '38297567127888e01542857839058d52'))
paddle.fluid.layers.sampled_softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'num_samples', 'num_true', 'remove_accidental_hits', 'use_customized_samples', 'customized_samples', 'customized_probabilities', 'seed'], varargs=None, keywords=None, defaults=(1, True, False, None, None, 0)), ('document', 'd4435a63d34203339831ee6a86ef9242'))
paddle.fluid.layers.hsigmoid (ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)), ('document', 'b83e7dfa81059b39bb137922dc914f50'))
paddle.fluid.layers.beam_search (ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)), ('document', '1270395ce97a4e1b556104abbb14f096'))
paddle.fluid.layers.beam_search (ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)), ('document', '2b505ddaa309fd7b9be5445e41ca76d5'))
paddle.fluid.layers.row_conv (ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'a6477957b44907787b3c74157400b80c'))
paddle.fluid.layers.multiplex (ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None), ('document', '2c4d1ae83da6ed35e3b36ba1b3b51d23'))
paddle.fluid.layers.layer_norm (ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)), ('document', '678de6d6d0c93da74189990b039daae8'))
......@@ -281,7 +281,7 @@ paddle.fluid.layers.similarity_focus (ArgSpec(args=['input', 'axis', 'indexes',
paddle.fluid.layers.hash (ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'a0b73c21be618cec0281e7903039e5e3'))
paddle.fluid.layers.grid_sampler (ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '90c74742f48c70b103f1fbb9eb129066'))
paddle.fluid.layers.log_loss (ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)), ('document', 'ef1701e11d60508fe8f02dd2a8c60bdf'))
paddle.fluid.layers.add_position_encoding (ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e399f9436fed5f7ff480d8532e42c937'))
paddle.fluid.layers.add_position_encoding (ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'bd8b28e6c1640b13a42b0524f86f7800'))
paddle.fluid.layers.bilinear_tensor_product (ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '6755168c4b2308e1e4f54cb56fa7dcb2'))
paddle.fluid.layers.merge_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b2b0e5d5c155ce24bafc38b78cd0b164'))
paddle.fluid.layers.get_tensor_from_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '2c568321feb4d16c41a83df43f95089d'))
......@@ -922,7 +922,7 @@ paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ (ArgSpec(args=['self
paddle.fluid.nets.simple_img_conv_pool (ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True)), ('document', '5e89c978199c4ecce2b26d5fed1ec52b'))
paddle.fluid.nets.sequence_conv_pool (ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type', 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None)), ('document', 'b2d435f782ac8ea3ca480b8d24e7f5b4'))
paddle.fluid.nets.glu (ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)), ('document', '3efe197c8e3e75f84a4c464d8b74e943'))
paddle.fluid.nets.scaled_dot_product_attention (ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)), ('document', 'b1a07a0000eb9103e3a143ca8c13de5b'))
paddle.fluid.nets.scaled_dot_product_attention (ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)), ('document', '375898e47266633635f4c2096e1ac296'))
paddle.fluid.nets.img_conv_group (ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True)), ('document', 'a59c581d5969266427e841abe69f694a'))
paddle.fluid.optimizer.SGDOptimizer ('paddle.fluid.optimizer.SGDOptimizer', ('document', 'fc09d6e6c1083cec2dce51f6f9f4ecaf'))
paddle.fluid.optimizer.SGDOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
......
......@@ -452,6 +452,7 @@ def center_loss(input,
centers_param = helper.create_parameter(
attr=param_attr, shape=centers_shape, dtype=dtype)
centers_param.stop_gradient = True
if isinstance(alpha, Variable):
alpha_param = alpha
else:
......@@ -1215,13 +1216,16 @@ def dynamic_gru(input,
h_0=None,
origin_mode=False):
"""
**Gated Recurrent Unit (GRU) Layer**
**Note: The input type of this must be LoDTensor. If the input type to be
processed is Tensor, use** :ref:`api_fluid_layers_StaticRNN` .
if origin_mode is False, then the equation of a gru step is from paper
`Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
Modeling <https://arxiv.org/pdf/1412.3555.pdf>`_ .
This operator is used to perform the calculations for a single layer of
Gated Recurrent Unit (GRU) on full sequences step by step. The calculations
in one time step support these two modes:
The formula is as follows:
If ``origin_mode`` is True, then the formula used is from paper
`Learning Phrase Representations using RNN Encoder Decoder for Statistical
Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_ .
.. math::
......@@ -1231,12 +1235,12 @@ def dynamic_gru(input,
\\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
h_t & = (1-u_t) \odot h_{t-1} + u_t \odot \\tilde{h_t}
h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
if origin_mode is True then the equation is from paper
Learning Phrase Representations using RNN Encoder-Decoder for Statistical
Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_
if ``origin_mode`` is False, then the formula used is from paper
`Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
Modeling <https://arxiv.org/pdf/1412.3555.pdf>`_
.. math::
......@@ -1246,59 +1250,56 @@ def dynamic_gru(input,
\\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
h_t & = (1-u_t) \odot h_{t-1} + u_t \odot \\tilde{h_t}
The :math:`\odot` is the element-wise product of the vectors. :math:`act_g`
is the update gate and reset gate activation function and :math:`sigmoid`
is usually used for it. :math:`act_c` is the activation function for
candidate hidden state and :math:`tanh` is usually used for it.
:math:`x_t` is the input of current time step, but it is not from ``input`` .
This operator does not include the calculations :math:`W_{ux}x_{t}, W_{rx}x_{t}, W_{cx}x_{t}` ,
**Note** thus a fully-connect layer whose size is 3 times of ``size`` should
be used before this operator, and the output should be used as ``input`` here.
:math:`h_{t-1}` is the hidden state from previous time step.
:math:`u_t` , :math:`r_t` , :math:`\\tilde{h_t}` and :math:`h_t` stand for
update gate, reset gate, candidate hidden and hidden output separately.
:math:`W_{uh}, b_u` , :math:`W_{rh}, b_r` and :math:`W_{ch}, b_c` stand for
the weight matrix and bias used in update gate, reset gate, candidate hidden
calculations. For implementation, the three weight matrix are merged into a
tensor shaped :math:`[D, D \\times 3]` , the three bias are concatenated as
a tensor shaped :math:`[1, D \\times 3]` , where :math:`D` stands for the
hidden size; The data layout of weight tensor is: :math:`W_{uh}` and :math:`W_{rh}`
are concatenated with shape :math:`[D, D \\times 2]` lying on the first part,
and :math:`W_{ch}` lying on the latter part with shape :math:`[D, D]` .
Note that these :math:`W_{ux}x_{t}, W_{rx}x_{t}, W_{cx}x_{t}` operations on
the input :math:`x_{t}` are NOT included in this operator. Users can choose
to use fully-connect layer before GRU layer.
Args:
input(Variable): The input of dynamic_gru layer, which supports
variable-time length input sequence. The underlying tensor in this
Variable is a matrix with shape :math:`(T \\times 3D)`, where
:math:`T` is the total time steps in this mini-batch, :math:`D`
is the hidden size.
size(int): The dimension of the gru cell.
param_attr(ParamAttr|None): The parameter attribute for the learnable
hidden-hidden weight matrix. Note:
- The shape of the weight matrix is :math:`(T \\times 3D)`, where
:math:`D` is the hidden size.
- All elements in the weight matrix can be divided into two parts.
The first part are weights of the update gate and reset gate with
shape :math:`(D \\times 2D)`, and the second part are weights for
candidate hidden state with shape :math:`(D \\times D)`.
If it is set to None or one attribute of ParamAttr, dynamic_gru will
create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
of GRU.Note that the bias with :math:`(1 \\times 3D)` concatenates
the bias in the update gate, reset gate and candidate calculations.
If it is set to False, no bias will be applied to the update gate,
reset gate and candidate calculations. If it is set to None or one
attribute of ParamAttr, dynamic_gru will create ParamAttr as
bias_attr. If the Initializer of the bias_attr is not set, the bias
is initialized zero. Default: None.
is_reverse(bool): Whether to compute reversed GRU, default
:attr:`False`.
gate_activation(str): The activation for update gate and reset gate.
Choices = ["sigmoid", "tanh", "relu", "identity"], default "sigmoid".
candidate_activation(str): The activation for candidate hidden state.
Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh".
h_0 (Variable): This is initial hidden state. If not set, default is
zero. This is a tensor with shape (N x D), where N is the number of
total time steps of input mini-batch feature and D is the hidden
size.
input(Variable): A LoDTensor whose lod level is 1, representing the input
after linear projection. Its shape should be :math:`[T, D \\times 3]` ,
where :math:`T` stands for the total sequence lengths in this mini-batch,
:math:`D` for the hidden size. The data type should be float32 or float64.
size(int): Indicate the hidden size.
param_attr(ParamAttr, optional): To specify the weight parameter property.
Default: None, which means the default weight parameter property is used.
See usage for details in :ref:`api_fluid_ParamAttr` .
bias_attr (ParamAttr, optional): To specify the bias parameter property.
Default: None, which means the default bias parameter property is used.
See usage for details in :ref:`api_fluid_ParamAttr` .
is_reverse(bool, optional): Whether to compute in the reversed order of
input sequences. Default False.
gate_activation(str, optional): The activation fuction corresponding to
:math:`act_g` in the formula. "sigmoid", "tanh", "relu" and "identity"
are supported. Default "sigmoid".
candidate_activation(str, optional): The activation fuction corresponding to
:math:`act_c` in the formula. "sigmoid", "tanh", "relu" and "identity"
are supported. Default "tanh".
h_0 (Variable, optional): A Tensor representing the initial hidden state.
It not provided, the default initial hidden state is 0. The shape is
:math:`[N, D]` , where :math:`N` is the number of sequences in the
mini-batch, :math:`D` for the hidden size. The data type should be
same as ``input`` . Default None.
Returns:
Variable: The hidden state of GRU. The shape is :math:`(T \\times D)`, \
and sequence length is the same with the input.
Variable: A LoDTensor whose lod level is 1 and shape is :math:`[T, D]` , \
where :math:`T` stands for the total sequence lengths in this mini-batch \
:math:`D` for the hidden size. It represents GRU transformed sequence output, \
and has the same lod and data type with ``input`` .
Examples:
......@@ -1307,9 +1308,11 @@ def dynamic_gru(input,
import paddle.fluid as fluid
dict_dim, emb_dim = 128, 64
data = fluid.layers.data(name='sequence', shape=[1],
dtype='int32', lod_level=1)
emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
data = fluid.data(name='sequence',
shape=[None],
dtype='int64',
lod_level=1)
emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
hidden_dim = 512
x = fluid.layers.fc(input=emb, size=hidden_dim * 3)
hidden = fluid.layers.dynamic_gru(input=x, size=hidden_dim)
......@@ -1365,79 +1368,85 @@ def gru_unit(input,
gate_activation='sigmoid',
origin_mode=False):
"""
**GRU unit layer**
Gated Recurrent Unit (GRU) RNN cell. This operator performs GRU calculations for
one time step and it supports these two modes:
if origin_mode is True, then the equation of a gru step is from paper
`Learning Phrase Representations using RNN Encoder-Decoder for Statistical
Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_
If ``origin_mode`` is True, then the formula used is from paper
`Learning Phrase Representations using RNN Encoder Decoder for Statistical
Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_ .
.. math::
u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
.. math::
u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
\\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)
if origin_mode is False, then the equation of a gru step is from paper
if ``origin_mode`` is False, then the formula used is from paper
`Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
Modeling <https://arxiv.org/pdf/1412.3555.pdf>`_
Modeling <https://arxiv.org/pdf/1412.3555.pdf>`_
.. math::
u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
.. math::
r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
h_t & = dot((1-u_t), h_{t-1}) + dot(u_t, m_t)
\\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
h_t & = (1-u_t) \odot h_{t-1} + u_t \odot \\tilde{h_t}
The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms
of the equation above, the :math:`z_t` is split into 3 parts -
:math:`xu_t`, :math:`xr_t` and :math:`xm_t`. This means that in order to
implement a full GRU unit operator for an input, a fully
connected layer has to be applied, such that :math:`z_t = W_{fc}x_t`.
:math:`x_t` is the input of current time step, but it is not ``input`` .
This operator does not include the calculations :math:`W_{ux}x_{t}, W_{rx}x_{t}, W_{cx}x_{t}` ,
**Note** thus a fully-connect layer whose size is 3 times of GRU hidden size should
be used before this operator, and the output should be used as ``input`` here.
:math:`h_{t-1}` is the hidden state from previous time step.
:math:`u_t` , :math:`r_t` , :math:`\\tilde{h_t}` and :math:`h_t` stand for
update gate, reset gate, candidate hidden and hidden output separately.
:math:`W_{uh}, b_u` , :math:`W_{rh}, b_r` and :math:`W_{ch}, b_c` stand for
the weight matrix and bias used in update gate, reset gate, candidate hidden
calculations. For implementation, the three weight matrix are merged into a
tensor shaped :math:`[D, D \\times 3]` , the three bias are concatenated as
a tensor shaped :math:`[1, D \\times 3]` , where :math:`D` stands for the
hidden size; The data layout of weight tensor is: :math:`W_{uh}` and :math:`W_{rh}`
are concatenated with shape :math:`[D, D \\times 2]` lying on the first part,
and :math:`W_{ch}` lying on the latter part with shape :math:`[D, D]` .
The terms :math:`u_t` and :math:`r_t` represent the update and reset gates
of the GRU cell. Unlike LSTM, GRU has one lesser gate. However, there is
an intermediate candidate hidden output, which is denoted by :math:`m_t`.
This layer has three outputs :math:`h_t`, :math:`dot(r_t, h_{t-1})`
and concatenation of :math:`u_t`, :math:`r_t` and :math:`m_t`.
Args:
input (Variable): The fc transformed input value of current step.
hidden (Variable): The hidden value of gru unit from previous step.
size (integer): The input dimension value.
param_attr(ParamAttr|None): The parameter attribute for the learnable
hidden-hidden weight matrix. Note:
- The shape of the weight matrix is :math:`(T \\times 3D)`, where
:math:`D` is the hidden size.
- All elements in the weight matrix can be divided into two parts.
The first part are weights of the update gate and reset gate with
shape :math:`(D \\times 2D)`, and the second part are weights for
candidate hidden state with shape :math:`(D \\times D)`.
If it is set to None or one attribute of ParamAttr, gru_unit will
create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
of GRU.Note that the bias with :math:`(1 \\times 3D)` concatenates
the bias in the update gate, reset gate and candidate calculations.
If it is set to False, no bias will be applied to the update gate,
reset gate and candidate calculations. If it is set to None or one
attribute of ParamAttr, gru_unit will create ParamAttr as
bias_attr. If the Initializer of the bias_attr is not set, the bias
is initialized zero. Default: None.
activation (string): The activation type for cell (actNode).
Default: 'tanh'
gate_activation (string): The activation type for gates (actGate).
Default: 'sigmoid'
input(Variable): A 2D Tensor representing the input after linear projection
after linear projection. Its shape should be :math:`[N, D \\times 3]` ,
where :math:`N` stands for batch size, :math:`D` for the hidden size.
The data type should be float32 or float64.
hidden(Variable): A 2D Tensor representing the hidden state from previous step.
Its shape should be :math:`[N, D]` , where :math:`N` stands for batch size,
:math:`D` for the hidden size. The data type should be same as ``input`` .
size(int): Indicate the hidden size.
param_attr(ParamAttr, optional): To specify the weight parameter property.
Default: None, which means the default weight parameter property is used.
See usage for details in :ref:`api_fluid_ParamAttr` .
bias_attr (ParamAttr, optional): To specify the bias parameter property.
Default: None, which means the default bias parameter property is used.
See usage for details in :ref:`api_fluid_ParamAttr` .
activation(str, optional): The activation fuction corresponding to
:math:`act_c` in the formula. "sigmoid", "tanh", "relu" and "identity"
are supported. Default "tanh".
gate_activation(str, optional): The activation fuction corresponding to
:math:`act_g` in the formula. "sigmoid", "tanh", "relu" and "identity"
are supported. Default "sigmoid".
Returns:
tuple: The hidden value, reset-hidden value and gate values.
tuple: The tuple contains three Tensor variables with the same data type \
as ``input`` . They represent the hidden state for next time step ( :math:`h_t` ), \
reseted previous hidden state ( :math:`r_t \odot h_{t-1}` ), and the \
concatenation of :math:`h_t, r_t, \\tilde{h_t}` . And they have shape \
:math:`[N, D]` , :math:`[N, D]` , :math:`[N, D \times 3]` separately. \
Usually only the hidden state for next time step ( :math:`h_t` ) is used \
as output and state, the other two are intermediate results of calculations.
Examples:
......@@ -1446,12 +1455,12 @@ def gru_unit(input,
import paddle.fluid as fluid
dict_dim, emb_dim = 128, 64
data = fluid.layers.data(name='step_data', shape=[1], dtype='int32')
emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
data = fluid.data(name='step_data', shape=[None], dtype='int64')
emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
hidden_dim = 512
x = fluid.layers.fc(input=emb, size=hidden_dim * 3)
pre_hidden = fluid.layers.data(
name='pre_hidden', shape=[hidden_dim], dtype='float32')
pre_hidden = fluid.data(
name='pre_hidden', shape=[None, hidden_dim], dtype='float32')
hidden = fluid.layers.gru_unit(
input=x, hidden=pre_hidden, size=hidden_dim * 3)
......@@ -2028,17 +2037,14 @@ def chunk_eval(input,
excluded_chunk_types=None,
seq_length=None):
"""
**Chunk Evaluator**
This function computes and outputs the precision, recall and
F1-score of chunk detection.
This operator computes the precision, recall and F1-score for chunk detection.
It is often used in sequence tagging tasks, such as Named Entity Recognition(NER).
For some basics of chunking, please refer to
`Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>`_ .
ChunkEvalOp computes the precision, recall, and F1-score of chunk detection,
and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
Here is a NER example of labeling for these tagging schemes:
This operator supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
Here is a NER example for the usage of these tagging schemes:
.. code-block:: python
......@@ -2052,11 +2058,11 @@ def chunk_eval(input,
====== ====== ====== ===== == ============ ===== ===== ===== == =========
There are three chunk types(named entity types) including PER(person), ORG(organization)
and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chunk type>.
and LOC(location), and we can see that the labels have the form `<tag type>-<chunk type>` .
Since the calculations actually use label ids rather than labels, extra attention
should be paid when mapping labels to ids to make CheckEvalOp work. The key point
is that the listed equations are satisfied by ids.
Since the implementation of this operator actually uses label ids rather than
label strings, to make it work, there should be a way to map label ids to
tag types and chunk types. This operator uses the following way to do mapping:
.. code-block:: python
......@@ -2074,8 +2080,8 @@ def chunk_eval(input,
IOE - 0 1 -
IOBES 0 1 2 3
Still use NER as example, assuming the tagging scheme is IOB while chunk types are ORG,
PER and LOC. To satisfy the above equations, the label map can be like this:
Accordingly, in the above NER example, if the tagging scheme is IOB and chunk
types are ORG, PER and LOC, then the label ids would be as follows:
.. code-block:: python
......@@ -2087,23 +2093,32 @@ def chunk_eval(input,
I-LOC 5
O 6
It's not hard to verify the equations noting that the num of chunk types
is 3 and the num of tag types in IOB scheme is 2. For example, the label
id of I-LOC is 5, the tag type id of I-LOC is 1, and the chunk type id of
I-LOC is 2, which consistent with the results from the equations.
With which we can map each label id to the corresponding tag type and chunk
type correctly.
Args:
input (Variable): prediction output of the network.
label (Variable): label of the test data set.
chunk_scheme (str): ${chunk_scheme_comment}
num_chunk_types (int): ${num_chunk_types_comment}
excluded_chunk_types (list): ${excluded_chunk_types_comment}
seq_length(Variable): 1-D Tensor specifying sequence length when input and label are Tensor type.
input (Variable): A Tensor or LoDTensor, representing the predicted labels
from the network. When it is a Tensor, its shape would be `[N, M, 1]`,
where `N` stands for batch size, `M` for sequence length; When it is
a LoDTensor, its shape would be `[N, 1]` where `N` stands for the total
sequence lengths in this mini-batch. The data type should be int64.
label (Variable): A Tensor or LoDTensor representing the ground-truth labels.
It shoud have the same shape, lod and data type as ``input`` .
chunk_scheme (str): Indicate the tagging schemes used here. The value must
be IOB, IOE, IOBES or plain.
num_chunk_types (int): The number of chunk types.
excluded_chunk_types (list, optional): Indicate the chunk types shouldn't
be taken into account. It should be a list of chunk type ids(integer).
Default None.
seq_length(Variable, optional): A 1D Tensor containing the length of each
sequence when ``input`` and ``label`` are Tensor. It needn't be
provided if ``input`` and ``label`` are LoDTensor. Default None.
Returns:
tuple: tuple containing: precision, recall, f1_score,
num_infer_chunks, num_label_chunks,
num_correct_chunks
tuple: A tuple including precision, recall, F1-score, chunk number detected, \
chunk number in ground-truth, chunk number correctly detected. Each \
is a Tensor with shape `[1]`. The data type of precision, recall and \
F1-score all is float32, and the others' data type all is int64.
Examples:
.. code-block:: python
......@@ -2112,9 +2127,9 @@ def chunk_eval(input,
dict_size = 10000
label_dict_len = 7
sequence = fluid.layers.data(
name='id', shape=[1], lod_level=1, dtype='int64')
embedding = fluid.layers.embedding(
sequence = fluid.data(
name='id', shape=[-1, 1], lod_level=1, dtype='int64')
embedding = fluid.embedding(
input=sequence, size=[dict_size, 512])
hidden = fluid.layers.fc(input=embedding, size=512)
label = fluid.layers.data(
......@@ -5644,64 +5659,71 @@ def beam_search(pre_ids,
Refer to `Beam search <https://en.wikipedia.org/wiki/Beam_search>`_
for more details.
This layer does the search in beams for one time step. Specifically, it
selects the top-K candidate word ids of current step from :attr:`ids`
according to their :attr:`scores` for all source sentences, where K is
:attr:`beam_size` and :attr:`ids, scores` are predicted results from the
computation cell. If :attr:`ids` is not set, it will be calculated out
according to :attr:`scores`. Additionally, :attr:`pre_ids` and
:attr:`pre_scores` are the output of beam_search at previous step, they
**This operator only supports LoDTensor.** It is used after finishing
scores calculation to perform beam search for one time step. Specifically,
after ``ids`` and ``scores`` have been produced, it selects the top-K
( `k` is ``beam_size`` ) candidate word ids of current step from ``ids``
according to the correspongding ``scores``. Additionally, ``pre_id`` and
``pre_scores`` are the output of `beam_search` at previous step, they
are needed for special use to handle ended candidate translations.
Note that if :attr:`is_accumulated` is :attr:`True`, the :attr:`scores`
passed in should be accumulated scores. Else, the :attr:`scores` are
considered as the straightforward scores and will be transformed to the
log field and accumulated the :attr:`pre_scores` in this operator.
Length penalty should be done with extra operators before calculating the
accumulated scores if needed.
Note that if ``is_accumulated`` is True, the ``scores`` passed in should
be accumulated scores. Otherwise, the ``scores`` are
considered as the probabilities of single step and would be transformed to
the log field and added up with ``pre_scores`` for final scores in this
operator. Length penalty should be done with extra operators before calculating
the accumulated scores if needed.
Please see the following demo for a fully beam search usage example:
fluid/tests/book/test_machine_translation.py
Args:
pre_ids(Variable): The LodTensor variable which is the output of
beam_search at previous step. It should be a LodTensor with shape
:math:`(batch_size, 1)` and lod
:math:`[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at the
first step.
pre_scores(Variable): The LodTensor variable which is the output of
beam_search at previous step.
ids(Variable): The LodTensor variable containing the candidates ids.
Its shape should be :math:`(batch_size \\times beam_size, K)`,
where :math:`K` supposed to be :attr:`beam_size`.
scores(Variable): The LodTensor variable containing the accumulated
scores corresponding to :attr:`ids` and its shape is the same as
the shape of :attr:`ids`.
pre_ids(Variable): A LodTensor variable (lod level is 2), representing
the selected ids of previous step. It is the output of beam_search
at previous step. Its shape is `[batch_size, 1]` and its lod is
`[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at the
first step. The data type should be int64.
pre_scores(Variable): A LodTensor variable has the same shape and lod
with ``pre_ids`` , representing the accumulated scores corresponding
to the selected ids of previous step. It is the output of
beam_search at previous step. The data type should be float32.
ids(Variable|None): A LodTensor variable containing the candidates ids.
It has the same lod with ``pre_ids`` and its shape should be
`[batch_size * beam_size, K]`, where `K` supposed to be greater than
``beam_size`` and the first dimension size (decrease as samples reach
to the end) should be same as that of ``pre_ids`` . The data type
should be int64. It can be None, which use indice in ``scores`` as
ids.
scores(Variable): A LodTensor variable containing the accumulated
scores corresponding to ``ids`` . Both its shape and lod are same as
thoes of ``ids`` . The data type should be float32.
beam_size(int): The beam width used in beam search.
end_id(int): The id of end token.
level(int, default 0): It can be ignored and mustn't change currently.
It means the source level of lod, which is explained as following.
The lod level of :attr:`ids` should be 2. The first level is source
level which describes how many prefixes (branchs) for each source
sentece (beam), and the second level is sentence level which
describes how these candidates belong to the prefix. The paths
linking prefixes and selected candidates are organized and reserved
in lod.
is_accumulated(bool, default True): Whether the input :attr:`score` is
accumulated scores.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
return_parent_idx(bool): Whether to return an extra Tensor variable
preserving the selected_ids' parent indice in pre_ids
in output, which can be used to gather cell states at
the next time step.
level(int): **It can be ignored and mustn't change currently.**
The 2 level lod used in this operator has the following
meaning: The first level describes how many beams each sample has,
which would change to 0 when beams of the sample all end (batch reduce);
The second level describes how many times each beam is selected.
Default 0, which shouldn't be changed currently.
is_accumulated(bool): Whether the input ``score`` is accumulated scores.
Default True.
name(str, optional): For detailed information, please refer
to :ref:`api_guide_Name`. Usually name is no need to set and
None by default.
return_parent_idx(bool, optional): Whether to return an extra Tensor variable
in output, which stores the selected ids' parent indice in
``pre_ids`` and can be used to update RNN's states by gather operator.
Default False.
Returns:
Variable: The LodTensor tuple containing the selected ids and the \
corresponding scores. If :attr:`return_parent_idx` is :attr:`True`, \
an extra Tensor variable preserving the selected_ids' parent indice \
is included.
tuple: The tuple contains two or three LodTensor variables. The two LodTensor, \
representing the selected ids and the corresponding accumulated scores of \
current step, have the same shape `[batch_size, beam_size]` and lod with 2 levels, \
and have data types int64 and float32. If ``return_parent_idx`` is True, \
an extra Tensor variable preserving the selected ids' parent indice \
is included, whose shape is `[batch_size * beam_size]` and data type \
is int64.
Examples:
.. code-block:: python
......@@ -5713,12 +5735,12 @@ def beam_search(pre_ids,
# at previous step.
beam_size = 4
end_id = 1
pre_ids = fluid.layers.data(
name='pre_id', shape=[1], lod_level=2, dtype='int64')
pre_scores = fluid.layers.data(
name='pre_scores', shape=[1], lod_level=2, dtype='float32')
probs = fluid.layers.data(
name='probs', shape=[10000], dtype='float32')
pre_ids = fluid.data(
name='pre_id', shape=[None, 1], lod_level=2, dtype='int64')
pre_scores = fluid.data(
name='pre_scores', shape=[None, 1], lod_level=2, dtype='float32')
probs = fluid.data(
name='probs', shape=[None, 10000], dtype='float32')
topk_scores, topk_indices = fluid.layers.topk(probs, k=beam_size)
accu_scores = fluid.layers.elementwise_add(
x=fluid.layers.log(x=topk_scores),
......@@ -5772,28 +5794,46 @@ def beam_search(pre_ids,
def beam_search_decode(ids, scores, beam_size, end_id, name=None):
"""
Beam Search Decode Layer. This layer constructs the full hypotheses for
each source sentence by walking back along the LoDTensorArray :attr:`ids`
whose lods can be used to restore the path in the beam search tree.
This operator is used after beam search has completed. It constructs the
full predicted sequences for each sample by walking back along the search
paths stored in lod of ``ids`` . The result sequences are stored in a
LoDTensor, which uses the following way to parse:
.. code-block:: text
If lod = [[0, 3, 6], [0, 12, 24, 40, 54, 67, 82]]
The first level of lod stands for: There are 2 samples each having 3
(beam width) predicted sequence.
The second level of lod stands for: The lengths of the first sample's
3 predicted sequences are 12, 12, 16; The lengths of the second sample's
3 predicted sequences are 14, 13, 15.
Please see the following demo for a fully beam search usage example:
fluid/tests/book/test_machine_translation.py
Args:
ids(Variable): The LodTensorArray variable containing the selected ids
of all steps.
scores(Variable): The LodTensorArray variable containing the selected
scores of all steps.
ids(Variable): The LoDTensorArray variable containing the selected ids
of all steps. Each LoDTensor in it has int64 data type and 2 level
lod which can be used to get the search paths.
scores(Variable): The LodTensorArray variable containing the accumulated
scores corresponding to selected ids of all steps. It has the same size
as ``ids`` . Each LoDTensor in it has the same shape and lod as the
counterpart in ``ids`` , and has a float32 data type.
beam_size(int): The beam width used in beam search.
end_id(int): The id of end token.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
name(str, optional): For detailed information, please refer
to :ref:`api_guide_Name`. Usually name is no need to set and
None by default.
Returns:
Variable: The LodTensor pair containing the generated id sequences \
and the corresponding scores. The shapes and lods of the two \
LodTensor are same. The lod level is 2 and the two levels \
separately indicate how many hypotheses each source sentence has \
and how many ids each hypothesis has.
tuple: The tuple contains two LodTensor variables. The two LodTensor, \
containing the full sequences of ids and the correspongding accumulated \
scores, have the same shape flattened to 1D and have the same 2 level \
lod. The lod can be used to get how many predicted sequences each sample \
has and how many ids each predicted sequence has.
Examples:
.. code-block:: python
......@@ -5832,71 +5872,67 @@ def lstm_unit(x_t,
param_attr=None,
bias_attr=None,
name=None):
"""Lstm unit layer. The equation of a lstm step is:
.. math::
i_t & = \sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + b_i)
f_t & = \sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + b_f)
c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t + W_{h_c}h_{t-1} + b_c)
o_t & = \sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + b_o)
h_t & = o_t tanh(c_t)
"""
Long-Short Term Memory (LSTM) RNN cell. This operator performs LSTM calculations for
one time step, whose implementation is based on calculations described in `RECURRENT
NEURAL NETWORK REGULARIZATION <http://arxiv.org/abs/1409.2329>`_ .
The inputs of lstm unit include :math:`x_t`, :math:`h_{t-1}` and
:math:`c_{t-1}`. The 2nd dimensions of :math:`h_{t-1}` and :math:`c_{t-1}`
should be same. The implementation separates the linear transformation and
non-linear transformation apart. Here, we take :math:`i_t` as an example.
The linear transformation is applied by calling a `fc` layer and the
equation is:
We add forget_bias to the biases of the forget gate in order to
reduce the scale of forgetting. The formula is as follows:
.. math::
.. math::
i_{t} & = \sigma(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
L_{i_t} = W_{x_i}x_{t} + W_{h_i}h_{t-1} + b_i
f_{t} & = \sigma(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
The non-linear transformation is applied by calling `lstm_unit_op` and the
equation is:
c_{t} & = f_{t}c_{t-1} + i_{t} tanh (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
.. math::
o_{t} & = \sigma(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
i_t = \sigma(L_{i_t})
h_{t} & = o_{t} tanh (c_{t})
This layer has two outputs including :math:`h_t` and :math:`c_t`.
:math:`x_{t}` stands for ``x_t`` , corresponding to the input of current time step;
:math:`h_{t-1}` and :math:`c_{t-1}` correspond to ``hidden_t_prev`` and ``cell_t_prev`` ,
representing the output of from previous time step.
:math:`i_{t}, f_{t}, c_{t}, o_{t}, h_{t}` are input gate, forget gate, cell, output gate
and hidden calculation.
Args:
x_t (Variable): The input value of current step, a 2-D tensor with shape
M x N, M for batch size and N for input size.
hidden_t_prev (Variable): The hidden value of lstm unit, a 2-D tensor
with shape M x S, M for batch size and S for size of lstm unit.
cell_t_prev (Variable): The cell value of lstm unit, a 2-D tensor with
shape M x S, M for batch size and S for size of lstm unit.
forget_bias (float): The forget bias of lstm unit.
param_attr(ParamAttr|None): The parameter attribute for the learnable
hidden-hidden weights.
If it is set to None or one attribute of ParamAttr,
lstm_unit will create ParamAttr as param_attr.
If the Initializer of the param_attr is not set, the
parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr|None): The bias attribute for the learnable bias
weights. If it is set to False, no bias will be added
to the output units. If it is set to None or one attribute of ParamAttr,
lstm_unit will create ParamAttr as bias_attr.
If the Initializer of the bias_attr is not set,
the bias is initialized zero. Default: None.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
x_t(Variable): A 2D Tensor representing the input of current time step.
Its shape should be :math:`[N, M]` , where :math:`N` stands for batch
size, :math:`M` for the feature size of input. The data type should
be float32 or float64.
hidden_t_prev(Variable): A 2D Tensor representing the hidden value from
previous step. Its shape should be :math:`[N, D]` , where :math:`N`
stands for batch size, :math:`D` for the hidden size. The data type
should be same as ``x_t`` .
cell_t_prev(Variable): A 2D Tensor representing the cell value from
previous step. It has the same shape and data type with ``hidden_t_prev`` .
forget_bias (float, optional): :math:`forget\\_bias` added to the biases
of the forget gate. Default 0.
param_attr(ParamAttr, optional): To specify the weight parameter property.
Default: None, which means the default weight parameter property is used.
See usage for details in :ref:`api_fluid_ParamAttr` .
bias_attr (ParamAttr, optional): To specify the bias parameter property.
Default: None, which means the default bias parameter property is used.
See usage for details in :ref:`api_fluid_ParamAttr` .
name(str, optional): For detailed information, please refer
to :ref:`api_guide_Name`. Usually name is no need to set and
None by default.
Returns:
tuple: The hidden value and cell value of lstm unit.
tuple: The tuple contains two Tensor variables with the same shape and \
data type with ``hidden_t_prev`` , representing the hidden value and \
cell value which correspond to :math:`h_{t}` and :math:`c_{t}` in \
the formula.
Raises:
ValueError: The ranks of **x_t**, **hidden_t_prev** and **cell_t_prev**
not be 2 or the 1st dimensions of **x_t**, **hidden_t_prev**
and **cell_t_prev** not be the same or the 2nd dimensions of
**hidden_t_prev** and **cell_t_prev** not be the same.
ValueError: Rank of x_t must be 2.
ValueError: Rank of hidden_t_prev must be 2.
ValueError: Rank of cell_t_prev must be 2.
ValueError: The 1st dimensions of x_t, hidden_t_prev and cell_t_prev must be the same.
ValueError: The 2nd dimensions of hidden_t_prev and cell_t_prev must be the same.
Examples:
......@@ -5905,12 +5941,12 @@ def lstm_unit(x_t,
import paddle.fluid as fluid
dict_dim, emb_dim, hidden_dim = 128, 64, 512
data = fluid.layers.data(name='step_data', shape=[1], dtype='int32')
x = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
pre_hidden = fluid.layers.data(
name='pre_hidden', shape=[hidden_dim], dtype='float32')
pre_cell = fluid.layers.data(
name='pre_cell', shape=[hidden_dim], dtype='float32')
data = fluid.data(name='step_data', shape=[None], dtype='int64')
x = fluid.embedding(input=data, size=[dict_dim, emb_dim])
pre_hidden = fluid.data(
name='pre_hidden', shape=[None, hidden_dim], dtype='float32')
pre_cell = fluid.data(
name='pre_cell', shape=[None, hidden_dim], dtype='float32')
hidden = fluid.layers.lstm_unit(
x_t=x,
hidden_t_prev=pre_hidden,
......@@ -8308,7 +8344,7 @@ def one_hot(input, depth, allow_out_of_range=False):
attrs = {'depth': depth}
else:
if not isinstance(depth, Variable):
# user attribute
# user attribute
inputs = {'X': input}
attrs = {'depth': depth}
else:
......@@ -15035,12 +15071,13 @@ def teacher_student_sigmoid_loss(input,
def add_position_encoding(input, alpha, beta, name=None):
"""
**Add Position Encoding Layer**
This operator performs weighted sum of input feature at each position
(position in the sequence) and the corresponding position encoding.
This layer accepts an input 3D-Tensor of shape [N x M x P], and returns an
output Tensor of shape [N x M x P] with positional encoding value.
For more details of position encoding, please refer to `Attention Is All You
Need <http://arxiv.org/pdf/1706.03762.pdf>`_ .
Refer to `Attention Is All You Need <http://arxiv.org/pdf/1706.03762.pdf>`_ .
The formula is as follows:
.. math::
PE(pos, 2i) &= \\sin{(pos / 10000^{2i / P})} \\\\
......@@ -15048,28 +15085,36 @@ def add_position_encoding(input, alpha, beta, name=None):
Out(:, pos, i) &= \\alpha * input(:, pos, i) + \\beta * PE(pos, i)
Where:
- :math:`PE(pos, 2i)` : the increment for the number at even position
- :math:`PE(pos, 2i + 1)` : the increment for the number at odd position
- :math:`PE(pos, 2i)` : the value at even index `2i` for encoding of position `pos`.
- :math:`PE(pos, 2i + 1)` : the value at odd index `2i+1` for encoding of position `pos`
Args:
input (Variable): 3-D input tensor with shape [N x M x P]
alpha (float): multiple of Input Tensor
beta (float): multiple of Positional Encoding Tensor
name (string): the name of position encoding layer
input(Variable): A Tensor or LoDTensor (lod level is 1). If it is a
Tensor, the shape should be `[N, M, P]`, where `N` stands for
batch size, `M` for sequence length, `P` for the size of feature
dimension. If it is a LoDTensor, the shape should be `[N, P]`,
where `N` stands for the total sequence lengths in this mini-batch,
`P` for the size of feature. The data type should be float32 or float64.
alpha(float): Indicate the weight coefficient for `input` when performing
weighted sum.
beta(float): Indicate the weight coefficient for position encoding when
performing weighted sum.
name(str, optional): For detailed information, please refer
to :ref:`api_guide_Name`. Usually name is no need to set and
None by default.
Returns:
Variable: A 3-D Tensor of shape [N x M x P] with positional encoding.
Variable: A Tensor or LoDTensor. It has the same shape, data type and lod as `input`.
Examples:
.. code-block:: python
import paddle.fluid as fluid
tensor = fluid.layers.data(
tensor = fluid.data(
name='tensor',
shape=[32, 64, 512],
dtype='float32',
append_batch_size=False)
shape=[None, 64, 512],
dtype='float32')
position_tensor = fluid.layers.add_position_encoding(
input=tensor, alpha=1.0, beta=1.0)
......
......@@ -363,67 +363,67 @@ def scaled_dot_product_attention(queries,
num_heads=1,
dropout_rate=0.):
"""
The dot-product attention.
This interface Multi-Head Attention using scaled dot product.
Attention mechanism can be seen as mapping a query and a set of key-value
pairs to an output. The output is computed as a weighted sum of the values,
where the weight assigned to each value is computed by a compatibility
function (dot-product here) of the query with the corresponding key.
pairs to an output. Multi-Head Attention performs attention using multi-head
parallel, and the inputs of attention would be transformed by linear projection.
The formula is as follows:
The dot-product attention can be implemented through (batch) matrix
multipication as follows:
.. math::
.. math::
MultiHead(Q, K, V ) & = Concat(head_1, ..., head_h)
where \ head_i & = Attention(QW_i^Q , KW_i^K , VW_i^V )
Attention(Q, K, V)= softmax(QK^\mathrm{T})V
Attention(Q, K, V) & = softmax (\\frac{QK^\mathrm{T}}{\sqrt{d_k}}) V
Refer to `Attention Is All You Need
<https://arxiv.org/pdf/1706.03762.pdf>`_.
For more details, please refer to `Attention Is All You Need
<https://arxiv.org/pdf/1706.03762.pdf>`_ .
Note that the implementation is adapted to batch, and all matrix multiplication
in :math:`Attention(Q, K, V)` is batched matrix multiplication. Refer to
:ref:`api_fluid_layers_matmul` .
Args:
queries (Variable): The input variable which should be a 3-D Tensor.
keys (Variable): The input variable which should be a 3-D Tensor.
values (Variable): The input variable which should be a 3-D Tensor.
num_heads (int): Head number to compute the scaled dot product
attention. Default: 1.
dropout_rate (float): The dropout rate to drop the attention weight.
Default: 0.0.
queries (Variable): A 3-D Tensor with shape :math:`[N, L_q, d_k \\times h]` ,
where :math:`N` stands for batch size, :math:`L_q` for the sequence length
of query, :math:`d_k \\times h` for the feature size of query, :math:`h` for
head number. The data type should be float32 or float64.
keys (Variable): A 3-D Tensor with shape :math:`[N, L_k, d_k \\times h]` ,
where :math:`N` stands for batch size, :math:`L_k` for the sequence length
of key, :math:`d_k \\times h` for the feature size of key, :math:`h` for head
number. The data type should be the same as ``queries`` .
values (Variable): A 3-D Tensor with shape :math:`[N, L_k, d_v \\times h]` ,
where :math:`N` stands for batch size, :math:`L_k` for the sequence length
of key, :math:`d_v \\times h` for the feature size of value, :math:`h` for head
number. The data type should be the same as ``queries`` .
num_heads (int, optional): Indicate the number of head. If the numher
is 1, linear projection would not be performed on inputs. Default: 1.
dropout_rate (float, optional): The rate to drop the attention weight.
Default: 0.0, which means no dropout.
Returns:
Variable: A 3-D Tensor computed by multi-head scaled dot product\
attention.
Variable: A 3-D Tensor with shape :math:`[N, L_q, d_v \\times h]` , \
where :math:`N` stands for batch size, :math:`L_q` for the sequence \
length of query, :math:`d_v \\times h` for the feature size of value. \
It has the same data type with inputs, representing the output of \
Multi-Head Attention.
Raises:
ValueError: If input queries, keys, values are not 3-D Tensors.
NOTES:
1. When num_heads > 1, three linear projections are learned respectively
to map input queries, keys and values into queries', keys' and values'.
queries', keys' and values' have the same shapes with queries, keys
and values.
2. When num_heads == 1, scaled_dot_product_attention has no learnable
parameters.
ValueError: Inputs quries, keys and values should all be 3-D tensors.
ValueError: The hidden size of queries and keys should be the same.
ValueError: The max sequence length in query batch and in key batch should be the same.
ValueError: he hidden size of keys must be divisible by the number of attention heads.
ValueError: he hidden size of values must be divisible by the number of attention heads.
Examples:
.. code-block:: python
import paddle.fluid as fluid
queries = fluid.layers.data(name="queries",
shape=[3, 5, 9],
dtype="float32",
append_batch_size=False)
queries.stop_gradient = False
keys = fluid.layers.data(name="keys",
shape=[3, 6, 9],
dtype="float32",
append_batch_size=False)
keys.stop_gradient = False
values = fluid.layers.data(name="values",
shape=[3, 6, 10],
dtype="float32",
append_batch_size=False)
values.stop_gradient = False
queries = fluid.data(name="queries", shape=[3, 5, 9], dtype="float32")
keys = fluid.data(name="keys", shape=[3, 6, 9], dtype="float32")
values = fluid.data(name="values", shape=[3, 6, 10], dtype="float32")
contexts = fluid.nets.scaled_dot_product_attention(queries, keys, values)
contexts.shape # [3, 5, 10]
"""
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册