diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index dce4f95eea6e2cb4206fd902be2828607dbc3260..1913b1b34afc6ae41d44c8b27f367e6c1b503d82 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -127,15 +127,15 @@ paddle.fluid.layers.center_loss (ArgSpec(args=['input', 'label', 'num_classes', paddle.fluid.layers.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', 'c51fcac7a4f5786ca41f27fa60bd22c5')) paddle.fluid.layers.dynamic_lstm (ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)), ('document', 'd4a82e2f5feb20c4a23ced8054e047ed')) paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None)), ('document', 'b35fe3e0c2ecca15a8be658277e064ec')) -paddle.fluid.layers.dynamic_gru (ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False)), ('document', '83617c165827e030636c80486d5de6f3')) -paddle.fluid.layers.gru_unit (ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False)), ('document', '33974b9bfa69f2f1eb85e6f956dff04e')) +paddle.fluid.layers.dynamic_gru (ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False)), ('document', 'a3364b36fb3190b9bd75e419aa75573b')) +paddle.fluid.layers.gru_unit (ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False)), ('document', '0b10a755b469d0b85b3a5cac38b4cf01')) paddle.fluid.layers.linear_chain_crf (ArgSpec(args=['input', 'label', 'param_attr', 'length'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'b28bdb43160e9667be2a3457d19d9f5b')) paddle.fluid.layers.crf_decoding (ArgSpec(args=['input', 'param_attr', 'label', 'length'], varargs=None, keywords=None, defaults=(None, None)), ('document', '708ce0348b74d3e0c7885c2c524b7fa7')) paddle.fluid.layers.cos_sim (ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None), ('document', '48ec1ba2d75c4e2faf8d9a47350462ae')) paddle.fluid.layers.cross_entropy (ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)), ('document', 'd1985a930a59c3bd41a7c1d72594f5b9')) paddle.fluid.layers.bpr_loss (ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ae57e6e5136dade436f0df1f11770afa')) paddle.fluid.layers.square_error_cost (ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None), ('document', '4ed09e115b50ec7393674c4c09d223a2')) -paddle.fluid.layers.chunk_eval (ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types', 'seq_length'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'b02844e0ad4bd713c5fe6802aa13219c')) +paddle.fluid.layers.chunk_eval (ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types', 'seq_length'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'a8aa2071cae18df1e8dde9183d64bfb1')) paddle.fluid.layers.sequence_conv (ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'padding_start', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, True, None, None, None, None, None)), ('document', 'ebddcc5a1073ef065d22b4673e36b1d2')) paddle.fluid.layers.conv2d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name', 'data_format'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None, 'NCHW')), ('document', 'e91c63b8ac8c35982c0ac518537e44bf')) paddle.fluid.layers.conv3d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name', 'data_format'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None, 'NCDHW')), ('document', 'feff9c8ebb4d4d0be5345f9042f57c8e')) @@ -149,14 +149,14 @@ paddle.fluid.layers.adaptive_pool3d (ArgSpec(args=['input', 'pool_size', 'pool_t paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', '1400433bae7876d0407ae205be39b7a1')) paddle.fluid.layers.instance_norm (ArgSpec(args=['input', 'epsilon', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None)), ('document', '23d6fba8ad8495f67a66d8878be5b0be')) paddle.fluid.layers.data_norm (ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, None, None, None, False)), ('document', '5ba4cdb4ea5c03382da545335ffc05b7')) -paddle.fluid.layers.beam_search_decode (ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '83e08f21af41ac8bac37aeab1f86fdd0')) +paddle.fluid.layers.beam_search_decode (ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'eafa177a7fed6178a51c1affa7f46a40')) paddle.fluid.layers.conv2d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name', 'data_format'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None, 'NCHW')), ('document', 'ed24c2d0f82cd9a3b40488157285a584')) paddle.fluid.layers.conv3d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name', 'data_format'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None, 'NCDHW')), ('document', 'efb1e3bc87339cb26faa2edae210e8b0')) paddle.fluid.layers.sequence_expand (ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '10e122eb755c2bd1f78ef2332b28f1a0')) paddle.fluid.layers.sequence_expand_as (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '858c432e7cbd8bb952cc2eb555457d50')) paddle.fluid.layers.sequence_pad (ArgSpec(args=['x', 'pad_value', 'maxlen', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'df08b9c499ab3a90f95d08ab5b6c6c62')) paddle.fluid.layers.sequence_unpad (ArgSpec(args=['x', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e478180d5bc010a84f35af958cafa62c')) -paddle.fluid.layers.lstm_unit (ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None)), ('document', 'fe126c58e4339410e875ab1eba246d21')) +paddle.fluid.layers.lstm_unit (ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None)), ('document', 'f5a878b6166f34878376a58d7e6fa95c')) paddle.fluid.layers.reduce_sum (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'ecb55075fdf89a866bcede85e60aebad')) paddle.fluid.layers.reduce_mean (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '968c9b17affaf714e5021c3dc8d68c73')) paddle.fluid.layers.reduce_max (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'd37e3a9a05c00e032d4b7876c4f6b414')) @@ -181,7 +181,7 @@ paddle.fluid.layers.im2sequence (ArgSpec(args=['input', 'filter_size', 'stride', paddle.fluid.layers.nce (ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)), ('document', '38297567127888e01542857839058d52')) paddle.fluid.layers.sampled_softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'num_samples', 'num_true', 'remove_accidental_hits', 'use_customized_samples', 'customized_samples', 'customized_probabilities', 'seed'], varargs=None, keywords=None, defaults=(1, True, False, None, None, 0)), ('document', 'd4435a63d34203339831ee6a86ef9242')) paddle.fluid.layers.hsigmoid (ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)), ('document', 'b83e7dfa81059b39bb137922dc914f50')) -paddle.fluid.layers.beam_search (ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)), ('document', '1270395ce97a4e1b556104abbb14f096')) +paddle.fluid.layers.beam_search (ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)), ('document', '2b505ddaa309fd7b9be5445e41ca76d5')) paddle.fluid.layers.row_conv (ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'a6477957b44907787b3c74157400b80c')) paddle.fluid.layers.multiplex (ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None), ('document', '2c4d1ae83da6ed35e3b36ba1b3b51d23')) paddle.fluid.layers.layer_norm (ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)), ('document', '678de6d6d0c93da74189990b039daae8')) @@ -281,7 +281,7 @@ paddle.fluid.layers.similarity_focus (ArgSpec(args=['input', 'axis', 'indexes', paddle.fluid.layers.hash (ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'a0b73c21be618cec0281e7903039e5e3')) paddle.fluid.layers.grid_sampler (ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '90c74742f48c70b103f1fbb9eb129066')) paddle.fluid.layers.log_loss (ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)), ('document', 'ef1701e11d60508fe8f02dd2a8c60bdf')) -paddle.fluid.layers.add_position_encoding (ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e399f9436fed5f7ff480d8532e42c937')) +paddle.fluid.layers.add_position_encoding (ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'bd8b28e6c1640b13a42b0524f86f7800')) paddle.fluid.layers.bilinear_tensor_product (ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '6755168c4b2308e1e4f54cb56fa7dcb2')) paddle.fluid.layers.merge_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b2b0e5d5c155ce24bafc38b78cd0b164')) paddle.fluid.layers.get_tensor_from_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '2c568321feb4d16c41a83df43f95089d')) @@ -922,7 +922,7 @@ paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ (ArgSpec(args=['self paddle.fluid.nets.simple_img_conv_pool (ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True)), ('document', '5e89c978199c4ecce2b26d5fed1ec52b')) paddle.fluid.nets.sequence_conv_pool (ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type', 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None)), ('document', 'b2d435f782ac8ea3ca480b8d24e7f5b4')) paddle.fluid.nets.glu (ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)), ('document', '3efe197c8e3e75f84a4c464d8b74e943')) -paddle.fluid.nets.scaled_dot_product_attention (ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)), ('document', 'b1a07a0000eb9103e3a143ca8c13de5b')) +paddle.fluid.nets.scaled_dot_product_attention (ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)), ('document', '375898e47266633635f4c2096e1ac296')) paddle.fluid.nets.img_conv_group (ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True)), ('document', 'a59c581d5969266427e841abe69f694a')) paddle.fluid.optimizer.SGDOptimizer ('paddle.fluid.optimizer.SGDOptimizer', ('document', 'fc09d6e6c1083cec2dce51f6f9f4ecaf')) paddle.fluid.optimizer.SGDOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 5357d0b3dac60f06171c08035a4f10af7fe84739..37a5c6640ac7ceed4b08deac7ddb80e6d186bcc9 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -452,6 +452,7 @@ def center_loss(input, centers_param = helper.create_parameter( attr=param_attr, shape=centers_shape, dtype=dtype) centers_param.stop_gradient = True + if isinstance(alpha, Variable): alpha_param = alpha else: @@ -1215,13 +1216,16 @@ def dynamic_gru(input, h_0=None, origin_mode=False): """ - **Gated Recurrent Unit (GRU) Layer** + **Note: The input type of this must be LoDTensor. If the input type to be + processed is Tensor, use** :ref:`api_fluid_layers_StaticRNN` . - if origin_mode is False, then the equation of a gru step is from paper - `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence - Modeling `_ . + This operator is used to perform the calculations for a single layer of + Gated Recurrent Unit (GRU) on full sequences step by step. The calculations + in one time step support these two modes: - The formula is as follows: + If ``origin_mode`` is True, then the formula used is from paper + `Learning Phrase Representations using RNN Encoder Decoder for Statistical + Machine Translation `_ . .. math:: @@ -1231,12 +1235,12 @@ def dynamic_gru(input, \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c) - h_t & = (1-u_t) \odot h_{t-1} + u_t \odot \\tilde{h_t} + h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t} - if origin_mode is True then the equation is from paper - Learning Phrase Representations using RNN Encoder-Decoder for Statistical - Machine Translation `_ + if ``origin_mode`` is False, then the formula used is from paper + `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence + Modeling `_ .. math:: @@ -1246,59 +1250,56 @@ def dynamic_gru(input, \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c) - h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t} + h_t & = (1-u_t) \odot h_{t-1} + u_t \odot \\tilde{h_t} - The :math:`\odot` is the element-wise product of the vectors. :math:`act_g` - is the update gate and reset gate activation function and :math:`sigmoid` - is usually used for it. :math:`act_c` is the activation function for - candidate hidden state and :math:`tanh` is usually used for it. + :math:`x_t` is the input of current time step, but it is not from ``input`` . + This operator does not include the calculations :math:`W_{ux}x_{t}, W_{rx}x_{t}, W_{cx}x_{t}` , + **Note** thus a fully-connect layer whose size is 3 times of ``size`` should + be used before this operator, and the output should be used as ``input`` here. + :math:`h_{t-1}` is the hidden state from previous time step. + :math:`u_t` , :math:`r_t` , :math:`\\tilde{h_t}` and :math:`h_t` stand for + update gate, reset gate, candidate hidden and hidden output separately. + :math:`W_{uh}, b_u` , :math:`W_{rh}, b_r` and :math:`W_{ch}, b_c` stand for + the weight matrix and bias used in update gate, reset gate, candidate hidden + calculations. For implementation, the three weight matrix are merged into a + tensor shaped :math:`[D, D \\times 3]` , the three bias are concatenated as + a tensor shaped :math:`[1, D \\times 3]` , where :math:`D` stands for the + hidden size; The data layout of weight tensor is: :math:`W_{uh}` and :math:`W_{rh}` + are concatenated with shape :math:`[D, D \\times 2]` lying on the first part, + and :math:`W_{ch}` lying on the latter part with shape :math:`[D, D]` . - Note that these :math:`W_{ux}x_{t}, W_{rx}x_{t}, W_{cx}x_{t}` operations on - the input :math:`x_{t}` are NOT included in this operator. Users can choose - to use fully-connect layer before GRU layer. Args: - input(Variable): The input of dynamic_gru layer, which supports - variable-time length input sequence. The underlying tensor in this - Variable is a matrix with shape :math:`(T \\times 3D)`, where - :math:`T` is the total time steps in this mini-batch, :math:`D` - is the hidden size. - size(int): The dimension of the gru cell. - param_attr(ParamAttr|None): The parameter attribute for the learnable - hidden-hidden weight matrix. Note: - - - The shape of the weight matrix is :math:`(T \\times 3D)`, where - :math:`D` is the hidden size. - - All elements in the weight matrix can be divided into two parts. - The first part are weights of the update gate and reset gate with - shape :math:`(D \\times 2D)`, and the second part are weights for - candidate hidden state with shape :math:`(D \\times D)`. - - If it is set to None or one attribute of ParamAttr, dynamic_gru will - create ParamAttr as param_attr. If the Initializer of the param_attr - is not set, the parameter is initialized with Xavier. Default: None. - bias_attr (ParamAttr|bool|None): The parameter attribute for the bias - of GRU.Note that the bias with :math:`(1 \\times 3D)` concatenates - the bias in the update gate, reset gate and candidate calculations. - If it is set to False, no bias will be applied to the update gate, - reset gate and candidate calculations. If it is set to None or one - attribute of ParamAttr, dynamic_gru will create ParamAttr as - bias_attr. If the Initializer of the bias_attr is not set, the bias - is initialized zero. Default: None. - is_reverse(bool): Whether to compute reversed GRU, default - :attr:`False`. - gate_activation(str): The activation for update gate and reset gate. - Choices = ["sigmoid", "tanh", "relu", "identity"], default "sigmoid". - candidate_activation(str): The activation for candidate hidden state. - Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh". - h_0 (Variable): This is initial hidden state. If not set, default is - zero. This is a tensor with shape (N x D), where N is the number of - total time steps of input mini-batch feature and D is the hidden - size. + input(Variable): A LoDTensor whose lod level is 1, representing the input + after linear projection. Its shape should be :math:`[T, D \\times 3]` , + where :math:`T` stands for the total sequence lengths in this mini-batch, + :math:`D` for the hidden size. The data type should be float32 or float64. + size(int): Indicate the hidden size. + param_attr(ParamAttr, optional): To specify the weight parameter property. + Default: None, which means the default weight parameter property is used. + See usage for details in :ref:`api_fluid_ParamAttr` . + bias_attr (ParamAttr, optional): To specify the bias parameter property. + Default: None, which means the default bias parameter property is used. + See usage for details in :ref:`api_fluid_ParamAttr` . + is_reverse(bool, optional): Whether to compute in the reversed order of + input sequences. Default False. + gate_activation(str, optional): The activation fuction corresponding to + :math:`act_g` in the formula. "sigmoid", "tanh", "relu" and "identity" + are supported. Default "sigmoid". + candidate_activation(str, optional): The activation fuction corresponding to + :math:`act_c` in the formula. "sigmoid", "tanh", "relu" and "identity" + are supported. Default "tanh". + h_0 (Variable, optional): A Tensor representing the initial hidden state. + It not provided, the default initial hidden state is 0. The shape is + :math:`[N, D]` , where :math:`N` is the number of sequences in the + mini-batch, :math:`D` for the hidden size. The data type should be + same as ``input`` . Default None. Returns: - Variable: The hidden state of GRU. The shape is :math:`(T \\times D)`, \ - and sequence length is the same with the input. + Variable: A LoDTensor whose lod level is 1 and shape is :math:`[T, D]` , \ + where :math:`T` stands for the total sequence lengths in this mini-batch \ + :math:`D` for the hidden size. It represents GRU transformed sequence output, \ + and has the same lod and data type with ``input`` . Examples: @@ -1307,9 +1308,11 @@ def dynamic_gru(input, import paddle.fluid as fluid dict_dim, emb_dim = 128, 64 - data = fluid.layers.data(name='sequence', shape=[1], - dtype='int32', lod_level=1) - emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim]) + data = fluid.data(name='sequence', + shape=[None], + dtype='int64', + lod_level=1) + emb = fluid.embedding(input=data, size=[dict_dim, emb_dim]) hidden_dim = 512 x = fluid.layers.fc(input=emb, size=hidden_dim * 3) hidden = fluid.layers.dynamic_gru(input=x, size=hidden_dim) @@ -1365,79 +1368,85 @@ def gru_unit(input, gate_activation='sigmoid', origin_mode=False): """ - **GRU unit layer** + Gated Recurrent Unit (GRU) RNN cell. This operator performs GRU calculations for + one time step and it supports these two modes: - if origin_mode is True, then the equation of a gru step is from paper - `Learning Phrase Representations using RNN Encoder-Decoder for Statistical - Machine Translation `_ + If ``origin_mode`` is True, then the formula used is from paper + `Learning Phrase Representations using RNN Encoder Decoder for Statistical + Machine Translation `_ . - .. math:: - u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u) + .. math:: + + u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u) + + r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r) - r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r) + \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c) - m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m) + h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t} - h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t) - if origin_mode is False, then the equation of a gru step is from paper + if ``origin_mode`` is False, then the formula used is from paper `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence - Modeling `_ + Modeling `_ - .. math:: - u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u) + .. math:: - r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r) + u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u) - m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m) + r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r) - h_t & = dot((1-u_t), h_{t-1}) + dot(u_t, m_t) + \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c) + h_t & = (1-u_t) \odot h_{t-1} + u_t \odot \\tilde{h_t} - The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms - of the equation above, the :math:`z_t` is split into 3 parts - - :math:`xu_t`, :math:`xr_t` and :math:`xm_t`. This means that in order to - implement a full GRU unit operator for an input, a fully - connected layer has to be applied, such that :math:`z_t = W_{fc}x_t`. + :math:`x_t` is the input of current time step, but it is not ``input`` . + This operator does not include the calculations :math:`W_{ux}x_{t}, W_{rx}x_{t}, W_{cx}x_{t}` , + **Note** thus a fully-connect layer whose size is 3 times of GRU hidden size should + be used before this operator, and the output should be used as ``input`` here. + :math:`h_{t-1}` is the hidden state from previous time step. + :math:`u_t` , :math:`r_t` , :math:`\\tilde{h_t}` and :math:`h_t` stand for + update gate, reset gate, candidate hidden and hidden output separately. + :math:`W_{uh}, b_u` , :math:`W_{rh}, b_r` and :math:`W_{ch}, b_c` stand for + the weight matrix and bias used in update gate, reset gate, candidate hidden + calculations. For implementation, the three weight matrix are merged into a + tensor shaped :math:`[D, D \\times 3]` , the three bias are concatenated as + a tensor shaped :math:`[1, D \\times 3]` , where :math:`D` stands for the + hidden size; The data layout of weight tensor is: :math:`W_{uh}` and :math:`W_{rh}` + are concatenated with shape :math:`[D, D \\times 2]` lying on the first part, + and :math:`W_{ch}` lying on the latter part with shape :math:`[D, D]` . - The terms :math:`u_t` and :math:`r_t` represent the update and reset gates - of the GRU cell. Unlike LSTM, GRU has one lesser gate. However, there is - an intermediate candidate hidden output, which is denoted by :math:`m_t`. - This layer has three outputs :math:`h_t`, :math:`dot(r_t, h_{t-1})` - and concatenation of :math:`u_t`, :math:`r_t` and :math:`m_t`. Args: - input (Variable): The fc transformed input value of current step. - hidden (Variable): The hidden value of gru unit from previous step. - size (integer): The input dimension value. - param_attr(ParamAttr|None): The parameter attribute for the learnable - hidden-hidden weight matrix. Note: - - - The shape of the weight matrix is :math:`(T \\times 3D)`, where - :math:`D` is the hidden size. - - All elements in the weight matrix can be divided into two parts. - The first part are weights of the update gate and reset gate with - shape :math:`(D \\times 2D)`, and the second part are weights for - candidate hidden state with shape :math:`(D \\times D)`. - - If it is set to None or one attribute of ParamAttr, gru_unit will - create ParamAttr as param_attr. If the Initializer of the param_attr - is not set, the parameter is initialized with Xavier. Default: None. - bias_attr (ParamAttr|bool|None): The parameter attribute for the bias - of GRU.Note that the bias with :math:`(1 \\times 3D)` concatenates - the bias in the update gate, reset gate and candidate calculations. - If it is set to False, no bias will be applied to the update gate, - reset gate and candidate calculations. If it is set to None or one - attribute of ParamAttr, gru_unit will create ParamAttr as - bias_attr. If the Initializer of the bias_attr is not set, the bias - is initialized zero. Default: None. - activation (string): The activation type for cell (actNode). - Default: 'tanh' - gate_activation (string): The activation type for gates (actGate). - Default: 'sigmoid' + input(Variable): A 2D Tensor representing the input after linear projection + after linear projection. Its shape should be :math:`[N, D \\times 3]` , + where :math:`N` stands for batch size, :math:`D` for the hidden size. + The data type should be float32 or float64. + hidden(Variable): A 2D Tensor representing the hidden state from previous step. + Its shape should be :math:`[N, D]` , where :math:`N` stands for batch size, + :math:`D` for the hidden size. The data type should be same as ``input`` . + size(int): Indicate the hidden size. + param_attr(ParamAttr, optional): To specify the weight parameter property. + Default: None, which means the default weight parameter property is used. + See usage for details in :ref:`api_fluid_ParamAttr` . + bias_attr (ParamAttr, optional): To specify the bias parameter property. + Default: None, which means the default bias parameter property is used. + See usage for details in :ref:`api_fluid_ParamAttr` . + activation(str, optional): The activation fuction corresponding to + :math:`act_c` in the formula. "sigmoid", "tanh", "relu" and "identity" + are supported. Default "tanh". + gate_activation(str, optional): The activation fuction corresponding to + :math:`act_g` in the formula. "sigmoid", "tanh", "relu" and "identity" + are supported. Default "sigmoid". Returns: - tuple: The hidden value, reset-hidden value and gate values. + tuple: The tuple contains three Tensor variables with the same data type \ + as ``input`` . They represent the hidden state for next time step ( :math:`h_t` ), \ + reseted previous hidden state ( :math:`r_t \odot h_{t-1}` ), and the \ + concatenation of :math:`h_t, r_t, \\tilde{h_t}` . And they have shape \ + :math:`[N, D]` , :math:`[N, D]` , :math:`[N, D \times 3]` separately. \ + Usually only the hidden state for next time step ( :math:`h_t` ) is used \ + as output and state, the other two are intermediate results of calculations. Examples: @@ -1446,12 +1455,12 @@ def gru_unit(input, import paddle.fluid as fluid dict_dim, emb_dim = 128, 64 - data = fluid.layers.data(name='step_data', shape=[1], dtype='int32') - emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim]) + data = fluid.data(name='step_data', shape=[None], dtype='int64') + emb = fluid.embedding(input=data, size=[dict_dim, emb_dim]) hidden_dim = 512 x = fluid.layers.fc(input=emb, size=hidden_dim * 3) - pre_hidden = fluid.layers.data( - name='pre_hidden', shape=[hidden_dim], dtype='float32') + pre_hidden = fluid.data( + name='pre_hidden', shape=[None, hidden_dim], dtype='float32') hidden = fluid.layers.gru_unit( input=x, hidden=pre_hidden, size=hidden_dim * 3) @@ -2028,17 +2037,14 @@ def chunk_eval(input, excluded_chunk_types=None, seq_length=None): """ - **Chunk Evaluator** - - This function computes and outputs the precision, recall and - F1-score of chunk detection. + This operator computes the precision, recall and F1-score for chunk detection. + It is often used in sequence tagging tasks, such as Named Entity Recognition(NER). For some basics of chunking, please refer to `Chunking with Support Vector Machines `_ . - ChunkEvalOp computes the precision, recall, and F1-score of chunk detection, - and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes. - Here is a NER example of labeling for these tagging schemes: + This operator supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes. + Here is a NER example for the usage of these tagging schemes: .. code-block:: python @@ -2052,11 +2058,11 @@ def chunk_eval(input, ====== ====== ====== ===== == ============ ===== ===== ===== == ========= There are three chunk types(named entity types) including PER(person), ORG(organization) - and LOC(LOCATION), and we can see that the labels have the form -. + and LOC(location), and we can see that the labels have the form `-` . - Since the calculations actually use label ids rather than labels, extra attention - should be paid when mapping labels to ids to make CheckEvalOp work. The key point - is that the listed equations are satisfied by ids. + Since the implementation of this operator actually uses label ids rather than + label strings, to make it work, there should be a way to map label ids to + tag types and chunk types. This operator uses the following way to do mapping: .. code-block:: python @@ -2074,8 +2080,8 @@ def chunk_eval(input, IOE - 0 1 - IOBES 0 1 2 3 - Still use NER as example, assuming the tagging scheme is IOB while chunk types are ORG, - PER and LOC. To satisfy the above equations, the label map can be like this: + Accordingly, in the above NER example, if the tagging scheme is IOB and chunk + types are ORG, PER and LOC, then the label ids would be as follows: .. code-block:: python @@ -2087,23 +2093,32 @@ def chunk_eval(input, I-LOC 5 O 6 - It's not hard to verify the equations noting that the num of chunk types - is 3 and the num of tag types in IOB scheme is 2. For example, the label - id of I-LOC is 5, the tag type id of I-LOC is 1, and the chunk type id of - I-LOC is 2, which consistent with the results from the equations. + With which we can map each label id to the corresponding tag type and chunk + type correctly. Args: - input (Variable): prediction output of the network. - label (Variable): label of the test data set. - chunk_scheme (str): ${chunk_scheme_comment} - num_chunk_types (int): ${num_chunk_types_comment} - excluded_chunk_types (list): ${excluded_chunk_types_comment} - seq_length(Variable): 1-D Tensor specifying sequence length when input and label are Tensor type. + input (Variable): A Tensor or LoDTensor, representing the predicted labels + from the network. When it is a Tensor, its shape would be `[N, M, 1]`, + where `N` stands for batch size, `M` for sequence length; When it is + a LoDTensor, its shape would be `[N, 1]` where `N` stands for the total + sequence lengths in this mini-batch. The data type should be int64. + label (Variable): A Tensor or LoDTensor representing the ground-truth labels. + It shoud have the same shape, lod and data type as ``input`` . + chunk_scheme (str): Indicate the tagging schemes used here. The value must + be IOB, IOE, IOBES or plain. + num_chunk_types (int): The number of chunk types. + excluded_chunk_types (list, optional): Indicate the chunk types shouldn't + be taken into account. It should be a list of chunk type ids(integer). + Default None. + seq_length(Variable, optional): A 1D Tensor containing the length of each + sequence when ``input`` and ``label`` are Tensor. It needn't be + provided if ``input`` and ``label`` are LoDTensor. Default None. Returns: - tuple: tuple containing: precision, recall, f1_score, - num_infer_chunks, num_label_chunks, - num_correct_chunks + tuple: A tuple including precision, recall, F1-score, chunk number detected, \ + chunk number in ground-truth, chunk number correctly detected. Each \ + is a Tensor with shape `[1]`. The data type of precision, recall and \ + F1-score all is float32, and the others' data type all is int64. Examples: .. code-block:: python @@ -2112,9 +2127,9 @@ def chunk_eval(input, dict_size = 10000 label_dict_len = 7 - sequence = fluid.layers.data( - name='id', shape=[1], lod_level=1, dtype='int64') - embedding = fluid.layers.embedding( + sequence = fluid.data( + name='id', shape=[-1, 1], lod_level=1, dtype='int64') + embedding = fluid.embedding( input=sequence, size=[dict_size, 512]) hidden = fluid.layers.fc(input=embedding, size=512) label = fluid.layers.data( @@ -5644,64 +5659,71 @@ def beam_search(pre_ids, Refer to `Beam search `_ for more details. - This layer does the search in beams for one time step. Specifically, it - selects the top-K candidate word ids of current step from :attr:`ids` - according to their :attr:`scores` for all source sentences, where K is - :attr:`beam_size` and :attr:`ids, scores` are predicted results from the - computation cell. If :attr:`ids` is not set, it will be calculated out - according to :attr:`scores`. Additionally, :attr:`pre_ids` and - :attr:`pre_scores` are the output of beam_search at previous step, they + **This operator only supports LoDTensor.** It is used after finishing + scores calculation to perform beam search for one time step. Specifically, + after ``ids`` and ``scores`` have been produced, it selects the top-K + ( `k` is ``beam_size`` ) candidate word ids of current step from ``ids`` + according to the correspongding ``scores``. Additionally, ``pre_id`` and + ``pre_scores`` are the output of `beam_search` at previous step, they are needed for special use to handle ended candidate translations. - Note that if :attr:`is_accumulated` is :attr:`True`, the :attr:`scores` - passed in should be accumulated scores. Else, the :attr:`scores` are - considered as the straightforward scores and will be transformed to the - log field and accumulated the :attr:`pre_scores` in this operator. - Length penalty should be done with extra operators before calculating the - accumulated scores if needed. + Note that if ``is_accumulated`` is True, the ``scores`` passed in should + be accumulated scores. Otherwise, the ``scores`` are + considered as the probabilities of single step and would be transformed to + the log field and added up with ``pre_scores`` for final scores in this + operator. Length penalty should be done with extra operators before calculating + the accumulated scores if needed. Please see the following demo for a fully beam search usage example: fluid/tests/book/test_machine_translation.py Args: - pre_ids(Variable): The LodTensor variable which is the output of - beam_search at previous step. It should be a LodTensor with shape - :math:`(batch_size, 1)` and lod - :math:`[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at the - first step. - pre_scores(Variable): The LodTensor variable which is the output of - beam_search at previous step. - ids(Variable): The LodTensor variable containing the candidates ids. - Its shape should be :math:`(batch_size \\times beam_size, K)`, - where :math:`K` supposed to be :attr:`beam_size`. - scores(Variable): The LodTensor variable containing the accumulated - scores corresponding to :attr:`ids` and its shape is the same as - the shape of :attr:`ids`. + pre_ids(Variable): A LodTensor variable (lod level is 2), representing + the selected ids of previous step. It is the output of beam_search + at previous step. Its shape is `[batch_size, 1]` and its lod is + `[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at the + first step. The data type should be int64. + pre_scores(Variable): A LodTensor variable has the same shape and lod + with ``pre_ids`` , representing the accumulated scores corresponding + to the selected ids of previous step. It is the output of + beam_search at previous step. The data type should be float32. + ids(Variable|None): A LodTensor variable containing the candidates ids. + It has the same lod with ``pre_ids`` and its shape should be + `[batch_size * beam_size, K]`, where `K` supposed to be greater than + ``beam_size`` and the first dimension size (decrease as samples reach + to the end) should be same as that of ``pre_ids`` . The data type + should be int64. It can be None, which use indice in ``scores`` as + ids. + scores(Variable): A LodTensor variable containing the accumulated + scores corresponding to ``ids`` . Both its shape and lod are same as + thoes of ``ids`` . The data type should be float32. beam_size(int): The beam width used in beam search. end_id(int): The id of end token. - level(int, default 0): It can be ignored and mustn't change currently. - It means the source level of lod, which is explained as following. - The lod level of :attr:`ids` should be 2. The first level is source - level which describes how many prefixes (branchs) for each source - sentece (beam), and the second level is sentence level which - describes how these candidates belong to the prefix. The paths - linking prefixes and selected candidates are organized and reserved - in lod. - is_accumulated(bool, default True): Whether the input :attr:`score` is - accumulated scores. - name(str|None): A name for this layer(optional). If set None, the layer - will be named automatically. - return_parent_idx(bool): Whether to return an extra Tensor variable - preserving the selected_ids' parent indice in pre_ids - in output, which can be used to gather cell states at - the next time step. + level(int): **It can be ignored and mustn't change currently.** + The 2 level lod used in this operator has the following + meaning: The first level describes how many beams each sample has, + which would change to 0 when beams of the sample all end (batch reduce); + The second level describes how many times each beam is selected. + Default 0, which shouldn't be changed currently. + is_accumulated(bool): Whether the input ``score`` is accumulated scores. + Default True. + name(str, optional): For detailed information, please refer + to :ref:`api_guide_Name`. Usually name is no need to set and + None by default. + return_parent_idx(bool, optional): Whether to return an extra Tensor variable + in output, which stores the selected ids' parent indice in + ``pre_ids`` and can be used to update RNN's states by gather operator. + Default False. Returns: - Variable: The LodTensor tuple containing the selected ids and the \ - corresponding scores. If :attr:`return_parent_idx` is :attr:`True`, \ - an extra Tensor variable preserving the selected_ids' parent indice \ - is included. + tuple: The tuple contains two or three LodTensor variables. The two LodTensor, \ + representing the selected ids and the corresponding accumulated scores of \ + current step, have the same shape `[batch_size, beam_size]` and lod with 2 levels, \ + and have data types int64 and float32. If ``return_parent_idx`` is True, \ + an extra Tensor variable preserving the selected ids' parent indice \ + is included, whose shape is `[batch_size * beam_size]` and data type \ + is int64. Examples: .. code-block:: python @@ -5713,12 +5735,12 @@ def beam_search(pre_ids, # at previous step. beam_size = 4 end_id = 1 - pre_ids = fluid.layers.data( - name='pre_id', shape=[1], lod_level=2, dtype='int64') - pre_scores = fluid.layers.data( - name='pre_scores', shape=[1], lod_level=2, dtype='float32') - probs = fluid.layers.data( - name='probs', shape=[10000], dtype='float32') + pre_ids = fluid.data( + name='pre_id', shape=[None, 1], lod_level=2, dtype='int64') + pre_scores = fluid.data( + name='pre_scores', shape=[None, 1], lod_level=2, dtype='float32') + probs = fluid.data( + name='probs', shape=[None, 10000], dtype='float32') topk_scores, topk_indices = fluid.layers.topk(probs, k=beam_size) accu_scores = fluid.layers.elementwise_add( x=fluid.layers.log(x=topk_scores), @@ -5772,28 +5794,46 @@ def beam_search(pre_ids, def beam_search_decode(ids, scores, beam_size, end_id, name=None): """ - Beam Search Decode Layer. This layer constructs the full hypotheses for - each source sentence by walking back along the LoDTensorArray :attr:`ids` - whose lods can be used to restore the path in the beam search tree. + This operator is used after beam search has completed. It constructs the + full predicted sequences for each sample by walking back along the search + paths stored in lod of ``ids`` . The result sequences are stored in a + LoDTensor, which uses the following way to parse: + + .. code-block:: text + + If lod = [[0, 3, 6], [0, 12, 24, 40, 54, 67, 82]] + + The first level of lod stands for: There are 2 samples each having 3 + (beam width) predicted sequence. + + The second level of lod stands for: The lengths of the first sample's + 3 predicted sequences are 12, 12, 16; The lengths of the second sample's + 3 predicted sequences are 14, 13, 15. + + Please see the following demo for a fully beam search usage example: fluid/tests/book/test_machine_translation.py Args: - ids(Variable): The LodTensorArray variable containing the selected ids - of all steps. - scores(Variable): The LodTensorArray variable containing the selected - scores of all steps. + ids(Variable): The LoDTensorArray variable containing the selected ids + of all steps. Each LoDTensor in it has int64 data type and 2 level + lod which can be used to get the search paths. + scores(Variable): The LodTensorArray variable containing the accumulated + scores corresponding to selected ids of all steps. It has the same size + as ``ids`` . Each LoDTensor in it has the same shape and lod as the + counterpart in ``ids`` , and has a float32 data type. beam_size(int): The beam width used in beam search. end_id(int): The id of end token. - name(str|None): A name for this layer(optional). If set None, the layer - will be named automatically. + name(str, optional): For detailed information, please refer + to :ref:`api_guide_Name`. Usually name is no need to set and + None by default. Returns: - Variable: The LodTensor pair containing the generated id sequences \ - and the corresponding scores. The shapes and lods of the two \ - LodTensor are same. The lod level is 2 and the two levels \ - separately indicate how many hypotheses each source sentence has \ - and how many ids each hypothesis has. + tuple: The tuple contains two LodTensor variables. The two LodTensor, \ + containing the full sequences of ids and the correspongding accumulated \ + scores, have the same shape flattened to 1D and have the same 2 level \ + lod. The lod can be used to get how many predicted sequences each sample \ + has and how many ids each predicted sequence has. Examples: .. code-block:: python @@ -5832,71 +5872,67 @@ def lstm_unit(x_t, param_attr=None, bias_attr=None, name=None): - """Lstm unit layer. The equation of a lstm step is: - - .. math:: - - i_t & = \sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + b_i) - - f_t & = \sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + b_f) - - c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t + W_{h_c}h_{t-1} + b_c) - - o_t & = \sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + b_o) - - h_t & = o_t tanh(c_t) + """ + Long-Short Term Memory (LSTM) RNN cell. This operator performs LSTM calculations for + one time step, whose implementation is based on calculations described in `RECURRENT + NEURAL NETWORK REGULARIZATION `_ . - The inputs of lstm unit include :math:`x_t`, :math:`h_{t-1}` and - :math:`c_{t-1}`. The 2nd dimensions of :math:`h_{t-1}` and :math:`c_{t-1}` - should be same. The implementation separates the linear transformation and - non-linear transformation apart. Here, we take :math:`i_t` as an example. - The linear transformation is applied by calling a `fc` layer and the - equation is: + We add forget_bias to the biases of the forget gate in order to + reduce the scale of forgetting. The formula is as follows: + + .. math:: - .. math:: + i_{t} & = \sigma(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i}) - L_{i_t} = W_{x_i}x_{t} + W_{h_i}h_{t-1} + b_i + f_{t} & = \sigma(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias) - The non-linear transformation is applied by calling `lstm_unit_op` and the - equation is: + c_{t} & = f_{t}c_{t-1} + i_{t} tanh (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c}) - .. math:: + o_{t} & = \sigma(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o}) - i_t = \sigma(L_{i_t}) + h_{t} & = o_{t} tanh (c_{t}) - This layer has two outputs including :math:`h_t` and :math:`c_t`. + :math:`x_{t}` stands for ``x_t`` , corresponding to the input of current time step; + :math:`h_{t-1}` and :math:`c_{t-1}` correspond to ``hidden_t_prev`` and ``cell_t_prev`` , + representing the output of from previous time step. + :math:`i_{t}, f_{t}, c_{t}, o_{t}, h_{t}` are input gate, forget gate, cell, output gate + and hidden calculation. Args: - x_t (Variable): The input value of current step, a 2-D tensor with shape - M x N, M for batch size and N for input size. - hidden_t_prev (Variable): The hidden value of lstm unit, a 2-D tensor - with shape M x S, M for batch size and S for size of lstm unit. - cell_t_prev (Variable): The cell value of lstm unit, a 2-D tensor with - shape M x S, M for batch size and S for size of lstm unit. - forget_bias (float): The forget bias of lstm unit. - param_attr(ParamAttr|None): The parameter attribute for the learnable - hidden-hidden weights. - If it is set to None or one attribute of ParamAttr, - lstm_unit will create ParamAttr as param_attr. - If the Initializer of the param_attr is not set, the - parameter is initialized with Xavier. Default: None. - bias_attr (ParamAttr|None): The bias attribute for the learnable bias - weights. If it is set to False, no bias will be added - to the output units. If it is set to None or one attribute of ParamAttr, - lstm_unit will create ParamAttr as bias_attr. - If the Initializer of the bias_attr is not set, - the bias is initialized zero. Default: None. - name(str|None): A name for this layer(optional). If set None, the layer - will be named automatically. + x_t(Variable): A 2D Tensor representing the input of current time step. + Its shape should be :math:`[N, M]` , where :math:`N` stands for batch + size, :math:`M` for the feature size of input. The data type should + be float32 or float64. + hidden_t_prev(Variable): A 2D Tensor representing the hidden value from + previous step. Its shape should be :math:`[N, D]` , where :math:`N` + stands for batch size, :math:`D` for the hidden size. The data type + should be same as ``x_t`` . + cell_t_prev(Variable): A 2D Tensor representing the cell value from + previous step. It has the same shape and data type with ``hidden_t_prev`` . + forget_bias (float, optional): :math:`forget\\_bias` added to the biases + of the forget gate. Default 0. + param_attr(ParamAttr, optional): To specify the weight parameter property. + Default: None, which means the default weight parameter property is used. + See usage for details in :ref:`api_fluid_ParamAttr` . + bias_attr (ParamAttr, optional): To specify the bias parameter property. + Default: None, which means the default bias parameter property is used. + See usage for details in :ref:`api_fluid_ParamAttr` . + name(str, optional): For detailed information, please refer + to :ref:`api_guide_Name`. Usually name is no need to set and + None by default. Returns: - tuple: The hidden value and cell value of lstm unit. + tuple: The tuple contains two Tensor variables with the same shape and \ + data type with ``hidden_t_prev`` , representing the hidden value and \ + cell value which correspond to :math:`h_{t}` and :math:`c_{t}` in \ + the formula. Raises: - ValueError: The ranks of **x_t**, **hidden_t_prev** and **cell_t_prev** - not be 2 or the 1st dimensions of **x_t**, **hidden_t_prev** - and **cell_t_prev** not be the same or the 2nd dimensions of - **hidden_t_prev** and **cell_t_prev** not be the same. + ValueError: Rank of x_t must be 2. + ValueError: Rank of hidden_t_prev must be 2. + ValueError: Rank of cell_t_prev must be 2. + ValueError: The 1st dimensions of x_t, hidden_t_prev and cell_t_prev must be the same. + ValueError: The 2nd dimensions of hidden_t_prev and cell_t_prev must be the same. Examples: @@ -5905,12 +5941,12 @@ def lstm_unit(x_t, import paddle.fluid as fluid dict_dim, emb_dim, hidden_dim = 128, 64, 512 - data = fluid.layers.data(name='step_data', shape=[1], dtype='int32') - x = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim]) - pre_hidden = fluid.layers.data( - name='pre_hidden', shape=[hidden_dim], dtype='float32') - pre_cell = fluid.layers.data( - name='pre_cell', shape=[hidden_dim], dtype='float32') + data = fluid.data(name='step_data', shape=[None], dtype='int64') + x = fluid.embedding(input=data, size=[dict_dim, emb_dim]) + pre_hidden = fluid.data( + name='pre_hidden', shape=[None, hidden_dim], dtype='float32') + pre_cell = fluid.data( + name='pre_cell', shape=[None, hidden_dim], dtype='float32') hidden = fluid.layers.lstm_unit( x_t=x, hidden_t_prev=pre_hidden, @@ -8308,7 +8344,7 @@ def one_hot(input, depth, allow_out_of_range=False): attrs = {'depth': depth} else: if not isinstance(depth, Variable): - # user attribute + # user attribute inputs = {'X': input} attrs = {'depth': depth} else: @@ -15035,12 +15071,13 @@ def teacher_student_sigmoid_loss(input, def add_position_encoding(input, alpha, beta, name=None): """ - **Add Position Encoding Layer** + This operator performs weighted sum of input feature at each position + (position in the sequence) and the corresponding position encoding. - This layer accepts an input 3D-Tensor of shape [N x M x P], and returns an - output Tensor of shape [N x M x P] with positional encoding value. + For more details of position encoding, please refer to `Attention Is All You + Need `_ . - Refer to `Attention Is All You Need `_ . + The formula is as follows: .. math:: PE(pos, 2i) &= \\sin{(pos / 10000^{2i / P})} \\\\ @@ -15048,28 +15085,36 @@ def add_position_encoding(input, alpha, beta, name=None): Out(:, pos, i) &= \\alpha * input(:, pos, i) + \\beta * PE(pos, i) Where: - - :math:`PE(pos, 2i)` : the increment for the number at even position - - :math:`PE(pos, 2i + 1)` : the increment for the number at odd position + - :math:`PE(pos, 2i)` : the value at even index `2i` for encoding of position `pos`. + - :math:`PE(pos, 2i + 1)` : the value at odd index `2i+1` for encoding of position `pos` Args: - input (Variable): 3-D input tensor with shape [N x M x P] - alpha (float): multiple of Input Tensor - beta (float): multiple of Positional Encoding Tensor - name (string): the name of position encoding layer + input(Variable): A Tensor or LoDTensor (lod level is 1). If it is a + Tensor, the shape should be `[N, M, P]`, where `N` stands for + batch size, `M` for sequence length, `P` for the size of feature + dimension. If it is a LoDTensor, the shape should be `[N, P]`, + where `N` stands for the total sequence lengths in this mini-batch, + `P` for the size of feature. The data type should be float32 or float64. + alpha(float): Indicate the weight coefficient for `input` when performing + weighted sum. + beta(float): Indicate the weight coefficient for position encoding when + performing weighted sum. + name(str, optional): For detailed information, please refer + to :ref:`api_guide_Name`. Usually name is no need to set and + None by default. Returns: - Variable: A 3-D Tensor of shape [N x M x P] with positional encoding. + Variable: A Tensor or LoDTensor. It has the same shape, data type and lod as `input`. Examples: .. code-block:: python import paddle.fluid as fluid - tensor = fluid.layers.data( + tensor = fluid.data( name='tensor', - shape=[32, 64, 512], - dtype='float32', - append_batch_size=False) + shape=[None, 64, 512], + dtype='float32') position_tensor = fluid.layers.add_position_encoding( input=tensor, alpha=1.0, beta=1.0) diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py index 23a34403179379534fd77b9754d5dc5d2e7ca3ba..f8ad54751fe131c86bf00ce65a97319e6d00fdc0 100644 --- a/python/paddle/fluid/nets.py +++ b/python/paddle/fluid/nets.py @@ -363,67 +363,67 @@ def scaled_dot_product_attention(queries, num_heads=1, dropout_rate=0.): """ - The dot-product attention. - + This interface Multi-Head Attention using scaled dot product. Attention mechanism can be seen as mapping a query and a set of key-value - pairs to an output. The output is computed as a weighted sum of the values, - where the weight assigned to each value is computed by a compatibility - function (dot-product here) of the query with the corresponding key. + pairs to an output. Multi-Head Attention performs attention using multi-head + parallel, and the inputs of attention would be transformed by linear projection. + The formula is as follows: - The dot-product attention can be implemented through (batch) matrix - multipication as follows: + .. math:: - .. math:: + MultiHead(Q, K, V ) & = Concat(head_1, ..., head_h) + + where \ head_i & = Attention(QW_i^Q , KW_i^K , VW_i^V ) - Attention(Q, K, V)= softmax(QK^\mathrm{T})V + Attention(Q, K, V) & = softmax (\\frac{QK^\mathrm{T}}{\sqrt{d_k}}) V - Refer to `Attention Is All You Need - `_. + For more details, please refer to `Attention Is All You Need + `_ . + + Note that the implementation is adapted to batch, and all matrix multiplication + in :math:`Attention(Q, K, V)` is batched matrix multiplication. Refer to + :ref:`api_fluid_layers_matmul` . Args: - queries (Variable): The input variable which should be a 3-D Tensor. - keys (Variable): The input variable which should be a 3-D Tensor. - values (Variable): The input variable which should be a 3-D Tensor. - num_heads (int): Head number to compute the scaled dot product - attention. Default: 1. - dropout_rate (float): The dropout rate to drop the attention weight. - Default: 0.0. + queries (Variable): A 3-D Tensor with shape :math:`[N, L_q, d_k \\times h]` , + where :math:`N` stands for batch size, :math:`L_q` for the sequence length + of query, :math:`d_k \\times h` for the feature size of query, :math:`h` for + head number. The data type should be float32 or float64. + keys (Variable): A 3-D Tensor with shape :math:`[N, L_k, d_k \\times h]` , + where :math:`N` stands for batch size, :math:`L_k` for the sequence length + of key, :math:`d_k \\times h` for the feature size of key, :math:`h` for head + number. The data type should be the same as ``queries`` . + values (Variable): A 3-D Tensor with shape :math:`[N, L_k, d_v \\times h]` , + where :math:`N` stands for batch size, :math:`L_k` for the sequence length + of key, :math:`d_v \\times h` for the feature size of value, :math:`h` for head + number. The data type should be the same as ``queries`` . + num_heads (int, optional): Indicate the number of head. If the numher + is 1, linear projection would not be performed on inputs. Default: 1. + dropout_rate (float, optional): The rate to drop the attention weight. + Default: 0.0, which means no dropout. Returns: - Variable: A 3-D Tensor computed by multi-head scaled dot product\ - attention. + Variable: A 3-D Tensor with shape :math:`[N, L_q, d_v \\times h]` , \ + where :math:`N` stands for batch size, :math:`L_q` for the sequence \ + length of query, :math:`d_v \\times h` for the feature size of value. \ + It has the same data type with inputs, representing the output of \ + Multi-Head Attention. Raises: - ValueError: If input queries, keys, values are not 3-D Tensors. - - NOTES: - 1. When num_heads > 1, three linear projections are learned respectively - to map input queries, keys and values into queries', keys' and values'. - queries', keys' and values' have the same shapes with queries, keys - and values. - 2. When num_heads == 1, scaled_dot_product_attention has no learnable - parameters. + ValueError: Inputs quries, keys and values should all be 3-D tensors. + ValueError: The hidden size of queries and keys should be the same. + ValueError: The max sequence length in query batch and in key batch should be the same. + ValueError: he hidden size of keys must be divisible by the number of attention heads. + ValueError: he hidden size of values must be divisible by the number of attention heads. Examples: .. code-block:: python import paddle.fluid as fluid - queries = fluid.layers.data(name="queries", - shape=[3, 5, 9], - dtype="float32", - append_batch_size=False) - queries.stop_gradient = False - keys = fluid.layers.data(name="keys", - shape=[3, 6, 9], - dtype="float32", - append_batch_size=False) - keys.stop_gradient = False - values = fluid.layers.data(name="values", - shape=[3, 6, 10], - dtype="float32", - append_batch_size=False) - values.stop_gradient = False + queries = fluid.data(name="queries", shape=[3, 5, 9], dtype="float32") + keys = fluid.data(name="keys", shape=[3, 6, 9], dtype="float32") + values = fluid.data(name="values", shape=[3, 6, 10], dtype="float32") contexts = fluid.nets.scaled_dot_product_attention(queries, keys, values) contexts.shape # [3, 5, 10] """